-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster_time_series_handling_nan.py
126 lines (102 loc) · 5.06 KB
/
cluster_time_series_handling_nan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
Description: This program clusters time courses using KMeans clustering
with pairwise Dynamic Time Warping (DTW) distance calculation
Inputs:
model: Model ID
sampled_dataset_id: Sampled Dataset ID
output_conc_dataset_path: Path to "model<model_ID>_output_conc.csv" dataset file
nrows: Number of rows (i.e. parameter sets) from "model<model_ID>_output_conc.csv" dataset file to process, default='all'
kmax: Maximum number of clusters to consider for Elbow point determination
random_seed_kmeans: Random seed to be used in KMeans clustering
cluster_label_data_path: Path to save the "cluster_label_model<model_ID>.csv" file with cluster labels
Output:
A csv file with two columns: 'param_index': <model_ID>.<param_ID> and label named "cluster_label_model<model_ID>.csv"
"""
import pandas as pd
from dtaidistance import dtw
from scipy.spatial.distance import squareform
from sklearn.cluster import KMeans
from kneed import KneeLocator
import time
import argparse
def min_max_scaling(df_conc):
df_conc = df_conc.T
for column in df_conc.columns:
df_conc[column] = (df_conc[column] - df_conc[column].min()) / (df_conc[column].max() - df_conc[column].min())
return df_conc.T
def process_data(df_data):
# Scale time courses between 0-1
df_data = min_max_scaling(df_data)
# Check if for any parameter, the scaled concentration time course has a NaN
df_data = df_data.T
nan_idx = df_data.columns[df_data.isna().any()].tolist()
if len(nan_idx) > 0:
# Update dataframe by removing Paramater ID of time course containing NaN
df_data = df_data.loc[~df_data.index.isin(nan_idx)]
df_data = df_data.drop(columns=nan_idx)
df_data = df_data.transpose()
return df_data
def main(args):
# Parse arguments
output_conc_dataset_path = args.output_conc_dataset_path
nrows = args.nrows
kmax = args.kmax
random_seed = args.random_seed_kmeans
cluster_label_data_path = args.cluster_label_data_path
# Record start time
start_time = time.time()
# Read output concentration time course dataset for model over all parameter sets
df_data = pd.read_csv(output_conc_dataset_path', dtype={'param_index':str})
df_data = df_data.set_index('param_index')
# Process output concentration time courses of a model for only 'nrows' number of parameter sets
if nrows != 'all':
df_data = df_data.head(nrows)
# Process data and update simulation inputs file where applicable (i.e., if NaN is found in time course)
df_data = process_data(df_data)
# Calculate pairwise DTW distance for concentration time courses
distance = dtw.distance_matrix( df_data.to_numpy(), compact=True, parallel=True, use_c=True, show_progress=True)
distance_matrix = squareform(distance)
# Maximum number of clusters to calculate Within-cluster-sum-of-squared distances for determining the elbow point
if kmax == 'all' or kmax > len(df_data):
kmax = len(df_data)
wcss = []
for k in range(2, kmax):
kmeans = KMeans(n_clusters=k, init='k-means++', n_init=10, random_state=random_seed)
labels = kmeans.fit_predict(distance_matrix)
wcss.append(kmeans.inertia_)
# Locate the elbow point
kn = KneeLocator(
range(2, kmax),
wcss,
curve='convex',
direction='decreasing',
interp_method='polynomial')
# Do KMeans clustering with number of clusters = elbow point
kmeans = KMeans(n_clusters=kn.elbow+2, init='k-means++', n_init=10, random_state=random_seed)
labels = kmeans.fit_predict(distance_matrix)
# Get dataframe with Parameter ID and its corresponding Label found by clustering
df_cluster_labels = pd.DataFrame(columns=['param_index', 'label'])
df_cluster_labels['param_index'] = df_data.index
df_cluster_labels['label'] = labels
# Write cluster label dataframe to csv
df_cluster_labels.to_csv(cluster_label_data_path+'/cluster_label.csv', header=True, index=None)
# Record end time
end_time = time.time()
# Calculate elapsed time
elapsed_time = end_time - start_time
print("Elapsed time: ", elapsed_time)
def default_argument_parser():
parser = argparse.ArgumentParser()
parser.add_argument("--output_conc_dataset_path", default='../../data/v2/',
help="Path to 'model<model_ID>_output_conc.csv' dataset file")
parser.add_argument("--nrows", default='all',
help="Number of rows (i.e. parameter sets) from 'model<model_ID>_output_conc.csv' dataset file to process")
parser.add_argument("--kmax", default=100,
help="Maximum number of clusters to consider for Elbow point determination")
parser.add_argument("--random_seed_kmeans", default=27, help="Random seed to be used in KMeans clustering")
parser.add_argument("--cluster_label_data_path", default='../../data/v2/',
help="Path to save the 'cluster_label.csv' file with cluster labels")
return parser
if __name__ == '__main__':
args = default_argument_parser().parse_args()
main(args)