-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcu_mds.py
132 lines (102 loc) · 5.03 KB
/
cu_mds.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""
pip install \
--extra-index-url=https://pypi.nvidia.com \
cudf-cu12==23.12.* dask-cudf-cu12==23.12.* cuml-cu12==23.12.* \
cugraph-cu12==23.12.* cuspatial-cu12==23.12.* cuproj-cu12==23.12.* \
cuxfilter-cu12==23.12.* cucim-cu12==23.12.* pylibraft-cu12==23.12.* \
raft-dask-cu12==23.12.*
"""
import pandas as pd
from dtaidistance import dtw
from scipy.spatial.distance import squareform
from cuml.cluster import KMeans as cuKMeans
from mdscuda import mds_fit
# from sklearn.cluster import KMeans
from kneed import KneeLocator
import time
import argparse
def min_max_scaling(df_conc):
df_conc = df_conc.T
for column in df_conc.columns:
df_conc[column] = (df_conc[column] - df_conc[column].min()) / (df_conc[column].max() - df_conc[column].min())
return df_conc.T
def process_data(df_data):
# Scale time courses between 0-1
df_data = min_max_scaling(df_data)
# Check if for any parameter, the scaled concentration time course has a NaN
df_data = df_data.T
nan_idx = df_data.columns[df_data.isna().any()].tolist()
if len(nan_idx) > 0:
# Update dataframe by removing Paramater ID of time course containing NaN
df_data = df_data.loc[~df_data.index.isin(nan_idx)]
df_data = df_data.drop(columns=nan_idx)
df_data = df_data.transpose()
return df_data
def main(args):
# Parse arguments
output_conc_dataset_path = args.output_conc_dataset_path
nrows = args.nrows
kmax = args.kmax
random_seed = args.random_seed_kmeans
cluster_label_data_path = args.cluster_label_data_path
# Record start time
start_time = time.time()
# Read output concentration time course dataset for model over all parameter sets
df_data = pd.read_csv(output_conc_dataset_path, dtype={'param_index':str})
df_data = df_data.set_index('param_index')
# Process output concentration time courses of a model for only 'nrows' number of parameter sets
if nrows != 'all':
df_data = df_data.head(nrows)
# Process data and update simulation inputs file where applicable (i.e., if NaN is found in time course)
df_data = process_data(df_data)
# Calculate pairwise DTW distance for concentration time courses
distance = dtw.distance_matrix(df_data.to_numpy(), compact=True, parallel=True, use_c=True, show_progress=True)
distance_matrix = squareform(distance)
x = mds_fit(distance_matrix, n_dims = 3, verbosity = 1)
# Maximum number of clusters to calculate Within-cluster-sum-of-squared distances for determining the elbow point
if kmax == 'all' or kmax > len(df_data):
kmax = len(df_data)
kmax = int(len(df_data) / 10)
wcss = []
for k in range(2, kmax):
kmeans = cuKMeans(n_clusters=k, random_state=random_seed) # KMeans(n_clusters=k, init='k-means++', n_init=10, random_state=random_seed) #
labels = kmeans.fit_predict(x)
wcss.append(kmeans.inertia_)
print(k)
# Locate the elbow point
kn = KneeLocator(
range(2, kmax),
wcss,
curve='convex',
direction='decreasing',
interp_method='polynomial')
print(kn.elbow)
# Do KMeans clustering with number of clusters = elbow point
kmeans = cuKMeans(n_clusters=kn.elbow+2, random_state=random_seed) # KMeans(n_clusters=kn.elbow+2, init='k-means++', n_init=10, random_state=random_seed) #
labels = kmeans.fit_predict(distance_matrix)
# Get dataframe with Parameter ID and its corresponding Label found by clustering
df_cluster_labels = pd.DataFrame(columns=['param_index', 'label'])
df_cluster_labels['param_index'] = df_data.index
df_cluster_labels['label'] = labels
# Write cluster label dataframe to csv
df_cluster_labels.to_csv(cluster_label_data_path+'/cluster_label.csv', header=True, index=None)
# Record end time
end_time = time.time()
# Calculate elapsed time
elapsed_time = end_time - start_time
print("Elapsed time: ", elapsed_time)
def default_argument_parser():
parser = argparse.ArgumentParser()
parser.add_argument("--output_conc_dataset_path", default='model2_output_conc.csv',
help="Path to 'model<model_ID>_output_conc.csv' dataset file")
parser.add_argument("--nrows", default='all',
help="Number of rows (i.e. parameter sets) from 'model<model_ID>_output_conc.csv' dataset file to process")
parser.add_argument("--kmax", default=2301,
help="Maximum number of clusters to consider for Elbow point determination")
parser.add_argument("--random_seed_kmeans", default=27, help="Random seed to be used in KMeans clustering")
parser.add_argument("--cluster_label_data_path", default='output',
help="Path to save the 'cluster_label.csv' file with cluster labels")
return parser
if __name__ == '__main__':
args = default_argument_parser().parse_args()
main(args)