-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_cluster_barycenters.py
128 lines (113 loc) · 6.68 KB
/
get_cluster_barycenters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
""""
Description: This program calculates the barycenters of either time series clusters or
barycenter clusters.
When the input dataset contains cluster labels of time series, it finds the barycenters
of only those clusters for which the number of cluster members is greater than
min_cluster_size. Otherwise, it drops all the time series members that form a cluster
smaller than min_cluster_size. Each barycenter is assigned an index given by
"<model_ID>.<param_ID>".
When the input dataset contains cluster labels of barycenters, each calculated
barycenter is assigned an index given by "b.f<functional_cluster_ID>".
Inputs:
input_data_id: Model ID, if the input dataset contains time series cluster labels
Iteration number of barycenter calculation, if the input dataset contains barycenter cluster labels
sampled_dataset_id: Sampled Dataset ID
min_cluster_size: Minimum size of time series cluster for which barycenter should be calculated
barycenter_flag: Flag to indicate if input file contains time series "cluster labels (when 0) or barycenter
cluster labels (when 1)
cluster_label_data_path: Path to cluster label data files "cluster_label_model<model_ID>.csv" or '
'"barycenter<iteration_number>_sampled_dataset<sampled_dataset_id>.csv"
output_conc_dataset_path: Path to 'model<model_ID>_output_conc.csv' dataset file
barycenter_dataset_path: Path to save the file "barycenter<iteration_number>_sampled_dataset<sampled_dataset_id>.csv"
containing the calculated barycenters for each cluster
Outputs:
A csv file with 101 columns giving the barycenter of time points [t0-t100] and one column 'param_index' indicating
"<model_ID>.<cluster_ID>" if the input dataset is a time series clusters and "b.f<functional_cluster_ID>" if the
input dataset consists of barycenters.
"""
import pandas as pd
from tslearn.barycenters import softdtw_barycenter
import argparse
import time
import os
def min_max_scaling(df_conc):
df_conc = df_conc.T
for column in df_conc.columns:
df_conc[column] = (df_conc[column] - df_conc[column].min()) / (df_conc[column].max() - df_conc[column].min())
return df_conc.T
def main(args):
arg1 = args.input_data_id
arg2 = args.sampled_dataset_id
min_cluster_size = args.min_cluster_size
barycenter_flag = args.barycenter_flag
cluster_label_data_path = args.cluster_label_data_path
output_conc_dataset_path = args.output_conc_dataset_path
barycenter_dataset_path = args.barycenter_dataset_path
# Record start time
start_time = time.time()
# Get cluster labels
df_cluster_labels = pd.read_csv(cluster_label_data_path+'/cluster_label_model'+str(arg1)+'.csv', dtype={'param_index': str})
# Get time course/barycenter dataset
df_conc = pd.read_csv(output_conc_dataset_path + '/model'+ str(arg1) +'_output_conc.csv', dtype={'param_index': str})
df_conc = df_conc.set_index('param_index')
# Scale time course or barycenter data between 0-1
df_conc = min_max_scaling(df_conc)
# If input dataset consists of time courses, handle NaN entries
if barycenter_flag == 0:
df_conc = df_conc.T
nan_idx = df_conc.columns[df_conc.isna().any()].tolist()
df_conc = df_conc.drop(columns=nan_idx)
df_conc = df_conc.T
# Get the data labels list
labels = df_cluster_labels.label.unique()
index = []
barycenter_list = []
for label in labels:
# param_index_list = []
df_labels_temp = df_cluster_labels.loc[df_cluster_labels['label'] == label]
param_index_list = df_labels_temp.index
if len(param_index_list) <= min_cluster_size and barycenter_flag == 0:
continue
cluster_conc = df_conc.iloc[param_index_list, :]
cluster_conc = cluster_conc.reset_index(drop=True)
cluster_barycenter = softdtw_barycenter(cluster_conc, gamma=1.0, max_iter=50, tol=1e-3)
barycenter_list.append(cluster_barycenter.ravel())
if barycenter_flag == 0:
index.append(str(arg1) + '.' + str(label))
else:
index.append('b.f' + str(label))
df_barycenter = pd.DataFrame(barycenter_list)
df_barycenter['param_index'] = index
if barycenter_flag == 1:
itr = int(arg1) + 1
df_barycenter.to_csv(barycenter_dataset_path+'/barycenter' + str(itr) + '_sampled_dataset'+ str(arg2) +'.csv', index=None, header=True)
else:
itr = 0
file_exists = os.path.isfile(barycenter_dataset_path+'/barycenter' + str(itr) + '_sampled_dataset'+ str(arg2) +'.csv')
if not file_exists:
df_barycenter.to_csv(barycenter_dataset_path+'/barycenter' + str(itr) + '_sampled_dataset'+ str(arg2) +'.csv', mode='a', index=None, header=True)
else:
df_barycenter.to_csv(barycenter_dataset_path+'/barycenter' + str(itr) + '_sampled_dataset'+ str(arg2) +'.csv', mode='a', index=None, header=False)
# Record end time
end_time = time.time()
# Calculate elapsed time
elapsed_time = end_time - start_time
print("Elapsed time: ", elapsed_time)
def default_argument_parser():
parser = argparse.ArgumentParser()
parser.add_argument("--input_data_id", help="Model ID or Barycenter iteration number", required=True)
parser.add_argument("--sampled_dataset_id", help="Sampled Dataset ID", required=True)
parser.add_argument("--min_cluster_size", default=10, help="Minimum size of time series cluster for which barycenter should be calculated. Not applicable if barycenter_flag = 1")
parser.add_argument("--barycenter_flag", type=int, help="Flag to indicate if input file contains time series "
"cluster labels (when 0) or barycenter cluster labels (when 1)", required=True)
parser.add_argument("--cluster_label_data_path", default='/home/user/PycharmProjects/data/v2/csvs/sampled_dataset0/cluster_labels',
help='Path to cluster label data files "cluster_label_model<model_ID>.csv" or '
'"barycenter<iteration_number>_sampled_dataset<sampled_dataset_id>.csv"')
parser.add_argument("--output_conc_dataset_path", default='/home/user/PycharmProjects/data/v2/dataset0_lhs',
help="Path to 'model<model_ID>_output_conc.csv' dataset file")
parser.add_argument("--barycenter_dataset_path", default='/home/user/PycharmProjects/data/v2/csvs/sampled_dataset0/barycenter_dataset',
help='Path to save the file "barycenter<iteration_number>_sampled_dataset<sampled_dataset_id>.csv" containing the calculated barycenters for each cluster')
return parser
if __name__ == '__main__':
args = default_argument_parser().parse_args()
main(args)