Skip to content

Commit

Permalink
2a PM - gaussian and benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
qnater committed Jan 8, 2025
1 parent 9ae4a91 commit 597cbe7
Show file tree
Hide file tree
Showing 211 changed files with 804 additions and 872 deletions.
179 changes: 139 additions & 40 deletions .idea/workspace.xml

Large diffs are not rendered by default.

Binary file added imputegap/assets/25_01_08_16_33_26_plot.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added imputegap/assets/25_01_08_16_34_34_plot.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added imputegap/assets/25_01_08_16_35_19_plot.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added imputegap/assets/25_01_08_16_35_57_plot.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added imputegap/assets/25_01_08_16_37_19_plot.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added imputegap/assets/25_01_08_16_37_54_plot.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added imputegap/assets/25_01_08_16_45_33_plot.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added imputegap/assets/25_01_08_16_46_33_plot.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added imputegap/assets/25_01_08_16_47_09_plot.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added imputegap/assets/25_01_08_16_51_45_plot.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
6 changes: 3 additions & 3 deletions imputegap/params/optimal_parameters_e_eeg-reading_cdrec.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
[cdrec]
rank = 9
epsilon = 1.5973053913529038e-5
iteration = 294
rank = 5
epsilon = 0.31661830435765825
iteration = 821
Binary file modified imputegap/recovery/__pycache__/benchmark.cpython-312.pyc
Binary file not shown.
Binary file modified imputegap/recovery/__pycache__/manager.cpython-312.pyc
Binary file not shown.
90 changes: 87 additions & 3 deletions imputegap/recovery/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,72 @@ def _config_optimization(self, opti_mean, ts_test, pattern, algorithm, block_siz

return i_opti

def average_runs_by_names(self, data):
"""
Average the results of all runs depending on the dataset
Parameters
----------
data : list
list of dictionary containing the results of the benchmark runs.
Returns
-------
list
list of dictionary containing the results of the benchmark runs averaged by datasets.
"""
results_avg, all_names = [], []

# Extract dataset names
for dictionary in data:
all_keys = list(dictionary.keys())
dataset_name = all_keys[0]
all_names.append(dataset_name)

# Get unique dataset names
unique_names = sorted(set(all_names))
print("All dataset names:", *all_names, "\n")
print("Unique dataset names:", *unique_names)

# Initialize and populate the split matrix
split = [[0 for _ in range(all_names.count(name))] for name in unique_names]
for i, name in enumerate(unique_names):
x = 0
for y, match in enumerate(all_names):
if name == match:
split[i][x] = data[y]
x += 1

# Iterate over the split matrix to calculate averages
for datasets in split:
tmp = [dataset for dataset in datasets if dataset != 0]
merged_dict = {}
count = len(tmp)

# Process and calculate averages
for dataset in tmp:
for outer_key, outer_value in dataset.items():
for middle_key, middle_value in outer_value.items():
for mean_key, mean_value in middle_value.items():
for method_key, method_value in mean_value.items():
for level_key, level_value in method_value.items():
# Initialize scores and times if not already initialized
merger = merged_dict.setdefault(outer_key, {}
).setdefault(middle_key, {}).setdefault(mean_key, {}
).setdefault(
method_key, {}).setdefault(level_key, {"scores": {}, "times": {}})

# Add scores and times
for score_key, v in level_value["scores"].items():
merger["scores"][score_key] = (merger["scores"].get(score_key, 0) + v / count)
for time_key, time_value in level_value["times"].items():
merger["times"][time_key] = (
merger["times"].get(time_key, 0) + time_value / count)

results_avg.append(merged_dict)

return results_avg

def avg_results(self, *datasets):
"""
Calculate the average of all metrics and times across multiple datasets.
Expand Down Expand Up @@ -681,7 +747,7 @@ def eval(self, algorithms=["cdrec"], datasets=["eeg-alcohol"], patterns=["mcar"]
print("\t\truns_plots_scores", runs_plots_scores)

print("\truns_plots_scores : ", runs_plots_scores)
save_dir_runs = save_dir + "/run_" + str(i_run)
save_dir_runs = save_dir + "/run_" + str(i_run) + "/" + dataset
print("\truns saved in : ", save_dir_runs)
self.generate_plots(runs_plots_scores=runs_plots_scores, ticks=x_axis, subplot=True, save_dir=save_dir_runs)
self.generate_plots(runs_plots_scores=runs_plots_scores, ticks=x_axis, subplot=False, save_dir=save_dir_runs)
Expand All @@ -692,6 +758,24 @@ def eval(self, algorithms=["cdrec"], datasets=["eeg-alcohol"], patterns=["mcar"]
print("============================================================================\n\n\n\n\n\n")

scores_list, algos, sets = self.avg_results(*run_storage)
_ = Benchmark().generate_heatmap(scores_list, algos, sets, save_dir=save_dir, display=False)
_ = self.generate_heatmap(scores_list, algos, sets, save_dir=save_dir, display=False)

run_averaged = self.average_runs_by_names(run_storage)

save_dir_agg = save_dir + "/aggregation"
print("\taggragation of results saved in : ", save_dir_agg)

for scores in run_averaged:
all_keys = list(scores.keys())
dataset_name = str(all_keys[0])

save_dir_agg_set = save_dir_agg + "/" + dataset_name

self.generate_plots(runs_plots_scores=scores, ticks=x_axis, subplot=True, save_dir=save_dir_agg_set)
self.generate_plots(runs_plots_scores=scores, ticks=x_axis, subplot=False, save_dir=save_dir_agg_set)
self.generate_reports_txt(scores, save_dir_agg_set, dataset_name, -1)
self.generate_reports_excel(scores, save_dir_agg_set, dataset_name, -1)


return run_averaged, scores_list

return run_storage, scores_list
77 changes: 77 additions & 0 deletions imputegap/recovery/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from scipy.stats import zscore
from sklearn.preprocessing import MinMaxScaler
import importlib.resources
from scipy.stats import norm

from imputegap.tools import utils

Expand Down Expand Up @@ -617,3 +618,79 @@ def blackout(input_data, missing_rate=0.2, offset=0.1):
"""
return TimeSeries.Contamination.missing_percentage(input_data, series_rate=1, missing_rate=missing_rate,
offset=offset)

def gaussian(input_data, series_rate=0.2, missing_rate=0.2, std_dev=0.2, offset=0.1, seed=True):
"""
Apply contamination with a Gaussian distribution to the time series data.
Parameters
----------
input_data : numpy.ndarray
The time series dataset to contaminate.
series_rate : float, optional
Percentage of series to contaminate (default is 0.2).
missing_rate : float, optional
Percentage of missing values per series (default is 0.2).
std_dev : float, optional
Standard deviation of the Gaussian distribution for missing values (default is 0.2).
offset : float, optional
Size of the uncontaminated section at the beginning of the series (default is 0.1).
seed : bool, optional
Whether to use a seed for reproducibility (default is True).
Returns
-------
numpy.ndarray
The contaminated time series data.
"""

ts_contaminated = input_data.copy()
M, _ = ts_contaminated.shape

if seed:
seed_value = 42
np.random.seed(seed_value)

# Validation and limitation of input parameters
missing_rate = utils.verification_limitation(missing_rate)
series_rate = utils.verification_limitation(series_rate)
offset = utils.verification_limitation(offset)

nbr_series_impacted = int(np.ceil(M * series_rate))

print("\n\nGAUSSIAN contamination has been called with :"
"\n\ta number of series impacted ", series_rate * 100, "%",
"\n\ta missing rate of ", missing_rate * 100, "%",
"\n\ta starting position at ", offset,
"\n\tGaussian std_dev ", std_dev,
"\n\tshape of the set ", ts_contaminated.shape,
"\n\tthis selection of series 0 to ", nbr_series_impacted, "\n\n")

for series in range(0, nbr_series_impacted):
S = int(series)
N = len(ts_contaminated[S]) # number of values in the series
P = int(N * offset) # values to protect in the beginning of the series
W = int((N - P) * missing_rate) # number of data points to remove
I = np.arange(P, N)

# probability density function
mean = np.mean(ts_contaminated[S])
mean = max(min(mean, 1), -1)

probabilities = norm.pdf(I, loc=P + mean * (N - P), scale=std_dev * (N - P))

print("\n\nmean = ", mean)
print("P + mean * (N - P) = ", P + mean * (N - P))
print("std_dev * (N - P) = ", std_dev * (N - P))
print("probabilities.sum() = ", probabilities.sum())

# normalizes the probabilities so that their sum equals 1
probabilities /= probabilities.sum()

# select the values based on the probability
missing_indices = np.random.choice(I, size=W, replace=False, p=probabilities)

# apply missing values
ts_contaminated[S, missing_indices] = np.nan

return ts_contaminated
Binary file removed imputegap/reports/benchmarking_mae.jpg
Binary file not shown.
Binary file removed imputegap/reports/benchmarking_rmse.jpg
Binary file not shown.
Binary file removed imputegap/reports/benchmarking_time.jpg
Binary file not shown.
Binary file not shown.
Binary file removed imputegap/reports/report_01/chlorine_mcar_MAE.jpg
Binary file not shown.
Binary file removed imputegap/reports/report_01/chlorine_mcar_MI.jpg
Binary file not shown.
Binary file removed imputegap/reports/report_01/chlorine_mcar_RMSE.jpg
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file removed imputegap/reports/report_01/drift_mcar_MAE.jpg
Binary file not shown.
Binary file removed imputegap/reports/report_01/drift_mcar_MI.jpg
Binary file not shown.
Binary file removed imputegap/reports/report_01/drift_mcar_RMSE.jpg
Binary file not shown.
Binary file not shown.
Diff not rendered.
Diff not rendered.
Diff not rendered.
Binary file removed imputegap/reports/report_01/eegalcohol_mcar_MAE.jpg
Diff not rendered.
Binary file removed imputegap/reports/report_01/eegalcohol_mcar_MI.jpg
Diff not rendered.
Binary file removed imputegap/reports/report_01/eegalcohol_mcar_RMSE.jpg
Diff not rendered.
Diff not rendered.
Diff not rendered.
Diff not rendered.
Diff not rendered.
Binary file removed imputegap/reports/report_01/eegreading_mcar_MAE.jpg
Diff not rendered.
Binary file removed imputegap/reports/report_01/eegreading_mcar_MI.jpg
Diff not rendered.
Binary file removed imputegap/reports/report_01/eegreading_mcar_RMSE.jpg
Diff not rendered.
Diff not rendered.
Diff not rendered.
Diff not rendered.
Diff not rendered.
Diff not rendered.
Diff not rendered.
Diff not rendered.
Diff not rendered.
Diff not rendered.
Diff not rendered.
Diff not rendered.
Diff not rendered.
Binary file removed imputegap/reports/report_01/fmristoptask_mcar_MI.jpg
Diff not rendered.
Diff not rendered.
Diff not rendered.
Diff not rendered.
Diff not rendered.
Loading

0 comments on commit 597cbe7

Please sign in to comment.