2a PM - gaussian and benchmark

eXascaleInfolab · Jan 8, 2025 · 597cbe7 · 597cbe7
1 parent 9ae4a91
commit 597cbe7
Show file tree

Hide file tree

Showing 211 changed files with 804 additions and 872 deletions.
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
diff --git a/imputegap/assets/25_01_08_16_33_26_plot.jpg b/imputegap/assets/25_01_08_16_33_26_plot.jpg
diff --git a/imputegap/assets/25_01_08_16_34_34_plot.jpg b/imputegap/assets/25_01_08_16_34_34_plot.jpg
diff --git a/imputegap/assets/25_01_08_16_35_19_plot.jpg b/imputegap/assets/25_01_08_16_35_19_plot.jpg
diff --git a/imputegap/assets/25_01_08_16_35_57_plot.jpg b/imputegap/assets/25_01_08_16_35_57_plot.jpg
diff --git a/imputegap/assets/25_01_08_16_37_19_plot.jpg b/imputegap/assets/25_01_08_16_37_19_plot.jpg
diff --git a/imputegap/assets/25_01_08_16_37_54_plot.jpg b/imputegap/assets/25_01_08_16_37_54_plot.jpg
diff --git a/imputegap/assets/25_01_08_16_45_33_plot.jpg b/imputegap/assets/25_01_08_16_45_33_plot.jpg
diff --git a/imputegap/assets/25_01_08_16_46_33_plot.jpg b/imputegap/assets/25_01_08_16_46_33_plot.jpg
diff --git a/imputegap/assets/25_01_08_16_47_09_plot.jpg b/imputegap/assets/25_01_08_16_47_09_plot.jpg
diff --git a/imputegap/assets/25_01_08_16_51_45_plot.jpg b/imputegap/assets/25_01_08_16_51_45_plot.jpg
diff --git a/imputegap/params/optimal_parameters_e_eeg-reading_cdrec.toml b/imputegap/params/optimal_parameters_e_eeg-reading_cdrec.toml
@@ -1,4 +1,4 @@
 [cdrec]
-rank = 9
-epsilon = 1.5973053913529038e-5
-iteration = 294
+rank = 5
+epsilon = 0.31661830435765825
+iteration = 821
diff --git a/imputegap/recovery/__pycache__/benchmark.cpython-312.pyc b/imputegap/recovery/__pycache__/benchmark.cpython-312.pyc
diff --git a/imputegap/recovery/__pycache__/manager.cpython-312.pyc b/imputegap/recovery/__pycache__/manager.cpython-312.pyc
diff --git a/imputegap/recovery/benchmark.py b/imputegap/recovery/benchmark.py
@@ -84,6 +84,72 @@ def _config_optimization(self, opti_mean, ts_test, pattern, algorithm, block_siz
 
         return i_opti
 
+    def average_runs_by_names(self, data):
+        """
+        Average the results of all runs depending on the dataset
+
+        Parameters
+        ----------
+        data : list
+            list of dictionary containing the results of the benchmark runs.
+
+        Returns
+        -------
+        list
+            list of dictionary containing the results of the benchmark runs averaged by datasets.
+        """
+        results_avg, all_names = [], []
+
+        # Extract dataset names
+        for dictionary in data:
+            all_keys = list(dictionary.keys())
+            dataset_name = all_keys[0]
+            all_names.append(dataset_name)
+
+        # Get unique dataset names
+        unique_names = sorted(set(all_names))
+        print("All dataset names:", *all_names, "\n")
+        print("Unique dataset names:", *unique_names)
+
+        # Initialize and populate the split matrix
+        split = [[0 for _ in range(all_names.count(name))] for name in unique_names]
+        for i, name in enumerate(unique_names):
+            x = 0
+            for y, match in enumerate(all_names):
+                if name == match:
+                    split[i][x] = data[y]
+                    x += 1
+
+        # Iterate over the split matrix to calculate averages
+        for datasets in split:
+            tmp = [dataset for dataset in datasets if dataset != 0]
+            merged_dict = {}
+            count = len(tmp)
+
+            # Process and calculate averages
+            for dataset in tmp:
+                for outer_key, outer_value in dataset.items():
+                    for middle_key, middle_value in outer_value.items():
+                        for mean_key, mean_value in middle_value.items():
+                            for method_key, method_value in mean_value.items():
+                                for level_key, level_value in method_value.items():
+                                    # Initialize scores and times if not already initialized
+                                    merger = merged_dict.setdefault(outer_key, {}
+                                                                    ).setdefault(middle_key, {}).setdefault(mean_key, {}
+                                                                                                            ).setdefault(
+                                        method_key, {}).setdefault(level_key, {"scores": {}, "times": {}})
+
+                                    # Add scores and times
+                                    for score_key, v in level_value["scores"].items():
+                                        merger["scores"][score_key] = (merger["scores"].get(score_key, 0) + v / count)
+                                    for time_key, time_value in level_value["times"].items():
+                                        merger["times"][time_key] = (
+                                                    merger["times"].get(time_key, 0) + time_value / count)
+
+            results_avg.append(merged_dict)
+
+        return results_avg
+
     def avg_results(self, *datasets):
         """
         Calculate the average of all metrics and times across multiple datasets.
@@ -681,7 +747,7 @@ def eval(self, algorithms=["cdrec"], datasets=["eeg-alcohol"], patterns=["mcar"]
                                 print("\t\truns_plots_scores", runs_plots_scores)
 
                 print("\truns_plots_scores : ", runs_plots_scores)
-                save_dir_runs = save_dir + "/run_" + str(i_run)
+                save_dir_runs = save_dir + "/run_" + str(i_run) + "/" + dataset
                 print("\truns saved in : ", save_dir_runs)
                 self.generate_plots(runs_plots_scores=runs_plots_scores, ticks=x_axis, subplot=True, save_dir=save_dir_runs)
                 self.generate_plots(runs_plots_scores=runs_plots_scores, ticks=x_axis, subplot=False, save_dir=save_dir_runs)
@@ -692,6 +758,24 @@ def eval(self, algorithms=["cdrec"], datasets=["eeg-alcohol"], patterns=["mcar"]
                 print("============================================================================\n\n\n\n\n\n")
 
         scores_list, algos, sets = self.avg_results(*run_storage)
-        _ = Benchmark().generate_heatmap(scores_list, algos, sets, save_dir=save_dir, display=False)
+        _ = self.generate_heatmap(scores_list, algos, sets, save_dir=save_dir, display=False)
+
+        run_averaged = self.average_runs_by_names(run_storage)
+
+        save_dir_agg = save_dir + "/aggregation"
+        print("\taggragation of results saved in : ", save_dir_agg)
+
+        for scores in run_averaged:
+            all_keys = list(scores.keys())
+            dataset_name = str(all_keys[0])
+
+            save_dir_agg_set = save_dir_agg + "/" + dataset_name
+
+            self.generate_plots(runs_plots_scores=scores, ticks=x_axis, subplot=True, save_dir=save_dir_agg_set)
+            self.generate_plots(runs_plots_scores=scores, ticks=x_axis, subplot=False, save_dir=save_dir_agg_set)
+            self.generate_reports_txt(scores, save_dir_agg_set, dataset_name, -1)
+            self.generate_reports_excel(scores, save_dir_agg_set, dataset_name, -1)
+
+
+        return run_averaged, scores_list
 
-        return run_storage, scores_list
diff --git a/imputegap/recovery/manager.py b/imputegap/recovery/manager.py
@@ -6,6 +6,7 @@
 from scipy.stats import zscore
 from sklearn.preprocessing import MinMaxScaler
 import importlib.resources
+from scipy.stats import norm
 
 from imputegap.tools import utils
 
@@ -617,3 +618,79 @@ def blackout(input_data, missing_rate=0.2, offset=0.1):
             """
             return TimeSeries.Contamination.missing_percentage(input_data, series_rate=1, missing_rate=missing_rate,
                                                                offset=offset)
+
+        def gaussian(input_data, series_rate=0.2, missing_rate=0.2, std_dev=0.2, offset=0.1, seed=True):
+            """
+            Apply contamination with a Gaussian distribution to the time series data.
+
+            Parameters
+            ----------
+            input_data : numpy.ndarray
+                The time series dataset to contaminate.
+            series_rate : float, optional
+                Percentage of series to contaminate (default is 0.2).
+            missing_rate : float, optional
+                Percentage of missing values per series (default is 0.2).
+            std_dev : float, optional
+                Standard deviation of the Gaussian distribution for missing values (default is 0.2).
+            offset : float, optional
+                Size of the uncontaminated section at the beginning of the series (default is 0.1).
+            seed : bool, optional
+                Whether to use a seed for reproducibility (default is True).
+
+            Returns
+            -------
+            numpy.ndarray
+                The contaminated time series data.
+            """
+
+            ts_contaminated = input_data.copy()
+            M, _ = ts_contaminated.shape
+
+            if seed:
+                seed_value = 42
+                np.random.seed(seed_value)
+
+            # Validation and limitation of input parameters
+            missing_rate = utils.verification_limitation(missing_rate)
+            series_rate = utils.verification_limitation(series_rate)
+            offset = utils.verification_limitation(offset)
+
+            nbr_series_impacted = int(np.ceil(M * series_rate))
+
+            print("\n\nGAUSSIAN contamination has been called with :"
+                  "\n\ta number of series impacted ", series_rate * 100, "%",
+                  "\n\ta missing rate of ", missing_rate * 100, "%",
+                  "\n\ta starting position at ", offset,
+                  "\n\tGaussian std_dev ", std_dev,
+                  "\n\tshape of the set ", ts_contaminated.shape,
+                  "\n\tthis selection of series 0 to ", nbr_series_impacted, "\n\n")
+
+            for series in range(0, nbr_series_impacted):
+                S = int(series)
+                N = len(ts_contaminated[S])  # number of values in the series
+                P = int(N * offset)  # values to protect in the beginning of the series
+                W = int((N - P) * missing_rate)  # number of data points to remove
+                I = np.arange(P, N)
+
+                # probability density function
+                mean = np.mean(ts_contaminated[S])
+                mean = max(min(mean, 1), -1)
+
+                probabilities = norm.pdf(I, loc=P + mean * (N - P), scale=std_dev * (N - P))
+
+                print("\n\nmean = ", mean)
+                print("P + mean * (N - P) = ", P + mean * (N - P))
+                print("std_dev * (N - P) = ", std_dev * (N - P))
+                print("probabilities.sum() = ", probabilities.sum())
+
+                # normalizes the probabilities so that their sum equals 1
+                probabilities /= probabilities.sum()
+
+                # select the values based on the probability
+                missing_indices = np.random.choice(I, size=W, replace=False, p=probabilities)
+
+                # apply missing values
+                ts_contaminated[S, missing_indices] = np.nan
+
+            return ts_contaminated
diff --git a/imputegap/reports/benchmarking_mae.jpg b/imputegap/reports/benchmarking_mae.jpg
diff --git a/imputegap/reports/benchmarking_rmse.jpg b/imputegap/reports/benchmarking_rmse.jpg
diff --git a/imputegap/reports/benchmarking_time.jpg b/imputegap/reports/benchmarking_time.jpg
diff --git a/imputegap/reports/report_01/chlorine_mcar_CORRELATION.jpg b/imputegap/reports/report_01/chlorine_mcar_CORRELATION.jpg
diff --git a/imputegap/reports/report_01/chlorine_mcar_MAE.jpg b/imputegap/reports/report_01/chlorine_mcar_MAE.jpg
diff --git a/imputegap/reports/report_01/chlorine_mcar_MI.jpg b/imputegap/reports/report_01/chlorine_mcar_MI.jpg
diff --git a/imputegap/reports/report_01/chlorine_mcar_RMSE.jpg b/imputegap/reports/report_01/chlorine_mcar_RMSE.jpg
diff --git a/imputegap/reports/report_01/chlorine_mcar_contamination_time.jpg b/imputegap/reports/report_01/chlorine_mcar_contamination_time.jpg
diff --git a/imputegap/reports/report_01/chlorine_mcar_imputation_time.jpg b/imputegap/reports/report_01/chlorine_mcar_imputation_time.jpg
diff --git a/imputegap/reports/report_01/chlorine_mcar_optimization_time.jpg b/imputegap/reports/report_01/chlorine_mcar_optimization_time.jpg
diff --git a/imputegap/reports/report_01/drift_mcar_CORRELATION.jpg b/imputegap/reports/report_01/drift_mcar_CORRELATION.jpg
diff --git a/imputegap/reports/report_01/drift_mcar_MAE.jpg b/imputegap/reports/report_01/drift_mcar_MAE.jpg
diff --git a/imputegap/reports/report_01/drift_mcar_MI.jpg b/imputegap/reports/report_01/drift_mcar_MI.jpg
diff --git a/imputegap/reports/report_01/drift_mcar_RMSE.jpg b/imputegap/reports/report_01/drift_mcar_RMSE.jpg
diff --git a/imputegap/reports/report_01/drift_mcar_contamination_time.jpg b/imputegap/reports/report_01/drift_mcar_contamination_time.jpg
diff --git a/imputegap/reports/report_01/drift_mcar_imputation_time.jpg b/imputegap/reports/report_01/drift_mcar_imputation_time.jpg
diff --git a/imputegap/reports/report_01/drift_mcar_optimization_time.jpg b/imputegap/reports/report_01/drift_mcar_optimization_time.jpg
diff --git a/imputegap/reports/report_01/eegalcohol_mcar_CORRELATION.jpg b/imputegap/reports/report_01/eegalcohol_mcar_CORRELATION.jpg
diff --git a/imputegap/reports/report_01/eegalcohol_mcar_MAE.jpg b/imputegap/reports/report_01/eegalcohol_mcar_MAE.jpg
diff --git a/imputegap/reports/report_01/eegalcohol_mcar_MI.jpg b/imputegap/reports/report_01/eegalcohol_mcar_MI.jpg
diff --git a/imputegap/reports/report_01/eegalcohol_mcar_RMSE.jpg b/imputegap/reports/report_01/eegalcohol_mcar_RMSE.jpg
diff --git a/imputegap/reports/report_01/eegalcohol_mcar_contamination_time.jpg b/imputegap/reports/report_01/eegalcohol_mcar_contamination_time.jpg
diff --git a/imputegap/reports/report_01/eegalcohol_mcar_imputation_time.jpg b/imputegap/reports/report_01/eegalcohol_mcar_imputation_time.jpg
diff --git a/imputegap/reports/report_01/eegalcohol_mcar_optimization_time.jpg b/imputegap/reports/report_01/eegalcohol_mcar_optimization_time.jpg
diff --git a/imputegap/reports/report_01/eegreading_mcar_CORRELATION.jpg b/imputegap/reports/report_01/eegreading_mcar_CORRELATION.jpg
diff --git a/imputegap/reports/report_01/eegreading_mcar_MAE.jpg b/imputegap/reports/report_01/eegreading_mcar_MAE.jpg
diff --git a/imputegap/reports/report_01/eegreading_mcar_MI.jpg b/imputegap/reports/report_01/eegreading_mcar_MI.jpg
diff --git a/imputegap/reports/report_01/eegreading_mcar_RMSE.jpg b/imputegap/reports/report_01/eegreading_mcar_RMSE.jpg
diff --git a/imputegap/reports/report_01/eegreading_mcar_contamination_time.jpg b/imputegap/reports/report_01/eegreading_mcar_contamination_time.jpg
diff --git a/imputegap/reports/report_01/eegreading_mcar_imputation_time.jpg b/imputegap/reports/report_01/eegreading_mcar_imputation_time.jpg
diff --git a/imputegap/reports/report_01/eegreading_mcar_optimization_time.jpg b/imputegap/reports/report_01/eegreading_mcar_optimization_time.jpg
diff --git a/imputegap/reports/report_01/fmriobjectviewing_mcar_CORRELATION.jpg b/imputegap/reports/report_01/fmriobjectviewing_mcar_CORRELATION.jpg
diff --git a/imputegap/reports/report_01/fmriobjectviewing_mcar_MAE.jpg b/imputegap/reports/report_01/fmriobjectviewing_mcar_MAE.jpg
diff --git a/imputegap/reports/report_01/fmriobjectviewing_mcar_MI.jpg b/imputegap/reports/report_01/fmriobjectviewing_mcar_MI.jpg
diff --git a/imputegap/reports/report_01/fmriobjectviewing_mcar_RMSE.jpg b/imputegap/reports/report_01/fmriobjectviewing_mcar_RMSE.jpg
diff --git a/imputegap/reports/report_01/fmriobjectviewing_mcar_contamination_time.jpg b/imputegap/reports/report_01/fmriobjectviewing_mcar_contamination_time.jpg
diff --git a/imputegap/reports/report_01/fmriobjectviewing_mcar_imputation_time.jpg b/imputegap/reports/report_01/fmriobjectviewing_mcar_imputation_time.jpg
diff --git a/imputegap/reports/report_01/fmriobjectviewing_mcar_optimization_time.jpg b/imputegap/reports/report_01/fmriobjectviewing_mcar_optimization_time.jpg
diff --git a/imputegap/reports/report_01/fmristoptask_mcar_CORRELATION.jpg b/imputegap/reports/report_01/fmristoptask_mcar_CORRELATION.jpg
diff --git a/imputegap/reports/report_01/fmristoptask_mcar_MAE.jpg b/imputegap/reports/report_01/fmristoptask_mcar_MAE.jpg
diff --git a/imputegap/reports/report_01/fmristoptask_mcar_MI.jpg b/imputegap/reports/report_01/fmristoptask_mcar_MI.jpg
diff --git a/imputegap/reports/report_01/fmristoptask_mcar_RMSE.jpg b/imputegap/reports/report_01/fmristoptask_mcar_RMSE.jpg
diff --git a/imputegap/reports/report_01/fmristoptask_mcar_contamination_time.jpg b/imputegap/reports/report_01/fmristoptask_mcar_contamination_time.jpg
diff --git a/imputegap/reports/report_01/fmristoptask_mcar_imputation_time.jpg b/imputegap/reports/report_01/fmristoptask_mcar_imputation_time.jpg
diff --git a/imputegap/reports/report_01/fmristoptask_mcar_optimization_time.jpg b/imputegap/reports/report_01/fmristoptask_mcar_optimization_time.jpg