2e PM - gaussian, overlap, disjoint and benchmark

eXascaleInfolab · Jan 9, 2025 · 96d6ed2 · 96d6ed2
1 parent f56f717
commit 96d6ed2
Show file tree

Hide file tree

Showing 43 changed files with 1,249 additions and 808 deletions.
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
diff --git a/README.md b/README.md
@@ -242,6 +242,43 @@ Explainer.print(shap_values, shap_details)
 
 ---
 
+
+## Benchmark
+ImputeGAP enables users to comprehensively evaluate the efficiency of algorithms across various datasets.
+
+
+### Example Benchmark
+You can find this example in the file [`runner_benchmark.py`](https://github.com/eXascaleInfolab/ImputeGAP/blob/main/imputegap/runner_benchmark.py).
+
+```python
+from imputegap.recovery.benchmark import Benchmark
+
+# VARIABLES
+save_dir = "./analysis"
+nbr_run = 2
+
+# SELECT YOUR DATASET(S) :
+datasets_demo = ["eeg-alcohol", "eeg-reading"]
+
+# SELECT YOUR OPTIMIZER :
+optimiser_bayesian = {"optimizer": "bayesian", "options": {"n_calls": 15, "n_random_starts": 50, "acq_func": "gp_hedge", "metrics": "RMSE"}}
+optimizers_demo = [optimiser_bayesian]  # add optimizer you want to test
+
+# SELECT YOUR ALGORITHM(S) :
+algorithms_demo = ["mean", "cdrec", "stmvl", "iim", "mrnn"]
+
+# SELECT YOUR CONTAMINATION PATTERN(S) :
+patterns_demo = ["mcar"]
+
+# SELECT YOUR MISSING RATE(S) :
+x_axis = [0.05, 0.1, 0.2, 0.4, 0.6, 0.8]
+
+# START THE ANALYSIS
+list_results, sum_scores = Benchmark().eval(algorithms=algorithms_demo, datasets=datasets_demo, patterns=patterns_demo, x_axis=x_axis, optimizers=optimizers_demo, save_dir=save_dir, runs=nbr_run)
+```
+
+---
+
 ## Integration
 To add your own imputation algorithm in Python or C++, please refer to the detailed [integration guide](https://github.com/eXascaleInfolab/ImputeGAP/tree/main/procedure/integration).
 
@@ -251,7 +288,7 @@ To add your own imputation algorithm in Python or C++, please refer to the detai
 
 ## References
 
-Mourad Khayati, Quentin Nater, and Jacques Pasquier. ImputeVIS: An Interactive Evaluator to Benchmark Imputation Techniques for Time Series Data. Proceedings of the VLDB Endowment (PVLDB). Demo Track 17, no. 1 (2024): 4329–32.
+Mourad Khayati, Quentin Nater, and Jacques Pasquier. ImputeVIS: An Interactive Evaluator to Benchmark Imputation Techniques for Time Series Data. Proceedings of the VLDB Endowment (PVLDB). Demo Track 17, no. 1 (2024), 4329–32.
 
 Mourad Khayati, Alberto Lerner, Zakhar Tymchenko, and Philippe Cudre-Mauroux. Mind the Gap: An Experimental Evaluation of Imputation of Missing Values Techniques in Time Series. In Proceedings of the VLDB Endowment (PVLDB), Vol. 13, 2020.
 

diff --git a/build/lib/imputegap/assets/25_01_08_17_34_43_plot.jpg b/build/lib/imputegap/assets/25_01_08_17_34_43_plot.jpg
diff --git a/build/lib/imputegap/assets/25_01_08_17_35_48_plot.jpg b/build/lib/imputegap/assets/25_01_08_17_35_48_plot.jpg
diff --git a/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_DTL_Beeswarm.png b/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_DTL_Beeswarm.png
diff --git a/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_DTL_Waterfall.png b/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_DTL_Waterfall.png
diff --git a/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_results.txt b/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_results.txt
@@ -0,0 +1,22 @@
+Feature : 15    cdrec      with a score of 37.05      Transformation     Power in the lowest 20% of frequencies                            SP_Summaries_welch_rect_area_5_1
+Feature : 10    cdrec      with a score of 28.93      Geometry           Goodness of exponential fit to embedding distance distribution    CO_Embed2_Dist_tau_d_expfit_meandiff
+Feature : 2     cdrec      with a score of 12.2       Correlation        First 1/e crossing of the ACF                                     CO_f1ecac
+Feature : 21    cdrec      with a score of 4.92       Trend              Error of 3-point rolling mean forecast                            FC_LocalSimple_mean3_stderr
+Feature : 14    cdrec      with a score of 3.25       Geometry           Negative outlier timing                                           DN_OutlierInclude_n_001_mdrmd
+Feature : 1     cdrec      with a score of 3.09       Geometry           10-bin histogram mode                                             DN_HistogramMode_10
+Feature : 0     cdrec      with a score of 2.73       Geometry           5-bin histogram mode                                              DN_HistogramMode_5
+Feature : 5     cdrec      with a score of 2.56       Correlation        Time reversibility                                                CO_trev_1_num
+Feature : 17    cdrec      with a score of 1.36       Trend              Entropy of successive pairs in symbolized series                  SB_MotifThree_quantile_hh
+Feature : 6     cdrec      with a score of 1.31       Geometry           Proportion of high incremental changes in the series              MD_hrv_classic_pnn40
+Feature : 20    cdrec      with a score of 0.77       Transformation     Centroid frequency                                                SP_Summaries_welch_rect_centroid
+Feature : 4     cdrec      with a score of 0.7        Correlation        Histogram-based automutual information (lag 2, 5 bins)            CO_HistogramAMI_even_2_5
+Feature : 18    cdrec      with a score of 0.51       Geometry           Rescaled range fluctuation analysis (low-scale scaling)           SC_FluctAnal_2_rsrangefit_50_1_logi_prop_r1
+Feature : 13    cdrec      with a score of 0.43       Geometry           Positive outlier timing                                           DN_OutlierInclude_p_001_mdrmd
+Feature : 8     cdrec      with a score of 0.18       Geometry           Transition matrix column variance                                 SB_TransitionMatrix_3ac_sumdiagcov
+Feature : 7     cdrec      with a score of 0.02       Geometry           Longest stretch of above-mean values                              SB_BinaryStats_mean_longstretch1
+Feature : 3     cdrec      with a score of 0.0        Correlation        First minimum of the ACF                                          CO_FirstMin_ac
+Feature : 9     cdrec      with a score of 0.0        Trend              Wangs periodicity metric                                          PD_PeriodicityWang_th0_01
+Feature : 11    cdrec      with a score of 0.0        Correlation        First minimum of the AMI function                                 IN_AutoMutualInfoStats_40_gaussian_fmmi
+Feature : 12    cdrec      with a score of 0.0        Correlation        Change in autocorrelation timescale after incremental differencing FC_LocalSimple_mean1_tauresrat
+Feature : 16    cdrec      with a score of 0.0        Geometry           Longest stretch of decreasing values                              SB_BinaryStats_diff_longstretch0
+Feature : 19    cdrec      with a score of 0.0        Geometry           Detrended fluctuation analysis (low-scale scaling)                SC_FluctAnal_2_dfa_50_1_2_logi_prop_r1
diff --git a/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_shap_aggregate_plot.png b/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_shap_aggregate_plot.png
diff --git a/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_shap_aggregate_reverse_plot.png b/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_shap_aggregate_reverse_plot.png
diff --git a/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_shap_correlation_plot.png b/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_shap_correlation_plot.png
diff --git a/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_shap_geometry_plot.png b/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_shap_geometry_plot.png
diff --git a/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_shap_plot.png b/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_shap_plot.png
diff --git a/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_shap_reverse_plot.png b/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_shap_reverse_plot.png
diff --git a/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_shap_transformation_plot.png b/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_shap_transformation_plot.png
diff --git a/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_shap_trend_plot.png b/build/lib/imputegap/assets/shap/eeg-alcohol_cdrec_shap_trend_plot.png
diff --git a/build/lib/imputegap/params/optimal_parameters_e_eeg-reading_cdrec.toml b/build/lib/imputegap/params/optimal_parameters_e_eeg-reading_cdrec.toml
@@ -1,4 +1,4 @@
 [cdrec]
-rank = 9
-epsilon = 1.5973053913529038e-5
-iteration = 294
+rank = 5
+epsilon = 0.31661830435765825
+iteration = 821
diff --git a/build/lib/imputegap/recovery/README.md b/build/lib/imputegap/recovery/README.md
@@ -56,7 +56,9 @@ This pattern uses random number generator with fixed seed and will produce the s
 <br />
 
 ### MISSING PERCENTAGE
-**MISSING PERCENTAGE** selects of percent of series to contaminate from the first to the last with a desired percentage of missing value to remove.
+**MISSING PERCENTAGE** selects a percentage of time series to contaminate, applying the desired percentage of missing values from the beginning to the end of each selected series.
+
+
 
 <table>
     <tbody>Definition</tbody>
@@ -82,7 +84,84 @@ This pattern uses random number generator with fixed seed and will produce the s
 
 
 ### BLACKOUT
-The **BLACKOUT** pattern selects all time series to introduce missing values. It removes a set percentage of data points from all series, creating gaps for further analysis.
+The **BLACKOUT** pattern introduces missing values across all time series by removing a specified percentage of data points from each series, creating uniform gaps for analysis.
+
+
+<table>
+    <tbody>Definition</tbody>
+    <tr>
+        <td>N</td><td>MAX</td>
+    </tr>
+    <tr>
+        <td>M</td><td>MAX</td>
+    </tr>
+    <tr>
+        <td>R</td><td>1 - 100%</td>
+    </tr>
+    <tr>
+        <td>S</td><td>100%</td>
+    </tr>
+    <tr>
+        <td>W</td><td>(N-P) * R</td>
+    </tr>
+    <tr>
+        <td>B</td><td>R</td>
+    </tr>
+ </table><br />
+
+
+### GAUSSIAN
+The **GAUSSIAN** pattern introduces missing values into a percentage of time series, determined based on probabilities derived from a Gaussian distribution.
+
+<table>
+    <tbody>Definition</tbody>
+    <tr>
+        <td>N</td><td>MAX</td>
+    </tr>
+    <tr>
+        <td>M</td><td>MAX</td>
+    </tr>
+    <tr>
+        <td>R</td><td>1 - 100%</td>
+    </tr>
+    <tr>
+        <td>S</td><td>100%</td>
+    </tr>
+    <tr>
+        <td>W</td><td>(N-P) * R * probability</td>
+    </tr>
+    <tr>
+        <td>B</td><td>R</td>
+    </tr>
+ </table><br />
+
+### DISJOINT
+The **DISJOINT** pattern introduces missing values into time series by selecting segments with non-overlapping intervals. This process continues until either the missing rate limit is reached or the series length is exhausted.
+
+<table>
+    <tbody>Definition</tbody>
+    <tr>
+        <td>N</td><td>MAX</td>
+    </tr>
+    <tr>
+        <td>M</td><td>MAX</td>
+    </tr>
+    <tr>
+        <td>R</td><td>1 - 100%</td>
+    </tr>
+    <tr>
+        <td>S</td><td>100%</td>
+    </tr>
+    <tr>
+        <td>W</td><td>(N-P) * R</td>
+    </tr>
+    <tr>
+        <td>B</td><td>R</td>
+    </tr>
+ </table><br />
+
+### OVERLAP
+The **OVERLAP** pattern selects time series segments for introducing missing values by using a disjoint interval that is shifted by a specified percentage. This process continues until either the missing rate limit is reached or the series length is exhausted.
 
 
 <table>

diff --git a/build/lib/imputegap/recovery/benchmark.py b/build/lib/imputegap/recovery/benchmark.py
@@ -21,15 +21,19 @@ class Benchmark:
     -------
     _config_optimization():
         Configure and execute optimization for a selected imputation algorithm and contamination pattern.
+    average_runs_by_names(self, data):
+        Average the results of all runs depending on the dataset.
     avg_results():
         Calculate average metrics (e.g., RMSE) across multiple datasets and algorithm runs.
-    generate_matrix():
+    generate_heatmap():
         Generate and save a heatmap visualization of RMSE scores for datasets and algorithms.
-    generate_reports():
+    generate_reports_txt():
         Create detailed text-based reports summarizing metrics and timing results for all evaluations.
+    generate_reports_excel():
+        Create detailed excel-based reports summarizing metrics and timing results for all evaluations.
     generate_plots():
         Visualize metrics (e.g., RMSE, MAE) and timing (e.g., imputation, optimization) across patterns and datasets.
-    comprehensive_evaluation():
+    eval():
         Perform a complete benchmarking pipeline, including contamination, imputation, evaluation, and reporting.
 
     Example
@@ -84,6 +88,72 @@ def _config_optimization(self, opti_mean, ts_test, pattern, algorithm, block_siz
 
         return i_opti
 
+    def average_runs_by_names(self, data):
+        """
+        Average the results of all runs depending on the dataset
+
+        Parameters
+        ----------
+        data : list
+            list of dictionary containing the results of the benchmark runs.
+
+        Returns
+        -------
+        list
+            list of dictionary containing the results of the benchmark runs averaged by datasets.
+        """
+        results_avg, all_names = [], []
+
+        # Extract dataset names
+        for dictionary in data:
+            all_keys = list(dictionary.keys())
+            dataset_name = all_keys[0]
+            all_names.append(dataset_name)
+
+        # Get unique dataset names
+        unique_names = sorted(set(all_names))
+        print("All dataset names:", *all_names, "\n")
+        print("Unique dataset names:", *unique_names)
+
+        # Initialize and populate the split matrix
+        split = [[0 for _ in range(all_names.count(name))] for name in unique_names]
+        for i, name in enumerate(unique_names):
+            x = 0
+            for y, match in enumerate(all_names):
+                if name == match:
+                    split[i][x] = data[y]
+                    x += 1
+
+        # Iterate over the split matrix to calculate averages
+        for datasets in split:
+            tmp = [dataset for dataset in datasets if dataset != 0]
+            merged_dict = {}
+            count = len(tmp)
+
+            # Process and calculate averages
+            for dataset in tmp:
+                for outer_key, outer_value in dataset.items():
+                    for middle_key, middle_value in outer_value.items():
+                        for mean_key, mean_value in middle_value.items():
+                            for method_key, method_value in mean_value.items():
+                                for level_key, level_value in method_value.items():
+                                    # Initialize scores and times if not already initialized
+                                    merger = merged_dict.setdefault(outer_key, {}
+                                                                    ).setdefault(middle_key, {}).setdefault(mean_key, {}
+                                                                                                            ).setdefault(
+                                        method_key, {}).setdefault(level_key, {"scores": {}, "times": {}})
+
+                                    # Add scores and times
+                                    for score_key, v in level_value["scores"].items():
+                                        merger["scores"][score_key] = (merger["scores"].get(score_key, 0) + v / count)
+                                    for time_key, time_value in level_value["times"].items():
+                                        merger["times"][time_key] = (
+                                                    merger["times"].get(time_key, 0) + time_value / count)
+
+            results_avg.append(merged_dict)
+
+        return results_avg
+
     def avg_results(self, *datasets):
         """
         Calculate the average of all metrics and times across multiple datasets.
@@ -203,7 +273,7 @@ def generate_heatmap(self, scores_list, algos, sets, save_dir="./reports", displ
         plt.savefig(filepath, dpi=300, bbox_inches='tight')  # Save in HD with tight layout
 
         # Show the plot
-        if display :
+        if display:
             plt.tight_layout()
             plt.show()
             plt.close()
@@ -681,7 +751,7 @@ def eval(self, algorithms=["cdrec"], datasets=["eeg-alcohol"], patterns=["mcar"]
                                 print("\t\truns_plots_scores", runs_plots_scores)
 
                 print("\truns_plots_scores : ", runs_plots_scores)
-                save_dir_runs = save_dir + "/run_" + str(i_run)
+                save_dir_runs = save_dir + "/run_" + str(i_run) + "/" + dataset
                 print("\truns saved in : ", save_dir_runs)
                 self.generate_plots(runs_plots_scores=runs_plots_scores, ticks=x_axis, subplot=True, save_dir=save_dir_runs)
                 self.generate_plots(runs_plots_scores=runs_plots_scores, ticks=x_axis, subplot=False, save_dir=save_dir_runs)
@@ -692,6 +762,24 @@ def eval(self, algorithms=["cdrec"], datasets=["eeg-alcohol"], patterns=["mcar"]
                 print("============================================================================\n\n\n\n\n\n")
 
         scores_list, algos, sets = self.avg_results(*run_storage)
-        _ = Benchmark().generate_heatmap(scores_list, algos, sets, save_dir=save_dir)
+        _ = self.generate_heatmap(scores_list, algos, sets, save_dir=save_dir, display=False)
+
+        run_averaged = self.average_runs_by_names(run_storage)
+
+        save_dir_agg = save_dir + "/aggregation"
+        print("\taggragation of results saved in : ", save_dir_agg)
+
+        for scores in run_averaged:
+            all_keys = list(scores.keys())
+            dataset_name = str(all_keys[0])
+
+            save_dir_agg_set = save_dir_agg + "/" + dataset_name
+
+            self.generate_plots(runs_plots_scores=scores, ticks=x_axis, subplot=True, save_dir=save_dir_agg_set)
+            self.generate_plots(runs_plots_scores=scores, ticks=x_axis, subplot=False, save_dir=save_dir_agg_set)
+            self.generate_reports_txt(scores, save_dir_agg_set, dataset_name, -1)
+            self.generate_reports_excel(scores, save_dir_agg_set, dataset_name, -1)
+
+
+        return run_averaged, scores_list
 
-        return run_storage, scores_list
diff --git a/build/lib/imputegap/recovery/evaluation.py b/build/lib/imputegap/recovery/evaluation.py
@@ -9,7 +9,7 @@ class Evaluation:
 
     Methods
     -------
-    metrics_computation():
+    compute_all_metrics():
         Compute various evaluation metrics (RMSE, MAE, MI, CORRELATION) for the imputation.
     compute_rmse():
         Compute the Root Mean Squared Error (RMSE) between the ground truth and the imputed values.
@@ -43,7 +43,7 @@ def __init__(self, input_data, recov_data, incomp_data):
         self.recov_data = recov_data
         self.incomp_data = incomp_data
 
-    def metrics_computation(self):
+    def compute_all_metrics(self):
         """
         Compute a set of evaluation metrics for the imputation based on the ground truth and contamination data.