ukri-excalibur · ilectra · Apr 12, 2024 · Feb 1, 2024 · Feb 1, 2024 · Feb 1, 2024
diff --git a/post-processing/README.md b/post-processing/README.md
@@ -39,7 +39,7 @@ python post_processing.py log_path config_path [-p plot_type]
 - `config_path` - Path to a configuration file containing plot details.
 - `plot_type` - (Optional.) Type of plot to be generated. (`Note: only a generic bar chart is currently implemented.`)
 
-Run `post_processing.py -h` for more information (including debugging flags).
+Run `post_processing.py -h` for more information (including debugging and file output flags).
 
 #### Streamlit
 
@@ -68,12 +68,13 @@ Before running post-processing, create a config file including all necessary inf
     - `Format: [column_name, value]`
 - `column_types` - Pandas dtype for each relevant column (axes, units, filters, series). Specified with a dictionary.
     - `Accepted types: "str"/"string"/"object", "int"/"int64", "float"/"float64", "datetime"/"datetime64"`
+- `additional_columns_to_csv` - (Optional.) List of additional columns to export to csv file, in addition to the ones above. Those columns are not used in plotting. (Specify an empty list if no additional columns are required.)
 
 #### A Note on Replaced ReFrame Columns
 
-A perflog contains certain columns that will not be present in the DataFrame available to the graphing script. Currently, these columns are `display_name`, `extra_resources`, and `env_vars`. Removed columns should not be referenced in a plot config file.
+A perflog contains certain columns with complex information that has to be unpacked in order to be useful. Currently, such columns are `display_name`, `extra_resources`, `env_vars`, and `spack_spec_dict`. Those columns are parsed by the postprocessing, removed from the DataFrame, and substituted by new columns with the unpacked information. Therefore they will not be present in the DataFrame available to the graphing script and should not be referenced in a plot config file.
 
-When the row contents of `display_name` are parsed, they are separated into their constituent benchmark names and parameters. This column is replaced with a new `test_name` column and new parameter columns (if present). Similarly, the `extra_resources` and `env_vars` columns are replaced with their respective dictionary row contents (keys become columns, values become row contents).
+When the row contents of `display_name` are parsed, they are separated into their constituent benchmark names and parameters. This column is replaced with a new `test_name` column and new parameter columns (if present). Similarly, the `extra_resources`, `env_vars`, and `spack_spec_dict` columns are replaced with their respective dictionary row contents (keys become columns, values become row contents).
 
 #### Complete Config Template
 
@@ -121,6 +122,10 @@ series: <series_list>
 # accepted types: string/object, int, float, datetime
 column_types:
   <column_name>: <column_type>
+
+# optional (default: no extra columns exported to a csv file in addition to the ones above)
+additional_columns_to_csv:
+  <columns_list>
 ```
 
 #### Example Config
@@ -162,6 +167,9 @@ column_types:
   filter_col_1: "datetime"
   filter_col_2: "int"
   series_col: "str"
+
+additional_columns_to_csv:
+  ["additional_col_1", "additional_col_2"]
 ```
 
 #### X-axis Grouping

diff --git a/post-processing/config_handler.py b/post-processing/config_handler.py
@@ -25,6 +25,7 @@ def __init__(self, config: dict, template=False):
         self.filters = config.get("filters")
         self.series = config.get("series")
         self.column_types = config.get("column_types")
+        self.extra_columns = config.get("additional_columns_to_csv")
 
         # parse filter information
         self.and_filters = []
@@ -153,6 +154,13 @@ def parse_columns(self):
             dict.fromkeys((self.plot_columns + self.filter_columns +
                            ([self.scaling_column.get("name")] if self.scaling_column else []))))
 
+        # remove duplicated columns from the extra_columns list
+        duplicates = set(self.all_columns) & set(self.extra_columns)
+        while len(duplicates) != 0:
+            for d in duplicates:
+                self.extra_columns.remove(d)
+            duplicates = set(self.all_columns) & set(self.extra_columns)
+
     def remove_redundant_types(self):
         """
             Check for columns that are no longer in use and remove them from the type dict.

diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py
@@ -2,6 +2,7 @@
 import operator as op
 import traceback
 from functools import reduce
+import os
 from pathlib import Path
 
 import pandas as pd
@@ -12,19 +13,23 @@
 
 class PostProcessing:
 
-    def __init__(self, log_path: Path, debug=False, verbose=False):
+    def __init__(self, log_path: Path, debug=False, verbose=False, save=False, plotting=True):
         """
             Initialise class.
 
             Args:
                 log_path: Path, path to performance log file or directory.
                 debug: bool, flag to print additional information to console.
                 verbose: bool, flag to print more additional information to console.
+                save: bool, flag to save the filtered dataframe in csv file
+                plotting: bool, flag to generate and store a plot in html file
         """
 
         # FIXME (issue #264): add proper logging
         self.debug = debug
         self.verbose = verbose
+        self.save = save
+        self.plotting = plotting
         # find and read perflogs
         self.original_df = PerflogHandler(log_path, self.debug).get_df()
         # copy original data for modification during post-processing
@@ -58,16 +63,18 @@ def run_post_processing(self, config: ConfigHandler):
         # scale y-axis
         self.transform_df_data(
             config.x_axis["value"], config.y_axis["value"], *config.get_y_scaling(), config.series_filters)
-
-        # FIXME (#issue #255): have an option to put this into a file (-s / --save flag?)
         if self.debug:
             print("Selected dataframe:")
-            print(self.df[self.mask][config.plot_columns])
+            print(self.df[self.mask][config.plot_columns + config.extra_columns])
+        if self.save:
+            self.df[self.mask][config.plot_columns + config.extra_columns].to_csv(
+                path_or_buf=os.path.join(Path(__file__).parent,'output.csv'), index=True)  # Set index=False to exclude the DataFrame index from the CSV
 
         # call a plotting script
-        self.plot = plot_generic(
-            config.title, self.df[self.mask][config.plot_columns],
-            config.x_axis, config.y_axis, config.series_filters, self.debug)
+        if self.plotting:
+            self.plot = plot_generic(
+                config.title, self.df[self.mask][config.plot_columns],
+                config.x_axis, config.y_axis, config.series_filters, self.debug)
 
         # FIXME (#issue #255): maybe save this bit to a file as well for easier viewing
         if self.debug & self.verbose:
@@ -396,6 +403,11 @@ def read_args():
     parser.add_argument("-v", "--verbose", action="store_true",
                         help="verbose flag for printing more debug information \
                               (must be used in conjunction with the debug flag)")
+    parser.add_argument("-s", "--save", action="store_true",
+                        help="save flag for saving the filtered dataframe in csv file")
+    parser.add_argument("-np", "--no_plot", action="store_true",
+                        help="no-plot flag for disabling generating and storing a plot")
+
 
     return parser.parse_args()
 
@@ -405,7 +417,7 @@ def main():
     args = read_args()
 
     try:
-        post = PostProcessing(args.log_path, args.debug, args.verbose)
+        post = PostProcessing(args.log_path, args.debug, args.verbose, args.save, not(args.no_plot))
         config = ConfigHandler.from_path(args.config_path)
         post.run_post_processing(config)
 

diff --git a/post-processing/post_processing_config.yaml b/post-processing/post_processing_config.yaml
@@ -47,3 +47,8 @@ column_types:
   flops_unit: "str"
   system: "str"
   cpus_per_task: "int"
+
+# Optional setting to specify additional columns to export to csv file, in addition to
+# the ones in axes/series/filters
+additional_columns_to_csv:
+  ["spack_spec"]
diff --git a/post-processing/test_post_processing.py b/post-processing/test_post_processing.py
@@ -236,7 +236,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"fake_column": "int",
                                   "flops_value": "float",
-                                  "flops_unit": "str"}}))
+                                  "flops_unit": "str"},
+                 "additional_columns_to_csv": []}))
     except KeyError as e:
         assert e.args[1] == ["fake_column"]
     else:
@@ -256,7 +257,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"tasks": "int",
                                   "flops_value": "float",
-                                  "flops_unit": "str"}}))
+                                  "flops_unit": "str"},
+                 "additional_columns_to_csv": []}))
     except KeyError as e:
         assert e.args[1] == "!!"
     else:
@@ -276,7 +278,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"tasks": "int",
                                   "flops_value": "float",
-                                  "flops_unit": "str"}}))
+                                  "flops_unit": "str"},
+                 "additional_columns_to_csv": []}))
     except ValueError:
         assert True
     else:
@@ -296,7 +299,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"tasks": "int",
                                   "flops_value": "float",
-                                  "flops_unit": "str"}}))
+                                  "flops_unit": "str"},
+                 "additional_columns_to_csv": []}))
     except pd.errors.EmptyDataError:
         assert True
     else:
@@ -315,7 +319,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"tasks": "int",
                                   "flops_value": "float",
-                                  "flops_unit": "str"}}))
+                                  "flops_unit": "str"},
+                 "additional_columns_to_csv": []}))
     except RuntimeError:
         assert True
     else:
@@ -334,7 +339,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"tasks": "int",
                                   "cpus_per_task": "int",
-                                  "extra_param": "int"}}))
+                                  "extra_param": "int"},
+                 "additional_columns_to_csv": []}))
     except RuntimeError as e:
         # three param columns found in changed log
         EXPECTED_FIELDS = ["tasks", "cpus_per_task", "extra_param"]
@@ -356,7 +362,8 @@ def test_high_level_script(run_sombrero):
              "series": [],
              "column_types": {"job_completion_time": "datetime",
                               "flops_value": "float",
-                              "flops_unit": "str"}}))
+                              "flops_unit": "str"},
+             "additional_columns_to_csv": []}))
     # check returned subset is as expected
     assert len(df) == 2
 
@@ -374,7 +381,8 @@ def test_high_level_script(run_sombrero):
              "column_types": {"tasks": "int",
                               "cpus_per_task": "int",
                               "flops_value": "float",
-                              "flops_unit": "str"}}))
+                              "flops_unit": "str"},
+             "additional_columns_to_csv": []}))
     # check returned subset is as expected
     assert len(df) == 4
 
@@ -394,7 +402,8 @@ def test_high_level_script(run_sombrero):
                               "flops_value": "float",
                               "flops_unit": "str",
                               "cpus_per_task": "int",
-                              "OMP_NUM_THREADS": "int"}}))
+                              "OMP_NUM_THREADS": "int"},
+             "additional_columns_to_csv": []}))
     # check flops values are halved compared to previous df
     assert (dfs["flops_value"].values == df[df["cpus_per_task"] == 2]["flops_value"].values/2).all()
 
@@ -413,7 +422,8 @@ def test_high_level_script(run_sombrero):
              "column_types": {"tasks": "int",
                               "flops_value": "float",
                               "flops_unit": "str",
-                              "cpus_per_task": "int"}}))
+                              "cpus_per_task": "int"},
+             "additional_columns_to_csv": []}))
     assert (dfs[dfs["cpus_per_task"] == 1]["flops_value"].values ==
             df[df["cpus_per_task"] == 1]["flops_value"].values /
             df[df["cpus_per_task"] == 1]["flops_value"].values).all()
@@ -437,7 +447,8 @@ def test_high_level_script(run_sombrero):
              "column_types": {"tasks": "int",
                               "flops_value": "float",
                               "flops_unit": "str",
-                              "cpus_per_task": "int"}}))
+                              "cpus_per_task": "int"},
+             "additional_columns_to_csv": []}))
     assert (dfs["flops_value"].values == df["flops_value"].values /
             df[(df["cpus_per_task"] == 1) & (df["tasks"] == 2)]["flops_value"].iloc[0]).all()
 
@@ -456,7 +467,8 @@ def test_high_level_script(run_sombrero):
              "column_types": {"tasks": "int",
                               "flops_value": "float",
                               "flops_unit": "str",
-                              "cpus_per_task": "int"}}))
+                              "cpus_per_task": "int"},
+             "additional_columns_to_csv": []}))
     # check flops values are halved compared to previous df
     assert (dfs["flops_value"].values == df[df["cpus_per_task"] == 2]["flops_value"].values/2).all()
 
@@ -476,7 +488,8 @@ def test_high_level_script(run_sombrero):
                                   "flops_value": "float",
                                   "flops_unit": "str",
                                   "cpus_per_task": "int",
-                                  "OMP_NUM_THREADS": "str"}}))
+                                  "OMP_NUM_THREADS": "str"},
+                 "additional_columns_to_csv": []}))
     except TypeError:
         assert True
 
@@ -496,7 +509,8 @@ def test_high_level_script(run_sombrero):
                  "column_types": {"tasks": "int",
                                   "flops_value": "float",
                                   "flops_unit": "str",
-                                  "cpus_per_task": "int"}}))
+                                  "cpus_per_task": "int"},
+                 "additional_columns_to_csv": []}))
     except ValueError:
         assert True
 
@@ -514,7 +528,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"tasks": "int",
                                   "flops_value": "float",
-                                  "flops_unit": "str"}}))
+                                  "flops_unit": "str"},
+                 "additional_columns_to_csv": []}))
     except RuntimeError as e:
         # dataframe has records from both files
         assert len(e.args[1]) == 8
@@ -535,9 +550,69 @@ def test_high_level_script(run_sombrero):
              "column_types": {"tasks": "int",
                               "flops_value": "float",
                               "flops_unit": "str",
-                              "cpus_per_task": "int"}}))
+                              "cpus_per_task": "int"},
+             "additional_columns_to_csv": []}))
 
     EXPECTED_FIELDS = ["tasks", "flops_value", "flops_unit"]
     # check returned subset is as expected
     assert df.columns.tolist() == EXPECTED_FIELDS
     assert len(df) == 1
+
+    # get filtered dataframe with extra columns for csv
+    df = PostProcessing(sombrero_log_path, save=True).run_post_processing(
+        ConfigHandler(
+            {"title": "Title",
+             "x_axis": {"value": "tasks",
+                        "units": {"custom": None}},
+             "y_axis": {"value": "flops_value",
+                        "units": {"column": "flops_unit"}},
+             "filters": {"and": [["tasks", ">", 1], ["cpus_per_task", "==", 2]],
+                         "or": []},
+             "series": [],
+             "column_types": {"tasks": "int",
+                              "flops_value": "float",
+                              "flops_unit": "str",
+                              "cpus_per_task": "int"},
+            "additional_columns_to_csv": ["spack_spec"]}
+        ))
+
+    EXPECTED_FIELDS = ["tasks", "flops_value", "flops_unit"]
+    # check returned subset is as expected
+    assert df.columns.tolist() == EXPECTED_FIELDS
+    assert len(df) == 1
+
+    EXPECTED_FIELDS.append("spack_spec")
+    # check subset written to csv is as expected
+    output_file = "output.csv"
+    df_saved = pd.read_csv(output_file, index_col=0)
+    assert df_saved.columns.tolist() == EXPECTED_FIELDS
+    assert len(df_saved) == 1
+
+    # get filtered dataframe with duplicated extra columns for csv
+    df = PostProcessing(sombrero_log_path, save=True).run_post_processing(
+        ConfigHandler(
+            {"title": "Title",
+             "x_axis": {"value": "tasks",
+                        "units": {"custom": None}},
+             "y_axis": {"value": "flops_value",
+                        "units": {"column": "flops_unit"}},
+             "filters": {"and": [["tasks", ">", 1], ["cpus_per_task", "==", 2]],
+                         "or": []},
+             "series": [],
+             "column_types": {"tasks": "int",
+                              "flops_value": "float",
+                              "flops_unit": "str",
+                              "cpus_per_task": "int"},
+            "additional_columns_to_csv": ["tasks", "tasks"]}
+        ))
+
+    EXPECTED_FIELDS = ["tasks", "flops_value", "flops_unit"]
+    # check returned subset is as expected
+    assert df.columns.tolist() == EXPECTED_FIELDS
+    assert len(df) == 1
+
+    # check subset written to csv is as expected
+    output_file = "output.csv"
+    df_saved = pd.read_csv(output_file, index_col=0)
+    assert df_saved.columns.tolist() == EXPECTED_FIELDS
+    assert len(df_saved) == 1