From 626418620a6aa070a85b1cfe3fae5a7f7ed689c0 Mon Sep 17 00:00:00 2001
From: Struan Donald <struan@exo.org.uk>
Date: Tue, 26 Nov 2024 10:55:38 +0000
Subject: [PATCH 1/4] tidy up checking for empty dataframes

use df.empty to check for data but also check for None where appropriate

Fixes #622
---
 hub/management/commands/base_importers.py                 | 8 ++++----
 hub/management/commands/import_air_quality_data.py        | 4 +++-
 hub/management/commands/import_cen_nzsg_members.py        | 2 +-
 hub/management/commands/import_flood_risk_data.py         | 8 +++++++-
 hub/management/commands/import_last_election_data.py      | 4 +++-
 hub/management/commands/import_mp_engagement.py           | 6 ++++--
 hub/management/commands/import_mp_job_titles.py           | 2 +-
 .../commands/import_mps_select_committee_membership.py    | 2 +-
 hub/management/commands/run_all_import_scripts.py         | 3 ---
 9 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/hub/management/commands/base_importers.py b/hub/management/commands/base_importers.py
index e0c426100..c1a6c851c 100644
--- a/hub/management/commands/base_importers.py
+++ b/hub/management/commands/base_importers.py
@@ -210,7 +210,7 @@ def process_data(self, df: pd.DataFrame):
     def handle(self, quiet=False, *args, **kwargs):
         self._quiet = quiet
         df = self.get_df()
-        if not df:
+        if df is None or df.empty:
             return
         self.add_data_sets()
         self.delete_data()
@@ -271,7 +271,7 @@ def handle(self, quiet=False, skip_new_areatype_conversion=False, *args, **optio
         if not hasattr(self, "do_not_convert"):
             self.do_not_convert = skip_new_areatype_conversion
         df = self.get_dataframe()
-        if df is None:
+        if df is None or df.empty:
             if not self._quiet:
                 self.stdout.write(f"missing data for {self.message} ({self.area_type})")
             return
@@ -374,7 +374,7 @@ def process_data(self, df: pd.DataFrame):
     def handle(self, quiet=False, *args, **kwargs):
         self._quiet = quiet
         df = self.get_df()
-        if df is None:
+        if df is None or df.empty:
             if not self._quiet:
                 self.stdout.write(f"missing data for {self.message} ({self.area_type})")
             return
@@ -429,7 +429,7 @@ def process_data(self, df):
     def handle(self, quiet=False, *args, **options):
         self._quiet = quiet
         df = self.get_dataframe()
-        if df.empty:
+        if df is None or df.empty:
             if not self._quiet:
                 self.stdout.write(f"missing data for {self.message} ({self.area_type})")
             return
diff --git a/hub/management/commands/import_air_quality_data.py b/hub/management/commands/import_air_quality_data.py
index 2b5373317..9b69704b6 100644
--- a/hub/management/commands/import_air_quality_data.py
+++ b/hub/management/commands/import_air_quality_data.py
@@ -94,7 +94,7 @@ def add_arguments(self, parser):
     def handle(self, quiet=False, *args, **options):
         self._quiet = quiet
         df = self.get_dataframe()
-        if not df:
+        if df is None:
             self.stdout.write(
                 "Failed to import air quality data. Please ensure that the gridcode_lookup file is available."
             )
@@ -192,4 +192,6 @@ def get_dataframe(self):
 
         # Prepare the df for useful importing
         df = df.drop(columns=["gridcode"]).groupby("gss").mean()
+        if df.empty:
+            return None
         return df
diff --git a/hub/management/commands/import_cen_nzsg_members.py b/hub/management/commands/import_cen_nzsg_members.py
index f0ad1fe60..e67b930a9 100644
--- a/hub/management/commands/import_cen_nzsg_members.py
+++ b/hub/management/commands/import_cen_nzsg_members.py
@@ -80,7 +80,7 @@ def create_data_types(self):
     def get_results(self):
         mps = Person.objects.filter(person_type="MP")
         df = self.get_df()
-        if df is None:
+        if df is None or df.empty:
             return {}
         results = {}
         print("Name matching MPs")
diff --git a/hub/management/commands/import_flood_risk_data.py b/hub/management/commands/import_flood_risk_data.py
index 82d56b0ea..161dfde60 100644
--- a/hub/management/commands/import_flood_risk_data.py
+++ b/hub/management/commands/import_flood_risk_data.py
@@ -43,7 +43,9 @@ def handle(self, quiet=False, *args, **options):
         df = self.get_dataframe()
         if df is None:
             if not self._quiet:
-                self.stdout.write(f"Data file {self.data_file} not found")
+                self.stdout.write(
+                    f"Data file {self.data_file} not found or contains no data"
+                )
             return
         self.data_types = self.create_data_types(df)
         self.delete_data()
@@ -101,6 +103,8 @@ def get_dataframe(self):
         if self.data_file.exists() is False:
             return None
         df = pd.read_csv(self.data_file)
+        if df.empty:
+            return None
         totals = (
             df.dropna()[["gss", "prob_4band"]]
             .groupby("gss")
@@ -117,4 +121,6 @@ def get_dataframe(self):
         )
         df["percentage"] = df.value / df.total * 100
         df = df.pivot(columns="prob_4band", values="percentage", index="gss").fillna(0)
+        if df.empty:
+            return None
         return df
diff --git a/hub/management/commands/import_last_election_data.py b/hub/management/commands/import_last_election_data.py
index d47f7aaf8..ada7e683a 100644
--- a/hub/management/commands/import_last_election_data.py
+++ b/hub/management/commands/import_last_election_data.py
@@ -69,7 +69,7 @@ def handle(self, quiet=False, *args, **options):
         self._quiet = quiet
         self.delete_data()
         df = self.get_last_election_df()
-        if df.empty is not True:
+        if df is not None:
             self.data_types = self.create_data_types()
             self.import_results(df)
 
@@ -202,6 +202,8 @@ def get_last_election_df(self):
         df = df.rename(
             columns=lambda party: self.party_translate_up_dict.get(party.lower(), party)
         )
+        if df.empty:
+            return None
         return df
 
     def create_data_types(self):
diff --git a/hub/management/commands/import_mp_engagement.py b/hub/management/commands/import_mp_engagement.py
index 578e370cc..72a869c58 100644
--- a/hub/management/commands/import_mp_engagement.py
+++ b/hub/management/commands/import_mp_engagement.py
@@ -15,9 +15,11 @@ def handle(self, quiet=False, *args, **options):
         self._quiet = quiet
         self.data_types = self.create_data_types()
         df = self.get_df()
-        if df is None:
+        if df is None or df.empty:
             if not self._quiet:
-                self.stdout.write(f"Data file {self.data_file} not found")
+                self.stdout.write(
+                    f"Data file {self.data_file} not found or contains no data"
+                )
             return
         self.import_results(df)
 
diff --git a/hub/management/commands/import_mp_job_titles.py b/hub/management/commands/import_mp_job_titles.py
index de8f75421..e0e0ce1d8 100644
--- a/hub/management/commands/import_mp_job_titles.py
+++ b/hub/management/commands/import_mp_job_titles.py
@@ -89,7 +89,7 @@ def import_results(self):
 
         df = self.get_df()
 
-        if df is None:
+        if df is None or df.empty:
             return
 
         data_type = self.create_data_type()
diff --git a/hub/management/commands/import_mps_select_committee_membership.py b/hub/management/commands/import_mps_select_committee_membership.py
index 1e7e81b41..3f2e75366 100644
--- a/hub/management/commands/import_mps_select_committee_membership.py
+++ b/hub/management/commands/import_mps_select_committee_membership.py
@@ -95,5 +95,5 @@ def add_results(self, results: pd.DataFrame, data_type):
     def import_results(self):
         data_type = self.create_data_types()
         df = self.get_df()
-        if df:
+        if not df.empty:
             self.add_results(df, data_type)
diff --git a/hub/management/commands/run_all_import_scripts.py b/hub/management/commands/run_all_import_scripts.py
index ab2a7d530..c1fbf8941 100644
--- a/hub/management/commands/run_all_import_scripts.py
+++ b/hub/management/commands/run_all_import_scripts.py
@@ -15,13 +15,10 @@ class Command(BaseCommand):
     ]
     skip_imports = [
         "import_2024_ppcs",  # no longer relevant post-election
-        "import_air_quality_data",  # ValueError because it checks for truthiness of a Pandas dataframe?
         "import_christian_aid_group_locations",  # JSONDecodeError because parlparse JSON file not found
         "import_mps_appg_data",  # hasn't been updated for Autumn 2024 APPGs
         "import_mps_relevant_votes",  # hasn't been updated for a while (and we import EDMs separately now)
         "import_mps_standing_down_2024",  # no longer relevant post-election
-        "import_mps_select_committee_membership",  # ValueError because it checks for truthiness of a Pandas dataframe?
-        "import_nt_property_locations",  # ValueError because it checks for truthiness of a Pandas dataframe?
         "import_onward_polling_data",  # JSONDecodeError because parlparse JSON file not found
     ]
 

From d21b7896994414e20b13b591cf967a7fd61711ae Mon Sep 17 00:00:00 2001
From: Struan Donald <struan@exo.org.uk>
Date: Tue, 26 Nov 2024 15:31:16 +0000
Subject: [PATCH 2/4] move constituency lookup table generator to base importer

Rather than sharing this across two file. Also updates it to use
people.json as a source rather than the now removed constituencies.json
---
 hub/management/commands/base_importers.py | 47 +++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/hub/management/commands/base_importers.py b/hub/management/commands/base_importers.py
index c1a6c851c..d6c6d5007 100644
--- a/hub/management/commands/base_importers.py
+++ b/hub/management/commands/base_importers.py
@@ -4,6 +4,7 @@
 from django.core.management.base import BaseCommand
 
 import pandas as pd
+import requests
 from tqdm import tqdm
 
 from hub.models import Area, AreaData, AreaType, DataSet, DataType
@@ -36,6 +37,16 @@
     "Independents": "#DCDCDC",
 }
 
+TWFY_CONSTITUENCIES_DATA_URL = (
+    "https://raw.githubusercontent.com/mysociety/parlparse/master/members/people.json"
+)
+HARD_CODED_CONSTITUENCY_LOOKUP = {
+    "Cotswolds The": "The Cotswolds",
+    "Basildon South and East Thurrock": "South Basildon and East Thurrock",
+    "Na h-Eileanan An Iar (Western Isles)": "Na h-Eileanan an Iar",
+    "Ynys M¶n": "Ynys Môn",
+}
+
 
 class MultipleAreaTypesMixin:
     def handle(self, *args, **options):
@@ -64,6 +75,42 @@ def add_arguments(self, parser):
             help="do not auto convert to new constituency data",
         )
 
+    def add_to_dict(self, df):
+        names = df.area.tolist()
+        # Add a version of the main name, without any commas
+        names.append(names[0].replace(",", ""))
+        # The first name listed is the ideal form
+        name = names.pop(0)
+        return {alt_name.replace(",", ""): name for alt_name in names}
+
+    def build_constituency_name_lookup(self, old_cons=False):
+        # Grab the TWFY data, and ignore any constituencies that no longer exist
+        # We're only interested in the names, so keep them, and explode the column.
+        # Then group by (arbitrary) index, and build the dictionary from these groups
+
+        cons_filter = "end_date.isna()"
+        if old_cons:
+            cons_filter = "end_date == '2024-07-03'"
+
+        response = requests.get(TWFY_CONSTITUENCIES_DATA_URL)
+        df = pd.DataFrame.from_records(response.json()["posts"])
+        df = df.query(cons_filter)["area"].reset_index()
+        df = (
+            df["area"]
+            .map(lambda a: [a["name"]] + [o for o in a.get("other_names", [])])
+            .reset_index()
+        )
+        df = df.explode("area", ignore_index=True)
+
+        # Start with hard-coded lookup
+        names_lookup_dict = HARD_CODED_CONSTITUENCY_LOOKUP.copy()
+        for i, names_df in df.groupby("index"):
+            new_dict = self.add_to_dict(names_df)
+            if new_dict:
+                names_lookup_dict.update(new_dict)
+
+        return names_lookup_dict
+
     def get_label(self, config):
         return config["defaults"]["label"]
 

From a642b3dc1236a0188b519eeb519b101e08ffdf87 Mon Sep 17 00:00:00 2001
From: Struan Donald <struan@exo.org.uk>
Date: Tue, 26 Nov 2024 15:36:09 +0000
Subject: [PATCH 3/4] update constituency name lookup in importers

update the christian aid and onward polling importers to use the
constituency name lookup from the base importer

Fixes #621
---
 .../import_christian_aid_group_locations.py   | 33 +----------------
 .../commands/import_onward_polling_data.py    | 37 +------------------
 .../commands/run_all_import_scripts.py        |  2 -
 3 files changed, 2 insertions(+), 70 deletions(-)

diff --git a/hub/management/commands/import_christian_aid_group_locations.py b/hub/management/commands/import_christian_aid_group_locations.py
index 0fdae3b9b..46408f465 100644
--- a/hub/management/commands/import_christian_aid_group_locations.py
+++ b/hub/management/commands/import_christian_aid_group_locations.py
@@ -1,15 +1,11 @@
 from django.conf import settings
 
 import pandas as pd
-import requests
 
 from hub.models import DataSet
 
 from .base_importers import BaseConstituencyGroupListImportCommand
 
-TWFY_CONSTITUENCIES_DATA_URL = "https://raw.githubusercontent.com/mysociety/parlparse/master/members/constituencies.json"
-HARD_CODED_CONSTITUENCY_LOOKUP = {}
-
 
 class Command(BaseConstituencyGroupListImportCommand):
     help = "Import data about Christian Aid groups per constituency"
@@ -61,33 +57,6 @@ class Command(BaseConstituencyGroupListImportCommand):
     group_data_type = "constituency_christian_aid_groups"
     count_data_type = "constituency_christian_aid_group_count"
 
-    def add_to_dict(self, df):
-        names = df.names.tolist()
-        # Add a version of the main name, without any commas
-        names.append(names[0].replace(",", ""))
-        # The first name listed is the ideal form
-        name = names.pop(0)
-        return {alt_name.replace(",", ""): name for alt_name in names}
-
-    def build_constituency_name_lookup(self):
-        # Grab the TWFY data, and ignore any constituencies that no longer exist
-        # We're only interested in the names, so keep them, and explode the column.
-        # Then group by (arbitrary) index, and build the dictionary from these groups
-
-        response = requests.get(TWFY_CONSTITUENCIES_DATA_URL)
-        df = pd.DataFrame.from_records(response.json())
-        df = df.query("end_date.isna()")["names"].reset_index()
-        df = df.explode("names", ignore_index=True)
-
-        # Start with hard-coded lookup
-        names_lookup_dict = HARD_CODED_CONSTITUENCY_LOOKUP.copy()
-        for i, names_df in df.groupby("index"):
-            new_dict = self.add_to_dict(names_df)
-            if new_dict:
-                names_lookup_dict.update(new_dict)
-
-        return names_lookup_dict
-
     def get_df(self):
 
         if self.data_file.exists() is False:
@@ -107,7 +76,7 @@ def get_df(self):
         ]
 
         # Build a constituency lookup from TWFY data, and apply it to the constituency column, so that the names are all in a form that LIH recognises
-        constituency_lookup = self.build_constituency_name_lookup()
+        constituency_lookup = self.build_constituency_name_lookup(old_cons=True)
         df.constituency = df.constituency.apply(
             lambda x: (
                 constituency_lookup.get(x.replace(",", ""), x) if not pd.isna(x) else ""
diff --git a/hub/management/commands/import_onward_polling_data.py b/hub/management/commands/import_onward_polling_data.py
index dbd2c07b0..4ec40a18d 100644
--- a/hub/management/commands/import_onward_polling_data.py
+++ b/hub/management/commands/import_onward_polling_data.py
@@ -1,19 +1,11 @@
 from django.conf import settings
 
 import pandas as pd
-import requests
 
 from hub.models import AreaData, DataSet
 
 from .base_importers import BaseImportFromDataFrameCommand
 
-TWFY_CONSTITUENCIES_DATA_URL = "https://raw.githubusercontent.com/mysociety/parlparse/master/members/constituencies.json"
-HARD_CODED_CONSTITUENCY_LOOKUP = {
-    "Cotswolds The": "The Cotswolds",
-    "Basildon South and East Thurrock": "South Basildon and East Thurrock",
-    "Na h-Eileanan An Iar (Western Isles)": "Na h-Eileanan an Iar",
-}
-
 
 class Command(BaseImportFromDataFrameCommand):
     help = "Import Onward polling data on attitudes to net zero and climate change"
@@ -60,33 +52,6 @@ class Command(BaseImportFromDataFrameCommand):
     }
     del data_sets["constituency_cc_high"]["defaults"]["subcategory"]
 
-    def add_to_dict(self, df):
-        names = df.names.tolist()
-        # Add a version of the main name, without any commas
-        names.append(names[0].replace(",", ""))
-        # The first name listed is the ideal form
-        name = names.pop(0)
-        return {alt_name.replace(",", ""): name for alt_name in names}
-
-    def build_constituency_name_lookup(self):
-        # Grab the TWFY data, and ignore any constituencies that no longer exist
-        # We're only interested in the names, so keep them, and explode the column.
-        # Then group by (arbitrary) index, and build the dictionary from these groups
-
-        response = requests.get(TWFY_CONSTITUENCIES_DATA_URL)
-        df = pd.DataFrame.from_records(response.json())
-        df = df.query("end_date.isna()")["names"].reset_index()
-        df = df.explode("names", ignore_index=True)
-
-        # Start with hard-coded lookup
-        names_lookup_dict = HARD_CODED_CONSTITUENCY_LOOKUP.copy()
-        for i, names_df in df.groupby("index"):
-            new_dict = self.add_to_dict(names_df)
-            if new_dict:
-                names_lookup_dict.update(new_dict)
-
-        return names_lookup_dict
-
     def get_dataframe(self):
 
         if not self.data_file.exists():
@@ -110,7 +75,7 @@ def get_dataframe(self):
         ]
 
         # Build a constituency lookup from TWFY data, and apply it to the constituency column, so that the names are all in a form that LIH recognises
-        constituency_lookup = self.build_constituency_name_lookup()
+        constituency_lookup = self.build_constituency_name_lookup(old_cons=True)
         df.constituency = df.constituency.apply(
             lambda x: constituency_lookup.get(x.replace(",", ""), x)
         )
diff --git a/hub/management/commands/run_all_import_scripts.py b/hub/management/commands/run_all_import_scripts.py
index c1fbf8941..b6d2f829c 100644
--- a/hub/management/commands/run_all_import_scripts.py
+++ b/hub/management/commands/run_all_import_scripts.py
@@ -15,11 +15,9 @@ class Command(BaseCommand):
     ]
     skip_imports = [
         "import_2024_ppcs",  # no longer relevant post-election
-        "import_christian_aid_group_locations",  # JSONDecodeError because parlparse JSON file not found
         "import_mps_appg_data",  # hasn't been updated for Autumn 2024 APPGs
         "import_mps_relevant_votes",  # hasn't been updated for a while (and we import EDMs separately now)
         "import_mps_standing_down_2024",  # no longer relevant post-election
-        "import_onward_polling_data",  # JSONDecodeError because parlparse JSON file not found
     ]
 
     def get_scripts(self, *args, **options):

From 0de5bf23c2e8e954ee5ee32bb22cf7d483609b30 Mon Sep 17 00:00:00 2001
From: Zarino Zappia <mail@zarino.co.uk>
Date: Tue, 10 Dec 2024 17:17:51 +0000
Subject: [PATCH 4/4] Add dataset visibility reminder to README

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index eb97ccaf8..abbf67cd2 100644
--- a/README.md
+++ b/README.md
@@ -65,6 +65,8 @@ You could alternatively run commands individually (again, from inside the contai
     ./manage.py import_areas
     ./manage.py import_mps
 
+Finally, you will want to log in to `/admin` and make a selection of datasets “Public”, so they appear to logged-out users, on the site.
+
 ### Running the tests
 
 First start the Docker environment: