From 626418620a6aa070a85b1cfe3fae5a7f7ed689c0 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Tue, 26 Nov 2024 10:55:38 +0000 Subject: [PATCH 1/4] tidy up checking for empty dataframes use df.empty to check for data but also check for None where appropriate Fixes #622 --- hub/management/commands/base_importers.py | 8 ++++---- hub/management/commands/import_air_quality_data.py | 4 +++- hub/management/commands/import_cen_nzsg_members.py | 2 +- hub/management/commands/import_flood_risk_data.py | 8 +++++++- hub/management/commands/import_last_election_data.py | 4 +++- hub/management/commands/import_mp_engagement.py | 6 ++++-- hub/management/commands/import_mp_job_titles.py | 2 +- .../commands/import_mps_select_committee_membership.py | 2 +- hub/management/commands/run_all_import_scripts.py | 3 --- 9 files changed, 24 insertions(+), 15 deletions(-) diff --git a/hub/management/commands/base_importers.py b/hub/management/commands/base_importers.py index e0c426100..c1a6c851c 100644 --- a/hub/management/commands/base_importers.py +++ b/hub/management/commands/base_importers.py @@ -210,7 +210,7 @@ def process_data(self, df: pd.DataFrame): def handle(self, quiet=False, *args, **kwargs): self._quiet = quiet df = self.get_df() - if not df: + if df is None or df.empty: return self.add_data_sets() self.delete_data() @@ -271,7 +271,7 @@ def handle(self, quiet=False, skip_new_areatype_conversion=False, *args, **optio if not hasattr(self, "do_not_convert"): self.do_not_convert = skip_new_areatype_conversion df = self.get_dataframe() - if df is None: + if df is None or df.empty: if not self._quiet: self.stdout.write(f"missing data for {self.message} ({self.area_type})") return @@ -374,7 +374,7 @@ def process_data(self, df: pd.DataFrame): def handle(self, quiet=False, *args, **kwargs): self._quiet = quiet df = self.get_df() - if df is None: + if df is None or df.empty: if not self._quiet: self.stdout.write(f"missing data for {self.message} ({self.area_type})") return @@ -429,7 +429,7 @@ def process_data(self, df): def handle(self, quiet=False, *args, **options): self._quiet = quiet df = self.get_dataframe() - if df.empty: + if df is None or df.empty: if not self._quiet: self.stdout.write(f"missing data for {self.message} ({self.area_type})") return diff --git a/hub/management/commands/import_air_quality_data.py b/hub/management/commands/import_air_quality_data.py index 2b5373317..9b69704b6 100644 --- a/hub/management/commands/import_air_quality_data.py +++ b/hub/management/commands/import_air_quality_data.py @@ -94,7 +94,7 @@ def add_arguments(self, parser): def handle(self, quiet=False, *args, **options): self._quiet = quiet df = self.get_dataframe() - if not df: + if df is None: self.stdout.write( "Failed to import air quality data. Please ensure that the gridcode_lookup file is available." ) @@ -192,4 +192,6 @@ def get_dataframe(self): # Prepare the df for useful importing df = df.drop(columns=["gridcode"]).groupby("gss").mean() + if df.empty: + return None return df diff --git a/hub/management/commands/import_cen_nzsg_members.py b/hub/management/commands/import_cen_nzsg_members.py index f0ad1fe60..e67b930a9 100644 --- a/hub/management/commands/import_cen_nzsg_members.py +++ b/hub/management/commands/import_cen_nzsg_members.py @@ -80,7 +80,7 @@ def create_data_types(self): def get_results(self): mps = Person.objects.filter(person_type="MP") df = self.get_df() - if df is None: + if df is None or df.empty: return {} results = {} print("Name matching MPs") diff --git a/hub/management/commands/import_flood_risk_data.py b/hub/management/commands/import_flood_risk_data.py index 82d56b0ea..161dfde60 100644 --- a/hub/management/commands/import_flood_risk_data.py +++ b/hub/management/commands/import_flood_risk_data.py @@ -43,7 +43,9 @@ def handle(self, quiet=False, *args, **options): df = self.get_dataframe() if df is None: if not self._quiet: - self.stdout.write(f"Data file {self.data_file} not found") + self.stdout.write( + f"Data file {self.data_file} not found or contains no data" + ) return self.data_types = self.create_data_types(df) self.delete_data() @@ -101,6 +103,8 @@ def get_dataframe(self): if self.data_file.exists() is False: return None df = pd.read_csv(self.data_file) + if df.empty: + return None totals = ( df.dropna()[["gss", "prob_4band"]] .groupby("gss") @@ -117,4 +121,6 @@ def get_dataframe(self): ) df["percentage"] = df.value / df.total * 100 df = df.pivot(columns="prob_4band", values="percentage", index="gss").fillna(0) + if df.empty: + return None return df diff --git a/hub/management/commands/import_last_election_data.py b/hub/management/commands/import_last_election_data.py index d47f7aaf8..ada7e683a 100644 --- a/hub/management/commands/import_last_election_data.py +++ b/hub/management/commands/import_last_election_data.py @@ -69,7 +69,7 @@ def handle(self, quiet=False, *args, **options): self._quiet = quiet self.delete_data() df = self.get_last_election_df() - if df.empty is not True: + if df is not None: self.data_types = self.create_data_types() self.import_results(df) @@ -202,6 +202,8 @@ def get_last_election_df(self): df = df.rename( columns=lambda party: self.party_translate_up_dict.get(party.lower(), party) ) + if df.empty: + return None return df def create_data_types(self): diff --git a/hub/management/commands/import_mp_engagement.py b/hub/management/commands/import_mp_engagement.py index 578e370cc..72a869c58 100644 --- a/hub/management/commands/import_mp_engagement.py +++ b/hub/management/commands/import_mp_engagement.py @@ -15,9 +15,11 @@ def handle(self, quiet=False, *args, **options): self._quiet = quiet self.data_types = self.create_data_types() df = self.get_df() - if df is None: + if df is None or df.empty: if not self._quiet: - self.stdout.write(f"Data file {self.data_file} not found") + self.stdout.write( + f"Data file {self.data_file} not found or contains no data" + ) return self.import_results(df) diff --git a/hub/management/commands/import_mp_job_titles.py b/hub/management/commands/import_mp_job_titles.py index de8f75421..e0e0ce1d8 100644 --- a/hub/management/commands/import_mp_job_titles.py +++ b/hub/management/commands/import_mp_job_titles.py @@ -89,7 +89,7 @@ def import_results(self): df = self.get_df() - if df is None: + if df is None or df.empty: return data_type = self.create_data_type() diff --git a/hub/management/commands/import_mps_select_committee_membership.py b/hub/management/commands/import_mps_select_committee_membership.py index 1e7e81b41..3f2e75366 100644 --- a/hub/management/commands/import_mps_select_committee_membership.py +++ b/hub/management/commands/import_mps_select_committee_membership.py @@ -95,5 +95,5 @@ def add_results(self, results: pd.DataFrame, data_type): def import_results(self): data_type = self.create_data_types() df = self.get_df() - if df: + if not df.empty: self.add_results(df, data_type) diff --git a/hub/management/commands/run_all_import_scripts.py b/hub/management/commands/run_all_import_scripts.py index ab2a7d530..c1fbf8941 100644 --- a/hub/management/commands/run_all_import_scripts.py +++ b/hub/management/commands/run_all_import_scripts.py @@ -15,13 +15,10 @@ class Command(BaseCommand): ] skip_imports = [ "import_2024_ppcs", # no longer relevant post-election - "import_air_quality_data", # ValueError because it checks for truthiness of a Pandas dataframe? "import_christian_aid_group_locations", # JSONDecodeError because parlparse JSON file not found "import_mps_appg_data", # hasn't been updated for Autumn 2024 APPGs "import_mps_relevant_votes", # hasn't been updated for a while (and we import EDMs separately now) "import_mps_standing_down_2024", # no longer relevant post-election - "import_mps_select_committee_membership", # ValueError because it checks for truthiness of a Pandas dataframe? - "import_nt_property_locations", # ValueError because it checks for truthiness of a Pandas dataframe? "import_onward_polling_data", # JSONDecodeError because parlparse JSON file not found ] From d21b7896994414e20b13b591cf967a7fd61711ae Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Tue, 26 Nov 2024 15:31:16 +0000 Subject: [PATCH 2/4] move constituency lookup table generator to base importer Rather than sharing this across two file. Also updates it to use people.json as a source rather than the now removed constituencies.json --- hub/management/commands/base_importers.py | 47 +++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/hub/management/commands/base_importers.py b/hub/management/commands/base_importers.py index c1a6c851c..d6c6d5007 100644 --- a/hub/management/commands/base_importers.py +++ b/hub/management/commands/base_importers.py @@ -4,6 +4,7 @@ from django.core.management.base import BaseCommand import pandas as pd +import requests from tqdm import tqdm from hub.models import Area, AreaData, AreaType, DataSet, DataType @@ -36,6 +37,16 @@ "Independents": "#DCDCDC", } +TWFY_CONSTITUENCIES_DATA_URL = ( + "https://raw.githubusercontent.com/mysociety/parlparse/master/members/people.json" +) +HARD_CODED_CONSTITUENCY_LOOKUP = { + "Cotswolds The": "The Cotswolds", + "Basildon South and East Thurrock": "South Basildon and East Thurrock", + "Na h-Eileanan An Iar (Western Isles)": "Na h-Eileanan an Iar", + "Ynys M¶n": "Ynys Môn", +} + class MultipleAreaTypesMixin: def handle(self, *args, **options): @@ -64,6 +75,42 @@ def add_arguments(self, parser): help="do not auto convert to new constituency data", ) + def add_to_dict(self, df): + names = df.area.tolist() + # Add a version of the main name, without any commas + names.append(names[0].replace(",", "")) + # The first name listed is the ideal form + name = names.pop(0) + return {alt_name.replace(",", ""): name for alt_name in names} + + def build_constituency_name_lookup(self, old_cons=False): + # Grab the TWFY data, and ignore any constituencies that no longer exist + # We're only interested in the names, so keep them, and explode the column. + # Then group by (arbitrary) index, and build the dictionary from these groups + + cons_filter = "end_date.isna()" + if old_cons: + cons_filter = "end_date == '2024-07-03'" + + response = requests.get(TWFY_CONSTITUENCIES_DATA_URL) + df = pd.DataFrame.from_records(response.json()["posts"]) + df = df.query(cons_filter)["area"].reset_index() + df = ( + df["area"] + .map(lambda a: [a["name"]] + [o for o in a.get("other_names", [])]) + .reset_index() + ) + df = df.explode("area", ignore_index=True) + + # Start with hard-coded lookup + names_lookup_dict = HARD_CODED_CONSTITUENCY_LOOKUP.copy() + for i, names_df in df.groupby("index"): + new_dict = self.add_to_dict(names_df) + if new_dict: + names_lookup_dict.update(new_dict) + + return names_lookup_dict + def get_label(self, config): return config["defaults"]["label"] From a642b3dc1236a0188b519eeb519b101e08ffdf87 Mon Sep 17 00:00:00 2001 From: Struan Donald Date: Tue, 26 Nov 2024 15:36:09 +0000 Subject: [PATCH 3/4] update constituency name lookup in importers update the christian aid and onward polling importers to use the constituency name lookup from the base importer Fixes #621 --- .../import_christian_aid_group_locations.py | 33 +---------------- .../commands/import_onward_polling_data.py | 37 +------------------ .../commands/run_all_import_scripts.py | 2 - 3 files changed, 2 insertions(+), 70 deletions(-) diff --git a/hub/management/commands/import_christian_aid_group_locations.py b/hub/management/commands/import_christian_aid_group_locations.py index 0fdae3b9b..46408f465 100644 --- a/hub/management/commands/import_christian_aid_group_locations.py +++ b/hub/management/commands/import_christian_aid_group_locations.py @@ -1,15 +1,11 @@ from django.conf import settings import pandas as pd -import requests from hub.models import DataSet from .base_importers import BaseConstituencyGroupListImportCommand -TWFY_CONSTITUENCIES_DATA_URL = "https://raw.githubusercontent.com/mysociety/parlparse/master/members/constituencies.json" -HARD_CODED_CONSTITUENCY_LOOKUP = {} - class Command(BaseConstituencyGroupListImportCommand): help = "Import data about Christian Aid groups per constituency" @@ -61,33 +57,6 @@ class Command(BaseConstituencyGroupListImportCommand): group_data_type = "constituency_christian_aid_groups" count_data_type = "constituency_christian_aid_group_count" - def add_to_dict(self, df): - names = df.names.tolist() - # Add a version of the main name, without any commas - names.append(names[0].replace(",", "")) - # The first name listed is the ideal form - name = names.pop(0) - return {alt_name.replace(",", ""): name for alt_name in names} - - def build_constituency_name_lookup(self): - # Grab the TWFY data, and ignore any constituencies that no longer exist - # We're only interested in the names, so keep them, and explode the column. - # Then group by (arbitrary) index, and build the dictionary from these groups - - response = requests.get(TWFY_CONSTITUENCIES_DATA_URL) - df = pd.DataFrame.from_records(response.json()) - df = df.query("end_date.isna()")["names"].reset_index() - df = df.explode("names", ignore_index=True) - - # Start with hard-coded lookup - names_lookup_dict = HARD_CODED_CONSTITUENCY_LOOKUP.copy() - for i, names_df in df.groupby("index"): - new_dict = self.add_to_dict(names_df) - if new_dict: - names_lookup_dict.update(new_dict) - - return names_lookup_dict - def get_df(self): if self.data_file.exists() is False: @@ -107,7 +76,7 @@ def get_df(self): ] # Build a constituency lookup from TWFY data, and apply it to the constituency column, so that the names are all in a form that LIH recognises - constituency_lookup = self.build_constituency_name_lookup() + constituency_lookup = self.build_constituency_name_lookup(old_cons=True) df.constituency = df.constituency.apply( lambda x: ( constituency_lookup.get(x.replace(",", ""), x) if not pd.isna(x) else "" diff --git a/hub/management/commands/import_onward_polling_data.py b/hub/management/commands/import_onward_polling_data.py index dbd2c07b0..4ec40a18d 100644 --- a/hub/management/commands/import_onward_polling_data.py +++ b/hub/management/commands/import_onward_polling_data.py @@ -1,19 +1,11 @@ from django.conf import settings import pandas as pd -import requests from hub.models import AreaData, DataSet from .base_importers import BaseImportFromDataFrameCommand -TWFY_CONSTITUENCIES_DATA_URL = "https://raw.githubusercontent.com/mysociety/parlparse/master/members/constituencies.json" -HARD_CODED_CONSTITUENCY_LOOKUP = { - "Cotswolds The": "The Cotswolds", - "Basildon South and East Thurrock": "South Basildon and East Thurrock", - "Na h-Eileanan An Iar (Western Isles)": "Na h-Eileanan an Iar", -} - class Command(BaseImportFromDataFrameCommand): help = "Import Onward polling data on attitudes to net zero and climate change" @@ -60,33 +52,6 @@ class Command(BaseImportFromDataFrameCommand): } del data_sets["constituency_cc_high"]["defaults"]["subcategory"] - def add_to_dict(self, df): - names = df.names.tolist() - # Add a version of the main name, without any commas - names.append(names[0].replace(",", "")) - # The first name listed is the ideal form - name = names.pop(0) - return {alt_name.replace(",", ""): name for alt_name in names} - - def build_constituency_name_lookup(self): - # Grab the TWFY data, and ignore any constituencies that no longer exist - # We're only interested in the names, so keep them, and explode the column. - # Then group by (arbitrary) index, and build the dictionary from these groups - - response = requests.get(TWFY_CONSTITUENCIES_DATA_URL) - df = pd.DataFrame.from_records(response.json()) - df = df.query("end_date.isna()")["names"].reset_index() - df = df.explode("names", ignore_index=True) - - # Start with hard-coded lookup - names_lookup_dict = HARD_CODED_CONSTITUENCY_LOOKUP.copy() - for i, names_df in df.groupby("index"): - new_dict = self.add_to_dict(names_df) - if new_dict: - names_lookup_dict.update(new_dict) - - return names_lookup_dict - def get_dataframe(self): if not self.data_file.exists(): @@ -110,7 +75,7 @@ def get_dataframe(self): ] # Build a constituency lookup from TWFY data, and apply it to the constituency column, so that the names are all in a form that LIH recognises - constituency_lookup = self.build_constituency_name_lookup() + constituency_lookup = self.build_constituency_name_lookup(old_cons=True) df.constituency = df.constituency.apply( lambda x: constituency_lookup.get(x.replace(",", ""), x) ) diff --git a/hub/management/commands/run_all_import_scripts.py b/hub/management/commands/run_all_import_scripts.py index c1fbf8941..b6d2f829c 100644 --- a/hub/management/commands/run_all_import_scripts.py +++ b/hub/management/commands/run_all_import_scripts.py @@ -15,11 +15,9 @@ class Command(BaseCommand): ] skip_imports = [ "import_2024_ppcs", # no longer relevant post-election - "import_christian_aid_group_locations", # JSONDecodeError because parlparse JSON file not found "import_mps_appg_data", # hasn't been updated for Autumn 2024 APPGs "import_mps_relevant_votes", # hasn't been updated for a while (and we import EDMs separately now) "import_mps_standing_down_2024", # no longer relevant post-election - "import_onward_polling_data", # JSONDecodeError because parlparse JSON file not found ] def get_scripts(self, *args, **options): From 0de5bf23c2e8e954ee5ee32bb22cf7d483609b30 Mon Sep 17 00:00:00 2001 From: Zarino Zappia Date: Tue, 10 Dec 2024 17:17:51 +0000 Subject: [PATCH 4/4] Add dataset visibility reminder to README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index eb97ccaf8..abbf67cd2 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,8 @@ You could alternatively run commands individually (again, from inside the contai ./manage.py import_areas ./manage.py import_mps +Finally, you will want to log in to `/admin` and make a selection of datasets “Public”, so they appear to logged-out users, on the site. + ### Running the tests First start the Docker environment: