diff --git a/README.md b/README.md index eb97ccaf8..abbf67cd2 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,8 @@ You could alternatively run commands individually (again, from inside the contai ./manage.py import_areas ./manage.py import_mps +Finally, you will want to log in to `/admin` and make a selection of datasets “Public”, so they appear to logged-out users, on the site. + ### Running the tests First start the Docker environment: diff --git a/hub/management/commands/base_importers.py b/hub/management/commands/base_importers.py index e0c426100..d6c6d5007 100644 --- a/hub/management/commands/base_importers.py +++ b/hub/management/commands/base_importers.py @@ -4,6 +4,7 @@ from django.core.management.base import BaseCommand import pandas as pd +import requests from tqdm import tqdm from hub.models import Area, AreaData, AreaType, DataSet, DataType @@ -36,6 +37,16 @@ "Independents": "#DCDCDC", } +TWFY_CONSTITUENCIES_DATA_URL = ( + "https://raw.githubusercontent.com/mysociety/parlparse/master/members/people.json" +) +HARD_CODED_CONSTITUENCY_LOOKUP = { + "Cotswolds The": "The Cotswolds", + "Basildon South and East Thurrock": "South Basildon and East Thurrock", + "Na h-Eileanan An Iar (Western Isles)": "Na h-Eileanan an Iar", + "Ynys M¶n": "Ynys Môn", +} + class MultipleAreaTypesMixin: def handle(self, *args, **options): @@ -64,6 +75,42 @@ def add_arguments(self, parser): help="do not auto convert to new constituency data", ) + def add_to_dict(self, df): + names = df.area.tolist() + # Add a version of the main name, without any commas + names.append(names[0].replace(",", "")) + # The first name listed is the ideal form + name = names.pop(0) + return {alt_name.replace(",", ""): name for alt_name in names} + + def build_constituency_name_lookup(self, old_cons=False): + # Grab the TWFY data, and ignore any constituencies that no longer exist + # We're only interested in the names, so keep them, and explode the column. + # Then group by (arbitrary) index, and build the dictionary from these groups + + cons_filter = "end_date.isna()" + if old_cons: + cons_filter = "end_date == '2024-07-03'" + + response = requests.get(TWFY_CONSTITUENCIES_DATA_URL) + df = pd.DataFrame.from_records(response.json()["posts"]) + df = df.query(cons_filter)["area"].reset_index() + df = ( + df["area"] + .map(lambda a: [a["name"]] + [o for o in a.get("other_names", [])]) + .reset_index() + ) + df = df.explode("area", ignore_index=True) + + # Start with hard-coded lookup + names_lookup_dict = HARD_CODED_CONSTITUENCY_LOOKUP.copy() + for i, names_df in df.groupby("index"): + new_dict = self.add_to_dict(names_df) + if new_dict: + names_lookup_dict.update(new_dict) + + return names_lookup_dict + def get_label(self, config): return config["defaults"]["label"] @@ -210,7 +257,7 @@ def process_data(self, df: pd.DataFrame): def handle(self, quiet=False, *args, **kwargs): self._quiet = quiet df = self.get_df() - if not df: + if df is None or df.empty: return self.add_data_sets() self.delete_data() @@ -271,7 +318,7 @@ def handle(self, quiet=False, skip_new_areatype_conversion=False, *args, **optio if not hasattr(self, "do_not_convert"): self.do_not_convert = skip_new_areatype_conversion df = self.get_dataframe() - if df is None: + if df is None or df.empty: if not self._quiet: self.stdout.write(f"missing data for {self.message} ({self.area_type})") return @@ -374,7 +421,7 @@ def process_data(self, df: pd.DataFrame): def handle(self, quiet=False, *args, **kwargs): self._quiet = quiet df = self.get_df() - if df is None: + if df is None or df.empty: if not self._quiet: self.stdout.write(f"missing data for {self.message} ({self.area_type})") return @@ -429,7 +476,7 @@ def process_data(self, df): def handle(self, quiet=False, *args, **options): self._quiet = quiet df = self.get_dataframe() - if df.empty: + if df is None or df.empty: if not self._quiet: self.stdout.write(f"missing data for {self.message} ({self.area_type})") return diff --git a/hub/management/commands/import_air_quality_data.py b/hub/management/commands/import_air_quality_data.py index 2b5373317..9b69704b6 100644 --- a/hub/management/commands/import_air_quality_data.py +++ b/hub/management/commands/import_air_quality_data.py @@ -94,7 +94,7 @@ def add_arguments(self, parser): def handle(self, quiet=False, *args, **options): self._quiet = quiet df = self.get_dataframe() - if not df: + if df is None: self.stdout.write( "Failed to import air quality data. Please ensure that the gridcode_lookup file is available." ) @@ -192,4 +192,6 @@ def get_dataframe(self): # Prepare the df for useful importing df = df.drop(columns=["gridcode"]).groupby("gss").mean() + if df.empty: + return None return df diff --git a/hub/management/commands/import_cen_nzsg_members.py b/hub/management/commands/import_cen_nzsg_members.py index f0ad1fe60..e67b930a9 100644 --- a/hub/management/commands/import_cen_nzsg_members.py +++ b/hub/management/commands/import_cen_nzsg_members.py @@ -80,7 +80,7 @@ def create_data_types(self): def get_results(self): mps = Person.objects.filter(person_type="MP") df = self.get_df() - if df is None: + if df is None or df.empty: return {} results = {} print("Name matching MPs") diff --git a/hub/management/commands/import_christian_aid_group_locations.py b/hub/management/commands/import_christian_aid_group_locations.py index 0fdae3b9b..46408f465 100644 --- a/hub/management/commands/import_christian_aid_group_locations.py +++ b/hub/management/commands/import_christian_aid_group_locations.py @@ -1,15 +1,11 @@ from django.conf import settings import pandas as pd -import requests from hub.models import DataSet from .base_importers import BaseConstituencyGroupListImportCommand -TWFY_CONSTITUENCIES_DATA_URL = "https://raw.githubusercontent.com/mysociety/parlparse/master/members/constituencies.json" -HARD_CODED_CONSTITUENCY_LOOKUP = {} - class Command(BaseConstituencyGroupListImportCommand): help = "Import data about Christian Aid groups per constituency" @@ -61,33 +57,6 @@ class Command(BaseConstituencyGroupListImportCommand): group_data_type = "constituency_christian_aid_groups" count_data_type = "constituency_christian_aid_group_count" - def add_to_dict(self, df): - names = df.names.tolist() - # Add a version of the main name, without any commas - names.append(names[0].replace(",", "")) - # The first name listed is the ideal form - name = names.pop(0) - return {alt_name.replace(",", ""): name for alt_name in names} - - def build_constituency_name_lookup(self): - # Grab the TWFY data, and ignore any constituencies that no longer exist - # We're only interested in the names, so keep them, and explode the column. - # Then group by (arbitrary) index, and build the dictionary from these groups - - response = requests.get(TWFY_CONSTITUENCIES_DATA_URL) - df = pd.DataFrame.from_records(response.json()) - df = df.query("end_date.isna()")["names"].reset_index() - df = df.explode("names", ignore_index=True) - - # Start with hard-coded lookup - names_lookup_dict = HARD_CODED_CONSTITUENCY_LOOKUP.copy() - for i, names_df in df.groupby("index"): - new_dict = self.add_to_dict(names_df) - if new_dict: - names_lookup_dict.update(new_dict) - - return names_lookup_dict - def get_df(self): if self.data_file.exists() is False: @@ -107,7 +76,7 @@ def get_df(self): ] # Build a constituency lookup from TWFY data, and apply it to the constituency column, so that the names are all in a form that LIH recognises - constituency_lookup = self.build_constituency_name_lookup() + constituency_lookup = self.build_constituency_name_lookup(old_cons=True) df.constituency = df.constituency.apply( lambda x: ( constituency_lookup.get(x.replace(",", ""), x) if not pd.isna(x) else "" diff --git a/hub/management/commands/import_flood_risk_data.py b/hub/management/commands/import_flood_risk_data.py index 82d56b0ea..161dfde60 100644 --- a/hub/management/commands/import_flood_risk_data.py +++ b/hub/management/commands/import_flood_risk_data.py @@ -43,7 +43,9 @@ def handle(self, quiet=False, *args, **options): df = self.get_dataframe() if df is None: if not self._quiet: - self.stdout.write(f"Data file {self.data_file} not found") + self.stdout.write( + f"Data file {self.data_file} not found or contains no data" + ) return self.data_types = self.create_data_types(df) self.delete_data() @@ -101,6 +103,8 @@ def get_dataframe(self): if self.data_file.exists() is False: return None df = pd.read_csv(self.data_file) + if df.empty: + return None totals = ( df.dropna()[["gss", "prob_4band"]] .groupby("gss") @@ -117,4 +121,6 @@ def get_dataframe(self): ) df["percentage"] = df.value / df.total * 100 df = df.pivot(columns="prob_4band", values="percentage", index="gss").fillna(0) + if df.empty: + return None return df diff --git a/hub/management/commands/import_last_election_data.py b/hub/management/commands/import_last_election_data.py index d47f7aaf8..ada7e683a 100644 --- a/hub/management/commands/import_last_election_data.py +++ b/hub/management/commands/import_last_election_data.py @@ -69,7 +69,7 @@ def handle(self, quiet=False, *args, **options): self._quiet = quiet self.delete_data() df = self.get_last_election_df() - if df.empty is not True: + if df is not None: self.data_types = self.create_data_types() self.import_results(df) @@ -202,6 +202,8 @@ def get_last_election_df(self): df = df.rename( columns=lambda party: self.party_translate_up_dict.get(party.lower(), party) ) + if df.empty: + return None return df def create_data_types(self): diff --git a/hub/management/commands/import_mp_engagement.py b/hub/management/commands/import_mp_engagement.py index 578e370cc..72a869c58 100644 --- a/hub/management/commands/import_mp_engagement.py +++ b/hub/management/commands/import_mp_engagement.py @@ -15,9 +15,11 @@ def handle(self, quiet=False, *args, **options): self._quiet = quiet self.data_types = self.create_data_types() df = self.get_df() - if df is None: + if df is None or df.empty: if not self._quiet: - self.stdout.write(f"Data file {self.data_file} not found") + self.stdout.write( + f"Data file {self.data_file} not found or contains no data" + ) return self.import_results(df) diff --git a/hub/management/commands/import_mp_job_titles.py b/hub/management/commands/import_mp_job_titles.py index de8f75421..e0e0ce1d8 100644 --- a/hub/management/commands/import_mp_job_titles.py +++ b/hub/management/commands/import_mp_job_titles.py @@ -89,7 +89,7 @@ def import_results(self): df = self.get_df() - if df is None: + if df is None or df.empty: return data_type = self.create_data_type() diff --git a/hub/management/commands/import_mps_select_committee_membership.py b/hub/management/commands/import_mps_select_committee_membership.py index 1e7e81b41..3f2e75366 100644 --- a/hub/management/commands/import_mps_select_committee_membership.py +++ b/hub/management/commands/import_mps_select_committee_membership.py @@ -95,5 +95,5 @@ def add_results(self, results: pd.DataFrame, data_type): def import_results(self): data_type = self.create_data_types() df = self.get_df() - if df: + if not df.empty: self.add_results(df, data_type) diff --git a/hub/management/commands/import_onward_polling_data.py b/hub/management/commands/import_onward_polling_data.py index dbd2c07b0..4ec40a18d 100644 --- a/hub/management/commands/import_onward_polling_data.py +++ b/hub/management/commands/import_onward_polling_data.py @@ -1,19 +1,11 @@ from django.conf import settings import pandas as pd -import requests from hub.models import AreaData, DataSet from .base_importers import BaseImportFromDataFrameCommand -TWFY_CONSTITUENCIES_DATA_URL = "https://raw.githubusercontent.com/mysociety/parlparse/master/members/constituencies.json" -HARD_CODED_CONSTITUENCY_LOOKUP = { - "Cotswolds The": "The Cotswolds", - "Basildon South and East Thurrock": "South Basildon and East Thurrock", - "Na h-Eileanan An Iar (Western Isles)": "Na h-Eileanan an Iar", -} - class Command(BaseImportFromDataFrameCommand): help = "Import Onward polling data on attitudes to net zero and climate change" @@ -60,33 +52,6 @@ class Command(BaseImportFromDataFrameCommand): } del data_sets["constituency_cc_high"]["defaults"]["subcategory"] - def add_to_dict(self, df): - names = df.names.tolist() - # Add a version of the main name, without any commas - names.append(names[0].replace(",", "")) - # The first name listed is the ideal form - name = names.pop(0) - return {alt_name.replace(",", ""): name for alt_name in names} - - def build_constituency_name_lookup(self): - # Grab the TWFY data, and ignore any constituencies that no longer exist - # We're only interested in the names, so keep them, and explode the column. - # Then group by (arbitrary) index, and build the dictionary from these groups - - response = requests.get(TWFY_CONSTITUENCIES_DATA_URL) - df = pd.DataFrame.from_records(response.json()) - df = df.query("end_date.isna()")["names"].reset_index() - df = df.explode("names", ignore_index=True) - - # Start with hard-coded lookup - names_lookup_dict = HARD_CODED_CONSTITUENCY_LOOKUP.copy() - for i, names_df in df.groupby("index"): - new_dict = self.add_to_dict(names_df) - if new_dict: - names_lookup_dict.update(new_dict) - - return names_lookup_dict - def get_dataframe(self): if not self.data_file.exists(): @@ -110,7 +75,7 @@ def get_dataframe(self): ] # Build a constituency lookup from TWFY data, and apply it to the constituency column, so that the names are all in a form that LIH recognises - constituency_lookup = self.build_constituency_name_lookup() + constituency_lookup = self.build_constituency_name_lookup(old_cons=True) df.constituency = df.constituency.apply( lambda x: constituency_lookup.get(x.replace(",", ""), x) ) diff --git a/hub/management/commands/run_all_import_scripts.py b/hub/management/commands/run_all_import_scripts.py index ab2a7d530..b6d2f829c 100644 --- a/hub/management/commands/run_all_import_scripts.py +++ b/hub/management/commands/run_all_import_scripts.py @@ -15,14 +15,9 @@ class Command(BaseCommand): ] skip_imports = [ "import_2024_ppcs", # no longer relevant post-election - "import_air_quality_data", # ValueError because it checks for truthiness of a Pandas dataframe? - "import_christian_aid_group_locations", # JSONDecodeError because parlparse JSON file not found "import_mps_appg_data", # hasn't been updated for Autumn 2024 APPGs "import_mps_relevant_votes", # hasn't been updated for a while (and we import EDMs separately now) "import_mps_standing_down_2024", # no longer relevant post-election - "import_mps_select_committee_membership", # ValueError because it checks for truthiness of a Pandas dataframe? - "import_nt_property_locations", # ValueError because it checks for truthiness of a Pandas dataframe? - "import_onward_polling_data", # JSONDecodeError because parlparse JSON file not found ] def get_scripts(self, *args, **options):