Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix some import bugs #623

Merged
merged 4 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ You could alternatively run commands individually (again, from inside the contai
./manage.py import_areas
./manage.py import_mps

Finally, you will want to log in to `/admin` and make a selection of datasets “Public”, so they appear to logged-out users, on the site.

### Running the tests

First start the Docker environment:
Expand Down
55 changes: 51 additions & 4 deletions hub/management/commands/base_importers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from django.core.management.base import BaseCommand

import pandas as pd
import requests
from tqdm import tqdm

from hub.models import Area, AreaData, AreaType, DataSet, DataType
Expand Down Expand Up @@ -36,6 +37,16 @@
"Independents": "#DCDCDC",
}

TWFY_CONSTITUENCIES_DATA_URL = (
"https://raw.githubusercontent.com/mysociety/parlparse/master/members/people.json"
)
HARD_CODED_CONSTITUENCY_LOOKUP = {
"Cotswolds The": "The Cotswolds",
"Basildon South and East Thurrock": "South Basildon and East Thurrock",
"Na h-Eileanan An Iar (Western Isles)": "Na h-Eileanan an Iar",
"Ynys M¶n": "Ynys Môn",
}


class MultipleAreaTypesMixin:
def handle(self, *args, **options):
Expand Down Expand Up @@ -64,6 +75,42 @@ def add_arguments(self, parser):
help="do not auto convert to new constituency data",
)

def add_to_dict(self, df):
names = df.area.tolist()
# Add a version of the main name, without any commas
names.append(names[0].replace(",", ""))
# The first name listed is the ideal form
name = names.pop(0)
return {alt_name.replace(",", ""): name for alt_name in names}

def build_constituency_name_lookup(self, old_cons=False):
# Grab the TWFY data, and ignore any constituencies that no longer exist
# We're only interested in the names, so keep them, and explode the column.
# Then group by (arbitrary) index, and build the dictionary from these groups

cons_filter = "end_date.isna()"
if old_cons:
cons_filter = "end_date == '2024-07-03'"

response = requests.get(TWFY_CONSTITUENCIES_DATA_URL)
df = pd.DataFrame.from_records(response.json()["posts"])
df = df.query(cons_filter)["area"].reset_index()
df = (
df["area"]
.map(lambda a: [a["name"]] + [o for o in a.get("other_names", [])])
.reset_index()
)
df = df.explode("area", ignore_index=True)

# Start with hard-coded lookup
names_lookup_dict = HARD_CODED_CONSTITUENCY_LOOKUP.copy()
for i, names_df in df.groupby("index"):
new_dict = self.add_to_dict(names_df)
if new_dict:
names_lookup_dict.update(new_dict)

return names_lookup_dict

def get_label(self, config):
return config["defaults"]["label"]

Expand Down Expand Up @@ -210,7 +257,7 @@ def process_data(self, df: pd.DataFrame):
def handle(self, quiet=False, *args, **kwargs):
self._quiet = quiet
df = self.get_df()
if not df:
if df is None or df.empty:
return
self.add_data_sets()
self.delete_data()
Expand Down Expand Up @@ -271,7 +318,7 @@ def handle(self, quiet=False, skip_new_areatype_conversion=False, *args, **optio
if not hasattr(self, "do_not_convert"):
self.do_not_convert = skip_new_areatype_conversion
df = self.get_dataframe()
if df is None:
if df is None or df.empty:
if not self._quiet:
self.stdout.write(f"missing data for {self.message} ({self.area_type})")
return
Expand Down Expand Up @@ -374,7 +421,7 @@ def process_data(self, df: pd.DataFrame):
def handle(self, quiet=False, *args, **kwargs):
self._quiet = quiet
df = self.get_df()
if df is None:
if df is None or df.empty:
if not self._quiet:
self.stdout.write(f"missing data for {self.message} ({self.area_type})")
return
Expand Down Expand Up @@ -429,7 +476,7 @@ def process_data(self, df):
def handle(self, quiet=False, *args, **options):
self._quiet = quiet
df = self.get_dataframe()
if df.empty:
if df is None or df.empty:
if not self._quiet:
self.stdout.write(f"missing data for {self.message} ({self.area_type})")
return
Expand Down
4 changes: 3 additions & 1 deletion hub/management/commands/import_air_quality_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def add_arguments(self, parser):
def handle(self, quiet=False, *args, **options):
self._quiet = quiet
df = self.get_dataframe()
if not df:
if df is None:
self.stdout.write(
"Failed to import air quality data. Please ensure that the gridcode_lookup file is available."
)
Expand Down Expand Up @@ -192,4 +192,6 @@ def get_dataframe(self):

# Prepare the df for useful importing
df = df.drop(columns=["gridcode"]).groupby("gss").mean()
if df.empty:
return None
return df
2 changes: 1 addition & 1 deletion hub/management/commands/import_cen_nzsg_members.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def create_data_types(self):
def get_results(self):
mps = Person.objects.filter(person_type="MP")
df = self.get_df()
if df is None:
if df is None or df.empty:
return {}
results = {}
print("Name matching MPs")
Expand Down
33 changes: 1 addition & 32 deletions hub/management/commands/import_christian_aid_group_locations.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
from django.conf import settings

import pandas as pd
import requests

from hub.models import DataSet

from .base_importers import BaseConstituencyGroupListImportCommand

TWFY_CONSTITUENCIES_DATA_URL = "https://raw.githubusercontent.com/mysociety/parlparse/master/members/constituencies.json"
HARD_CODED_CONSTITUENCY_LOOKUP = {}


class Command(BaseConstituencyGroupListImportCommand):
help = "Import data about Christian Aid groups per constituency"
Expand Down Expand Up @@ -61,33 +57,6 @@ class Command(BaseConstituencyGroupListImportCommand):
group_data_type = "constituency_christian_aid_groups"
count_data_type = "constituency_christian_aid_group_count"

def add_to_dict(self, df):
names = df.names.tolist()
# Add a version of the main name, without any commas
names.append(names[0].replace(",", ""))
# The first name listed is the ideal form
name = names.pop(0)
return {alt_name.replace(",", ""): name for alt_name in names}

def build_constituency_name_lookup(self):
# Grab the TWFY data, and ignore any constituencies that no longer exist
# We're only interested in the names, so keep them, and explode the column.
# Then group by (arbitrary) index, and build the dictionary from these groups

response = requests.get(TWFY_CONSTITUENCIES_DATA_URL)
df = pd.DataFrame.from_records(response.json())
df = df.query("end_date.isna()")["names"].reset_index()
df = df.explode("names", ignore_index=True)

# Start with hard-coded lookup
names_lookup_dict = HARD_CODED_CONSTITUENCY_LOOKUP.copy()
for i, names_df in df.groupby("index"):
new_dict = self.add_to_dict(names_df)
if new_dict:
names_lookup_dict.update(new_dict)

return names_lookup_dict

def get_df(self):

if self.data_file.exists() is False:
Expand All @@ -107,7 +76,7 @@ def get_df(self):
]

# Build a constituency lookup from TWFY data, and apply it to the constituency column, so that the names are all in a form that LIH recognises
constituency_lookup = self.build_constituency_name_lookup()
constituency_lookup = self.build_constituency_name_lookup(old_cons=True)
df.constituency = df.constituency.apply(
lambda x: (
constituency_lookup.get(x.replace(",", ""), x) if not pd.isna(x) else ""
Expand Down
8 changes: 7 additions & 1 deletion hub/management/commands/import_flood_risk_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ def handle(self, quiet=False, *args, **options):
df = self.get_dataframe()
if df is None:
if not self._quiet:
self.stdout.write(f"Data file {self.data_file} not found")
self.stdout.write(
f"Data file {self.data_file} not found or contains no data"
)
return
self.data_types = self.create_data_types(df)
self.delete_data()
Expand Down Expand Up @@ -101,6 +103,8 @@ def get_dataframe(self):
if self.data_file.exists() is False:
return None
df = pd.read_csv(self.data_file)
if df.empty:
return None
totals = (
df.dropna()[["gss", "prob_4band"]]
.groupby("gss")
Expand All @@ -117,4 +121,6 @@ def get_dataframe(self):
)
df["percentage"] = df.value / df.total * 100
df = df.pivot(columns="prob_4band", values="percentage", index="gss").fillna(0)
if df.empty:
return None
return df
4 changes: 3 additions & 1 deletion hub/management/commands/import_last_election_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def handle(self, quiet=False, *args, **options):
self._quiet = quiet
self.delete_data()
df = self.get_last_election_df()
if df.empty is not True:
if df is not None:
self.data_types = self.create_data_types()
self.import_results(df)

Expand Down Expand Up @@ -202,6 +202,8 @@ def get_last_election_df(self):
df = df.rename(
columns=lambda party: self.party_translate_up_dict.get(party.lower(), party)
)
if df.empty:
return None
return df

def create_data_types(self):
Expand Down
6 changes: 4 additions & 2 deletions hub/management/commands/import_mp_engagement.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@ def handle(self, quiet=False, *args, **options):
self._quiet = quiet
self.data_types = self.create_data_types()
df = self.get_df()
if df is None:
if df is None or df.empty:
if not self._quiet:
self.stdout.write(f"Data file {self.data_file} not found")
self.stdout.write(
f"Data file {self.data_file} not found or contains no data"
)
return
self.import_results(df)

Expand Down
2 changes: 1 addition & 1 deletion hub/management/commands/import_mp_job_titles.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def import_results(self):

df = self.get_df()

if df is None:
if df is None or df.empty:
return

data_type = self.create_data_type()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,5 +95,5 @@ def add_results(self, results: pd.DataFrame, data_type):
def import_results(self):
data_type = self.create_data_types()
df = self.get_df()
if df:
if not df.empty:
self.add_results(df, data_type)
37 changes: 1 addition & 36 deletions hub/management/commands/import_onward_polling_data.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,11 @@
from django.conf import settings

import pandas as pd
import requests

from hub.models import AreaData, DataSet

from .base_importers import BaseImportFromDataFrameCommand

TWFY_CONSTITUENCIES_DATA_URL = "https://raw.githubusercontent.com/mysociety/parlparse/master/members/constituencies.json"
HARD_CODED_CONSTITUENCY_LOOKUP = {
"Cotswolds The": "The Cotswolds",
"Basildon South and East Thurrock": "South Basildon and East Thurrock",
"Na h-Eileanan An Iar (Western Isles)": "Na h-Eileanan an Iar",
}


class Command(BaseImportFromDataFrameCommand):
help = "Import Onward polling data on attitudes to net zero and climate change"
Expand Down Expand Up @@ -60,33 +52,6 @@ class Command(BaseImportFromDataFrameCommand):
}
del data_sets["constituency_cc_high"]["defaults"]["subcategory"]

def add_to_dict(self, df):
names = df.names.tolist()
# Add a version of the main name, without any commas
names.append(names[0].replace(",", ""))
# The first name listed is the ideal form
name = names.pop(0)
return {alt_name.replace(",", ""): name for alt_name in names}

def build_constituency_name_lookup(self):
# Grab the TWFY data, and ignore any constituencies that no longer exist
# We're only interested in the names, so keep them, and explode the column.
# Then group by (arbitrary) index, and build the dictionary from these groups

response = requests.get(TWFY_CONSTITUENCIES_DATA_URL)
df = pd.DataFrame.from_records(response.json())
df = df.query("end_date.isna()")["names"].reset_index()
df = df.explode("names", ignore_index=True)

# Start with hard-coded lookup
names_lookup_dict = HARD_CODED_CONSTITUENCY_LOOKUP.copy()
for i, names_df in df.groupby("index"):
new_dict = self.add_to_dict(names_df)
if new_dict:
names_lookup_dict.update(new_dict)

return names_lookup_dict

def get_dataframe(self):

if not self.data_file.exists():
Expand All @@ -110,7 +75,7 @@ def get_dataframe(self):
]

# Build a constituency lookup from TWFY data, and apply it to the constituency column, so that the names are all in a form that LIH recognises
constituency_lookup = self.build_constituency_name_lookup()
constituency_lookup = self.build_constituency_name_lookup(old_cons=True)
df.constituency = df.constituency.apply(
lambda x: constituency_lookup.get(x.replace(",", ""), x)
)
Expand Down
5 changes: 0 additions & 5 deletions hub/management/commands/run_all_import_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,9 @@ class Command(BaseCommand):
]
skip_imports = [
"import_2024_ppcs", # no longer relevant post-election
"import_air_quality_data", # ValueError because it checks for truthiness of a Pandas dataframe?
"import_christian_aid_group_locations", # JSONDecodeError because parlparse JSON file not found
"import_mps_appg_data", # hasn't been updated for Autumn 2024 APPGs
"import_mps_relevant_votes", # hasn't been updated for a while (and we import EDMs separately now)
"import_mps_standing_down_2024", # no longer relevant post-election
"import_mps_select_committee_membership", # ValueError because it checks for truthiness of a Pandas dataframe?
"import_nt_property_locations", # ValueError because it checks for truthiness of a Pandas dataframe?
"import_onward_polling_data", # JSONDecodeError because parlparse JSON file not found
]

def get_scripts(self, *args, **options):
Expand Down
Loading