Merge pull request #438 from biglocalnews/ia-blanks

Cut blank IA rows and abstracted new utility
biglocalnews · Feb 22, 2022 · 6b6246f · 6b6246f
2 parents 08cabea + 47221f9
commit 6b6246f
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 27 deletions.
diff --git a/warn/scrapers/ia.py b/warn/scrapers/ia.py
@@ -2,7 +2,6 @@
 from pathlib import Path
 
 from bs4 import BeautifulSoup
-from openpyxl import load_workbook
 
 from .. import utils
 from ..cache import Cache
@@ -43,14 +42,14 @@ def scrape(
     excel_path = cache.download("ia/source.xlsx", excel_url)
 
     # Parse it
-    row_list = _parse_xlsx(excel_path)
+    row_list = utils.parse_excel(excel_path)
 
     # Get historic file
     historic_url = "https://www.iowaworkforcedevelopment.gov/sites/search.iowaworkforcedevelopment.gov/files/documents/2018/WARN_20180503.xlsx"
     historic_excel_path = cache.download("ia/historic.xlsx", historic_url)
 
     # Parse it, minus the header
-    row_list += _parse_xlsx(historic_excel_path)[1:]
+    row_list += utils.parse_excel(historic_excel_path, keep_header=False)
 
     # Set the export path
     data_path = data_dir / "ia.csv"
@@ -62,29 +61,5 @@ def scrape(
     return data_path
 
 
-def _parse_xlsx(excel_path: Path) -> list:
-    """Parse the XLSX file at the provided path.
-
-    Args:
-    excel_path (Path): The path to an XLSX file
-
-    Returns a list of values ready to write.
-    """
-    # Open it up
-    workbook = load_workbook(filename=excel_path)
-
-    # Get the first sheet
-    worksheet = workbook.worksheets[0]
-
-    # Convert the sheet to a list of lists
-    row_list = []
-    for r in worksheet.rows:
-        column = [cell.value for cell in r]
-        row_list.append(column)
-
-    # Pass it back
-    return row_list
-
-
 if __name__ == "__main__":
     scrape()
diff --git a/warn/utils.py b/warn/utils.py
@@ -1,9 +1,11 @@
 import csv
 import logging
 import os
+import typing
 from pathlib import Path
 
 import requests
+from openpyxl import load_workbook
 
 logger = logging.getLogger(__name__)
 
@@ -119,3 +121,42 @@ def get_url(
         response = requests.get(url, **kwargs)
     logger.debug(f"Response code: {response.status_code}")
     return response
+
+
+def parse_excel(excel_path: Path, keep_header: bool = True) -> typing.List[typing.List]:
+    """Parse the Excel file at the provided path.
+
+    Args:
+        excel_path (Path): The path to an XLSX file
+        keep_header (bool): Whether or not to return the header row. Default  True.
+
+    Returns: List of values ready to write.
+    """
+    # Open it up
+    workbook = load_workbook(filename=excel_path)
+
+    # Get the first sheet
+    worksheet = workbook.worksheets[0]
+
+    # Convert the sheet to a list of lists
+    row_list = []
+    for i, r in enumerate(worksheet.rows):
+        # Skip the header row, if that's what the user wants
+        if i == 0 and not keep_header:
+            continue
+
+        # Parse cells
+        cell_list = [cell.value for cell in r]
+
+        # Skip empty rows
+        try:
+            # A list with only empty cells will throw an error
+            next(c for c in cell_list if c)
+        except StopIteration:
+            continue
+
+        # Add to the master list
+        row_list.append(cell_list)
+
+    # Pass it back
+    return row_list