Skip to content

Commit

Permalink
Merge pull request #438 from biglocalnews/ia-blanks
Browse files Browse the repository at this point in the history
Cut blank IA rows and abstracted new  utility
  • Loading branch information
palewire authored Feb 22, 2022
2 parents 08cabea + 47221f9 commit 6b6246f
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 27 deletions.
29 changes: 2 additions & 27 deletions warn/scrapers/ia.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from pathlib import Path

from bs4 import BeautifulSoup
from openpyxl import load_workbook

from .. import utils
from ..cache import Cache
Expand Down Expand Up @@ -43,14 +42,14 @@ def scrape(
excel_path = cache.download("ia/source.xlsx", excel_url)

# Parse it
row_list = _parse_xlsx(excel_path)
row_list = utils.parse_excel(excel_path)

# Get historic file
historic_url = "https://www.iowaworkforcedevelopment.gov/sites/search.iowaworkforcedevelopment.gov/files/documents/2018/WARN_20180503.xlsx"
historic_excel_path = cache.download("ia/historic.xlsx", historic_url)

# Parse it, minus the header
row_list += _parse_xlsx(historic_excel_path)[1:]
row_list += utils.parse_excel(historic_excel_path, keep_header=False)

# Set the export path
data_path = data_dir / "ia.csv"
Expand All @@ -62,29 +61,5 @@ def scrape(
return data_path


def _parse_xlsx(excel_path: Path) -> list:
"""Parse the XLSX file at the provided path.
Args:
excel_path (Path): The path to an XLSX file
Returns a list of values ready to write.
"""
# Open it up
workbook = load_workbook(filename=excel_path)

# Get the first sheet
worksheet = workbook.worksheets[0]

# Convert the sheet to a list of lists
row_list = []
for r in worksheet.rows:
column = [cell.value for cell in r]
row_list.append(column)

# Pass it back
return row_list


if __name__ == "__main__":
scrape()
41 changes: 41 additions & 0 deletions warn/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import csv
import logging
import os
import typing
from pathlib import Path

import requests
from openpyxl import load_workbook

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -119,3 +121,42 @@ def get_url(
response = requests.get(url, **kwargs)
logger.debug(f"Response code: {response.status_code}")
return response


def parse_excel(excel_path: Path, keep_header: bool = True) -> typing.List[typing.List]:
"""Parse the Excel file at the provided path.
Args:
excel_path (Path): The path to an XLSX file
keep_header (bool): Whether or not to return the header row. Default True.
Returns: List of values ready to write.
"""
# Open it up
workbook = load_workbook(filename=excel_path)

# Get the first sheet
worksheet = workbook.worksheets[0]

# Convert the sheet to a list of lists
row_list = []
for i, r in enumerate(worksheet.rows):
# Skip the header row, if that's what the user wants
if i == 0 and not keep_header:
continue

# Parse cells
cell_list = [cell.value for cell in r]

# Skip empty rows
try:
# A list with only empty cells will throw an error
next(c for c in cell_list if c)
except StopIteration:
continue

# Add to the master list
row_list.append(cell_list)

# Pass it back
return row_list

0 comments on commit 6b6246f

Please sign in to comment.