Skip to content

Commit

Permalink
Merge branch 'id-655' of github.com:biglocalnews/warn-scraper into id…
Browse files Browse the repository at this point in the history
…-655
  • Loading branch information
stucka committed Jul 15, 2024
2 parents 896ee37 + 92f6e48 commit cf344fc
Show file tree
Hide file tree
Showing 6 changed files with 63 additions and 121 deletions.
33 changes: 19 additions & 14 deletions warn/scrapers/ca.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,20 +224,25 @@ def _extract_pdf_data(pdf_path):
if "summary" in first_cell:
continue
for row in rows:
data_row = {}
for i, value in enumerate(row):
this_raw_header = raw_header[i]
this_clean_header = header_crosswalk[this_raw_header]
data_row[this_clean_header] = value
# Data clean-ups
data_row.update(
{
"effective_date": data_row["effective_date"].replace(" ", ""),
"received_date": data_row["received_date"].replace(" ", ""),
"source_file": str(pdf_path).split("/")[-1],
}
)
data.append(data_row)
# Summary rows have an extra field, and the above code does not
# block the summary table from being parsed if it jumps onto another page.
if len(row) != len(raw_header) + 1:
data_row = {}
for i, value in enumerate(row):
this_raw_header = raw_header[i]
this_clean_header = header_crosswalk[this_raw_header]
data_row[this_clean_header] = value
# Data clean-ups
data_row.update(
{
"effective_date": data_row["effective_date"].replace(
" ", ""
),
"received_date": data_row["received_date"].replace(" ", ""),
"source_file": str(pdf_path).split("/")[-1],
}
)
data.append(data_row)
return data


Expand Down
1 change: 0 additions & 1 deletion warn/scrapers/de.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ def scrape(
stop_year,
cache_dir,
use_cache=use_cache,
verify=False,
)

# Return the resulting CSV file path
Expand Down
6 changes: 3 additions & 3 deletions warn/scrapers/fl.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
__tags__ = ["html", "pdf"]
__source__ = {
"name": "Florida Department of Economic Opportunity",
"url": "http://floridajobs.org/office-directory/division-of-workforce-services/workforce-programs/reemployment-and-emergency-assistance-coordination-team-react/warn-notices",
"url": "https://floridajobs.org/office-directory/division-of-workforce-services/workforce-programs/reemployment-and-emergency-assistance-coordination-team-react/warn-notices",
}

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -53,7 +53,7 @@ def scrape(
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
}
url = "http://floridajobs.org/office-directory/division-of-workforce-services/workforce-programs/reemployment-and-emergency-assistance-coordination-team-react/warn-notices"
url = "https://floridajobs.org/office-directory/division-of-workforce-services/workforce-programs/reemployment-and-emergency-assistance-coordination-team-react/warn-notices"
response = requests.get(url, headers=headers, verify=False)
logger.debug(f"Request status is {response.status_code} for {url}")
soup = BeautifulSoup(response.text, "html.parser")
Expand Down Expand Up @@ -138,7 +138,7 @@ def _scrape_html(cache, url, headers, page=1):
) # find link to next page, if exists
# recursively scrape until we have a list of all the pages' html
if nextPageLink:
url = "http://reactwarn.floridajobs.org" + nextPageLink.get(
url = "https://reactwarn.floridajobs.org" + nextPageLink.get(
"href"
) # /WarnList/Records?year=XXXX&page=X
# recursively make list of all the next pages' html
Expand Down
2 changes: 1 addition & 1 deletion warn/scrapers/md.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
__tags__ = ["html"]
__source__ = {
"name": "Maryland Department of Labor",
"url": "http://www.dllr.state.md.us/employment/warn.shtml",
"url": "https://www.dllr.state.md.us/employment/warn.shtml",
}

logger = logging.getLogger(__name__)
Expand Down
8 changes: 4 additions & 4 deletions warn/scrapers/mt.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ def scrape(
Returns: the Path where the file is written
"""
# Get the URL
url = "http://wsd.dli.mt.gov/wioa/related-links/warn-notice-page"
r = utils.get_url(url, verify=False)
url = "https://wsd.dli.mt.gov/wioa/related-links/warn-notice-page"
r = utils.get_url(url, verify=True)
html = r.text

# Save it to the cache
Expand All @@ -50,10 +50,10 @@ def scrape(
"xlsx"
) # URL will look like: ="../../_docs/wioa/warn-9-1-21.xlsx"
][0].split("/")[-1]
excel_url = f"http://wsd.dli.mt.gov/_docs/wioa/{excel_name}"
excel_url = f"https://wsd.dli.mt.gov/_docs/wioa/{excel_name}"

# Download the Excel file
excel_path = cache.download("mt/source.xlsx", excel_url, verify=False)
excel_path = cache.download("mt/source.xlsx", excel_url, verify=True)

# Open it up
workbook = load_workbook(filename=excel_path)
Expand Down
134 changes: 36 additions & 98 deletions warn/scrapers/tn.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import csv
import typing
from pathlib import Path

import pdfplumber
from bs4 import BeautifulSoup

from .. import utils
from ..cache import Cache

__authors__ = ["anikasikka"]
__tags__ = ["html", "pdf"]
__authors__ = ["anikasikka", "stucka"]
__tags__ = ["html"]
__source__ = {
"name": "Tennessee Department of Labor and Workforce Development",
"url": "https://www.tn.gov/workforce/general-resources/major-publications0/major-publications-redirect/reports.html",
Expand Down Expand Up @@ -37,13 +37,11 @@ def scrape(
)
html = page.text
cache.write("tn/source.html", html)
soup = BeautifulSoup(html, "html5lib")
tables = soup.find_all(attrs={"class": "tn-datatable"})
rows = BeautifulSoup(str(tables), "html5lib").find_all("tr")

# Grab the PDF with the archived historial data
pdf_url = "https://www.tn.gov/content/dam/tn/workforce/documents/majorpublications/reports/WarnReportByMonth.pdf"
pdf_file = cache.download("tn/pdffile.pdf", pdf_url)

# Set the headers we'll use for both sources
tn_headers = [
dataheaders: typing.List = [
"Notice Date",
"Effective Date",
"Received Date",
Expand All @@ -53,102 +51,42 @@ def scrape(
"No. Of Employees",
"Layoff/Closure",
"Notice ID",
# "Notice URL",
]
cleaned_data: typing.List[typing.Any] = [tn_headers]

# Parse the latest HTML file and convert to a list of rows, with a header in the first row.
soup = BeautifulSoup(html, "html5lib")
staginglist: typing.List = []
for row in reversed(rows):
cells = row.find_all("td")
if len(cells) == 6: # Filter for potentially valid rows
line: typing.Dict = {}
for item in dataheaders: # Build an ordered dictionary with null values
line[item] = None
line["Notice Date"] = cells[0].text.strip()
line["Effective Date"] = cells[4].text.strip()
line["Company"] = cells[1].text.strip()
line["County"] = cells[2].text.strip()
line["No. Of Employees"] = cells[3].text.strip()
line["Notice ID"] = cells[5].text.strip()
# line['Notice URL'] = cells[1].find("a")['href']
staginglist.append(line)

# Bring in historical data
historical_file = cache_dir / "tn/tn_historical.csv"
historical_url = (
"https://storage.googleapis.com/bln-data-public/warn-layoffs/tn_historical.csv"
)
utils.fetch_if_not_cached(historical_file, historical_url)
historical_str = cache.read("tn/tn_historical.csv")

# Grab all the list items on the page
data_list = soup.find_all("p")

# Loop through them all, skipping the first item, which is a header
for data in data_list[1:]:
# splitting the data on its delimiter
items = str(data).split("|")

# making sure that the last item in the list is the data value of interest
# splitting based on last character of each text-html data sequence
raw_data = []
for item in items:
value_html = item.split(":")[-1]
value_soup = BeautifulSoup(value_html, "html5lib")
string_list = list(value_soup.stripped_strings)
if len(string_list) > 0:
value = string_list[-1]
else:
continue
raw_data.append(value)

# If there aren't six entries it's junk
if len(raw_data) != 6:
continue

# Pluck out the values we want
nice_data = [
raw_data[0], # Notice Date
raw_data[4], # Effective Date
"", # Received Date
raw_data[1], # Company
"", # City
raw_data[2], # County
raw_data[3], # Number of employees
"", # Layoff/Closure
raw_data[5], # Notice ID
]

# Add them to the master list
cleaned_data.append(nice_data)

# The PDF header blacklist of rows to toss
pdf_header_blacklist = [
"Notice Date",
"Total",
]
historicallist = list(csv.DictReader(historical_str.splitlines()))

# Combine fresh and historical
staginglist.extend(historicallist)

# Open the PDF
with pdfplumber.open(pdf_file) as pdf:
# Loop through all the pages
for i, my_page in enumerate(pdf.pages):
# Sll even pages have data, odd pages don't have the data
if i % 2 != 0:
continue

# Pull out the table and loop through the rows
table = my_page.extract_table()
if not table:
continue

# Cut empty rows
row_list = [r for r in table if any(r)]
if not row_list:
continue

# If this is a summary table, skip it
first_cell = row_list[0][0]
assert first_cell
if first_cell.lower().strip() == "summary by month":
continue

# Loop through all the rows ...
for row in row_list:
# Skip remove redundant headers
if row[0] in pdf_header_blacklist:
continue

# Toss in an empty Notice ID since it isn't in the PDF
row.append("")

# Add the data to our output
cleaned_data.append(row)

# Set the path to the final CSV
output_csv = data_dir / "tn.csv"

# Write out the rows to the export directory
utils.write_rows_to_csv(output_csv, cleaned_data)
utils.write_dict_rows_to_csv(output_csv, dataheaders, staginglist)

# Return the path to the final CSV
return output_csv


Expand Down

0 comments on commit cf344fc

Please sign in to comment.