From c0d87c5a13af697b597c9cbbae8b4d2cc969bb57 Mon Sep 17 00:00:00 2001 From: Chris Zubak-Skees Date: Sun, 10 Mar 2024 17:33:35 +0800 Subject: [PATCH 1/5] Re-enable SSL verification for DE --- warn/scrapers/de.py | 1 - 1 file changed, 1 deletion(-) diff --git a/warn/scrapers/de.py b/warn/scrapers/de.py index 0d7dd8c..5f45797 100644 --- a/warn/scrapers/de.py +++ b/warn/scrapers/de.py @@ -44,7 +44,6 @@ def scrape( stop_year, cache_dir, use_cache=use_cache, - verify=False, ) # Return the resulting CSV file path From 214269768ca033f1b681efb5f31bc06f5625ccd9 Mon Sep 17 00:00:00 2001 From: Mike Stucka Date: Mon, 25 Mar 2024 10:52:52 -0400 Subject: [PATCH 2/5] Patch HTTP #622 --- warn/scrapers/fl.py | 6 +++--- warn/scrapers/md.py | 2 +- warn/scrapers/mt.py | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/warn/scrapers/fl.py b/warn/scrapers/fl.py index e5f2bb2..4854866 100755 --- a/warn/scrapers/fl.py +++ b/warn/scrapers/fl.py @@ -17,7 +17,7 @@ __tags__ = ["html", "pdf"] __source__ = { "name": "Florida Department of Economic Opportunity", - "url": "http://floridajobs.org/office-directory/division-of-workforce-services/workforce-programs/reemployment-and-emergency-assistance-coordination-team-react/warn-notices", + "url": "https://floridajobs.org/office-directory/division-of-workforce-services/workforce-programs/reemployment-and-emergency-assistance-coordination-team-react/warn-notices", } logger = logging.getLogger(__name__) @@ -53,7 +53,7 @@ def scrape( headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36" } - url = "http://floridajobs.org/office-directory/division-of-workforce-services/workforce-programs/reemployment-and-emergency-assistance-coordination-team-react/warn-notices" + url = "https://floridajobs.org/office-directory/division-of-workforce-services/workforce-programs/reemployment-and-emergency-assistance-coordination-team-react/warn-notices" response = requests.get(url, headers=headers, verify=False) logger.debug(f"Request status is {response.status_code} for {url}") soup = BeautifulSoup(response.text, "html.parser") @@ -138,7 +138,7 @@ def _scrape_html(cache, url, headers, page=1): ) # find link to next page, if exists # recursively scrape until we have a list of all the pages' html if nextPageLink: - url = "http://reactwarn.floridajobs.org" + nextPageLink.get( + url = "https://reactwarn.floridajobs.org" + nextPageLink.get( "href" ) # /WarnList/Records?year=XXXX&page=X # recursively make list of all the next pages' html diff --git a/warn/scrapers/md.py b/warn/scrapers/md.py index 0765f4f..d637089 100644 --- a/warn/scrapers/md.py +++ b/warn/scrapers/md.py @@ -11,7 +11,7 @@ __tags__ = ["html"] __source__ = { "name": "Maryland Department of Labor", - "url": "http://www.dllr.state.md.us/employment/warn.shtml", + "url": "https://www.dllr.state.md.us/employment/warn.shtml", } logger = logging.getLogger(__name__) diff --git a/warn/scrapers/mt.py b/warn/scrapers/mt.py index 3bdc6cb..3279b27 100644 --- a/warn/scrapers/mt.py +++ b/warn/scrapers/mt.py @@ -28,8 +28,8 @@ def scrape( Returns: the Path where the file is written """ # Get the URL - url = "http://wsd.dli.mt.gov/wioa/related-links/warn-notice-page" - r = utils.get_url(url, verify=False) + url = "https://wsd.dli.mt.gov/wioa/related-links/warn-notice-page" + r = utils.get_url(url, verify=True) html = r.text # Save it to the cache @@ -50,10 +50,10 @@ def scrape( "xlsx" ) # URL will look like: ="../../_docs/wioa/warn-9-1-21.xlsx" ][0].split("/")[-1] - excel_url = f"http://wsd.dli.mt.gov/_docs/wioa/{excel_name}" + excel_url = f"https://wsd.dli.mt.gov/_docs/wioa/{excel_name}" # Download the Excel file - excel_path = cache.download("mt/source.xlsx", excel_url, verify=False) + excel_path = cache.download("mt/source.xlsx", excel_url, verify=True) # Open it up workbook = load_workbook(filename=excel_path) From 9c5cb8b5023146128f540afd867d403218e20b4f Mon Sep 17 00:00:00 2001 From: Mike Stucka Date: Mon, 27 May 2024 09:15:48 -0400 Subject: [PATCH 3/5] Rebuild TN --- warn/scrapers/tn.py | 134 ++++++++++++-------------------------------- 1 file changed, 36 insertions(+), 98 deletions(-) diff --git a/warn/scrapers/tn.py b/warn/scrapers/tn.py index 6136a23..7b7a522 100644 --- a/warn/scrapers/tn.py +++ b/warn/scrapers/tn.py @@ -1,14 +1,14 @@ +import csv import typing from pathlib import Path -import pdfplumber from bs4 import BeautifulSoup from .. import utils from ..cache import Cache -__authors__ = ["anikasikka"] -__tags__ = ["html", "pdf"] +__authors__ = ["anikasikka", "stucka"] +__tags__ = ["html"] __source__ = { "name": "Tennessee Department of Labor and Workforce Development", "url": "https://www.tn.gov/workforce/general-resources/major-publications0/major-publications-redirect/reports.html", @@ -37,13 +37,11 @@ def scrape( ) html = page.text cache.write("tn/source.html", html) + soup = BeautifulSoup(html, "html5lib") + tables = soup.find_all(attrs={"class": "tn-datatable"}) + rows = BeautifulSoup(str(tables), "html5lib").find_all("tr") - # Grab the PDF with the archived historial data - pdf_url = "https://www.tn.gov/content/dam/tn/workforce/documents/majorpublications/reports/WarnReportByMonth.pdf" - pdf_file = cache.download("tn/pdffile.pdf", pdf_url) - - # Set the headers we'll use for both sources - tn_headers = [ + dataheaders: typing.List = [ "Notice Date", "Effective Date", "Received Date", @@ -53,102 +51,42 @@ def scrape( "No. Of Employees", "Layoff/Closure", "Notice ID", + # "Notice URL", ] - cleaned_data: typing.List[typing.Any] = [tn_headers] - # Parse the latest HTML file and convert to a list of rows, with a header in the first row. - soup = BeautifulSoup(html, "html5lib") + staginglist: typing.List = [] + for row in reversed(rows): + cells = row.find_all("td") + if len(cells) == 6: # Filter for potentially valid rows + line: typing.Dict = {} + for item in dataheaders: # Build an ordered dictionary with null values + line[item] = None + line["Notice Date"] = cells[0].text.strip() + line["Effective Date"] = cells[4].text.strip() + line["Company"] = cells[1].text.strip() + line["County"] = cells[2].text.strip() + line["No. Of Employees"] = cells[3].text.strip() + line["Notice ID"] = cells[5].text.strip() + # line['Notice URL'] = cells[1].find("a")['href'] + staginglist.append(line) + + # Bring in historical data + historical_file = cache_dir / "tn/tn_historical.csv" + historical_url = ( + "https://storage.googleapis.com/bln-data-public/warn-layoffs/tn_historical.csv" + ) + utils.fetch_if_not_cached(historical_file, historical_url) + historical_str = cache.read("tn/tn_historical.csv") - # Grab all the list items on the page - data_list = soup.find_all("p") - - # Loop through them all, skipping the first item, which is a header - for data in data_list[1:]: - # splitting the data on its delimiter - items = str(data).split("|") - - # making sure that the last item in the list is the data value of interest - # splitting based on last character of each text-html data sequence - raw_data = [] - for item in items: - value_html = item.split(":")[-1] - value_soup = BeautifulSoup(value_html, "html5lib") - string_list = list(value_soup.stripped_strings) - if len(string_list) > 0: - value = string_list[-1] - else: - continue - raw_data.append(value) - - # If there aren't six entries it's junk - if len(raw_data) != 6: - continue - - # Pluck out the values we want - nice_data = [ - raw_data[0], # Notice Date - raw_data[4], # Effective Date - "", # Received Date - raw_data[1], # Company - "", # City - raw_data[2], # County - raw_data[3], # Number of employees - "", # Layoff/Closure - raw_data[5], # Notice ID - ] - - # Add them to the master list - cleaned_data.append(nice_data) - - # The PDF header blacklist of rows to toss - pdf_header_blacklist = [ - "Notice Date", - "Total", - ] + historicallist = list(csv.DictReader(historical_str.splitlines())) + + # Combine fresh and historical + staginglist.extend(historicallist) - # Open the PDF - with pdfplumber.open(pdf_file) as pdf: - # Loop through all the pages - for i, my_page in enumerate(pdf.pages): - # Sll even pages have data, odd pages don't have the data - if i % 2 != 0: - continue - - # Pull out the table and loop through the rows - table = my_page.extract_table() - if not table: - continue - - # Cut empty rows - row_list = [r for r in table if any(r)] - if not row_list: - continue - - # If this is a summary table, skip it - first_cell = row_list[0][0] - assert first_cell - if first_cell.lower().strip() == "summary by month": - continue - - # Loop through all the rows ... - for row in row_list: - # Skip remove redundant headers - if row[0] in pdf_header_blacklist: - continue - - # Toss in an empty Notice ID since it isn't in the PDF - row.append("") - - # Add the data to our output - cleaned_data.append(row) - - # Set the path to the final CSV output_csv = data_dir / "tn.csv" - # Write out the rows to the export directory - utils.write_rows_to_csv(output_csv, cleaned_data) + utils.write_dict_rows_to_csv(output_csv, dataheaders, staginglist) - # Return the path to the final CSV return output_csv From 639c252d5249dc8473cd0c5ae09fb927305a1242 Mon Sep 17 00:00:00 2001 From: Mike Stucka Date: Mon, 8 Jul 2024 15:41:40 -0400 Subject: [PATCH 4/5] Patch CA PDF parsing flaw --- warn/scrapers/ca.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/warn/scrapers/ca.py b/warn/scrapers/ca.py index 9142865..f36df39 100644 --- a/warn/scrapers/ca.py +++ b/warn/scrapers/ca.py @@ -224,20 +224,25 @@ def _extract_pdf_data(pdf_path): if "summary" in first_cell: continue for row in rows: - data_row = {} - for i, value in enumerate(row): - this_raw_header = raw_header[i] - this_clean_header = header_crosswalk[this_raw_header] - data_row[this_clean_header] = value - # Data clean-ups - data_row.update( - { - "effective_date": data_row["effective_date"].replace(" ", ""), - "received_date": data_row["received_date"].replace(" ", ""), - "source_file": str(pdf_path).split("/")[-1], - } - ) - data.append(data_row) + # Summary rows have an extra field, and the above code does not + # block the summary table from being parsed if it jumps onto another page. + if len(row) != len(raw_header) + 1: + data_row = {} + for i, value in enumerate(row): + this_raw_header = raw_header[i] + this_clean_header = header_crosswalk[this_raw_header] + data_row[this_clean_header] = value + # Data clean-ups + data_row.update( + { + "effective_date": data_row["effective_date"].replace( + " ", "" + ), + "received_date": data_row["received_date"].replace(" ", ""), + "source_file": str(pdf_path).split("/")[-1], + } + ) + data.append(data_row) return data From dd7bdc08601207e5e4cbc138a701e2ff9d2df11b Mon Sep 17 00:00:00 2001 From: Mike Stucka Date: Thu, 11 Jul 2024 20:46:31 -0400 Subject: [PATCH 5/5] Patch ID --- warn/scrapers/id.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/warn/scrapers/id.py b/warn/scrapers/id.py index 7210260..44953bf 100644 --- a/warn/scrapers/id.py +++ b/warn/scrapers/id.py @@ -12,7 +12,7 @@ __tags__ = ["pdf"] __source__ = { "name": "Idaho Department of Labor", - "url": "https://www.labor.idaho.gov/dnn/Businesses/Layoff-Assistance#2", + "url": "https://www.labor.idaho.gov/warnnotice/", } logger = logging.getLogger(__name__) @@ -32,7 +32,7 @@ def scrape( Returns: the Path where the file is written """ # Create the URL of the source PDF - base_url = "https://www.labor.idaho.gov/dnn/Portals/0/Publications/" + base_url = "https://www.labor.idaho.gov/warnnotice/" file_name = "WARNNotice.pdf" # There's a numeric parameter called v on this PDF URL that updates # from time to time. Suspect this is a cache-buster. We're using a @@ -40,10 +40,8 @@ def scrape( min_cache_buster = 0 max_cache_buster = 10000000000 cache_buster = random.randrange(min_cache_buster, max_cache_buster) - url = f"{base_url}{file_name}?v={cache_buster}" + url = f"{base_url}?v={cache_buster}" - # Download the PDF with verify=False because - # there's a persistent cert error we're working around. cache = Cache(cache_dir) state_code = "id" cache_key = f"{state_code}/{file_name}" @@ -126,9 +124,9 @@ def filter_garbage_rows(incoming: list): badrows += 1 if badrows == 0: logger.debug("No bad rows found.") - logger.debug( - f"!!!!! {badrows:,} bad rows dropped from the data set with insufficient number of fields." - ) + logger.debug( + f"!!!!! {badrows:,} bad rows dropped from the data set with insufficient number of fields." + ) return outgoing