From c0d87c5a13af697b597c9cbbae8b4d2cc969bb57 Mon Sep 17 00:00:00 2001
From: Chris Zubak-Skees <chriszs@gmail.com>
Date: Sun, 10 Mar 2024 17:33:35 +0800
Subject: [PATCH 1/5] Re-enable SSL verification for DE

---
 warn/scrapers/de.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/warn/scrapers/de.py b/warn/scrapers/de.py
index 0d7dd8c..5f45797 100644
--- a/warn/scrapers/de.py
+++ b/warn/scrapers/de.py
@@ -44,7 +44,6 @@ def scrape(
         stop_year,
         cache_dir,
         use_cache=use_cache,
-        verify=False,
     )
 
     # Return the resulting CSV file path

From 214269768ca033f1b681efb5f31bc06f5625ccd9 Mon Sep 17 00:00:00 2001
From: Mike Stucka <stucka@whitedoggies.com>
Date: Mon, 25 Mar 2024 10:52:52 -0400
Subject: [PATCH 2/5] Patch HTTP #622

---
 warn/scrapers/fl.py | 6 +++---
 warn/scrapers/md.py | 2 +-
 warn/scrapers/mt.py | 8 ++++----
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/warn/scrapers/fl.py b/warn/scrapers/fl.py
index e5f2bb2..4854866 100755
--- a/warn/scrapers/fl.py
+++ b/warn/scrapers/fl.py
@@ -17,7 +17,7 @@
 __tags__ = ["html", "pdf"]
 __source__ = {
     "name": "Florida Department of Economic Opportunity",
-    "url": "http://floridajobs.org/office-directory/division-of-workforce-services/workforce-programs/reemployment-and-emergency-assistance-coordination-team-react/warn-notices",
+    "url": "https://floridajobs.org/office-directory/division-of-workforce-services/workforce-programs/reemployment-and-emergency-assistance-coordination-team-react/warn-notices",
 }
 
 logger = logging.getLogger(__name__)
@@ -53,7 +53,7 @@ def scrape(
     headers = {
         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
     }
-    url = "http://floridajobs.org/office-directory/division-of-workforce-services/workforce-programs/reemployment-and-emergency-assistance-coordination-team-react/warn-notices"
+    url = "https://floridajobs.org/office-directory/division-of-workforce-services/workforce-programs/reemployment-and-emergency-assistance-coordination-team-react/warn-notices"
     response = requests.get(url, headers=headers, verify=False)
     logger.debug(f"Request status is {response.status_code} for {url}")
     soup = BeautifulSoup(response.text, "html.parser")
@@ -138,7 +138,7 @@ def _scrape_html(cache, url, headers, page=1):
         )  # find link to next page, if exists
         # recursively scrape until we have a list of all the pages' html
         if nextPageLink:
-            url = "http://reactwarn.floridajobs.org" + nextPageLink.get(
+            url = "https://reactwarn.floridajobs.org" + nextPageLink.get(
                 "href"
             )  # /WarnList/Records?year=XXXX&page=X
             # recursively make list of all the next pages' html
diff --git a/warn/scrapers/md.py b/warn/scrapers/md.py
index 0765f4f..d637089 100644
--- a/warn/scrapers/md.py
+++ b/warn/scrapers/md.py
@@ -11,7 +11,7 @@
 __tags__ = ["html"]
 __source__ = {
     "name": "Maryland Department of Labor",
-    "url": "http://www.dllr.state.md.us/employment/warn.shtml",
+    "url": "https://www.dllr.state.md.us/employment/warn.shtml",
 }
 
 logger = logging.getLogger(__name__)
diff --git a/warn/scrapers/mt.py b/warn/scrapers/mt.py
index 3bdc6cb..3279b27 100644
--- a/warn/scrapers/mt.py
+++ b/warn/scrapers/mt.py
@@ -28,8 +28,8 @@ def scrape(
     Returns: the Path where the file is written
     """
     # Get the URL
-    url = "http://wsd.dli.mt.gov/wioa/related-links/warn-notice-page"
-    r = utils.get_url(url, verify=False)
+    url = "https://wsd.dli.mt.gov/wioa/related-links/warn-notice-page"
+    r = utils.get_url(url, verify=True)
     html = r.text
 
     # Save it to the cache
@@ -50,10 +50,10 @@ def scrape(
             "xlsx"
         )  # URL will look like: ="../../_docs/wioa/warn-9-1-21.xlsx"
     ][0].split("/")[-1]
-    excel_url = f"http://wsd.dli.mt.gov/_docs/wioa/{excel_name}"
+    excel_url = f"https://wsd.dli.mt.gov/_docs/wioa/{excel_name}"
 
     # Download the Excel file
-    excel_path = cache.download("mt/source.xlsx", excel_url, verify=False)
+    excel_path = cache.download("mt/source.xlsx", excel_url, verify=True)
 
     # Open it up
     workbook = load_workbook(filename=excel_path)

From 9c5cb8b5023146128f540afd867d403218e20b4f Mon Sep 17 00:00:00 2001
From: Mike Stucka <stucka@whitedoggies.com>
Date: Mon, 27 May 2024 09:15:48 -0400
Subject: [PATCH 3/5] Rebuild TN

---
 warn/scrapers/tn.py | 134 ++++++++++++--------------------------------
 1 file changed, 36 insertions(+), 98 deletions(-)

diff --git a/warn/scrapers/tn.py b/warn/scrapers/tn.py
index 6136a23..7b7a522 100644
--- a/warn/scrapers/tn.py
+++ b/warn/scrapers/tn.py
@@ -1,14 +1,14 @@
+import csv
 import typing
 from pathlib import Path
 
-import pdfplumber
 from bs4 import BeautifulSoup
 
 from .. import utils
 from ..cache import Cache
 
-__authors__ = ["anikasikka"]
-__tags__ = ["html", "pdf"]
+__authors__ = ["anikasikka", "stucka"]
+__tags__ = ["html"]
 __source__ = {
     "name": "Tennessee Department of Labor and Workforce Development",
     "url": "https://www.tn.gov/workforce/general-resources/major-publications0/major-publications-redirect/reports.html",
@@ -37,13 +37,11 @@ def scrape(
     )
     html = page.text
     cache.write("tn/source.html", html)
+    soup = BeautifulSoup(html, "html5lib")
+    tables = soup.find_all(attrs={"class": "tn-datatable"})
+    rows = BeautifulSoup(str(tables), "html5lib").find_all("tr")
 
-    # Grab the PDF with the archived historial data
-    pdf_url = "https://www.tn.gov/content/dam/tn/workforce/documents/majorpublications/reports/WarnReportByMonth.pdf"
-    pdf_file = cache.download("tn/pdffile.pdf", pdf_url)
-
-    # Set the headers we'll use for both sources
-    tn_headers = [
+    dataheaders: typing.List = [
         "Notice Date",
         "Effective Date",
         "Received Date",
@@ -53,102 +51,42 @@ def scrape(
         "No. Of Employees",
         "Layoff/Closure",
         "Notice ID",
+        # "Notice URL",
     ]
-    cleaned_data: typing.List[typing.Any] = [tn_headers]
 
-    # Parse the latest HTML file and convert to a list of rows, with a header in the first row.
-    soup = BeautifulSoup(html, "html5lib")
+    staginglist: typing.List = []
+    for row in reversed(rows):
+        cells = row.find_all("td")
+        if len(cells) == 6:  # Filter for potentially valid rows
+            line: typing.Dict = {}
+            for item in dataheaders:  # Build an ordered dictionary with null values
+                line[item] = None
+            line["Notice Date"] = cells[0].text.strip()
+            line["Effective Date"] = cells[4].text.strip()
+            line["Company"] = cells[1].text.strip()
+            line["County"] = cells[2].text.strip()
+            line["No. Of Employees"] = cells[3].text.strip()
+            line["Notice ID"] = cells[5].text.strip()
+            # line['Notice URL'] = cells[1].find("a")['href']
+            staginglist.append(line)
+
+    # Bring in historical data
+    historical_file = cache_dir / "tn/tn_historical.csv"
+    historical_url = (
+        "https://storage.googleapis.com/bln-data-public/warn-layoffs/tn_historical.csv"
+    )
+    utils.fetch_if_not_cached(historical_file, historical_url)
+    historical_str = cache.read("tn/tn_historical.csv")
 
-    # Grab all the list items on the page
-    data_list = soup.find_all("p")
-
-    # Loop through them all, skipping the first item, which is a header
-    for data in data_list[1:]:
-        # splitting the data on its delimiter
-        items = str(data).split("|")
-
-        # making sure that the last item in the list is the data value of interest
-        # splitting based on last character of each text-html data sequence
-        raw_data = []
-        for item in items:
-            value_html = item.split(":")[-1]
-            value_soup = BeautifulSoup(value_html, "html5lib")
-            string_list = list(value_soup.stripped_strings)
-            if len(string_list) > 0:
-                value = string_list[-1]
-            else:
-                continue
-            raw_data.append(value)
-
-        # If there aren't six entries it's junk
-        if len(raw_data) != 6:
-            continue
-
-        # Pluck out the values we want
-        nice_data = [
-            raw_data[0],  # Notice Date
-            raw_data[4],  # Effective Date
-            "",  # Received Date
-            raw_data[1],  # Company
-            "",  # City
-            raw_data[2],  # County
-            raw_data[3],  # Number of employees
-            "",  # Layoff/Closure
-            raw_data[5],  # Notice ID
-        ]
-
-        # Add them to the master list
-        cleaned_data.append(nice_data)
-
-    # The PDF header blacklist of rows to toss
-    pdf_header_blacklist = [
-        "Notice Date",
-        "Total",
-    ]
+    historicallist = list(csv.DictReader(historical_str.splitlines()))
+
+    # Combine fresh and historical
+    staginglist.extend(historicallist)
 
-    # Open the PDF
-    with pdfplumber.open(pdf_file) as pdf:
-        # Loop through all the pages
-        for i, my_page in enumerate(pdf.pages):
-            # Sll even pages have data, odd pages don't have the data
-            if i % 2 != 0:
-                continue
-
-            # Pull out the table and loop through the rows
-            table = my_page.extract_table()
-            if not table:
-                continue
-
-            # Cut empty rows
-            row_list = [r for r in table if any(r)]
-            if not row_list:
-                continue
-
-            # If this is a summary table, skip it
-            first_cell = row_list[0][0]
-            assert first_cell
-            if first_cell.lower().strip() == "summary by month":
-                continue
-
-            # Loop through all the rows ...
-            for row in row_list:
-                # Skip remove redundant headers
-                if row[0] in pdf_header_blacklist:
-                    continue
-
-                # Toss in an empty Notice ID since it isn't in the PDF
-                row.append("")
-
-                # Add the data to our output
-                cleaned_data.append(row)
-
-    # Set the path to the final CSV
     output_csv = data_dir / "tn.csv"
 
-    # Write out the rows to the export directory
-    utils.write_rows_to_csv(output_csv, cleaned_data)
+    utils.write_dict_rows_to_csv(output_csv, dataheaders, staginglist)
 
-    # Return the path to the final CSV
     return output_csv
 
 

From 639c252d5249dc8473cd0c5ae09fb927305a1242 Mon Sep 17 00:00:00 2001
From: Mike Stucka <stucka@whitedoggies.com>
Date: Mon, 8 Jul 2024 15:41:40 -0400
Subject: [PATCH 4/5] Patch CA PDF parsing flaw

---
 warn/scrapers/ca.py | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/warn/scrapers/ca.py b/warn/scrapers/ca.py
index 9142865..f36df39 100644
--- a/warn/scrapers/ca.py
+++ b/warn/scrapers/ca.py
@@ -224,20 +224,25 @@ def _extract_pdf_data(pdf_path):
             if "summary" in first_cell:
                 continue
             for row in rows:
-                data_row = {}
-                for i, value in enumerate(row):
-                    this_raw_header = raw_header[i]
-                    this_clean_header = header_crosswalk[this_raw_header]
-                    data_row[this_clean_header] = value
-                # Data clean-ups
-                data_row.update(
-                    {
-                        "effective_date": data_row["effective_date"].replace(" ", ""),
-                        "received_date": data_row["received_date"].replace(" ", ""),
-                        "source_file": str(pdf_path).split("/")[-1],
-                    }
-                )
-                data.append(data_row)
+                # Summary rows have an extra field, and the above code does not
+                # block the summary table from being parsed if it jumps onto another page.
+                if len(row) != len(raw_header) + 1:
+                    data_row = {}
+                    for i, value in enumerate(row):
+                        this_raw_header = raw_header[i]
+                        this_clean_header = header_crosswalk[this_raw_header]
+                        data_row[this_clean_header] = value
+                    # Data clean-ups
+                    data_row.update(
+                        {
+                            "effective_date": data_row["effective_date"].replace(
+                                " ", ""
+                            ),
+                            "received_date": data_row["received_date"].replace(" ", ""),
+                            "source_file": str(pdf_path).split("/")[-1],
+                        }
+                    )
+                    data.append(data_row)
     return data
 
 

From dd7bdc08601207e5e4cbc138a701e2ff9d2df11b Mon Sep 17 00:00:00 2001
From: Mike Stucka <stucka@whitedoggies.com>
Date: Thu, 11 Jul 2024 20:46:31 -0400
Subject: [PATCH 5/5] Patch ID

---
 warn/scrapers/id.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/warn/scrapers/id.py b/warn/scrapers/id.py
index 7210260..44953bf 100644
--- a/warn/scrapers/id.py
+++ b/warn/scrapers/id.py
@@ -12,7 +12,7 @@
 __tags__ = ["pdf"]
 __source__ = {
     "name": "Idaho Department of Labor",
-    "url": "https://www.labor.idaho.gov/dnn/Businesses/Layoff-Assistance#2",
+    "url": "https://www.labor.idaho.gov/warnnotice/",
 }
 
 logger = logging.getLogger(__name__)
@@ -32,7 +32,7 @@ def scrape(
     Returns: the Path where the file is written
     """
     # Create the URL of the source PDF
-    base_url = "https://www.labor.idaho.gov/dnn/Portals/0/Publications/"
+    base_url = "https://www.labor.idaho.gov/warnnotice/"
     file_name = "WARNNotice.pdf"
     # There's a numeric parameter called v on this PDF URL that updates
     # from time to time. Suspect this is a cache-buster. We're using a
@@ -40,10 +40,8 @@ def scrape(
     min_cache_buster = 0
     max_cache_buster = 10000000000
     cache_buster = random.randrange(min_cache_buster, max_cache_buster)
-    url = f"{base_url}{file_name}?v={cache_buster}"
+    url = f"{base_url}?v={cache_buster}"
 
-    # Download the PDF with verify=False because
-    # there's a persistent cert error we're working around.
     cache = Cache(cache_dir)
     state_code = "id"
     cache_key = f"{state_code}/{file_name}"
@@ -126,9 +124,9 @@ def filter_garbage_rows(incoming: list):
             badrows += 1
     if badrows == 0:
         logger.debug("No bad rows found.")
-    logger.debug(
-        f"!!!!! {badrows:,} bad rows dropped from the data set with insufficient number of fields."
-    )
+        logger.debug(
+            f"!!!!! {badrows:,} bad rows dropped from the data set with insufficient number of fields."
+        )
     return outgoing