Merge branch 'id-655' of github.com:biglocalnews/warn-scraper into id…

…-655
biglocalnews · Jul 15, 2024 · cf344fc · cf344fc
2 parents 896ee37 + 92f6e48
commit cf344fc
Show file tree

Hide file tree

Showing 6 changed files with 63 additions and 121 deletions.
diff --git a/warn/scrapers/ca.py b/warn/scrapers/ca.py
@@ -224,20 +224,25 @@ def _extract_pdf_data(pdf_path):
             if "summary" in first_cell:
                 continue
             for row in rows:
-                data_row = {}
-                for i, value in enumerate(row):
-                    this_raw_header = raw_header[i]
-                    this_clean_header = header_crosswalk[this_raw_header]
-                    data_row[this_clean_header] = value
-                # Data clean-ups
-                data_row.update(
-                    {
-                        "effective_date": data_row["effective_date"].replace(" ", ""),
-                        "received_date": data_row["received_date"].replace(" ", ""),
-                        "source_file": str(pdf_path).split("/")[-1],
-                    }
-                )
-                data.append(data_row)
+                # Summary rows have an extra field, and the above code does not
+                # block the summary table from being parsed if it jumps onto another page.
+                if len(row) != len(raw_header) + 1:
+                    data_row = {}
+                    for i, value in enumerate(row):
+                        this_raw_header = raw_header[i]
+                        this_clean_header = header_crosswalk[this_raw_header]
+                        data_row[this_clean_header] = value
+                    # Data clean-ups
+                    data_row.update(
+                        {
+                            "effective_date": data_row["effective_date"].replace(
+                                " ", ""
+                            ),
+                            "received_date": data_row["received_date"].replace(" ", ""),
+                            "source_file": str(pdf_path).split("/")[-1],
+                        }
+                    )
+                    data.append(data_row)
     return data
 
 

diff --git a/warn/scrapers/de.py b/warn/scrapers/de.py
@@ -44,7 +44,6 @@ def scrape(
         stop_year,
         cache_dir,
         use_cache=use_cache,
-        verify=False,
     )
 
     # Return the resulting CSV file path

diff --git a/warn/scrapers/fl.py b/warn/scrapers/fl.py
@@ -17,7 +17,7 @@
 __tags__ = ["html", "pdf"]
 __source__ = {
     "name": "Florida Department of Economic Opportunity",
-    "url": "http://floridajobs.org/office-directory/division-of-workforce-services/workforce-programs/reemployment-and-emergency-assistance-coordination-team-react/warn-notices",
+    "url": "https://floridajobs.org/office-directory/division-of-workforce-services/workforce-programs/reemployment-and-emergency-assistance-coordination-team-react/warn-notices",
 }
 
 logger = logging.getLogger(__name__)
@@ -53,7 +53,7 @@ def scrape(
     headers = {
         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
     }
-    url = "http://floridajobs.org/office-directory/division-of-workforce-services/workforce-programs/reemployment-and-emergency-assistance-coordination-team-react/warn-notices"
+    url = "https://floridajobs.org/office-directory/division-of-workforce-services/workforce-programs/reemployment-and-emergency-assistance-coordination-team-react/warn-notices"
     response = requests.get(url, headers=headers, verify=False)
     logger.debug(f"Request status is {response.status_code} for {url}")
     soup = BeautifulSoup(response.text, "html.parser")
@@ -138,7 +138,7 @@ def _scrape_html(cache, url, headers, page=1):
         )  # find link to next page, if exists
         # recursively scrape until we have a list of all the pages' html
         if nextPageLink:
-            url = "http://reactwarn.floridajobs.org" + nextPageLink.get(
+            url = "https://reactwarn.floridajobs.org" + nextPageLink.get(
                 "href"
             )  # /WarnList/Records?year=XXXX&page=X
             # recursively make list of all the next pages' html

diff --git a/warn/scrapers/md.py b/warn/scrapers/md.py
@@ -11,7 +11,7 @@
 __tags__ = ["html"]
 __source__ = {
     "name": "Maryland Department of Labor",
-    "url": "http://www.dllr.state.md.us/employment/warn.shtml",
+    "url": "https://www.dllr.state.md.us/employment/warn.shtml",
 }
 
 logger = logging.getLogger(__name__)

diff --git a/warn/scrapers/mt.py b/warn/scrapers/mt.py
@@ -28,8 +28,8 @@ def scrape(
     Returns: the Path where the file is written
     """
     # Get the URL
-    url = "http://wsd.dli.mt.gov/wioa/related-links/warn-notice-page"
-    r = utils.get_url(url, verify=False)
+    url = "https://wsd.dli.mt.gov/wioa/related-links/warn-notice-page"
+    r = utils.get_url(url, verify=True)
     html = r.text
 
     # Save it to the cache
@@ -50,10 +50,10 @@ def scrape(
             "xlsx"
         )  # URL will look like: ="../../_docs/wioa/warn-9-1-21.xlsx"
     ][0].split("/")[-1]
-    excel_url = f"http://wsd.dli.mt.gov/_docs/wioa/{excel_name}"
+    excel_url = f"https://wsd.dli.mt.gov/_docs/wioa/{excel_name}"
 
     # Download the Excel file
-    excel_path = cache.download("mt/source.xlsx", excel_url, verify=False)
+    excel_path = cache.download("mt/source.xlsx", excel_url, verify=True)
 
     # Open it up
     workbook = load_workbook(filename=excel_path)

diff --git a/warn/scrapers/tn.py b/warn/scrapers/tn.py
@@ -1,14 +1,14 @@
+import csv
 import typing
 from pathlib import Path
 
-import pdfplumber
 from bs4 import BeautifulSoup
 
 from .. import utils
 from ..cache import Cache
 
-__authors__ = ["anikasikka"]
-__tags__ = ["html", "pdf"]
+__authors__ = ["anikasikka", "stucka"]
+__tags__ = ["html"]
 __source__ = {
     "name": "Tennessee Department of Labor and Workforce Development",
     "url": "https://www.tn.gov/workforce/general-resources/major-publications0/major-publications-redirect/reports.html",
@@ -37,13 +37,11 @@ def scrape(
     )
     html = page.text
     cache.write("tn/source.html", html)
+    soup = BeautifulSoup(html, "html5lib")
+    tables = soup.find_all(attrs={"class": "tn-datatable"})
+    rows = BeautifulSoup(str(tables), "html5lib").find_all("tr")
 
-    # Grab the PDF with the archived historial data
-    pdf_url = "https://www.tn.gov/content/dam/tn/workforce/documents/majorpublications/reports/WarnReportByMonth.pdf"
-    pdf_file = cache.download("tn/pdffile.pdf", pdf_url)
-
-    # Set the headers we'll use for both sources
-    tn_headers = [
+    dataheaders: typing.List = [
         "Notice Date",
         "Effective Date",
         "Received Date",
@@ -53,102 +51,42 @@ def scrape(
         "No. Of Employees",
         "Layoff/Closure",
         "Notice ID",
+        # "Notice URL",
     ]
-    cleaned_data: typing.List[typing.Any] = [tn_headers]
 
-    # Parse the latest HTML file and convert to a list of rows, with a header in the first row.
-    soup = BeautifulSoup(html, "html5lib")
+    staginglist: typing.List = []
+    for row in reversed(rows):
+        cells = row.find_all("td")
+        if len(cells) == 6:  # Filter for potentially valid rows
+            line: typing.Dict = {}
+            for item in dataheaders:  # Build an ordered dictionary with null values
+                line[item] = None
+            line["Notice Date"] = cells[0].text.strip()
+            line["Effective Date"] = cells[4].text.strip()
+            line["Company"] = cells[1].text.strip()
+            line["County"] = cells[2].text.strip()
+            line["No. Of Employees"] = cells[3].text.strip()
+            line["Notice ID"] = cells[5].text.strip()
+            # line['Notice URL'] = cells[1].find("a")['href']
+            staginglist.append(line)
+
+    # Bring in historical data
+    historical_file = cache_dir / "tn/tn_historical.csv"
+    historical_url = (
+        "https://storage.googleapis.com/bln-data-public/warn-layoffs/tn_historical.csv"
+    )
+    utils.fetch_if_not_cached(historical_file, historical_url)
+    historical_str = cache.read("tn/tn_historical.csv")
 
-    # Grab all the list items on the page
-    data_list = soup.find_all("p")
-
-    # Loop through them all, skipping the first item, which is a header
-    for data in data_list[1:]:
-        # splitting the data on its delimiter
-        items = str(data).split("|")
-
-        # making sure that the last item in the list is the data value of interest
-        # splitting based on last character of each text-html data sequence
-        raw_data = []
-        for item in items:
-            value_html = item.split(":")[-1]
-            value_soup = BeautifulSoup(value_html, "html5lib")
-            string_list = list(value_soup.stripped_strings)
-            if len(string_list) > 0:
-                value = string_list[-1]
-            else:
-                continue
-            raw_data.append(value)
-
-        # If there aren't six entries it's junk
-        if len(raw_data) != 6:
-            continue
-
-        # Pluck out the values we want
-        nice_data = [
-            raw_data[0],  # Notice Date
-            raw_data[4],  # Effective Date
-            "",  # Received Date
-            raw_data[1],  # Company
-            "",  # City
-            raw_data[2],  # County
-            raw_data[3],  # Number of employees
-            "",  # Layoff/Closure
-            raw_data[5],  # Notice ID
-        ]
-
-        # Add them to the master list
-        cleaned_data.append(nice_data)
-
-    # The PDF header blacklist of rows to toss
-    pdf_header_blacklist = [
-        "Notice Date",
-        "Total",
-    ]
+    historicallist = list(csv.DictReader(historical_str.splitlines()))
+
+    # Combine fresh and historical
+    staginglist.extend(historicallist)
 
-    # Open the PDF
-    with pdfplumber.open(pdf_file) as pdf:
-        # Loop through all the pages
-        for i, my_page in enumerate(pdf.pages):
-            # Sll even pages have data, odd pages don't have the data
-            if i % 2 != 0:
-                continue
-
-            # Pull out the table and loop through the rows
-            table = my_page.extract_table()
-            if not table:
-                continue
-
-            # Cut empty rows
-            row_list = [r for r in table if any(r)]
-            if not row_list:
-                continue
-
-            # If this is a summary table, skip it
-            first_cell = row_list[0][0]
-            assert first_cell
-            if first_cell.lower().strip() == "summary by month":
-                continue
-
-            # Loop through all the rows ...
-            for row in row_list:
-                # Skip remove redundant headers
-                if row[0] in pdf_header_blacklist:
-                    continue
-
-                # Toss in an empty Notice ID since it isn't in the PDF
-                row.append("")
-
-                # Add the data to our output
-                cleaned_data.append(row)
-
-    # Set the path to the final CSV
     output_csv = data_dir / "tn.csv"
 
-    # Write out the rows to the export directory
-    utils.write_rows_to_csv(output_csv, cleaned_data)
+    utils.write_dict_rows_to_csv(output_csv, dataheaders, staginglist)
 
-    # Return the path to the final CSV
     return output_csv