Merge pull request #656 from biglocalnews/id-655

Patch ID #655
biglocalnews · Jul 15, 2024 · 618eaba · 618eaba
2 parents f95aa02 + 92f6e48
commit 618eaba
Showing 1 changed file with 19 additions and 4 deletions.
diff --git a/warn/scrapers/id.py b/warn/scrapers/id.py
@@ -4,6 +4,8 @@
 from pathlib import Path
 
 import pdfplumber
+import requests
+from bs4 import BeautifulSoup
 
 from .. import utils
 from ..cache import Cache
@@ -12,7 +14,7 @@
 __tags__ = ["pdf"]
 __source__ = {
     "name": "Idaho Department of Labor",
-    "url": "https://www.labor.idaho.gov/warnnotice/",
+    "url": "https://www.labor.idaho.gov/businesss/layoff-assistance/",
 }
 
 logger = logging.getLogger(__name__)
@@ -32,20 +34,32 @@ def scrape(
     Returns: the Path where the file is written
     """
     # Create the URL of the source PDF
-    base_url = "https://www.labor.idaho.gov/warnnotice/"
+    base_url = "https://www.labor.idaho.gov"
+    start_url = "https://www.labor.idaho.gov/businesss/layoff-assistance/"
     file_name = "WARNNotice.pdf"
     # There's a numeric parameter called v on this PDF URL that updates
     # from time to time. Suspect this is a cache-buster. We're using a
     # random number instead.
     min_cache_buster = 0
     max_cache_buster = 10000000000
     cache_buster = random.randrange(min_cache_buster, max_cache_buster)
-    url = f"{base_url}?v={cache_buster}"
+    page_url = f"{start_url}?v={cache_buster}"
 
     cache = Cache(cache_dir)
     state_code = "id"
+    logger.debug(f"Trying to fetch page at {page_url}")
+    r = requests.get(page_url)
+
+    # Start finding the link before "Who to contact"
+    html = r.text
+    localizedhtml = html.split("<h2>Who to contact")[0]
+    soup = BeautifulSoup(localizedhtml, features="lxml")
+    last_url = soup.find_all("a")[-1]["href"]
+    pdf_url = f"{base_url}{last_url}"
+
+    logger.debug(f"Trying to fetch PDF at {pdf_url}")
     cache_key = f"{state_code}/{file_name}"
-    pdf_file = cache.download(cache_key, url, verify=True)
+    pdf_file = cache.download(cache_key, pdf_url, verify=True)
 
     # Loop through the PDF pages and scrape out the data
     output_rows: list = []
@@ -124,6 +138,7 @@ def filter_garbage_rows(incoming: list):
             badrows += 1
     if badrows == 0:
         logger.debug("No bad rows found.")
+    else:
         logger.debug(
             f"!!!!! {badrows:,} bad rows dropped from the data set with insufficient number of fields."
         )