Skip to content

Commit

Permalink
Merge pull request #656 from biglocalnews/id-655
Browse files Browse the repository at this point in the history
Patch ID #655
  • Loading branch information
stucka authored Jul 15, 2024
2 parents f95aa02 + 92f6e48 commit 618eaba
Showing 1 changed file with 19 additions and 4 deletions.
23 changes: 19 additions & 4 deletions warn/scrapers/id.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from pathlib import Path

import pdfplumber
import requests
from bs4 import BeautifulSoup

from .. import utils
from ..cache import Cache
Expand All @@ -12,7 +14,7 @@
__tags__ = ["pdf"]
__source__ = {
"name": "Idaho Department of Labor",
"url": "https://www.labor.idaho.gov/warnnotice/",
"url": "https://www.labor.idaho.gov/businesss/layoff-assistance/",
}

logger = logging.getLogger(__name__)
Expand All @@ -32,20 +34,32 @@ def scrape(
Returns: the Path where the file is written
"""
# Create the URL of the source PDF
base_url = "https://www.labor.idaho.gov/warnnotice/"
base_url = "https://www.labor.idaho.gov"
start_url = "https://www.labor.idaho.gov/businesss/layoff-assistance/"
file_name = "WARNNotice.pdf"
# There's a numeric parameter called v on this PDF URL that updates
# from time to time. Suspect this is a cache-buster. We're using a
# random number instead.
min_cache_buster = 0
max_cache_buster = 10000000000
cache_buster = random.randrange(min_cache_buster, max_cache_buster)
url = f"{base_url}?v={cache_buster}"
page_url = f"{start_url}?v={cache_buster}"

cache = Cache(cache_dir)
state_code = "id"
logger.debug(f"Trying to fetch page at {page_url}")
r = requests.get(page_url)

# Start finding the link before "Who to contact"
html = r.text
localizedhtml = html.split("<h2>Who to contact")[0]
soup = BeautifulSoup(localizedhtml, features="lxml")
last_url = soup.find_all("a")[-1]["href"]
pdf_url = f"{base_url}{last_url}"

logger.debug(f"Trying to fetch PDF at {pdf_url}")
cache_key = f"{state_code}/{file_name}"
pdf_file = cache.download(cache_key, url, verify=True)
pdf_file = cache.download(cache_key, pdf_url, verify=True)

# Loop through the PDF pages and scrape out the data
output_rows: list = []
Expand Down Expand Up @@ -124,6 +138,7 @@ def filter_garbage_rows(incoming: list):
badrows += 1
if badrows == 0:
logger.debug("No bad rows found.")
else:
logger.debug(
f"!!!!! {badrows:,} bad rows dropped from the data set with insufficient number of fields."
)
Expand Down

0 comments on commit 618eaba

Please sign in to comment.