diff --git a/warn/scrapers/id.py b/warn/scrapers/id.py index 44953bf..1f04735 100644 --- a/warn/scrapers/id.py +++ b/warn/scrapers/id.py @@ -4,6 +4,8 @@ from pathlib import Path import pdfplumber +import requests +from bs4 import BeautifulSoup from .. import utils from ..cache import Cache @@ -12,7 +14,7 @@ __tags__ = ["pdf"] __source__ = { "name": "Idaho Department of Labor", - "url": "https://www.labor.idaho.gov/warnnotice/", + "url": "https://www.labor.idaho.gov/businesss/layoff-assistance/", } logger = logging.getLogger(__name__) @@ -32,7 +34,8 @@ def scrape( Returns: the Path where the file is written """ # Create the URL of the source PDF - base_url = "https://www.labor.idaho.gov/warnnotice/" + base_url = "https://www.labor.idaho.gov" + start_url = "https://www.labor.idaho.gov/businesss/layoff-assistance/" file_name = "WARNNotice.pdf" # There's a numeric parameter called v on this PDF URL that updates # from time to time. Suspect this is a cache-buster. We're using a @@ -40,12 +43,23 @@ def scrape( min_cache_buster = 0 max_cache_buster = 10000000000 cache_buster = random.randrange(min_cache_buster, max_cache_buster) - url = f"{base_url}?v={cache_buster}" + page_url = f"{start_url}?v={cache_buster}" cache = Cache(cache_dir) state_code = "id" + logger.debug(f"Trying to fetch page at {page_url}") + r = requests.get(page_url) + + # Start finding the link before "Who to contact" + html = r.text + localizedhtml = html.split("