From aad3a2b3d0ebf5d520dcf0700d9a780596f8c8b6 Mon Sep 17 00:00:00 2001 From: Mike Stucka Date: Sun, 22 Dec 2024 18:54:55 -0500 Subject: [PATCH] Move VA to JS-aware browser; add UTF-8 compatibility --- warn/cache.py | 2 +- warn/scrapers/va.py | 69 ++++++++++++++++++++++++++++++--------------- warn/utils.py | 4 +-- 3 files changed, 49 insertions(+), 26 deletions(-) diff --git a/warn/cache.py b/warn/cache.py index 8992ba6c..092140b9 100644 --- a/warn/cache.py +++ b/warn/cache.py @@ -69,7 +69,7 @@ def read_csv(self, name): """ path = Path(self.path, name) logger.debug(f"Reading CSV from cache {path}") - with open(path) as fh: + with open(path, encoding="utf-8") as fh: return list(csv.reader(fh)) def download( diff --git a/warn/scrapers/va.py b/warn/scrapers/va.py index 7c68058c..22b48826 100644 --- a/warn/scrapers/va.py +++ b/warn/scrapers/va.py @@ -1,13 +1,19 @@ import logging +import os +from glob import glob from pathlib import Path +from shutil import copyfile +from time import sleep + +from selenium import webdriver +from selenium.webdriver.chrome.options import Options as ChromeOptions +from selenium.webdriver.chrome.service import Service as ChromeService +from webdriver_manager.chrome import ChromeDriverManager from .. import utils from ..cache import Cache -# from bs4 import BeautifulSoup, Tag - - -__authors__ = ["zstumgoren", "Dilcia19", "shallotly"] +__authors__ = ["zstumgoren", "Dilcia19", "shallotly", "stucka"] __tags__ = ["html", "csv"] __source__ = { "name": "Virginia Employment Commission", @@ -35,32 +41,49 @@ def scrape( # This may break again, but this revised attempt has far fewer moving parts and actually fetches the complete data set. # Blame Stucka in December 2024. + # And it broke again in December 2024, but not even Stucka will blame Stucka for this mess. - # Get the WARN page - # url = "https://www.vec.virginia.gov/warn-notices" - # url = "https://vec.virginia.gov/warn-notices?field_notice_date_value%5Bmin%5D%5Bdate%5D=1%2F1%2F1990&field_notice_date_value%5Bmax%5D%5Bdate%5D=&field_region_warn_tid=All" - # r = utils.get_url(url, verify=True) - # html = r.text - - # Save it to the cache cache = Cache(cache_dir) - # cache.write("va/source.html", html) + # csv_url = "https://vec.virginia.gov/warn-notices-csv.csv?field_notice_date_value%5Bmin%5D%5Bdate%5D=1%2F1%2F1990&field_notice_date_value%5Bmax%5D%5Bdate%5D=&field_region_warn_tid=All" + + csv_url = "https://vec.virginia.gov/warn-notices-csv.csv" + + # driver = webdriver.Chrome(options=chromeoptionsholder, service=Service(ChromeDriverManager().install())) + logger.debug("Attempting to launch Chrome") + chromeoptionsholder = ChromeOptions() + chrome_install = ChromeDriverManager().install() + folder = os.path.dirname(chrome_install) + chromedriver_path = os.path.join(folder, "chromedriver.exe") + service = ChromeService(chromedriver_path) + driver = webdriver.Chrome(options=chromeoptionsholder, service=service) + logger.debug(f"Attempting to fetch {csv_url}") + driver.get(csv_url) + + sleep(25) + + logger.debug(driver.page_source) + + # get the user download folder (dynamic so will work on any machine) + downLoadFolder = os.path.join(os.getenv("USERPROFILE"), "Downloads") # type: ignore + # get the list of files + list_of_files = glob(downLoadFolder + "/*.csv") + # get the latest file name + latest_file = max(list_of_files, key=os.path.getctime) + # print the latest file name + logger.debug(f"CSV saved to {latest_file}") + + target_filename = cache_dir / "va" / "source.csv" + + utils.create_directory(path=cache_dir / "va", is_file=False) - # Parse out the CSV download link - # soup = BeautifulSoup(html, "html.parser") - # csv_link = soup.find("a", text="Download") - # if isinstance(csv_link, Tag): - # csv_href = csv_link["href"] - # else: - # raise ValueError("Could not find CSV link") + logger.debug(f"Saving file to {target_filename}") - # csv_href = "/warn-notices-csv.csv?" - # csv_url = f"https://www.vec.virginia.gov{csv_href}" + copyfile(latest_file, target_filename) - csv_url = "https://vec.virginia.gov/warn-notices-csv.csv?field_notice_date_value%5Bmin%5D%5Bdate%5D=1%2F1%2F1990&field_notice_date_value%5Bmax%5D%5Bdate%5D=&field_region_warn_tid=All" + driver.quit() # Download it to the cache - cache.download("va/source.csv", csv_url, verify=True) + # cache.download("va/source.csv", csv_url, verify=True) # Open it up as a list of rows csv_rows = cache.read_csv("va/source.csv") diff --git a/warn/utils.py b/warn/utils.py index d9bfc713..bf7995cf 100644 --- a/warn/utils.py +++ b/warn/utils.py @@ -86,7 +86,7 @@ def save_if_good_url(filename, url, **kwargs): success_flag = False content = False else: - with open(filename, "wb") as outfile: + with open(filename, "wb", encoding="utf-8") as outfile: outfile.write(response.content) success_flag = True content = response.content @@ -104,7 +104,7 @@ def write_rows_to_csv(output_path: Path, rows: list, mode="w"): """ create_directory(output_path, is_file=True) logger.debug(f"Writing {len(rows)} rows to {output_path}") - with open(output_path, mode, newline="") as f: + with open(output_path, mode, newline="", encoding="utf-8") as f: writer = csv.writer(f) writer.writerows(rows)