From aad3a2b3d0ebf5d520dcf0700d9a780596f8c8b6 Mon Sep 17 00:00:00 2001
From: Mike Stucka <stucka@whitedoggies.com>
Date: Sun, 22 Dec 2024 18:54:55 -0500
Subject: [PATCH] Move VA to JS-aware browser; add UTF-8 compatibility

---
 warn/cache.py       |  2 +-
 warn/scrapers/va.py | 69 ++++++++++++++++++++++++++++++---------------
 warn/utils.py       |  4 +--
 3 files changed, 49 insertions(+), 26 deletions(-)

diff --git a/warn/cache.py b/warn/cache.py
index 8992ba6c..092140b9 100644
--- a/warn/cache.py
+++ b/warn/cache.py
@@ -69,7 +69,7 @@ def read_csv(self, name):
         """
         path = Path(self.path, name)
         logger.debug(f"Reading CSV from cache {path}")
-        with open(path) as fh:
+        with open(path, encoding="utf-8") as fh:
             return list(csv.reader(fh))
 
     def download(
diff --git a/warn/scrapers/va.py b/warn/scrapers/va.py
index 7c68058c..22b48826 100644
--- a/warn/scrapers/va.py
+++ b/warn/scrapers/va.py
@@ -1,13 +1,19 @@
 import logging
+import os
+from glob import glob
 from pathlib import Path
+from shutil import copyfile
+from time import sleep
+
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options as ChromeOptions
+from selenium.webdriver.chrome.service import Service as ChromeService
+from webdriver_manager.chrome import ChromeDriverManager
 
 from .. import utils
 from ..cache import Cache
 
-# from bs4 import BeautifulSoup, Tag
-
-
-__authors__ = ["zstumgoren", "Dilcia19", "shallotly"]
+__authors__ = ["zstumgoren", "Dilcia19", "shallotly", "stucka"]
 __tags__ = ["html", "csv"]
 __source__ = {
     "name": "Virginia Employment Commission",
@@ -35,32 +41,49 @@ def scrape(
 
     # This may break again, but this revised attempt has far fewer moving parts and actually fetches the complete data set.
     # Blame Stucka in December 2024.
+    # And it broke again in December 2024, but not even Stucka will blame Stucka for this mess.
 
-    # Get the WARN page
-    # url = "https://www.vec.virginia.gov/warn-notices"
-    # url = "https://vec.virginia.gov/warn-notices?field_notice_date_value%5Bmin%5D%5Bdate%5D=1%2F1%2F1990&field_notice_date_value%5Bmax%5D%5Bdate%5D=&field_region_warn_tid=All"
-    # r = utils.get_url(url, verify=True)
-    # html = r.text
-
-    # Save it to the cache
     cache = Cache(cache_dir)
-    # cache.write("va/source.html", html)
+    #     csv_url = "https://vec.virginia.gov/warn-notices-csv.csv?field_notice_date_value%5Bmin%5D%5Bdate%5D=1%2F1%2F1990&field_notice_date_value%5Bmax%5D%5Bdate%5D=&field_region_warn_tid=All"
+
+    csv_url = "https://vec.virginia.gov/warn-notices-csv.csv"
+
+    # driver = webdriver.Chrome(options=chromeoptionsholder, service=Service(ChromeDriverManager().install()))
+    logger.debug("Attempting to launch Chrome")
+    chromeoptionsholder = ChromeOptions()
+    chrome_install = ChromeDriverManager().install()
+    folder = os.path.dirname(chrome_install)
+    chromedriver_path = os.path.join(folder, "chromedriver.exe")
+    service = ChromeService(chromedriver_path)
+    driver = webdriver.Chrome(options=chromeoptionsholder, service=service)
+    logger.debug(f"Attempting to fetch {csv_url}")
+    driver.get(csv_url)
+
+    sleep(25)
+
+    logger.debug(driver.page_source)
+
+    # get the user download folder (dynamic so will work on any machine)
+    downLoadFolder = os.path.join(os.getenv("USERPROFILE"), "Downloads")  # type: ignore
+    # get the list of files
+    list_of_files = glob(downLoadFolder + "/*.csv")
+    # get the latest file name
+    latest_file = max(list_of_files, key=os.path.getctime)
+    # print the latest file name
+    logger.debug(f"CSV saved to {latest_file}")
+
+    target_filename = cache_dir / "va" / "source.csv"
+
+    utils.create_directory(path=cache_dir / "va", is_file=False)
 
-    # Parse out the CSV download link
-    # soup = BeautifulSoup(html, "html.parser")
-    # csv_link = soup.find("a", text="Download")
-    # if isinstance(csv_link, Tag):
-    #     csv_href = csv_link["href"]
-    # else:
-    #     raise ValueError("Could not find CSV link")
+    logger.debug(f"Saving file to {target_filename}")
 
-    # csv_href = "/warn-notices-csv.csv?"
-    # csv_url = f"https://www.vec.virginia.gov{csv_href}"
+    copyfile(latest_file, target_filename)
 
-    csv_url = "https://vec.virginia.gov/warn-notices-csv.csv?field_notice_date_value%5Bmin%5D%5Bdate%5D=1%2F1%2F1990&field_notice_date_value%5Bmax%5D%5Bdate%5D=&field_region_warn_tid=All"
+    driver.quit()
 
     # Download it to the cache
-    cache.download("va/source.csv", csv_url, verify=True)
+    # cache.download("va/source.csv", csv_url, verify=True)
 
     # Open it up as a list of rows
     csv_rows = cache.read_csv("va/source.csv")
diff --git a/warn/utils.py b/warn/utils.py
index d9bfc713..bf7995cf 100644
--- a/warn/utils.py
+++ b/warn/utils.py
@@ -86,7 +86,7 @@ def save_if_good_url(filename, url, **kwargs):
         success_flag = False
         content = False
     else:
-        with open(filename, "wb") as outfile:
+        with open(filename, "wb", encoding="utf-8") as outfile:
             outfile.write(response.content)
             success_flag = True
             content = response.content
@@ -104,7 +104,7 @@ def write_rows_to_csv(output_path: Path, rows: list, mode="w"):
     """
     create_directory(output_path, is_file=True)
     logger.debug(f"Writing {len(rows)} rows to {output_path}")
-    with open(output_path, mode, newline="") as f:
+    with open(output_path, mode, newline="", encoding="utf-8") as f:
         writer = csv.writer(f)
         writer.writerows(rows)