Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added download script and modified pvmap for USA_DOL_Wages import #1197

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 0 additions & 20 deletions statvar_imports/USA/DOL_Wages/README.md

This file was deleted.

59 changes: 0 additions & 59 deletions statvar_imports/USA/DOL_Wages/USA_State_Places_Resolved.csv

This file was deleted.

40 changes: 0 additions & 40 deletions statvar_imports/USA/DOL_Wages/pv_map/US_Dol_Wages_pvmap.csv

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

25 changes: 25 additions & 0 deletions statvar_imports/usa_dol/minimum_wage/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# USA_DOL_Wages Import

- source: https://www.dol.gov/agencies/whd/state/minimum-wage/history,

- how to download data: Download script (download_script/main.py).
This script will create two folders (input_files and source_files).The file to be processed will be inside input_files. The raw files will be inside source_files folder

- type of place: Country and State.

- statvars: Economy

- years: 1968 to 2024

- place_resolution: Places resolved to geoId in pvmap itself.

### How to run:

`python3 stat_var_processor.py --input_data=<input_file>.csv --pv_map=statvar_imports/usa_dol/minimum_wage/<filename>_pvmap.csv --config=statvar_imports/usa_dol/minimum_wage/US_Dol_Wages_metadata.csv --output_path=--output_path=<filepath/filename>`

#### Example
#### Download :
`python3 main.py`

#### Processing
`python3 stat_var_processor.py --input_data=statvar_imports/usa_dol/minimum_wage/download_script/input_files/final_data.csv --pv_map=statvar_imports/usa_dol/minimum_wage/us_dol_wages_pvmap.csv --config=statvar_imports/usa_dol/minimum_wage/us_dol_wages_metadata.csv --output_path=statvar_imports/usa_dol/minimum_wage/test_data/sample_output/us_dol_wges`
73 changes: 73 additions & 0 deletions statvar_imports/usa_dol/minimum_wage/download_script/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import os
import csv
import requests
import pandas as pd
from absl import logging
from bs4 import BeautifulSoup
from retry import retry

url = "https://www.dol.gov/agencies/whd/state/minimum-wage/history"
source_folder = "source_files"
input_folder = "input_files"
os.makedirs(source_folder, exist_ok=True)
os.makedirs(input_folder, exist_ok=True)
file_path = os.path.join(source_folder, "raw_data.html")
file_path2 = os.path.join(source_folder, "raw_data.csv")

@retry(tries=5,delay=3,backoff=5)
def download_with_retry(url):
logging.info(f"Trying to access url : {url}")
return requests.get(url)

def extract_all_table_data(url):
try:
response = download_with_retry(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
with open(file_path, 'w') as file:
file.write(str(soup))

all_tables = soup.find_all('table')

if all_tables:
table_data = []
for index, table in enumerate(all_tables):
rows = table.find_all('tr')
table_rows = []
for row in rows:
cols = row.find_all(['td', 'th'])
row_data = [col.text.strip() for col in cols]
table_rows.append(row_data)
table_data.append(table_rows)

with open(file_path2, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
for index, table in enumerate(table_data):
writer.writerows(table)
writer.writerow([])
logging.info("Downloaded files")
return table_data
else:
print(f"No tables found on the page {url}")
return None
except requests.exceptions.RequestException as e:
logging.fatal(f"Error fetching URL: {e}")

def main():
logging.info("Process starts")
all_tables_data = extract_all_table_data(url)

if all_tables_data:
all_dfs = []
for index, table in enumerate(all_tables_data):
if index == 0:
df = pd.DataFrame(table[1:], columns=table[0])
else:
dfm = pd.DataFrame(table[1:], columns=table[0])
df = pd.merge(df, dfm, on=['State or otherjurisdiction'])

transposed_df = df.transpose()
transposed_df.to_csv('input_files/final_data.csv', header=False)
if __name__ == "__main__" :
logging.set_verbosity(1)
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
State or otherjurisdiction,Federal (FLSA),Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,Florida,Georgia,Hawaii,Idaho,Illinois,Indiana,Iowa,Kansas,Kentucky,Louisiana,Maine,Maryland,Massachusetts,Michigan,Minnesota,Mississippi,Missouri,Montana,Nebraska,Nevada,New Hampshire,New Jersey,New Mexico,New York,North Carolina,North Dakota,Ohio,Oklahoma,Oregon,Pennsylvania,Rhode Island,South Carolina,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,District of Columbia,Guam,Puerto Rico,U.S. Virgin Islands
2020,7.25,...,10.19,12,10.00[c],12,12.02,12,9.25,8.56,5.15(d),10.1,7.25,10.00[c],7.25(e),7.25,7.25,7.25,...,12,11,12.75,9.65(e),10.00(g),...,9.45,8.65(g),9.00[c],8.00-9.00,7.25,11,9,11.8,7.25,7.25,8.7,2.00-7.25(g),12,7.25,11.5,...,9.3,...,7.25,7.25,10.96(e),7.25[c],13.5,8.75,7.25,5.15,15,8.25,5.08-7.25(i),10.50(j)
2021,7.25,...,10.34,12.15,11,13,12.32,13,9.25,10,5.15(d),10.1,7.25,11.00[c],7.25(e),7.25,7.25,7.25,...,12.15,11.75,13.5,9.65(e),10.08(g),...,10.3,8.75(g),9.00[c],8.75-9.75,7.25,12,10.5,12.5,7.25,7.25,8.8,2.00-7.25(g),12.75,7.25,11.5,...,9.45,...,7.25,7.25,11.75,9.5,13.69,8.75,7.25,5.15,15.2,8.75,5.08-7.25(i),10.50(j)
2022,7.25,...,10.34,12.8,11,14,12.56,14,10.5,10,5.15(d),12,7.25,12.00[c],7.25(e),7.25,7.25,7.25,...,12.75,12.5,14.25,9.87(e),10.33(g),...,11.15,9.20(g),9.00[c],9.50/10.50,7.25,13,11.5,13.2,7.25,7.25,9.3,2.00-7.25(g),13.5,7.25,12.25,...,9.95,...,7.25,7.25,12.55,11,14.49,8.75,7.25,5.15,$16.10,9.25,7.25(i),10.50(j)
2023,7.25,...,10.85,13.85,11,15.5,13.65,15,11.75,12,5.15(d),12,7.25,13.00[c],7.25(e),7.25,7.25,7.25,...,13.8,13.25,15,10.10(e),10.59(g),...,12,9.95(g),10.50[c],10.25/11.25,7.25,14.13,12,14.2,7.25,7.25,10.1,2.00-7.25(g),14.2,7.25,13,...,10.8,...,7.25,7.25,13.18,12,15.74,8.75,7.25,5.15,17,9.25,9.5,10.50(j)
2024,7.25,...,11.73,14.35,11,16,14.42,15.69,13.25,12,5.15(d),14,7.25,14.00[c],7.25(e),7.25,7.25,7.25,...,14.15,15,15,10.33(e),10.85(g),...,12.3,10.30(g),12.00[c],12,7.25,15.13,12,15,7.25,7.25,10.45,2.00-7.25(g),14.7,7.25,14,...,11.2,...,7.25,7.25,13.67,12,16.28,8.75,7.25,5.15,$17.50,9.25,9.5,10.50(j)
Loading
Loading