forked from uga-libraries/aspace_scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_urls.py
153 lines (144 loc) · 10.2 KB
/
check_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# This script exported all published resources from ASpace as an EAD.xml file, then took those files and looked for any
# URLs that were present, checked their HTTP status and if it returned an error (anything but 200), then logged the
# error in a spreadsheet. This was later adapted to make the Check URLs custom report plugin for ArchivesSpace.
import requests
import os
import csv
import re
from secrets import *
from asnake.client import ASnakeClient
from pathlib import Path
from lxml import etree
id_field_regex = re.compile(r"(^id_+\d)")
id_combined_regex = re.compile(r'[\W_]+', re.UNICODE)
web_url_regex = re.compile(r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))""")
def create_export_folder():
try:
current_directory = os.getcwd()
for root, directories, files in os.walk(current_directory):
if "source_eads" in directories:
return str(Path(os.getcwd(), "source_eads"))
else:
raise Exception
except Exception as source_ead_error:
print(str(source_ead_error) + "\nNo source_eads folder found, creating new one...", end='', flush=True)
current_directory = os.getcwd()
folder = "source_eads"
source_path = os.path.join(current_directory, folder)
os.mkdir(source_path)
print("{} folder created\n".format(folder))
return str(Path(source_path))
def export_eads(source_path, client):
repos = client.get("repositories").json()
for repo in repos:
print(repo["name"] + "\n")
repo_id = repo["uri"].split("/")[2]
resources = client.get("repositories/{}/resources".format(repo_id), params={"all_ids": True}).json()
for resource_id in resources:
resource = client.get("repositories/{}/resources/{}".format(repo_id, resource_id))
combined_id = ""
for field, value in resource.json().items():
id_match = id_field_regex.match(field)
if id_match:
combined_id += value + "-"
combined_id = combined_id[:-1]
combined_aspace_id_clean = id_combined_regex.sub('', combined_id)
if resource.json()["publish"] is True:
if resource.status_code == 200:
export_ead = client.get("repositories/{}/resource_descriptions/{}.xml".format(repo_id, resource_id),
params={"include_unpublished": False, "include_daos": True,
"numbered_cs": True, "print_pdf": False, "ead3": False})
filepath = str(Path(source_path, combined_aspace_id_clean)) + ".xml"
with open(filepath, "wb") as local_file:
local_file.write(export_ead.content)
local_file.close()
print("Exported: {}".format(combined_id))
else:
print("\nThe following errors were found when exporting {}:\n{}: {}\n".format(combined_id, resource,
resource.text))
print("-"*100)
def write_csv(mode, coll_num, note, err_code, url):
with open("invalid_links.csv", mode=mode, newline='') as invalid_links:
file_write = csv.writer(invalid_links, delimiter=",")
file_write.writerow([coll_num, note, err_code, url])
invalid_links.close()
def check_urls(source_path):
for file in os.listdir(source_path):
print(file)
tree = etree.parse(source_path + "/" + file)
root = tree.getroot()
for element in root.getiterator():
element.tag = etree.QName(element).localname
for element in root.findall(".//*"):
if element.tag == "extref":
attributes = dict(element.attrib)
print(element.getparent().getparent().tag)
for key, value in attributes.items():
if key == "{http://www.w3.org/1999/xlink}href":
if "www.anb.org" in value or "www.oxforddnb.com" in value or "doi.org" in value or \
"http://www.ledger-enquirer.com/2012/07/05/2110319_judge-aaron-cohn-dies-at-95.html?rh=1" \
== value:
print("Couldn't parse href for: ", file)
write_csv("a", str(file), str(element.getparent().getparent().tag), "Couldn't parse href",
str(value))
pass
else:
try:
print(value)
test_request = requests.get(value)
if test_request.status_code != 200:
print(test_request.status_code)
write_csv("a", str(file), str(element.getparent().getparent().tag),
str(test_request.status_code), str(value))
elif "russelldoc" in value or "hmfa" in value:
write_csv("a", str(file), str(element.getparent().getparent().tag),
"Old website URL - consider replacing", str(value))
except Exception as e:
print(file, e)
write_csv("a", str(file), str(element.getparent().getparent().tag), str(e), str(value))
if element.tag == "dao":
attributes = dict(element.attrib)
for key, value in attributes.items():
if key == "{http://www.w3.org/1999/xlink}href":
print("Digital Object: ", attributes["{http://www.w3.org/1999/xlink}title"])
print(value)
try:
test_request = requests.get(value)
if test_request.status_code != 200:
print(test_request.status_code)
write_csv("a", str(file),
"Digital Object: {}".format(attributes["{http://www.w3.org/1999/xlink}title"]),
str(test_request.status_code), str(value))
except Exception as e:
print(file, e)
write_csv("a", str(file),
"Digital Object: {}".format(attributes["{http://www.w3.org/1999/xlink}title"]),
str(e), str(value))
else:
element_words = str(element.text).split(" ")
filtered_words = list(filter(None, element_words))
for word in filtered_words:
clean_word = word.strip(",.;:`~()<>")
match = web_url_regex.match(clean_word)
if match:
print(element.getparent().tag)
print(clean_word)
if "www.anb.org" in clean_word or "www.oxforddnb.com" in clean_word or "doi.org" in clean_word \
or \
"http://www.ledger-enquirer.com/2012/07/05/2110319_judge-aaron-cohn-dies-at-95.html?rh=1" \
== clean_word:
print("Couldn't parse href for: ", file)
write_csv("a", str(file), str(element.getparent().getparent().tag), "Couldn't parse href",
str(clean_word))
pass
else:
try:
test_request = requests.get(clean_word)
if test_request.status_code != 200:
print(test_request.status_code)
write_csv("a", str(file), element.getparent().tag, str(test_request.status_code),
str(clean_word))
except Exception as e:
print(file, e)
write_csv("a", str(file), element.getparent().tag, str(e), str(clean_word))
print("-" * 200)