-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathgazette.py
249 lines (207 loc) · 7.82 KB
/
gazette.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
"""
Extracting named entities from the Trove Govt. Gazettes
"""
import re
from typing import List, Any
import spacy
from spacy.matcher import Matcher
import pandas as pd
import datetime
def tag_name_address(doc: spacy.tokens.Doc, start: int, end: int) -> list:
"""Given a document span corresponding to a name and address
identify the entities LAST, FIRST, ADDRESS
Return a list of tuples (start, end, tag)"""
state = 'L'
last = [start, start]
first = [0, 0]
addr = [0, 0]
pos = start
for tok in doc[start:end]:
if tok.text == ',':
if state is 'L':
state = 'F'
first = [pos+1, pos+1]
elif state is 'F':
state = 'A'
addr = [pos+1, pos+1]
else:
if state is 'L':
last[1] += 1
elif state is 'F':
first[1] += 1
else:
addr[1] += 1
pos += 1
res = [(first[0], first[1], 'FIRSTNAME'),
(last[0], last[1], 'LASTNAME'),
(addr[0], addr[1], 'ADDR'),
]
return res
def tag_dates(nlp, doc):
"""Identify dates in a document by defining some patterns
Return a list of tags (start, end, 'DATE')
"""
matcher = Matcher(nlp.vocab)
matcher.add('DATE', None,
[{'SHAPE': 'd.d.dd'}],
[{'SHAPE': 'd,d.dd'}],
[{'SHAPE': 'd.dd.dd'}],
[{'SHAPE': 'dd.d.dd'}],
[{'SHAPE': 'dd.dd.dd'}],
[{'SHAPE': 'dd.d.dddd'}],
[{'SHAPE': 'dd.dd.dddd'}],
[{'SHAPE': 'd.d.dddd'}],
[{'SHAPE': 'd.dd.dddd'}],
[{'SHAPE': 'dd'}, {'IS_SPACE': True}, {'SHAPE': 'd.dd'}],
[{'SHAPE': 'dd'}, {'IS_SPACE': True}, {'SHAPE': 'dd.dd'}],
[{'IS_DIGIT': True}, {'IS_SPACE': True}, {'IS_DIGIT': True}, {'IS_SPACE': True}, {'IS_DIGIT': True}]
)
dates = matcher(doc)
# generate a list of tuples for each date matched
dates = [(m[1], m[2], 'DATE') for m in dates]
return dates
def tag_document(nlp, doc):
"""Given a Spacy document doc corresponding to a Govt. Gazzette "Naturalisation",
tag the names, addresses and dates
in the document
Return a list of tags (start, end, tag)
"""
dates = tag_dates(nlp, doc)
current = 0
tags = []
for start, end, tag in dates:
prefix = True
# skip leading space and punctuation
for idx in range(current, start):
if prefix and (doc[idx].is_space or doc[idx].is_punct):
# skip
pass
else:
current = idx
break
# now go tag the name and address in this segment
tags.extend(tag_name_address(doc, current, start))
current = end
tags.extend(dates)
return tags
def trove_naturalisation_text(article: dict) -> str:
"""Given an article from Trove representing a Govt. Gazette
article on Naturalisation, return the text of the article
minus the 'header' part - that is just the body containing
the list of names and addresses.
"""
lines: List[str] = re.findall('<span>([^<]+)</span>', article['articleText'])
text: str = ""
secretary: bool = False
for line in lines:
if not secretary:
if "Secretary" in line:
secretary = True
else:
text += "\n" + line
return text
def tag_row(row: pd.Series, nlp: object) -> list:
"""Run the nlp process over a row of a dataframe corresponding
to an article (must have an element 'text')
return a list of tags for this article"""
doc = nlp(row.text)
tags = tag_document(nlp, doc)
etags = []
for start, end, tag in tags:
etags.append((start, end, tag, str(doc[start:end])))
return etags
def extract_records(articles: pd.DataFrame) -> pd.DataFrame:
"""Given a dataframe containing tags, extract name/address
records and return a new dataframe with one record per
row containig the name and address and supporting
tags"""
records = []
for i, row in articles.iterrows():
records.extend(tags_to_records(row))
records = pd.DataFrame(records)
return records
def tags_to_records(row: pd.Series) -> list:
"""Given a Series (row of dataframe) containing an entry 'tags' that is
a list of tags (start, end, tag, text)
in sorted document order, generate a list of name/address records
return a list of dictionaries
{'id': article id,
'support': [(tag tuples)], # tag tuples used to create the record
'first', 'last', 'address', 'date', 'datestring' # fields in the record
}
"""
# expect FIRSTNAME, LASTNAME, ADDR, DATE
# dump a new record every time we see a date
records = []
record = {'id': row.id, 'support': []}
for tag in sorted(row.tags):
text = tag[3].strip().replace('\n', ' ')
if tag[2] == 'FIRSTNAME':
record['support'].append(tag)
record['first'] = text
elif tag[2] == 'LASTNAME':
record['support'].append(tag)
record['last'] = text
elif tag[2] == 'ADDR':
record['support'].append(tag)
record['address'] = text
elif tag[2] == 'DATE':
record['support'].append(tag)
record['datestring'] = text
record['date'] = parsedate(text)
if 'first' in record and 'last' in record and 'address' in record:
records.append(record)
# otherwise discard
record = {'id': row.id, 'support': []}
return records
def parsedate(datestring: str) -> str:
"""Parse a date string into a uniform format 1926-03-12"""
# replace punctuation with spaces
datestring = datestring.replace(".", " ").replace(",", " ")
parts = datestring.split()
if len(parts) == 3:
day, month, year = parts
if len(year) == 2:
year = "19"+year
if year.startswith("19") and len(month) <= 2 and len(day) <= 2:
# dstr = "%s-%s-%s" % (year, month, day)
try:
if len(month) == 1:
month = "0"+month
if len(day) == 1:
day = "0"+day
d = datetime.date(int(year), int(month), int(day))
return d.isoformat()
except ValueError:
pass
return "1900-01-01"
def filter_records(records: list) -> tuple:
"""Given a list of annotation records, remove those that
have bad data in them
- numbers in first or last name
- have a defaulted date field (couldn't parse date)
- address contains 'formerly' (probably contains more than one record)
- address is more than 70 chars (probably contains more than one record)
Return a tuple containing two
lists (filtered, rejected)"""
filtered = []
rejected = []
for record in records:
name = record['first'] + record['last']
if re.search('[0-9]', name) or record['date'] == "1900-01-01" or 'formerly' in record['address'] or len(record['address']) > 70 :
rejected.append(record)
else:
filtered.append(record)
return filtered, rejected
def valid_record(record: pd.Series) -> bool:
"""Is this a valid record?"""
name = record['first'] + record['last']
return not( re.search('[0-9]', name) or \
record['date'] == "1900-01-01" or \
'formerly' in record['address'] or \
len(record['address']) > 70 )
def collect_document_entities(articles: pd.DataFrame, records: pd.DataFrame) -> pd.DataFrame:
"""Given a dataframe containing articles and another with entity records
create a new dataframe suitable for training SpaCy NER models with the
article text and all entities found in it"""
pass