-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathtext_mining.py
217 lines (184 loc) · 7.16 KB
/
text_mining.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
## ---- fedspeak
## darren chang
## ---- setup
import pandas as pd
import numpy as np
import os #setwd
import time #timing
# scraping
from bs4 import BeautifulSoup
from contextlib import closing
import requests #website requests
from requests.exceptions import RequestException
from requests import get
# nlp
import re
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
# from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
# importing links, which were created using excel
os.chdir('Documents/_econ/fedspeak/text mining')
links = pd.read_csv('links.csv')
links.head(10)
## ---- scraping
# functions for getting URLs, with error logging
def simple_get(url):
"""
Attempts to get the content at `url` by making an HTTP GET request.
If the content-type of response is some kind of HTML/XML, return the
text content, otherwise return None.
"""
try:
with closing(get(url, stream = True)) as resp:
if is_good_response(resp):
soup = BeautifulSoup(resp.content, 'html.parser')
results = soup.find_all('p')
return results
else:
return None
except RequestException as e:
log_error('Error during requests to {0} : {1}'.format(url, str(e)))
return None
def is_good_response(resp):
"""
Returns True if the response seems to be HTML, False otherwise.
"""
content_type = resp.headers['Content-Type'].lower()
return (resp.status_code == 200
and content_type is not None
and content_type.find('html') > -1)
def log_error(e):
"""
log errors
"""
print(e)
# scraping a set of links
def scrape(links, #dataframe of urls and other info
):
"""
function for scraping set of links to dataframe. returns data frame of raw text in lists of lists
"""
links_use = links['url'].values.tolist() #extract urls as list
fed_text_raw = pd.DataFrame() #empty df
# this process usually takes 1.5 - 2.5 hrs.
# iterates across all the urls
# grabs the text as a list and adds to larger dataframe
for url in links_use:
text = simple_get(url)
df = pd.DataFrame({'url': url, 'text': [text]})
fed_text_raw = fed_text_raw.append(df, ignore_index= True)
fed_text_raw = pd.DataFrame(fed_text_raw)
fed_text_raw.columns = fed_text_raw.columns.str.strip() #strip column names
return fed_text_raw
start = time.time()
fed_text_raw = scrape(links)
# linklist = pd.DataFrame(links.loc[0:20,:]) # for testing
# fed_text_raw = scrape(linklist) # test run
end = time.time() # log time for this process
print(end - start)
## ---- cleaning
# stripping html tags and numbers
# this function for preprocessing can be finnicky - because of string types and series interactions
def preprocess(df,
text_field, # field that has text
new_text_field_name # name of field that has normalized text
):
"""
normalizes dataframes by converting it to lowercase and
removing characters that do not contribute to natural text meaning
"""
# df[new_text_field_name] = df[text_field].str.lower()
df[new_text_field_name] = (df[text_field]
.apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?",
"", str(elem)))
.apply(lambda elem: re.sub("<.*?>","", str(elem))) # strips out tags
.apply(lambda elem: re.sub(r"\d+", "", str(elem)))
)
return df
# word tokenization
def unnest(df, # line-based dataframe
column_to_tokenize, # name of the column with the text
new_token_column_name, # what you want the column of words to be called
tokenizer_function, # what tokenizer to use = (nltk.word_tokenize)
original_list): # original list of data
"""
unnests words from html and returns dataframe in long format merged with original list.
word tokenization in tidy text format.
"""
return (df[column_to_tokenize]
.apply(str)
.apply(tokenizer_function)
.apply(pd.Series)
.stack()
.reset_index(level=0)
.set_index('level_0')
.rename(columns={0: new_token_column_name})
.join(df.drop(column_to_tokenize, 1), how='left')
.reset_index(drop=True)
.merge(original_list, how = 'outer', on = 'url')
)
fed_text_raw = preprocess(fed_text_raw, 'text', 'text')
fed_text_all = unnest(fed_text_raw, 'text', 'word', nltk.word_tokenize, links)
fed_text_all['word'] = fed_text_all['word'].str.lower() # convert to lowercase
fed_text_all.to_csv('fed_text_all.csv', index = False) # save as csv
# stemming, if you decide to stem.
# snowball = SnowballStemmer(language = 'english')
# def stemming(df,
# column_to_stem,
# stem_function, # what stemming function to use = snowball
# ):
# """
# returns stem of words in a dataframe
# """
# return (df[column_to_stem]
# .apply(stem_function)
# )
#
# stemming(fed_text_all, 'word', snowball.stem)
# lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize(df,
column_to_lemmatize,
lemmatize_func, # lemmatizing function to use = wordnet
):
"""
returns root words by matching with the lemmatizing function
"""
return (df[column_to_lemmatize]
.apply(lambda word: lemmatize_func.lemmatize(word, pos = "v"))
)
# fed_text_test['word'] = lemmatize(fed_text_test, 'word', word_net_lemmatizer) # test code
fed_text_all['word'] = lemmatize(fed_text_all, 'word', wordnet_lemmatizer) # lemmatize
# stopwords
# make custom stop words list
stop_words = stopwords.words("english")
stop_words_html = ['ppwe', 'uspp', 'pwe', 'usp', 'pp', 'usbrp'] # leftover stopwords from converting html to text with 'p' tags
stop_words_econ = ['debt', 'gross', 'crude', 'well', 'maturity', 'work', 'marginally', 'leverage'] # stop words for economic usage
stop_words.extend(stop_words_html)
stop_words.extend(stop_words_econ)
# delete stopwords
def remove_stops(df,
column_to_process,
stop_words_list, # list of stop words = stop_words
index_reset = True # whether you want to reset index or not
):
"""
function remove stopwords if data is a pandas dataframe and in tidy format e.g. long.
different from typical pandas format which is lists of words in wide format.
"""
if index_reset == True:
return (df[~df[column_to_process].isin(stop_words_list)]
.reset_index(drop = True)) # this line resets index
else:
return df[~df[column_to_process].isin(stop_words_list)]
# remove_stops(fed_text_test, 'word', stop_words) # test code
fed_text_all = remove_stops(fed_text_all, 'word', stop_words)
end = time.time()
print(end - start)
fed_text_all.to_csv('fed_text_all.csv', index = False) # save as csv
# fed_text_all = pd.read_csv('fed_text_all.csv') # read csv
# summary statistics
fed_text_all['word'].value_counts().head(20)