forked from CAVIND46016/Yelp-Dataset-Analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathChristmas_date_trend.py
63 lines (47 loc) · 2.51 KB
/
Christmas_date_trend.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pandas as pd
import nltk
import csv
import json
import re
import datetime
SOURCE = 'D:\\YELP Dataset\\yelp_academic_dataset_review.json';
def normalize_text(text):
# Remove non-ASCII chars.
text = re.sub('[^\x00-\x7F]+', ' ', text)
return text
def str2Date(dt):
return datetime.datetime.strptime(dt, "%Y-%m-%d").date()
def generateCSV(year, count = None):
DESTINATION = "D:\\YELP Dataset\\wordcloud datasets\\yelp_restaurants_reviews_nouns_{}.csv".format(year);
bus_rest = pd.read_csv("D:\\YELP Dataset\\tobesubmitted\\yelp_academic_dataset_business_restaurants.csv", low_memory=False);
with open("D:\\YELP Dataset\\restaurant related full datasets\\restaurants_businessIds.txt", 'r') as busIds:
id_list = busIds.read();
required_pos = ['NNP', 'NN'] # Common and proper nouns (probable topics during christmas season)
date_range = ['{}-12-01'.format(year), '{}-12-31'.format(year)]
if(count != None):
c = 0
with open(DESTINATION, 'w', encoding = 'utf-8', errors = 'replace') as file:
w = csv.writer(file, lineterminator = '\n')
w.writerow(["name", "city", "text"])
with open(SOURCE, encoding='utf-8') as f:
for line in f:
data = json.loads(line)
if((str2Date(data['date']) > str2Date(date_range[0])) & (str2Date(data['date']) < str2Date(date_range[1]))):
if(data['business_id'] in id_list):
data['text'] = ''.join([normalize_text(text) for text in data['text'].lower()])
if(('christmas' not in data['text']) & ('Christmas' not in data['text'])):
continue;
df = bus_rest[(bus_rest['business_id'] == data['business_id'])];
word_list = nltk.word_tokenize(data['text'])
pos_tag_list = [x[0] for x in nltk.pos_tag(word_list) if x[1] in required_pos]
w.writerow([df['name'].values[0], df['city'].values[0], ' '.join(pos_tag_list)])
if(count != None):
c = c + 1;
if(c == count):
break;
print("{} created successfully.".format(DESTINATION))
def main():
year = 2011;
generateCSV(year)
if(__name__ == "__main__"):
main();