-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreddit.py
157 lines (128 loc) · 5.94 KB
/
reddit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import praw
import os
import json
from helper import retain_latest_entries, has_profanity
# Set up PRAW with your Reddit API credentials
reddit = praw.Reddit(
client_id='iwdABdbDIKeTvNEee5ojlw',
client_secret='Bv_QRYI2doViQWKastzTlbz2elZZdw',
user_agent="idk"
)
def save_posts_to_file(posts, filename):
existing_posts = {}
if os.path.exists(filename):
with open(filename, 'r', encoding='utf-8') as file:
try:
existing_posts = json.load(file)
except json.JSONDecodeError:
print(f"Warning: {filename} is empty or contains invalid JSON. It will be overwritten.")
for post_id, post_data in posts.items():
if post_id in existing_posts:
existing_posts[post_id].update(post_data)
existing_posts[post_id]['processed'] = existing_posts[post_id].get('processed', False)
else:
existing_posts[post_id] = post_data
with open(filename, 'w', encoding='utf-8') as file:
json.dump(existing_posts, file, indent=4)
def scrape_questions_and_answers(subreddit_name='AskReddit'):
subreddit = reddit.subreddit(subreddit_name)
filename=f'Reddit_Posts/{subreddit_name}_saved_posts.json'
retain_latest_entries(filename, 100)
existing_post_ids = set()
if os.path.exists(filename):
with open(filename, 'r', encoding='utf-8') as file:
try:
posts = json.load(file)
existing_post_ids = set(posts.keys())
except json.JSONDecodeError:
posts = {}
# Initialize the dictionary to hold questions and answers
qa_dict = {}
total_valid_posts = 0
def calculate_tts_time(char_count, word_count):
return (0.04 * char_count + 0.05 * word_count)
# Continue fetching until we have 31 non-NSFW posts
post_limit = 0
while total_valid_posts < 12:
post_limit += 75
top_posts = subreddit.top(limit=post_limit, time_filter="week") # Fetch more posts in each iteration to ensure we reach the required number
for post in top_posts:
# print(post.title)
if post.id in existing_post_ids:
print("Skipping existing post...")
continue # Skip posts that are already processed
if (not post.over_18) and (not has_profanity(post.title)) and (not has_profanity(post.selftext)): # Check if the post is NSFW and skip if it is
desc_char_count = len(post.selftext)
desc_word_count = len(post.selftext.split())
post_char_count = len(post.title)
post_word_count = len(post.title.split())
print("Desc time: ", calculate_tts_time(desc_char_count, desc_word_count))
if (calculate_tts_time(desc_char_count, desc_word_count) > 57-calculate_tts_time(post_char_count, post_word_count)): #Check if post and desc fits under a min
print("Post desc too long for 1 min video! - Skipping...")
continue
print("New posts found: ", str(total_valid_posts))
post_id = post.id
total_valid_posts += 1
author_name = "u/[deleted]" if post.author is None else "u/" + str(post.author.name)
qa_dict[post_id] = {
'post': post.title,
'body': post.selftext,
'user': author_name,
'comments': []
}
top_comments = list(post.comments)[:11]
for comment in top_comments:
if ((comment.body != "[deleted]") and (not has_profanity(comment.body))):
qa_dict[post_id]['comments'].append({
'text': comment.body,
'user': "u/" + str(comment.author)
})
else:
print("NSFW - Skipping...")
if total_valid_posts >= 12:
break # Exit the loop once we have enough posts
save_posts_to_file(qa_dict, filename)
return qa_dict
def get_unprocessed_post(subreddit='AskReddit', process=True):
filename=f'Reddit_Posts/{subreddit}_saved_posts.json'
if os.path.exists(filename):
with open(filename, 'r', encoding='utf-8') as file:
try:
posts = json.load(file)
existing_post_ids = set(posts.keys())
except json.JSONDecodeError:
posts = {}
unprocessed_found = False
for post_id, post_info in posts.items():
if ('processed' not in post_info) or (not post_info['processed']):
unprocessed_found = True
if process:
post_info['processed'] = True
save_posts_to_file(posts, filename)
return post_info
if not unprocessed_found:
print("RAN OUT OF POSTS, GETTING NEW ONES...")
previous_post_count = len(posts)
new_posts = scrape_questions_and_answers(subreddit)
with open(filename, 'r', encoding='utf-8') as file:
posts = json.load(file)
if len(posts) == previous_post_count:
print(f"Found {len(posts)} posts")
print("ALL FETCHED POSTS ARE THE SAME, STOPPING EXECUTION.")
return None
else:
return get_unprocessed_post(subreddit, process)
else:
print("NO POSTS FILE FOUND, GETTING POSTS...")
scrape_questions_and_answers(subreddit)
return get_unprocessed_post(subreddit, process)
# RUNS WHEN NOT AN IMPORT:
if __name__ == "__main__":
subreddit = "AskReddit"
subreddit = "offmychest"
scrape_questions_and_answers(subreddit)
# post = get_unprocessed_post(subreddit, process=False)
# if post is not None:
# print(post['post'])
# else:
# print("No unprocessed posts available.")