-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2017年9月4日v3.py
136 lines (121 loc) · 5.62 KB
/
2017年9月4日v3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# http://36kr.com/
import requests,re,time,math
import threading,queue
import json,os
from urllib import request
now_time = time.strftime("%Y-%m-%d", time.localtime())
hour = int(time.strftime("%H", time.localtime()))
minute = int(time.strftime("%M", time.localtime()))
def href():
flag = 1
for n in range(1, pages):
Base_url = 'http://36kr.com/api/search/articles/%20?page='+str(n)+'&pageSize=40&ts='+str(int(time.time()))
try:
articles = request_session.get(Base_url,headers=headers,timeout=3).json()
datalist = articles['data']['data']
for post in datalist:
q.put(post['id'])
flag += 1
#print(flag)
except BaseException as g:
print("ERROR:",g)
print("共计%s篇文章~" % flag)
def page(name):
flag = 1
while True:
if not q.empty():
flag = 1
try:
id = q.get()
url = 'http://36kr.com/p/' + str(id) + '.html'
# print(url)
content = request_session.get(url, headers=headers,timeout=3).text
props = re.findall(r'\"detailArticle\|post\":(.*?),\"abTest\|abtest\"', content)
if props and props[0]: #跳过付费才能看的文章
post_data = json.loads(props[0])
#print(post_data)
time1 = post_data['created_at']
day_time = time1.split()[0] #年月日
hour_time = time1.split()[1].split(':')[0] #小时
minute_time = time1.split()[1].split(':')[1]#分钟
if now_time == day_time:
data1 = int(hour) - int(hour_time)
if data1 == 0:
minute_time1 = minute - int(minute_time)
if minute_time1 == 0:
data2 = "1分钟前"
else:
data2 = str(minute_time1) + "分钟前"
else:
data2 = str(data1) + "小时前"
else:
data2 = day_time
page_tag = post_data['extraction_tags']
tag_list = []
page_tag1 = page_tag.split(',')
len1 = len(page_tag1)
j = 0
while j < len1:
tag_list.append(page_tag1[j].strip("["))
j += 3
pict_sour = str(post_data['content'])
picture_re = re.compile(r'src="https://(.*?)" ')
picture_list = picture_re.findall(pict_sour)
if len(picture_list) != 0:
os.mkdir(BASE_DIR +"/pictures/" + str(id))
try:
page = 1
for pic in picture_list:
request.urlretrieve('http://' + pic, BASE_DIR + '/pictures/%s/%s.jpg' % (str(id),page))
page += 1
except BaseException as p:
continue
page_dict = {
'title':post_data['title'], # 标题
'date' :data2, # 日期
'tag' : post_data['column']['name'], # 标签,时间后面那个
'author' : post_data['user']['name'], #作者
'content' :post_data['content'], #正文
'summary' : post_data['summary'], # 梗概
'extraction_tags' :tag_list, # 文章下面的标签,
'like' : post_data['counters']['like'], # 点赞数,
'comment' : post_data['counters']['comment'], #评论数,
'page_href' : url , # 链接,
}
print(page_dict)
except BaseException as e:
ERROR_list.append(id)
continue
time.sleep(10)
else:
if flag < 180: # 队列空就等一秒,如果等待3分钟后,队列还是空,视所有文章已抓取完毕,退出程序
print("队列中无数据")
time.sleep(1)
flag += 1
print("等待%s秒" % flag)
else:
return "All work is Done!"
if __name__ == "__main__":
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # 该程序所在目录
user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
headers = {'User-Agent': user_agent}
request_session = requests.Session()
Base_url = 'http://36kr.com/api/search/articles/%20?page=1&pageSize=40&ts=' + str(int(time.time()))
articles = request_session.get(Base_url, headers=headers).json()
data = articles['data']
total_count = data['total_count']
page_size = data['page_size']
pages = math.ceil(total_count / page_size) + 1
ERROR_list = []
if not os.path.exists(BASE_DIR + '/pictures'):
os.mkdir(BASE_DIR + '/pictures')
q = queue.Queue(maxsize=10)
href = threading.Thread(target=href)
href.start()
t_list = []
for i in range(10):
get_user1 = threading.Thread(target=page,args=(i,))
get_user1.start()
t_list.append(get_user1)
for t in t_list:
t.join()