-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUtilsWikipedia.py
92 lines (75 loc) · 3.11 KB
/
UtilsWikipedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
Author: Lim Zhao Qing
"""
import requests
class WikipediaUtils:
@staticmethod
def fetch_search_results() -> list:
"""
Fetches search results from the Wikipedia API.
Searches Wikipedia for the term 'bahasa' and retrieves a list of article titles.
Returns:
list: A list of article titles from the search results.
"""
print("Fetching search results from Wikipedia API...")
response = requests.get('https://ms.wikipedia.org/w/api.php', params={
'action': 'query',
'format': 'json',
'list': 'search',
'utf8': 1,
'srsearch': 'bahasa', # Search term
'srlimit': 100 # Limit the number of search results
})
response.raise_for_status() # Raise an exception for bad status codes
data = response.json()
# Extract titles from search results
titles = [result['title'] for result in data['query']['search']]
print(f"Extracted titles: {titles}")
return titles
@staticmethod
def fetch_page_content(title: str) -> dict:
"""
Fetches page content from the Wikipedia API.
Retrieves the content of a specific Wikipedia page given its title.
Args:
title (str): The title of the Wikipedia page.
Returns:
dict: The JSON response containing the page content.
"""
print(f"Fetching page content for title: {title}")
response = requests.get('https://ms.wikipedia.org/w/api.php', params={
'action': 'query',
'format': 'json',
'prop': 'extracts',
'exintro': True, # Retrieve only the introduction
'exlimit': 1, # Limit to one extract
'explaintext': True, # Get plain text content
'titles': title
})
response.raise_for_status() # Raise an exception for bad status codes
return response.json()
@staticmethod
def extract_page_info(page_content: dict) -> list:
"""
Extracts page information (title and content) from the API response.
Processes the JSON response from the Wikipedia API and extracts the title and content
of each page.
Args:
page_content (dict): The JSON response from the Wikipedia API.
Returns:
list: A list of dictionaries, each containing the title and content of a page.
"""
print("Extracting page information...")
data_to_write = []
# Extract relevant information from the page content
if 'pages' in page_content['query']:
for page_info in page_content['query']['pages'].values():
if 'extract' in page_info:
info = {
'Title': page_info['title'],
'Content': page_info['extract']
}
print(f"Extracted page info: {info}")
data_to_write.append(info)
print("Completed extracting page information.")
return data_to_write