-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwiki_page.py
85 lines (63 loc) · 2.04 KB
/
wiki_page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from dotenv import load_dotenv
import wikitextparser as wtp
import requests
import json
import os
load_dotenv()
def search_wikipedia(term, num = 5):
"""
Searches wikipedia for the given term
:param term: Term to search on wikipedia
:param num: Number of pages to search
:return:Returns a list of 5 pages from wikipedia
"""
headers = {
'Authorization': os.getenv('wiki_auth_token')
}
paramaters = {
'q': term,
'limit': num
}
url = "https://api.wikimedia.org/core/v1/wikipedia/en/search/page"
response = requests.get(url, params=paramaters, headers=headers).content
pages_dirs = json.loads(response)['pages']
pages = [x['title'] for x in pages_dirs]
return pages
def get_page(title):
"""
Returns the page object from wikipedia given a title
:param title: Title of the page to search on wikipedia
:return: Page object from wikipedia
"""
headers = {
'Authorization': os.getenv('wiki_auth_token')
}
url = "https://api.wikimedia.org/core/v1/wikipedia/en/search/page"
paramaters = {
'q': title,
'limit': 1
}
response =requests.get(url, params=paramaters, headers=headers).content
page_key = json.loads(response)['pages'][0]["key"]
url = " https://api.wikimedia.org/core/v1/wikipedia/en/page/" + page_key
response = requests.get(url, headers=headers).content
page = json.loads(response)
return page
def create_sections(wikiText:str):
"""
Extract the sections from wikipedia
:param wikiText: Text from the wikipedia page
:ruturn: a list of dictionaries that have section titles and content of the sections
"""
parsed = wtp.parse(wikiText)
raw_sections = parsed.sections
sections = []
for section in raw_sections:
title = section.title
content = section.contents
section_dict = {
"title": title,
"content": content
}
sections.append(section_dict)
return sections