-
Notifications
You must be signed in to change notification settings - Fork 1
/
extraction_wikipedia.py
64 lines (55 loc) · 2.28 KB
/
extraction_wikipedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# Wikipedia data extraction - using Pageviewapi
import pageviewapi
import os
import datetime
import pandas as pd
from pandas.io.json import json_normalize
# Fetch the pageviews for a specific article
def fetch_timeseries_wikipedia(keyword, save_csv = True):
try:
interest_over_time = pageviewapi.per_article('en.wikipedia', keyword, '20151101', '20191101',
access='all-access', agent='all-agents', granularity='daily')
except:
print('The chosen article doesn\'t exist')
return None
# Save in a csv if needed
if save_csv:
# Csv naming and path
data_path = 'data/wikipedia/'
if not os.path.exists(data_path): os.makedirs(data_path)
file_name = data_path + keyword.lower() + '_wikipedia_interest.csv'
interest_over_time_df = pd.DataFrame(interest_over_time)
interest_over_time_df.to_csv(file_name, index=False, encoding='utf-8')
return interest_over_time
# Get trending articles as an ordered list, cut them if necessary (updated daily)
def fetch_trending_wikipedia(cut = 0):
# Get the current date to select yesterday's trending articles (month and day must have the leading zero)
special_pages = ['Main_Page', 'Special:Search']
now = datetime.datetime.now() - datetime.timedelta(days=1)
year = now.year
month = f'{now.month:02d}'
day = f'{now.day:02d}'
trends_final = []
# Returns an AttrDict
try:
trends = pageviewapi.top('en.wikipedia', year, month, day, access='all-access')
counter = 0
for article in trends['items'][0]['articles']:
if article['article'] in special_pages: continue
else:
trends_final.append(article['article'])
counter += 1
if counter == cut + 1: break
# The data may not be ready around midnight
except:
return 'Data not ready'
return trends_final
# Avoid to run the script when imported
if __name__ == '__main__':
# Variabled needed (tries to take it from user or use default)
input_keyword = input('\nInsert a keyword (default: Donald_Trump) -> ')
if not input_keyword.strip():
keyword = 'Donald_Trump'
else:
keyword = input_keyword
fetch_timeseries_wikipedia(keyword)