-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtwitter_scrape.py
237 lines (192 loc) · 8.73 KB
/
twitter_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import requests
import os
import json
import pandas as pd
import csv
import os.path
from os import path
import dateutil
import time
import datetime as dt
from configparser import ConfigParser
import pandas as pd
#load the ini file which will keep track of the current progress of each ticker
INI_FILE = "twitter.ini"
DATA_FILE = './twitter_data_2020.csv'
config_object = ConfigParser()
config_object.read(INI_FILE)
#do you want to collect data for the S&P 500 or Russell 100? -- comment out whichever you do not want.
SP500_bool = True
#R1000_bool = True
#add column names to our output file
if path.exists(DATA_FILE) == False:
csvFile = open("twitter_data_2020.csv", "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)
#Create headers for the data you want to save, in this example, we only want save these columns in our dataset
csvWriter.writerow(['author id', 'created_at', 'id','lang', 'like_count', 'quote_count', 'reply_count','retweet_count','source','tweet','ticker_searched'])
csvFile.close()
if SP500_bool == True:
#load historical S&P500 information and save it to an array
#since we are loading a file with historical SP500 information we need to make sure that we are adding it for companies within the time frame
SP500 = pd.read_csv('./S&P500.txt')
SP500['dtdate'] = pd.to_datetime(SP500['date'], format = "%Y-%m-%d")
SP500 = SP500.loc[(SP500['dtdate'] > '2011-01-01') & (SP500['dtdate'] < '2012-03-02')].copy()
tickers = []
i = 1711
while i < 1815:
current_tickers = (SP500.at[i,'tickers'])
current_tickers_arr = current_tickers.split(',')
for tickers_ in current_tickers_arr:
tickers.append(tickers_)
i = i + 1
all_tickers = set(tickers)
tickers = list(all_tickers)
tickers.sort()
else if R1000_bool = True:
R1000 = pd.read_csv('./Stocks in the Russell 1000 Index.csv')
tickers = R1000['Symbol'].to_numpy()
#create auth token for twitter
os.environ['TOKEN'] = ''
def auth():
return os.getenv('TOKEN')
def create_headers(bearer_token):
headers = {"Authorization": "Bearer {}".format(bearer_token)}
return headers
#function to create the url
def create_url(keyword, start_date, end_date, max_results = 10):
search_url = "https://api.twitter.com/2/tweets/search/all" #Change to the endpoint you want to collect data from
#change params based on the endpoint you are using
query_params = {'query': keyword,
'start_time': start_date,
'end_time': end_date,
'max_results': max_results,
'expansions': 'author_id,in_reply_to_user_id,geo.place_id',
'tweet.fields': 'id,text,author_id,in_reply_to_user_id,geo,conversation_id,created_at,lang,public_metrics,referenced_tweets,reply_settings,source',
'user.fields': 'id,name,username,created_at,description,public_metrics,verified',
'place.fields': 'full_name,id,country,country_code,geo,name,place_type',
'next_token': {}}
return (search_url, query_params)
#function to get data from the API
def connect_to_endpoint(url, headers, params):
response = requests.request("GET", url, headers = headers, params = params)
print("Endpoint Response Code: " + str(response.status_code))
if response.status_code != 200:
raise Exception(response.status_code, response.text)
return response.json()
#function to save to the CSV file
def append_to_csv(json_response, fileName, tickers):
#counter variable
counter = 0
#Open OR create the target CSV file
csvFile = open(fileName, "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)
#Loop through each tweet
for tweet in json_response['data']:
# We will create a variable for each since some of the keys might not exist for some tweets
# So we will account for that
# 1. Author ID
author_id = tweet['author_id']
# 2. Time created
created_at = dateutil.parser.parse(tweet['created_at'])
# 4. Tweet ID
tweet_id = tweet['id']
# 5. Language
lang = tweet['lang']
# 6. Tweet metrics
retweet_count = tweet['public_metrics']['retweet_count']
reply_count = tweet['public_metrics']['reply_count']
like_count = tweet['public_metrics']['like_count']
quote_count = tweet['public_metrics']['quote_count']
# 7. source
try:
source = tweet['source']
except:
source = None
# 8. Tweet text
text = tweet['text']
# 9. Ticker searched
ticker_searched = tickers
# Assemble all data in a list
res = [author_id, created_at, tweet_id, lang, like_count, quote_count, reply_count, retweet_count, source, text, ticker_searched]
# Append the result to the CSV file
csvWriter.writerow(res)
counter += 1
# When done, close the CSV file
csvFile.close()
# Print the number of tweets for this iteration
print("# of Tweets added from this response: ", counter)
#get auth
bearer_token = auth()
headers = create_headers(bearer_token)
#set the start_time (end time is variable)
#start_time = "2011-02-28T00:00:00.000Z"
start_time = "2020-02-29T00:00:00.000Z"
max_results = 500
#loop through all the tickers in the S&P500 array
for tickers_ in tickers:
#if there is a previous date for a ticker we will use that, otherwise it will set the end_time to the 'date_of_interest' end_time
try:
prev_run = config_object['PREVIOUS_RUN_DATE']
end_time = prev_run[str(tickers_)]
except:
no_prev_run = config_object['DATES_OF_INTEREST']
end_time = no_prev_run['end_time']
end_time_compare = pd.to_datetime(end_time)
print("collecting data for ticker " + tickers_ + "current end time: " + end_time)
#set the keyword (search value)
keyword = "$" + tickers_
#call the url function which creates the search URL
url = create_url(keyword, start_time, end_time, max_results)
#we will try to call the function that collects data from the API
#if it fails we will log the error and try again
try:
json_response = connect_to_endpoint(url[0], headers, url[1])
except Exception as error:
print(error)
file1 = open("errors.txt", "a")
file1.write(str(error)+"\n")
file1.close()
time.sleep(60)
pass
#wait 2 seconds (limited by API how fast we can collect data)
time.sleep(2)
#if there is no data (we have previously reached the end or the ticker has no data)
#we will go to the next ticker
if json_response['meta']['result_count'] == 0:
continue
#save the data that we collected from the API
append_to_csv(json_response, DATA_FILE, tickers_)
#get the last datetime value, this will be the end_time since the API searches from end to beginning
current_time_compare = pd.to_datetime(dateutil.parser.parse(json_response['data'][-1]['created_at']))
current_time = current_time_compare.strftime("%Y-%m-%dT%H:%M:%S.000Z")
#we will save the 'current_time'/'end_time' to the ini file, this will keep track of our progress
prev_run[tickers_] = current_time
with open('twitter.ini', 'w') as conf:
config_object.write(conf)
#we can ask the API to continue our previous querry by passing the 'next_token'
#so we will continue to search while a 'next_token' exists
while 'next_token' in json_response['meta']:
#we change the 'next_token' value in our URL
url[1]['next_token'] = json_response['meta']['next_token']
#we try to collect data from the API
#if it fails we will log the error and try again
try:
json_response = connect_to_endpoint(url[0], headers, url[1])
except Exception as error:
print(error)
file1 = open("errors.txt", "a")
file1.write(str(error)+"\n")
file1.close()
time.sleep(60)
pass
#sleep 2 seconds
time.sleep(2)
#save the data
append_to_csv(json_response, DATA_FILE, tickers_)
#get the last datetime value
current_time_compare = pd.to_datetime(dateutil.parser.parse(json_response['data'][-1]['created_at']))
current_time = current_time_compare.strftime("%Y-%m-%dT%H:%M:%S.000Z")
#save the 'current_time'/'end_time' to ini file
prev_run[tickers_] = current_time
with open('twitter.ini', 'w') as conf:
config_object.write(conf)