-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchange-data.py
105 lines (86 loc) · 4.85 KB
/
change-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import yfinance as yf
import os
# Step 1: Scrape the S&P 500 constituent changes table from Wikipedia
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies#Selected_changes_to_the_list_of_S&P_500_components'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Locate the "Selected changes" table - assuming it's the second wikitable on the page
tables = soup.find_all('table', {'class': 'wikitable'})
changes_table = tables[1] # The second table contains the constituent changes
# Step 2: Parse the table to extract relevant data (Date, Added Ticker, Removed Ticker, etc.)
data = []
for row in changes_table.find_all('tr')[1:]:
cols = [col.text.strip() for col in row.find_all('td')] # Extract text from all columns in the row
if len(cols) < 6:
# Skip rows that don't have the expected 6 columns
continue
# Step 3: Extract relevant columns for tickers, securities, and dates
try:
date = cols[0] # Date of change
added_ticker = cols[1] # Ticker of the added company
added_security = cols[2] # Name of the added company
removed_ticker = cols[3] # Ticker of the removed company
removed_security = cols[4] # Name of the removed company
reason = cols[5] # Reason for the change
# Clean and parse the dates (if they are valid)
date_dt = pd.to_datetime(date, errors='coerce')
data.append([date_dt, added_ticker, added_security, removed_ticker, removed_security, reason])
except Exception as e:
print(f"Error parsing row: {e}")
continue
# Step 4: Create a DataFrame to store the parsed data
sp500_changes_df = pd.DataFrame(data, columns=['Date', 'Added Ticker', 'Added Security', 'Removed Ticker', 'Removed Security', 'Reason'])
# Step 5: Save the DataFrame to a CSV file for future reference
sp500_changes_df.to_csv('data/sp500_changes.csv', index=False)
# Step 6: Create directories for added and removed tickers if they don't already exist
os.makedirs('added_tickers', exist_ok=True)
os.makedirs('removed_tickers', exist_ok=True)
# Step 7: List to keep track of unavailable tickers
unavailable_tickers = []
# Step 8: Function to download stock data using yfinance and save it in the correct folder
def download_stock_data(ticker, start_date, end_date, folder):
try:
stock_data = yf.download(ticker, start=start_date, end=end_date)
if not stock_data.empty:
# Save the data to a CSV file in the appropriate folder
file_name = os.path.join(folder, f"{ticker}_historical_data.csv")
stock_data.to_csv(file_name)
print(f"Downloaded and saved data for {ticker} in {folder}")
else:
print(f"No data found for {ticker}. Adding to unavailable list.")
unavailable_tickers.append(ticker) # Track unavailable tickers
except Exception as e:
print(f"Failed to download data for {ticker}: {e}")
unavailable_tickers.append(ticker) # Track unavailable tickers in case of failure
# Step 9: Set to keep track of tickers that have already been processed
processed_tickers = set()
# Step 10: Iterate over the DataFrame and download historical data for both Added and Removed tickers
for index, row in sp500_changes_df.iterrows():
change_date = row['Date'] # 'Date' column is used for both added and removed tickers
# Ensure there is a valid date
if pd.isna(change_date):
continue
# Download historical data for the 'Removed Ticker' first (if it exists)
removed_ticker = row['Removed Ticker']
if pd.notna(removed_ticker) and removed_ticker not in processed_tickers:
end_date = change_date
start_date = end_date - timedelta(days=365) # Limit to one year before the removal date
download_stock_data(removed_ticker, start_date, end_date, 'removed_tickers')
processed_tickers.add(removed_ticker)
# Download historical data for the 'Added Ticker' (if it hasn't been processed as removed)
added_ticker = row['Added Ticker']
if pd.notna(added_ticker) and added_ticker not in processed_tickers:
end_date = datetime.today() # Use today's date for current tickers
start_date = end_date - timedelta(days=365) # Limit to one year before the added date
download_stock_data(added_ticker, start_date, end_date, 'added_tickers')
processed_tickers.add(added_ticker)
# Step 11: Save the unavailable tickers to a CSV file
if unavailable_tickers:
os.makedirs('data/delisted', exist_ok=True)
unavailable_df = pd.DataFrame(unavailable_tickers, columns=['Ticker'])
unavailable_df.to_csv('data/delisted/delisted.csv', index=False)
print(f"Saved unavailable tickers to data/delisted/delisted.csv")