Skip to content

Commit

Permalink
Created a central main.py to collectively perform data extraction, da…
Browse files Browse the repository at this point in the history
…ta upload to db and model training
  • Loading branch information
ShivamSingal committed Jan 5, 2024
1 parent f92a935 commit 6a91026
Show file tree
Hide file tree
Showing 15 changed files with 233 additions and 83 deletions.
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ __pycache__/

env
.env
data/*.xlsx
model/*.xlsx
.DS_Store

.ipynb_checkpoints
4 changes: 2 additions & 2 deletions backend/routers/predict.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from fastapi import APIRouter
import joblib
from model.model import get_model
import numpy
from backend.pydantic_schemas import Predict

Expand All @@ -12,7 +12,7 @@

@router.post("")
def get_prediction(request: Predict.PredictRequestBody):
model = joblib.load("random_forest_model.joblib")
model = get_model()
input = numpy.array(
[
request.bed,
Expand Down
17 changes: 13 additions & 4 deletions backend/services/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from backend.services.search import get_building_units_by_timestamp, get_building_by_lat_lon
from data.data_cleaner import get_cleaned_df

from .delete import delete_units_by_timestamp


def row_to_building(row, db: Session) -> Building:
building = get_building_by_lat_lon(
Expand Down Expand Up @@ -55,23 +57,30 @@ def add_listing_data_to_db(db: Session, df: pd.DataFrame):
building_groups = df.groupby([TableHeaders.LAT.value, TableHeaders.LON.value])
for (lat, lon), building_df in building_groups:
building = get_building_by_lat_lon(db, lat=lat, lon=lon)
timestamp = building_df[TableHeaders.DATE.value].iloc[0]
existing_units = get_building_units_by_timestamp(db, building.id, building_df[TableHeaders.DATE.value].iloc[0]).first()
if existing_units is not None:
print(f"Units for building {building.id} for timestamp {timestamp} already exist")
continue
units = building_df.apply(row_to_unit, args=(building.id,), axis=1)
create_units(db, units)
except Exception as e:
print(f"An error occurred: {e}")
print(f"An error occurred while adding listing data to the database: {e}")
# Delete all units for the current date timestamp
timestamp = df[TableHeaders.DATE.value].iloc[0]
delete_units_by_timestamp(db=db, timestamp=timestamp)
raise e


def create_buildings(db: Session, buildings: list[Building]):
db.bulk_save_objects(buildings)
db.commit()
print(f"Created buildings in db")



def create_units(db: Session, units: list[Unit]):
building_id = units[0].building_id
db.bulk_save_objects(units)
db.commit()


add_listing_data_to_db(get_db(), get_cleaned_df())
print(f"Created units in db for building with id {building_id}")
11 changes: 11 additions & 0 deletions backend/services/delete.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from sqlalchemy.orm import Session
from datetime import datetime

from backend.db_models import Unit

def delete_units_by_timestamp(db: Session, timestamp: datetime):
try:
db.query(Unit).filter(Unit.timestamp == timestamp).delete()
db.commit()
except Exception as e:
print(f"An error occurred while deleting units by timestamp: {e}")
Binary file modified data/cleaned_data/01-01-2024_cleaned_listings.xlsx
Binary file not shown.
2 changes: 1 addition & 1 deletion data/configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def create_chrome_driver(*, debugging_port):
chrome_options.add_argument(f"user-agent={user_agent}")
chrome_options.add_argument(f"--remote-debugging-port={debugging_port}")
chrome_options.add_argument('--window-size=1920x1080')
chrome_options.add_argument("--headless") # Enable headless mode (does not open browser GUI)
# chrome_options.add_argument("--headless") # Enable headless mode (does not open browser GUI)

# First ChromeService instance
chrome_service = ChromeService(executable_path=chrome_driver_path)
Expand Down
22 changes: 8 additions & 14 deletions data/data_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,16 +68,12 @@ def parse_pets_value(pets_value):
pets_value = pets_value.lower()
return 1 if any(pet in pets_value for pet in ['dog', 'cat', 'yes']) else 0

def get_raw_df() -> pd.DataFrame:
current_dir = os.path.dirname(os.path.realpath(__file__))
listings_path = os.path.join(current_dir, "rental_listings.xlsx")
return pd.read_excel(listings_path)

def get_cleaned_df() -> pd.DataFrame:
current_dir = os.path.dirname(os.path.realpath(__file__))
cleaned_listings_path = os.path.join(current_dir, "cleaned_listings.xlsx")
cleaned_df = get_cleaned_data(get_raw_df())
cleaned_df.to_excel(cleaned_listings_path, index=False)
def get_raw_df(raw_filepath: str) -> pd.DataFrame:
return pd.read_excel(raw_filepath)

def get_cleaned_df(raw_filepath: str, cleaned_filepath: str) -> pd.DataFrame:
cleaned_df = get_cleaned_data(get_raw_df(raw_filepath))
cleaned_df.to_excel(cleaned_filepath, index=False)
return cleaned_df

# Main function to process the data
Expand All @@ -100,14 +96,12 @@ def get_cleaned_data(df):
df = dummies.groupby(dummies.index).max()

# List of columns to check for NaN values
na_columns_to_drop = [TableHeaders.BUILDING.value, TableHeaders.CITY.value, TableHeaders.BED.value, TableHeaders.SQFT.value, TableHeaders.PRICE.value] # replace with your actual column names
na_columns_to_drop = [TableHeaders.BUILDING.value, TableHeaders.CITY.value, TableHeaders.BED.value, TableHeaders.BATH.value, TableHeaders.SQFT.value, TableHeaders.PRICE.value] # replace with your actual column names

# Remove nulls
df.dropna(subset=na_columns_to_drop, inplace=True)

# Filter out listings with prices greater than $5K - these extreme values are outliers
df = df[df[TableHeaders.PRICE.value] < 5000]

return df

get_cleaned_df()
return df
101 changes: 42 additions & 59 deletions data/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,75 +17,58 @@

from datetime import datetime

# Webdriver --------------------------------------------------
PADMAPPER_BASE_URL = "https://www.padmapper.com"

# Initialize WebDriver for retrieving rental listings from landing page
fetch_rental_listings_driver: WebDriver = create_chrome_driver(debugging_port=9222)
def extract_raw_data(filepath: str, listing_urls: list[str]) -> pd.DataFrame:
extracted_listing_data = []

current_dir = os.path.dirname(os.path.realpath(__file__))
listings_path = os.path.join(current_dir, "rental_listings.xlsx")
for listing_url in listing_urls:
# Initialize WebDriver for retrieving rental listings from landing page
fetch_rental_listings_driver: WebDriver = create_chrome_driver(debugging_port=9222)
padmapper_scraper = PadmapperScraper(PADMAPPER_BASE_URL, [listing_url])
padmapper_scraper.fetch_rental_listing_urls(fetch_rental_listings_driver)

# Padmapper --------------------------------------------------
# Close the fetch_rental_listing_driver
fetch_rental_listings_driver.quit()

padmapper_base_url = 'https://www.padmapper.com'
padmapper_complete_urls = [
f'{padmapper_base_url}/apartments/toronto-on',
f'{padmapper_base_url}/apartments/vancouver-bc',
f'{padmapper_base_url}/apartments/winnipeg-mb',
f'{padmapper_base_url}/apartments/edmonton-ab',
f'{padmapper_base_url}/apartments/montreal-qc',
]
padmapper_scraper = PadmapperScraper(padmapper_base_url, padmapper_complete_urls)
# Initialize WebDriver for extracting data from every rental listing
get_rental_data_driver: WebDriver = create_chrome_driver(debugging_port=9223)

# Collect rental listing URLs from main page to scrape
# Log all extracted listings to a txt file for data permanence
with open('listings.txt', 'a') as file:
file.write('\n'.join(padmapper_scraper.urls))

current_100_units = []
current_city_units = []

padmapper_scraper.fetch_rental_listing_urls(fetch_rental_listings_driver)
# Scrape page content of collected URLs to get rental listing data
for url in padmapper_scraper.urls:
try:
# on every 100 listings read, write them to the excel sheet (in case of crash)
if len(current_100_units) >= 100:
current_city_units += current_100_units
extracted_listing_data += current_city_units
extracted_listing_data_df = pd.DataFrame(extracted_listing_data, columns=table_columns)
extracted_listing_data_df.to_excel(filepath, index=False)
current_100_units.clear()
rental_listing_data = padmapper_scraper.get_rental_listing_data(get_rental_data_driver, url)
if rental_listing_data:
current_100_units += rental_listing_data
except:
continue

# Close the fetch_rental_listing_driver
fetch_rental_listings_driver.quit()
# Append remaining padmapper listings to all_units
extracted_listing_data += current_100_units

# Initialize WebDriver for extracting data from every rental listing
get_rental_data_driver: WebDriver = create_chrome_driver(debugging_port=9223)
extracted_listing_data_df = pd.DataFrame(extracted_listing_data, columns=table_columns)

# Log all extracted listings to a txt file for data permanence
with open('listings.txt', 'w') as file:
file.write('\n'.join(padmapper_scraper.urls))
extracted_listing_data_df.to_excel(filepath, index=False)

all_units_df = pd.DataFrame(columns=table_columns)
current_units = []
all_units = []
# Close the get_rental_data_driver
get_rental_data_driver.quit()

# Scrape page content of collected URLs to get rental listing data
for url in padmapper_scraper.urls:
try:
# on every 100 listings read, write them to the excel sheet (in case of crash)
if len(current_units) >= 100:
all_units += current_units
# current_df = pd.DataFrame(current_units, columns=table_columns)
# all_units_df = pd.concat([all_units_df, current_df], ignore_index=True)
all_units_df = pd.DataFrame(all_units, columns=table_columns)
all_units_df.to_excel(listings_path, index=False)
current_units.clear()
rental_listing_data = padmapper_scraper.get_rental_listing_data(get_rental_data_driver, url)
# print(rental_listing_data[0][TableHeaders.CITY.value])
# print(table_columns)
if rental_listing_data:
current_units += rental_listing_data
except:
continue
extracted_listing_data_df[TableHeaders.DATE.value].fillna(datetime.now(), inplace=True)

# Append remaining padmapper listings to all_units
all_units += current_units
extracted_listing_data_df.to_excel(filepath, index=False)

# ------------------------------------------------------------

all_units_df: pd.DataFrame = pd.DataFrame(all_units, columns=table_columns)

all_units_df[TableHeaders.DATE.value] = all_units_df[TableHeaders.DATE.value].fillna(datetime.now())

all_units_df.to_excel(listings_path, index=False)

# Close the get_rental_data_driver
get_rental_data_driver.quit()

# -------------------------------------------------------------
return extracted_listing_data_df
6 changes: 6 additions & 0 deletions data/scrapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,10 +192,16 @@ def get_rental_listing_data(self, web_driver: WebDriver, url: str) -> list:
web_driver (webdriver): The Selenium WebDriver to use for scraping.
url (str): URL of the listing page to scrape.
"""

try:
if not self._try_load_page(web_driver, url):
return [] # Skip processing this URL and continue with others

# Wait for a summary table before proceeding
WebDriverWait(web_driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='SummaryTable_']"))
)

is_single_unit = self._process_floorplan_panels(web_driver)
link_html_content = web_driver.page_source
print(f"Processing listing: {url}")
Expand Down
59 changes: 59 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os
from datetime import datetime
import pandas as pd

from data.main import extract_raw_data
from data.data_cleaner import get_cleaned_df

from backend.services.create import add_listing_data_to_db
from backend.dependencies import get_db

from model.model import train_model

current_dir = os.path.dirname(os.path.realpath(__file__))

# current_timestamp = datetime.now().strftime("%d-%m-%Y")

current_timestamp = "01-01-2024"

raw_filepath = f"{current_dir}/data/raw_data/{current_timestamp}_rental_listings.xlsx"
cleaned_filepath = f"{current_dir}/data/cleaned_data/{current_timestamp}_cleaned_listings.xlsx"
model_filepath = f"{current_dir}/model/model.joblib"
model_archive_filepath = f"{current_dir}/model/model_archives/{current_timestamp}_model.joblib"

# # Extract raw data to acquire the rental listing data for the current month
# try:
# extract_raw_data(
# filepath=raw_filepath,
# listing_urls=[
# "https://www.padmapper.com/apartments/vancouver-bc",
# "https://www.padmapper.com/apartments/winnipeg-mb",
# "https://www.padmapper.com/apartments/toronto-on",
# "https://www.padmapper.com/apartments/edmonton-ab",
# "https://www.padmapper.com/apartments/montreal-qc",
# ]
# )

# cleaned_data_df = get_cleaned_df(
# raw_filepath=raw_filepath, cleaned_filepath=cleaned_filepath
# )

# except Exception as e:
# print("An error occurred while extracting data:", e)
# if os.path.exists(cleaned_filepath):
# os.remove(cleaned_filepath)
# exit()

cleaned_data_df = get_cleaned_df(
raw_filepath=raw_filepath, cleaned_filepath=cleaned_filepath
)

# Push the acquired data to Neon DB
# try:
# add_listing_data_to_db(get_db(), cleaned_data_df)
# except Exception as e:
# print("An error occurred while updated the database:", e)
# exit()

# Retrain the model and update the joblib object containing the model
train_model(df=cleaned_data_df, filepath=model_filepath, archive_filepath=model_archive_filepath)
Loading

0 comments on commit 6a91026

Please sign in to comment.