Created a central main.py to collectively perform data extraction, da…

…ta upload to db and model training
ShivamSingal · Jan 5, 2024 · 6a91026 · 6a91026
1 parent f92a935
commit 6a91026
Show file tree

Hide file tree

Showing 15 changed files with 233 additions and 83 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,8 +2,6 @@ __pycache__/
 
 env
 .env
-data/*.xlsx
-model/*.xlsx
 .DS_Store
 
 .ipynb_checkpoints
diff --git a/backend/routers/predict.py b/backend/routers/predict.py
@@ -1,5 +1,5 @@
 from fastapi import APIRouter
-import joblib
+from model.model import get_model
 import numpy
 from backend.pydantic_schemas import Predict
 
@@ -12,7 +12,7 @@
 
 @router.post("")
 def get_prediction(request: Predict.PredictRequestBody):
-    model = joblib.load("random_forest_model.joblib")
+    model = get_model()
     input = numpy.array(
         [
             request.bed,

diff --git a/backend/services/create.py b/backend/services/create.py
@@ -7,6 +7,8 @@
 from backend.services.search import get_building_units_by_timestamp, get_building_by_lat_lon
 from data.data_cleaner import get_cleaned_df
 
+from .delete import delete_units_by_timestamp
+
 
 def row_to_building(row, db: Session) -> Building:
     building = get_building_by_lat_lon(
@@ -55,23 +57,30 @@ def add_listing_data_to_db(db: Session, df: pd.DataFrame):
         building_groups = df.groupby([TableHeaders.LAT.value, TableHeaders.LON.value])
         for (lat, lon), building_df in building_groups:
             building = get_building_by_lat_lon(db, lat=lat, lon=lon)
+            timestamp = building_df[TableHeaders.DATE.value].iloc[0]
             existing_units = get_building_units_by_timestamp(db, building.id, building_df[TableHeaders.DATE.value].iloc[0]).first()
             if existing_units is not None:
+                print(f"Units for building {building.id} for timestamp {timestamp} already exist")
                 continue
             units = building_df.apply(row_to_unit, args=(building.id,), axis=1)
             create_units(db, units)
     except Exception as e:
-        print(f"An error occurred: {e}")
+        print(f"An error occurred while adding listing data to the database: {e}")
+        # Delete all units for the current date timestamp
+        timestamp = df[TableHeaders.DATE.value].iloc[0]
+        delete_units_by_timestamp(db=db, timestamp=timestamp)
+        raise e
 
 
 def create_buildings(db: Session, buildings: list[Building]):
     db.bulk_save_objects(buildings)
     db.commit()
+    print(f"Created buildings in db")
+
 
 
 def create_units(db: Session, units: list[Unit]):
+    building_id = units[0].building_id
     db.bulk_save_objects(units)
     db.commit()
-
-
-add_listing_data_to_db(get_db(), get_cleaned_df())
+    print(f"Created units in db for building with id {building_id}")
diff --git a/backend/services/delete.py b/backend/services/delete.py
@@ -0,0 +1,11 @@
+from sqlalchemy.orm import Session
+from datetime import datetime
+
+from backend.db_models import Unit
+
+def delete_units_by_timestamp(db: Session, timestamp: datetime):
+    try:
+        db.query(Unit).filter(Unit.timestamp == timestamp).delete()
+        db.commit()
+    except Exception as e:
+        print(f"An error occurred while deleting units by timestamp: {e}")
diff --git a/data/cleaned_data/01-01-2024_cleaned_listings.xlsx b/data/cleaned_data/01-01-2024_cleaned_listings.xlsx
diff --git a/data/configs.py b/data/configs.py
@@ -22,7 +22,7 @@ def create_chrome_driver(*, debugging_port):
     chrome_options.add_argument(f"user-agent={user_agent}")
     chrome_options.add_argument(f"--remote-debugging-port={debugging_port}")
     chrome_options.add_argument('--window-size=1920x1080')
-    chrome_options.add_argument("--headless")  # Enable headless mode (does not open browser GUI)
+    # chrome_options.add_argument("--headless")  # Enable headless mode (does not open browser GUI)
 
     # First ChromeService instance
     chrome_service = ChromeService(executable_path=chrome_driver_path)

diff --git a/data/data_cleaner.py b/data/data_cleaner.py
@@ -68,16 +68,12 @@ def parse_pets_value(pets_value):
     pets_value = pets_value.lower()
     return 1 if any(pet in pets_value for pet in ['dog', 'cat', 'yes']) else 0
 
-def get_raw_df() -> pd.DataFrame:
-    current_dir = os.path.dirname(os.path.realpath(__file__))
-    listings_path = os.path.join(current_dir, "rental_listings.xlsx")
-    return pd.read_excel(listings_path)
-
-def get_cleaned_df() -> pd.DataFrame:
-    current_dir = os.path.dirname(os.path.realpath(__file__))
-    cleaned_listings_path = os.path.join(current_dir, "cleaned_listings.xlsx")
-    cleaned_df = get_cleaned_data(get_raw_df())
-    cleaned_df.to_excel(cleaned_listings_path, index=False)
+def get_raw_df(raw_filepath: str) -> pd.DataFrame:
+    return pd.read_excel(raw_filepath)
+
+def get_cleaned_df(raw_filepath: str, cleaned_filepath: str) -> pd.DataFrame:
+    cleaned_df = get_cleaned_data(get_raw_df(raw_filepath))
+    cleaned_df.to_excel(cleaned_filepath, index=False)
     return cleaned_df
 
 # Main function to process the data
@@ -100,14 +96,12 @@ def get_cleaned_data(df):
     df = dummies.groupby(dummies.index).max()
 
     # List of columns to check for NaN values
-    na_columns_to_drop = [TableHeaders.BUILDING.value, TableHeaders.CITY.value, TableHeaders.BED.value, TableHeaders.SQFT.value, TableHeaders.PRICE.value]  # replace with your actual column names
+    na_columns_to_drop = [TableHeaders.BUILDING.value, TableHeaders.CITY.value, TableHeaders.BED.value, TableHeaders.BATH.value, TableHeaders.SQFT.value, TableHeaders.PRICE.value]  # replace with your actual column names
 
     # Remove nulls
     df.dropna(subset=na_columns_to_drop, inplace=True)
 
     # Filter out listings with prices greater than $5K - these extreme values are outliers
     df = df[df[TableHeaders.PRICE.value] < 5000]
 
-    return df
-
-get_cleaned_df()
+    return df
diff --git a/data/main.py b/data/main.py
@@ -17,75 +17,58 @@
 
 from datetime import datetime
 
-# Webdriver --------------------------------------------------
+PADMAPPER_BASE_URL = "https://www.padmapper.com"
 
-# Initialize WebDriver for retrieving rental listings from landing page
-fetch_rental_listings_driver: WebDriver = create_chrome_driver(debugging_port=9222)
+def extract_raw_data(filepath: str, listing_urls: list[str]) -> pd.DataFrame:
+    extracted_listing_data = []
 
-current_dir = os.path.dirname(os.path.realpath(__file__))
-listings_path = os.path.join(current_dir, "rental_listings.xlsx")
+    for listing_url in listing_urls:
+        # Initialize WebDriver for retrieving rental listings from landing page
+        fetch_rental_listings_driver: WebDriver = create_chrome_driver(debugging_port=9222) 
+        padmapper_scraper = PadmapperScraper(PADMAPPER_BASE_URL, [listing_url])
+        padmapper_scraper.fetch_rental_listing_urls(fetch_rental_listings_driver)
 
-# Padmapper --------------------------------------------------
+        # Close the fetch_rental_listing_driver
+        fetch_rental_listings_driver.quit()
 
-padmapper_base_url = 'https://www.padmapper.com'
-padmapper_complete_urls = [
-    f'{padmapper_base_url}/apartments/toronto-on',
-    f'{padmapper_base_url}/apartments/vancouver-bc',
-    f'{padmapper_base_url}/apartments/winnipeg-mb',
-    f'{padmapper_base_url}/apartments/edmonton-ab',
-    f'{padmapper_base_url}/apartments/montreal-qc',
-]
-padmapper_scraper = PadmapperScraper(padmapper_base_url, padmapper_complete_urls)
+        # Initialize WebDriver for extracting data from every rental listing
+        get_rental_data_driver: WebDriver = create_chrome_driver(debugging_port=9223)
 
-# Collect rental listing URLs from main page to scrape
+        # Log all extracted listings to a txt file for data permanence
+        with open('listings.txt', 'a') as file:
+            file.write('\n'.join(padmapper_scraper.urls))
+
+        current_100_units = []
+        current_city_units = []
 
-padmapper_scraper.fetch_rental_listing_urls(fetch_rental_listings_driver)
+        # Scrape page content of collected URLs to get rental listing data 
+        for url in padmapper_scraper.urls:
+            try:
+                # on every 100 listings read, write them to the excel sheet (in case of crash)
+                if len(current_100_units) >= 100:
+                    current_city_units += current_100_units
+                    extracted_listing_data += current_city_units
+                    extracted_listing_data_df = pd.DataFrame(extracted_listing_data, columns=table_columns)
+                    extracted_listing_data_df.to_excel(filepath, index=False)
+                    current_100_units.clear()
+                rental_listing_data = padmapper_scraper.get_rental_listing_data(get_rental_data_driver, url)
+                if rental_listing_data:
+                    current_100_units += rental_listing_data
+            except:
+                continue
 
-# Close the fetch_rental_listing_driver
-fetch_rental_listings_driver.quit()
+        # Append remaining padmapper listings to all_units
+        extracted_listing_data += current_100_units
 
-# Initialize WebDriver for extracting data from every rental listing
-get_rental_data_driver: WebDriver = create_chrome_driver(debugging_port=9223)
+        extracted_listing_data_df = pd.DataFrame(extracted_listing_data, columns=table_columns)
 
-# Log all extracted listings to a txt file for data permanence
-with open('listings.txt', 'w') as file:
-    file.write('\n'.join(padmapper_scraper.urls))
+        extracted_listing_data_df.to_excel(filepath, index=False)
 
-all_units_df = pd.DataFrame(columns=table_columns)
-current_units = []
-all_units = []
+        # Close the get_rental_data_driver
+        get_rental_data_driver.quit()
 
-# Scrape page content of collected URLs to get rental listing data 
-for url in padmapper_scraper.urls:
-    try:
-        # on every 100 listings read, write them to the excel sheet (in case of crash)
-        if len(current_units) >= 100:
-            all_units += current_units
-            # current_df = pd.DataFrame(current_units, columns=table_columns)
-            # all_units_df = pd.concat([all_units_df, current_df], ignore_index=True)
-            all_units_df = pd.DataFrame(all_units, columns=table_columns)
-            all_units_df.to_excel(listings_path, index=False)
-            current_units.clear()
-        rental_listing_data = padmapper_scraper.get_rental_listing_data(get_rental_data_driver, url)
-        # print(rental_listing_data[0][TableHeaders.CITY.value])
-        # print(table_columns)
-        if rental_listing_data:
-            current_units += rental_listing_data
-    except:
-        continue
+    extracted_listing_data_df[TableHeaders.DATE.value].fillna(datetime.now(), inplace=True)
 
-# Append remaining padmapper listings to all_units
-all_units += current_units
+    extracted_listing_data_df.to_excel(filepath, index=False)
 
-# ------------------------------------------------------------
-
-all_units_df: pd.DataFrame = pd.DataFrame(all_units, columns=table_columns)
-
-all_units_df[TableHeaders.DATE.value] = all_units_df[TableHeaders.DATE.value].fillna(datetime.now())
-
-all_units_df.to_excel(listings_path, index=False)
-
-# Close the get_rental_data_driver
-get_rental_data_driver.quit()
-
-# -------------------------------------------------------------
+    return extracted_listing_data_df
diff --git a/data/scrapers.py b/data/scrapers.py
@@ -192,10 +192,16 @@ def get_rental_listing_data(self, web_driver: WebDriver, url: str) -> list:
             web_driver (webdriver): The Selenium WebDriver to use for scraping.
             url (str): URL of the listing page to scrape.
         """
+
         try:
             if not self._try_load_page(web_driver, url):
                 return []  # Skip processing this URL and continue with others
 
+            # Wait for a summary table before proceeding
+            WebDriverWait(web_driver, 5).until(
+                EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='SummaryTable_']"))
+            )
+
             is_single_unit = self._process_floorplan_panels(web_driver)
             link_html_content = web_driver.page_source
             print(f"Processing listing: {url}")

diff --git a/main.py b/main.py
@@ -0,0 +1,59 @@
+import os
+from datetime import datetime
+import pandas as pd
+
+from data.main import extract_raw_data
+from data.data_cleaner import get_cleaned_df
+
+from backend.services.create import add_listing_data_to_db
+from backend.dependencies import get_db
+
+from model.model import train_model
+
+current_dir = os.path.dirname(os.path.realpath(__file__))
+
+# current_timestamp = datetime.now().strftime("%d-%m-%Y")
+
+current_timestamp = "01-01-2024"
+
+raw_filepath = f"{current_dir}/data/raw_data/{current_timestamp}_rental_listings.xlsx"
+cleaned_filepath = f"{current_dir}/data/cleaned_data/{current_timestamp}_cleaned_listings.xlsx"
+model_filepath = f"{current_dir}/model/model.joblib"
+model_archive_filepath = f"{current_dir}/model/model_archives/{current_timestamp}_model.joblib"
+
+# # Extract raw data to acquire the rental listing data for the current month
+# try:
+#     extract_raw_data(
+#         filepath=raw_filepath,
+#         listing_urls=[
+#             "https://www.padmapper.com/apartments/vancouver-bc",
+#             "https://www.padmapper.com/apartments/winnipeg-mb",
+#             "https://www.padmapper.com/apartments/toronto-on",
+#             "https://www.padmapper.com/apartments/edmonton-ab",
+#             "https://www.padmapper.com/apartments/montreal-qc",
+#         ]
+#     )
+
+#     cleaned_data_df = get_cleaned_df(
+#         raw_filepath=raw_filepath, cleaned_filepath=cleaned_filepath
+#     )
+
+# except Exception as e:
+#     print("An error occurred while extracting data:", e)
+#     if os.path.exists(cleaned_filepath):
+#         os.remove(cleaned_filepath)
+#     exit()
+
+cleaned_data_df = get_cleaned_df(
+    raw_filepath=raw_filepath, cleaned_filepath=cleaned_filepath
+)
+
+# Push the acquired data to Neon DB
+# try:
+#     add_listing_data_to_db(get_db(), cleaned_data_df)
+# except Exception as e:
+#     print("An error occurred while updated the database:", e)
+#     exit()
+
+# Retrain the model and update the joblib object containing the model
+train_model(df=cleaned_data_df, filepath=model_filepath, archive_filepath=model_archive_filepath)