Skip to content

Commit

Permalink
Refactored scraping to include date and neighbourhood fields
Browse files Browse the repository at this point in the history
Co-authored-by: Alyssa D'Souza <[email protected]>
  • Loading branch information
ShivamSingal and alyssadsouza committed Jan 4, 2024
1 parent 3b8c3d6 commit ece8b23
Show file tree
Hide file tree
Showing 12 changed files with 63 additions and 43 deletions.
4 changes: 2 additions & 2 deletions backend/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
from backend.database import engine, Base
from backend.routers import predict
from backend.routers import analysis
# from backend.db_models import Building, Unit
from backend.db_models import Building, Unit


# Base.metadata.drop_all(bind=engine,tables=[Unit.__table__, Building.__table__])
Base.metadata.drop_all(bind=engine,tables=[Unit.__table__, Building.__table__])
Base.metadata.create_all(engine)

app = FastAPI()
Expand Down
14 changes: 8 additions & 6 deletions backend/db_models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from backend.database import Base
from sqlalchemy import Boolean, Column, ForeignKey, Integer, String, Float
from sqlalchemy import Boolean, Column, ForeignKey, Integer, String, Float, DateTime
from sqlalchemy.orm import relationship
from datetime import datetime

class Unit(Base):
__tablename__ = "units"
Expand All @@ -16,17 +17,18 @@ class Unit(Base):
hardwood_floor = Column(Boolean, default=False)
high_ceilings = Column(Boolean, default=False)
in_unit_laundry = Column(Boolean, default=False)
timestamp = Column(DateTime, default=datetime.now())

building = relationship("Building", back_populates="units")

class Building(Base):
__tablename__ = "buildings"
id = Column(Integer, primary_key=True, index=True, autoincrement=True, unique=True)
name = Column(String, primary_key=True, index=True, unique=True)
address = Column(String, primary_key=True, index=True)
city = Column(String, primary_key=True, index=True)
lat = Column(Float)
lon = Column(Float)
name = Column(String, index=True)
address = Column(String, index=True)
city = Column(String, index=True)
lat = Column(Float, primary_key=True)
lon = Column(Float, primary_key=True)
controlled_access = Column(Boolean, default=False)
fitness_center = Column(Boolean, default=False)
outdoor_space = Column(Boolean, default=False)
Expand Down
2 changes: 1 addition & 1 deletion backend/scripts/seed.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
session = SessionLocal()

# Clear ALL data from ALL tables
session.query(Building).delete()
session.query(Unit).delete()
session.query(Building).delete()

# Recreate tables after dropping them (if needed)
Base.metadata.create_all(engine)
Expand Down
22 changes: 14 additions & 8 deletions backend/services/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@
from constants import TableHeaders, UnitAmenities, BuildingAmenities
from backend.db_models import Building, Unit
from backend.dependencies import get_db
from backend.services.search import get_building_by_name
from backend.services.search import get_building_units_by_timestamp, get_building_by_lat_lon
from data.data_cleaner import get_cleaned_df


def row_to_building(row) -> Building:
def row_to_building(row, db: Session) -> Building:
building = get_building_by_lat_lon(
db, lat=row[TableHeaders.LAT.value], lon=row[TableHeaders.LON.value])
return Building(
name=row[TableHeaders.BUILDING.value],
address=row[TableHeaders.ADDRESS.value],
Expand All @@ -23,7 +25,7 @@ def row_to_building(row) -> Building:
storage=row[BuildingAmenities.STORAGE.value],
swimming_pool=row[BuildingAmenities.SWIMMING_POOL.value],
pets=row[TableHeaders.PETS.value]
)
) if building is None else None


def row_to_unit(row, building_id) -> Unit:
Expand All @@ -38,20 +40,24 @@ def row_to_unit(row, building_id) -> Unit:
furnished=row[UnitAmenities.FURNISHED.value],
hardwood_floor=row[UnitAmenities.HARDWOOD_FLOOR.value],
high_ceilings=row[UnitAmenities.HIGH_CEILINGS.value],
in_unit_laundry=row[UnitAmenities.IN_UNIT_LAUNDRY.value]
in_unit_laundry=row[UnitAmenities.IN_UNIT_LAUNDRY.value],
timestamp=row[TableHeaders.DATE.value].to_pydatetime()
)


def add_listing_data_to_db(db: Session, df: pd.DataFrame):
try:
# First add all Building objects by getting all rows with a unique building and converting each to a Building object
buildings = df.drop_duplicates(
subset=TableHeaders.BUILDING.value, keep='first').apply(row_to_building, axis=1)
subset=[TableHeaders.LAT.value, TableHeaders.LON.value], keep='first').apply(row_to_building, args=(db,), axis=1).dropna()
create_buildings(db, buildings)
# Now add all Unit objects associated with each building
building_groups = df.groupby(TableHeaders.BUILDING.value)
for building_name, building_df in building_groups:
building = get_building_by_name(db, building_name)
building_groups = df.groupby([TableHeaders.LAT.value, TableHeaders.LON.value])
for (lat, lon), building_df in building_groups:
building = get_building_by_lat_lon(db, lat=lat, lon=lon)
existing_units = get_building_units_by_timestamp(db, building.id, building_df[TableHeaders.DATE.value].iloc[0]).first()
if existing_units is not None:
continue
units = building_df.apply(row_to_unit, args=(building.id,), axis=1)
create_units(db, units)
except Exception as e:
Expand Down
13 changes: 9 additions & 4 deletions backend/services/search.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
from datetime import datetime
import pandas as pd
from sqlalchemy.orm import Session

from backend.pydantic_schemas.Building import Building as BuildingObj
from backend.db_models import Building
from backend.db_models import Building, Unit


def get_building_by_name(db: Session, name: str) -> BuildingObj:
def get_building_by_name(db: Session, name: str):
return db.query(Building).filter_by(name=name).first()

def get_building_by_lat_lon(db: Session, lat: float, lon: float):
return db.query(Building).filter_by(lat=lat, lon=lon).first()

def get_building_units_by_timestamp(db: Session, building_id: int, timestamp: datetime):
return db.query(Unit).filter_by(building_id=building_id, timestamp=timestamp)
17 changes: 3 additions & 14 deletions constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

class TableHeaders(Enum):
BUILDING = 'Building'
NEIGHBOURHOOD = 'Neighbourhood'
ADDRESS = 'Address'
CITY = 'City'
LISTING = 'Listing'
Expand All @@ -14,6 +15,7 @@ class TableHeaders(Enum):
PETS = 'Pets'
LAT = 'Latitude'
LON = 'Longitude'
DATE = 'Date'

class UnitAmenities(Enum):
BALCONY = 'Balcony'
Expand Down Expand Up @@ -111,17 +113,4 @@ class BuildingAmenities(Enum):
}
]

table_columns = [
TableHeaders.BUILDING.value,
TableHeaders.ADDRESS.value,
TableHeaders.LISTING.value,
TableHeaders.BED.value,
TableHeaders.BATH.value,
TableHeaders.SQFT.value,
TableHeaders.PRICE.value,
TableHeaders.UNIT_AMENITIES.value,
TableHeaders.BUILDING_AMENITIES.value,
TableHeaders.PETS.value,
TableHeaders.LAT.value,
TableHeaders.LON.value
]
table_columns = [table_header.value for table_header in TableHeaders]
Binary file not shown.
Binary file not shown.
7 changes: 6 additions & 1 deletion data/data_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,15 @@ def get_cleaned_data(df):
dummies = pd.get_dummies(df_exploded, columns=[TableHeaders.UNIT_AMENITIES.value], prefix='', prefix_sep='', dtype=int)
df = dummies.groupby(dummies.index).max()

# List of columns to check for NaN values
na_columns_to_drop = [TableHeaders.BUILDING.value, TableHeaders.CITY.value, TableHeaders.BED.value, TableHeaders.SQFT.value, TableHeaders.PRICE.value] # replace with your actual column names

# Remove nulls
df.dropna(inplace=True)
df.dropna(subset=na_columns_to_drop, inplace=True)

# Filter out listings with prices greater than $5K - these extreme values are outliers
df = df[df[TableHeaders.PRICE.value] < 5000]

return df

get_cleaned_df()
12 changes: 9 additions & 3 deletions data/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os

from constants import (
table_columns,
table_columns, TableHeaders
)

from data.configs import (
Expand All @@ -15,6 +15,8 @@

import pandas as pd

from datetime import datetime

# Webdriver --------------------------------------------------

# Initialize WebDriver for retrieving rental listings from landing page
Expand Down Expand Up @@ -65,6 +67,8 @@
all_units_df.to_excel(listings_path, index=False)
current_units.clear()
rental_listing_data = padmapper_scraper.get_rental_listing_data(get_rental_data_driver, url)
# print(rental_listing_data[0][TableHeaders.CITY.value])
# print(table_columns)
if rental_listing_data:
current_units += rental_listing_data
except:
Expand All @@ -75,9 +79,11 @@

# ------------------------------------------------------------

all_listings_df = all_units_df = pd.DataFrame(all_units, columns=table_columns)
all_units_df: pd.DataFrame = pd.DataFrame(all_units, columns=table_columns)

all_units_df[TableHeaders.DATE.value] = all_units_df[TableHeaders.DATE.value].fillna(datetime.now())

all_listings_df.to_excel(listings_path, index=False)
all_units_df.to_excel(listings_path, index=False)

# Close the get_rental_data_driver
get_rental_data_driver.quit()
Expand Down
Binary file added data/raw_data/01-01-2024_rental_listings.xlsx
Binary file not shown.
15 changes: 11 additions & 4 deletions data/scrapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def get_rental_units_data_by_listing(self, link_html_content, is_single_unit):
# Parse the HTML with Beautiful Soup
soup = BeautifulSoup(link_html_content, 'html.parser')

building_title_text, price_text, bed_text, bath_text, sqft_text, address_text, pets_text, lat_text, lon_text, city_text = DataExtractor.extract_building_details(soup)
building_title_text, neighborhood_title_text, price_text, bed_text, bath_text, sqft_text, address_text, pets_text, lat_text, lon_text, city_text = DataExtractor.extract_building_details(soup)

unit_amenities_text, building_amenities_text = DataExtractor.extract_amenities(soup)

Expand All @@ -231,21 +231,24 @@ def get_rental_units_data_by_listing(self, link_html_content, is_single_unit):
TableHeaders.SQFT.value: sqft_text,
}
]

rental_listing_units = []

# Concatenate each row of rental unit data with columns for building and rental unit amenities
for unit_data in all_units_data:
unit_data[TableHeaders.BUILDING.value] = building_title_text
unit_data[TableHeaders.NEIGHBOURHOOD.value] = neighborhood_title_text
unit_data[TableHeaders.PETS.value] = pets_text
unit_data[TableHeaders.UNIT_AMENITIES.value] = unit_amenities_text
unit_data[TableHeaders.BUILDING_AMENITIES.value] = building_amenities_text
unit_data[TableHeaders.ADDRESS.value] = address_text
unit_data[TableHeaders.BUILDING.value] = building_title_text
unit_data[TableHeaders.CITY.value] = city_text
unit_data[TableHeaders.LAT.value] = lat_text
unit_data[TableHeaders.LON.value] = lon_text
rental_listing_units.append(unit_data)

self.listings += rental_listing_units
print(f"Extracted {len(rental_listing_units)} units")
print(f"Extracted {len(rental_listing_units)} units in {city_text}")
print(f"Total units: {len(self.listings)}")
with open('listings.pkl', 'wb') as file:
pickle.dump(self.listings, file)
Expand All @@ -266,6 +269,10 @@ def extract_building_details(soup: BeautifulSoup) -> tuple:
building_title = soup.find('h1', class_=lambda cls: cls and 'FullDetail_street_' in cls)
building_title_text = re.split(r'[^\w ]+', building_title.get_text())[0] if building_title else ""

neighborhood_title_sep = soup.find('span', class_=lambda cls: cls and 'FullDetail_cityStateDivider_' in cls)
neighborhood_title = neighborhood_title_sep.find_next_sibling('a', class_=lambda cls: cls and 'FullDetail_cityStateLink_' in cls)
neighborhood_title_text = re.split(r'[^\w ]+', neighborhood_title.get_text())[0] if neighborhood_title else ""

details = soup.find('div', class_=lambda cls: cls and 'SummaryTable_summaryTable_' in cls)

[price_text, bed_text, bath_text, sqft_text, address_text, pets_text] = DataExtractor.extract_summary_table(details)
Expand All @@ -282,7 +289,7 @@ def extract_building_details(soup: BeautifulSoup) -> tuple:
city_tag = soup.find('meta', {'name': 'place:locality'})
city_text = city_tag['content'] if city_tag else ""

return (building_title_text, price_text, bed_text, bath_text, sqft_text, address_text, pets_text, lat_text, lon_text, city_text)
return (building_title_text, neighborhood_title_text, price_text, bed_text, bath_text, sqft_text, address_text, pets_text, lat_text, lon_text, city_text)

@staticmethod
def extract_summary_table(soup: BeautifulSoup) -> list:
Expand Down

0 comments on commit ece8b23

Please sign in to comment.