Skip to content

Commit

Permalink
web scraping 6 examples working (#91)
Browse files Browse the repository at this point in the history
  • Loading branch information
bmagz authored Nov 26, 2023
1 parent 1f04812 commit d6e3c32
Show file tree
Hide file tree
Showing 11 changed files with 428 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
__pycache__/
/__pycache__/
/.venv/
.venv/
/venv/
/venv3.11/
.env
Expand Down
2 changes: 2 additions & 0 deletions examples/web_scraper/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
OPENAI_API_KEY=sk-XXX
USER_AGENT=
46 changes: 46 additions & 0 deletions examples/web_scraper/cars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import openai
import os
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import List, Optional

load_dotenv()

from monkey_patch.monkey import Monkey as monkey
from utils import scrape_url


openai.api_key = os.getenv("OPENAI_API_KEY")


class Car(BaseModel):
price: float
mpg: str
seating: int
horsepower: int
weight: int
fuel_size: float
warranty_basic: str
warranty_powertrain: str
warranty_roadside: str


@monkey.patch
def extract_car(content: str) -> Optional[Car]:
"""
Examine the content string and extract the car details for the price, miles per gallon, seating, horsepower,
weight, fuel tank size, and warranty.
"""


if __name__ == '__main__':

# Web scrape the url and extract the car information
# url = "https://www.cars.com/research/ford-mustang-2024/"
url = "https://www.cars.com/research/mazda-cx_90-2024/"
contents = scrape_url(url=url)
print(contents)

# Process the cocktail block using MonkeyPatch
car = extract_car(contents[0])
print(car)
55 changes: 55 additions & 0 deletions examples/web_scraper/cocktail.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import openai
import os
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import List, Optional

load_dotenv()

from monkey_patch.monkey import Monkey as monkey
from utils import scrape_url


openai.api_key = os.getenv("OPENAI_API_KEY")


class Cocktail(BaseModel):
name: str
ingredients: List[str] = []
instructions: str
similar: List[str] = []


@monkey.patch
def extract_cocktail(content: str) -> Optional[Cocktail]:
"""
Examine the content string and extract the cocktail details for the ingredients, instructions, and similar cocktails.
"""


@monkey.align
def align_extract_cocktail() -> None:
print("Aligning...")
cocktail = """Black Rose | Kindred Cocktails\n\n\n\n\n\n Skip to main content\n \n\n\n\n\n\nKindred Cocktails\n\n\nToggle navigation\n\n\n\n\n\n\n\n\nMain navigation\n\n\nHome\n\n\nCocktails\n\n\nNew\n\n\nInfo \n\n\nStyle guidelines\n\n\nIngredients\n\n\n\n\n\nMeasurement units\n\n\nHistoric Cocktail Books\n\n\nRecommended Brands\n\n\nAmari & Friends\n\n\nArticles & Reviews\n\n\n\n\n\nAbout us\n\n\nLearn More\n\n\nFAQ\n\n\nTerms of Use\n\n\nContact us\n\n\n\n\nYou \n\n\nLog in\n\n\nSign Up\n\n\nReset your password\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHome\n\n\nCocktails\n\n\n Black Rose\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCopy\n\n\n\n\nBlack Rose\n \n\n\n\n\n\n\n\n\n\n2 oz Bourbon\n\n1 ds Grenadine\n\n2 ds Peychaud's Bitters\n\n1 Lemon peel (flamed, for garnish)\n\n\n\nInstructions\nFill an old-fashioned glass three-quarters full with ice. Add the bourbon, grenadine, and bitters, and stir. Garnish with the lemon peel.\n\n\n\n\n\n\nCocktail summary\n\n\n\nPosted by\nThe Boston Shaker\n on \n4/12/2011\n\n\n\n\nIs of\nunknown authenticity\n\n\nReference\nDale Degroff, The Essential Cocktail, p48\n\n\n\nCurator\nNot yet rated\n\n\nAverage\n3.5 stars (6 ratings)\n\n\n\nYieldsDrink\n\n\nScale\n\n\nBourbon, Peychaud's Bitters, Grenadine, Lemon peel\nPT5M\nPT0M\nCocktail\nCocktail\n1\ncraft, alcoholic\n3.66667\n6\n\n\n\n\n\n\n\n\n\n\nCocktail Book\n\nLog in or sign up to start building your Cocktail Book.\n\n\n\n\nFrom other usersWith a modest grenadine dash, this drink didn't do much for me, but adding a bit more won me over.\nSimilar cocktailsNew Orleans Cocktail — Bourbon, Peychaud's Bitters, Orange Curaçao, Lemon peelOld Fashioned — Bourbon, Bitters, Sugar, Lemon peelBattle of New Orleans — Bourbon, Peychaud's Bitters, Absinthe, Orange bitters, Simple syrupImproved Whiskey Cocktail — Bourbon, Bitters, Maraschino Liqueur, Absinthe, Simple syrup, Lemon peelDerby Cocktail — Bourbon, Bénédictine, BittersMother-In-Law — Bourbon, Orange Curaçao, Maraschino Liqueur, Peychaud's Bitters, Bitters, Torani Amer, Simple syrupMint Julep — Bourbon, Rich demerara syrup 2:1, MintThe Journey — Bourbon, Mezcal, Hazelnut liqueurBenton's Old Fashioned — Bourbon, Bitters, Grade B maple syrup, Orange peelFancy Mint Julep — Bourbon, Simple syrup, Mint, Fine sugar\n\nComments\n\n\n\n\n\nLog in or register to post comments\n\n\n\n\n\n\n\n\n© 2010-2023 Dan Chadwick. Kindred Cocktails™ is a trademark of Dan Chadwick."""
assert extract_cocktail(cocktail) == Cocktail(
name="Black Rose",
ingredients=["2 oz Bourbon", "1 ds Grenadine", "2 ds Peychaud's Bitters", "1 Lemon peel (flamed, for garnish)"],
instructions="Fill an old-fashioned glass three-quarters full with ice. Add the bourbon, grenadine, and bitters, and stir. Garnish with the lemon peel.",
similar=["New Orleans Cocktail", "Old Fashioned", "Battle of New Orleans", "Improved Whiskey Cocktail", "Derby Cocktail", "Mother-In-Law", "Mint Julep", "The Journey", "Benton's Old Fashioned", "Fancy Mint Julep"],
)


if __name__ == '__main__':

# Align the function
align_extract_cocktail()

# Web scrape the url and extract the cocktail information
url = "https://kindredcocktails.com/cocktail/old-fashioned"
# url = "https://kindredcocktails.com/cocktail/journey"
contents = scrape_url(url=url)
print(contents)

# Process the cocktail block using MonkeyPatch
cocktail = extract_cocktail(contents[0])
print(cocktail)
56 changes: 56 additions & 0 deletions examples/web_scraper/countries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import openai
import os
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import Optional

load_dotenv()

from monkey_patch.monkey import Monkey as monkey
from utils import scrape_url


openai.api_key = os.getenv("OPENAI_API_KEY")


class Country(BaseModel):
name: str
capital: str
population: int
area: float


@monkey.patch
def extract_country(content: str) -> Optional[Country]:
"""
Examine the content string and extract the country information pertaining to it's
name, capital, population, and area.
"""


@monkey.align
def align_extract_country() -> None:
print("Aligning...")
country = "\n\n\n U.S. Virgin Islands\n \n\nCapital: Charlotte Amalie\nPopulation: 108708\nArea (km2): 352.0\n\n"
assert extract_country(country) == Country(
name="U.S. Virgin Islands",
capital="Charlotte Amalie",
population=108708,
area=352.0,
)


if __name__ == '__main__':

# Align the function
align_extract_country()

# Web scrape the url and extract the list of countries
url = "https://www.scrapethissite.com/pages/simple/"
contents = scrape_url(url=url, class_name="country")

# Process the country blocks using MonkeyPatch (only sampling a couple for demo purposes)
countries = []
for content in contents[10:12]:
countries.append(extract_country(content))
print(countries)
53 changes: 53 additions & 0 deletions examples/web_scraper/jobs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import openai
import os
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import Optional

load_dotenv()

from monkey_patch.monkey import Monkey as monkey
from utils import scrape_url


openai.api_key = os.getenv("OPENAI_API_KEY")


class Job(BaseModel):
position: str
company: str
location: str


@monkey.patch
def extract_job(content: str) -> Optional[Job]:
"""
Examine the content string and extract the job details for the position title, company, and location.
"""


@monkey.align
def align_extract_job() -> None:
print("Aligning...")
job = "\n\n\n\n\n\n\n\n\nShip broker\nFuentes, Walls and Castro\n\n\n\n\n Michelleville, AP\n \n\n2021-04-08\n\n\n\nLearn\nApply\n\n\n"
assert extract_job(job) == Job(
position="Ship broker",
company="Fuentes, Walls and Castro",
location="Michelleville, AP",
)


if __name__ == '__main__':

# Align the function
align_extract_job()

# Web scrape the url and extract the list of jobs
url = "https://realpython.github.io/fake-jobs/"
contents = scrape_url(url=url, class_name="card")

# Process the job blocks using MonkeyPatch (only sampling a couple for demo purposes)
jobs = []
for content in contents[1:3]:
jobs.append(extract_job(content))
print(jobs)
55 changes: 55 additions & 0 deletions examples/web_scraper/quotes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import openai
import os
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import List, Optional

load_dotenv()

from monkey_patch.monkey import Monkey as monkey
from utils import scrape_url


openai.api_key = os.getenv("OPENAI_API_KEY")


class Quote(BaseModel):
text: str
author: str
tags: List[str] = []


@monkey.patch
def extract_quote(content: str) -> Optional[Quote]:
"""
Examine the content string and extract the quote details for the text, author, and tags.
"""


@monkey.align
def align_extract_quote() -> None:
print("Aligning...")
quote = "\nIt takes courage to grow up and become who you really are.\nby E.E. Cummings\n(about)\n\n\n Tags:\n \ncourage\n\n"
assert extract_quote(quote) == Quote(
text="It takes courage to grow up and become who you really are.",
author="E.E. Cummings",
tags=["courage"],
)


if __name__ == '__main__':

# Align the function
align_extract_quote()

# Web scrape the url and extract the list of quotes
url = "https://quotes.toscrape.com/page/1/"
contents = scrape_url(url=url, class_name="quote")

# Process the quote blocks using MonkeyPatch (only sampling a couple for demo purposes)
quotes = []
for content in contents[0:2]:
c = content.replace('“', '')
c = c.replace('”', '')
quotes.append(extract_quote(c))
print(quotes)
46 changes: 46 additions & 0 deletions examples/web_scraper/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Web Scraping

This example shows how MonkeyPatch can be used with web scraping to easily populate the desired values into a structured class.

Six examples for web scraping are provided:
- [Quotes](https://quotes.toscrape.com/)
- [Countries](https://www.scrapethissite.com/pages/simple/)
- [Job Postings](https://realpython.github.io/fake-jobs/)
- [Cocktails](https://kindredcocktails.com/cocktail/old-fashioned)
- [Car Specs](https://www.cars.com/research/mazda-cx_90-2024/)
- [StreetEasy Apartments](https://streeteasy.com/2-bedroom-apartments-for-rent/manhattan)

## Configuration

Ensure you have an account with OpenAI to access their underlying models.

Set the following environment variables in your `.env` file:
```
OPENAI_API_KEY=sk-XXX
USER_AGENT=... (Optional and only needed for StreetEasy example)
```

## Install

```bash
python -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt
```

## Usage

To align and test, run following command for the desired example of interest:
```
python quotes.py
python countries.py
python jobs.py
python cocktail.py
python cars.py
python streeteasy.py # make sure to update User-Agent!
```
5 changes: 5 additions & 0 deletions examples/web_scraper/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
python-dotenv
openai
monkey-patch.py
pytest
beautifulsoup4
63 changes: 63 additions & 0 deletions examples/web_scraper/streeteasy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from numpy import square
import openai
import os
from dotenv import load_dotenv
from pydantic import BaseModel
from typing import List, Optional

load_dotenv()

from monkey_patch.monkey import Monkey as monkey
from utils import scrape_url


openai.api_key = os.getenv("OPENAI_API_KEY")


class Property(BaseModel):
neighborhood: str
address: str
price: float
fee: bool
beds: float
bath: float
listed_by: str


@monkey.patch
def extract_property(content: str) -> Optional[Property]:
"""
Examine the content string and extract the rental property details for the neighborhood, address,
price, number of beds, number of bathrooms, square footage, and company that is listing the property.
"""

@monkey.align
def align_extract_property() -> None:
print("Aligning...")
unit_one = "Rental Unit in Lincoln Square\n \n\n\n229 West 60th Street #7H\n\n\n\n$7,250\nNO FEE\n\n\n\n\n\n\n\n\n2 Beds\n\n\n\n\n2 Baths\n\n\n\n\n\n 1,386\n square feet\nft²\n\n\n\n\n\n Listing by Algin Management"
assert extract_property(unit_one) == Property(
neighborhood="Lincoln Square",
address="229 West 60th Street #7H",
price=7250.0,
fee=False,
beds=2.0,
bath=2.0,
listed_by="Algin Management",
)


if __name__ == '__main__':

# Align the function
align_extract_property()

# Web scrape the url and extract the rental property details
url = "https://streeteasy.com/2-bedroom-apartments-for-rent/manhattan?page=2"
contents = scrape_url(url=url, class_name="listingCardBottom")
print(contents)

# Process the rental property block using MonkeyPatch
units = []
for content in contents[1:3]:
units.append(extract_property(content))
print(units)
Loading

0 comments on commit d6e3c32

Please sign in to comment.