-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
web scraping 6 examples working (#91)
- Loading branch information
Showing
11 changed files
with
428 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
__pycache__/ | ||
/__pycache__/ | ||
/.venv/ | ||
.venv/ | ||
/venv/ | ||
/venv3.11/ | ||
.env | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
OPENAI_API_KEY=sk-XXX | ||
USER_AGENT= |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import openai | ||
import os | ||
from dotenv import load_dotenv | ||
from pydantic import BaseModel | ||
from typing import List, Optional | ||
|
||
load_dotenv() | ||
|
||
from monkey_patch.monkey import Monkey as monkey | ||
from utils import scrape_url | ||
|
||
|
||
openai.api_key = os.getenv("OPENAI_API_KEY") | ||
|
||
|
||
class Car(BaseModel): | ||
price: float | ||
mpg: str | ||
seating: int | ||
horsepower: int | ||
weight: int | ||
fuel_size: float | ||
warranty_basic: str | ||
warranty_powertrain: str | ||
warranty_roadside: str | ||
|
||
|
||
@monkey.patch | ||
def extract_car(content: str) -> Optional[Car]: | ||
""" | ||
Examine the content string and extract the car details for the price, miles per gallon, seating, horsepower, | ||
weight, fuel tank size, and warranty. | ||
""" | ||
|
||
|
||
if __name__ == '__main__': | ||
|
||
# Web scrape the url and extract the car information | ||
# url = "https://www.cars.com/research/ford-mustang-2024/" | ||
url = "https://www.cars.com/research/mazda-cx_90-2024/" | ||
contents = scrape_url(url=url) | ||
print(contents) | ||
|
||
# Process the cocktail block using MonkeyPatch | ||
car = extract_car(contents[0]) | ||
print(car) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import openai | ||
import os | ||
from dotenv import load_dotenv | ||
from pydantic import BaseModel | ||
from typing import List, Optional | ||
|
||
load_dotenv() | ||
|
||
from monkey_patch.monkey import Monkey as monkey | ||
from utils import scrape_url | ||
|
||
|
||
openai.api_key = os.getenv("OPENAI_API_KEY") | ||
|
||
|
||
class Cocktail(BaseModel): | ||
name: str | ||
ingredients: List[str] = [] | ||
instructions: str | ||
similar: List[str] = [] | ||
|
||
|
||
@monkey.patch | ||
def extract_cocktail(content: str) -> Optional[Cocktail]: | ||
""" | ||
Examine the content string and extract the cocktail details for the ingredients, instructions, and similar cocktails. | ||
""" | ||
|
||
|
||
@monkey.align | ||
def align_extract_cocktail() -> None: | ||
print("Aligning...") | ||
cocktail = """Black Rose | Kindred Cocktails\n\n\n\n\n\n Skip to main content\n \n\n\n\n\n\nKindred Cocktails\n\n\nToggle navigation\n\n\n\n\n\n\n\n\nMain navigation\n\n\nHome\n\n\nCocktails\n\n\nNew\n\n\nInfo \n\n\nStyle guidelines\n\n\nIngredients\n\n\n\n\n\nMeasurement units\n\n\nHistoric Cocktail Books\n\n\nRecommended Brands\n\n\nAmari & Friends\n\n\nArticles & Reviews\n\n\n\n\n\nAbout us\n\n\nLearn More\n\n\nFAQ\n\n\nTerms of Use\n\n\nContact us\n\n\n\n\nYou \n\n\nLog in\n\n\nSign Up\n\n\nReset your password\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHome\n\n\nCocktails\n\n\n Black Rose\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCopy\n\n\n\n\nBlack Rose\n \n\n\n\n\n\n\n\n\n\n2 oz Bourbon\n\n1 ds Grenadine\n\n2 ds Peychaud's Bitters\n\n1 Lemon peel (flamed, for garnish)\n\n\n\nInstructions\nFill an old-fashioned glass three-quarters full with ice. Add the bourbon, grenadine, and bitters, and stir. Garnish with the lemon peel.\n\n\n\n\n\n\nCocktail summary\n\n\n\nPosted by\nThe Boston Shaker\n on \n4/12/2011\n\n\n\n\nIs of\nunknown authenticity\n\n\nReference\nDale Degroff, The Essential Cocktail, p48\n\n\n\nCurator\nNot yet rated\n\n\nAverage\n3.5 stars (6 ratings)\n\n\n\nYieldsDrink\n\n\nScale\n\n\nBourbon, Peychaud's Bitters, Grenadine, Lemon peel\nPT5M\nPT0M\nCocktail\nCocktail\n1\ncraft, alcoholic\n3.66667\n6\n\n\n\n\n\n\n\n\n\n\nCocktail Book\n\nLog in or sign up to start building your Cocktail Book.\n\n\n\n\nFrom other usersWith a modest grenadine dash, this drink didn't do much for me, but adding a bit more won me over.\nSimilar cocktailsNew Orleans Cocktail — Bourbon, Peychaud's Bitters, Orange Curaçao, Lemon peelOld Fashioned — Bourbon, Bitters, Sugar, Lemon peelBattle of New Orleans — Bourbon, Peychaud's Bitters, Absinthe, Orange bitters, Simple syrupImproved Whiskey Cocktail — Bourbon, Bitters, Maraschino Liqueur, Absinthe, Simple syrup, Lemon peelDerby Cocktail — Bourbon, Bénédictine, BittersMother-In-Law — Bourbon, Orange Curaçao, Maraschino Liqueur, Peychaud's Bitters, Bitters, Torani Amer, Simple syrupMint Julep — Bourbon, Rich demerara syrup 2:1, MintThe Journey — Bourbon, Mezcal, Hazelnut liqueurBenton's Old Fashioned — Bourbon, Bitters, Grade B maple syrup, Orange peelFancy Mint Julep — Bourbon, Simple syrup, Mint, Fine sugar\n\nComments\n\n\n\n\n\nLog in or register to post comments\n\n\n\n\n\n\n\n\n© 2010-2023 Dan Chadwick. Kindred Cocktails™ is a trademark of Dan Chadwick.""" | ||
assert extract_cocktail(cocktail) == Cocktail( | ||
name="Black Rose", | ||
ingredients=["2 oz Bourbon", "1 ds Grenadine", "2 ds Peychaud's Bitters", "1 Lemon peel (flamed, for garnish)"], | ||
instructions="Fill an old-fashioned glass three-quarters full with ice. Add the bourbon, grenadine, and bitters, and stir. Garnish with the lemon peel.", | ||
similar=["New Orleans Cocktail", "Old Fashioned", "Battle of New Orleans", "Improved Whiskey Cocktail", "Derby Cocktail", "Mother-In-Law", "Mint Julep", "The Journey", "Benton's Old Fashioned", "Fancy Mint Julep"], | ||
) | ||
|
||
|
||
if __name__ == '__main__': | ||
|
||
# Align the function | ||
align_extract_cocktail() | ||
|
||
# Web scrape the url and extract the cocktail information | ||
url = "https://kindredcocktails.com/cocktail/old-fashioned" | ||
# url = "https://kindredcocktails.com/cocktail/journey" | ||
contents = scrape_url(url=url) | ||
print(contents) | ||
|
||
# Process the cocktail block using MonkeyPatch | ||
cocktail = extract_cocktail(contents[0]) | ||
print(cocktail) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import openai | ||
import os | ||
from dotenv import load_dotenv | ||
from pydantic import BaseModel | ||
from typing import Optional | ||
|
||
load_dotenv() | ||
|
||
from monkey_patch.monkey import Monkey as monkey | ||
from utils import scrape_url | ||
|
||
|
||
openai.api_key = os.getenv("OPENAI_API_KEY") | ||
|
||
|
||
class Country(BaseModel): | ||
name: str | ||
capital: str | ||
population: int | ||
area: float | ||
|
||
|
||
@monkey.patch | ||
def extract_country(content: str) -> Optional[Country]: | ||
""" | ||
Examine the content string and extract the country information pertaining to it's | ||
name, capital, population, and area. | ||
""" | ||
|
||
|
||
@monkey.align | ||
def align_extract_country() -> None: | ||
print("Aligning...") | ||
country = "\n\n\n U.S. Virgin Islands\n \n\nCapital: Charlotte Amalie\nPopulation: 108708\nArea (km2): 352.0\n\n" | ||
assert extract_country(country) == Country( | ||
name="U.S. Virgin Islands", | ||
capital="Charlotte Amalie", | ||
population=108708, | ||
area=352.0, | ||
) | ||
|
||
|
||
if __name__ == '__main__': | ||
|
||
# Align the function | ||
align_extract_country() | ||
|
||
# Web scrape the url and extract the list of countries | ||
url = "https://www.scrapethissite.com/pages/simple/" | ||
contents = scrape_url(url=url, class_name="country") | ||
|
||
# Process the country blocks using MonkeyPatch (only sampling a couple for demo purposes) | ||
countries = [] | ||
for content in contents[10:12]: | ||
countries.append(extract_country(content)) | ||
print(countries) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
import openai | ||
import os | ||
from dotenv import load_dotenv | ||
from pydantic import BaseModel | ||
from typing import Optional | ||
|
||
load_dotenv() | ||
|
||
from monkey_patch.monkey import Monkey as monkey | ||
from utils import scrape_url | ||
|
||
|
||
openai.api_key = os.getenv("OPENAI_API_KEY") | ||
|
||
|
||
class Job(BaseModel): | ||
position: str | ||
company: str | ||
location: str | ||
|
||
|
||
@monkey.patch | ||
def extract_job(content: str) -> Optional[Job]: | ||
""" | ||
Examine the content string and extract the job details for the position title, company, and location. | ||
""" | ||
|
||
|
||
@monkey.align | ||
def align_extract_job() -> None: | ||
print("Aligning...") | ||
job = "\n\n\n\n\n\n\n\n\nShip broker\nFuentes, Walls and Castro\n\n\n\n\n Michelleville, AP\n \n\n2021-04-08\n\n\n\nLearn\nApply\n\n\n" | ||
assert extract_job(job) == Job( | ||
position="Ship broker", | ||
company="Fuentes, Walls and Castro", | ||
location="Michelleville, AP", | ||
) | ||
|
||
|
||
if __name__ == '__main__': | ||
|
||
# Align the function | ||
align_extract_job() | ||
|
||
# Web scrape the url and extract the list of jobs | ||
url = "https://realpython.github.io/fake-jobs/" | ||
contents = scrape_url(url=url, class_name="card") | ||
|
||
# Process the job blocks using MonkeyPatch (only sampling a couple for demo purposes) | ||
jobs = [] | ||
for content in contents[1:3]: | ||
jobs.append(extract_job(content)) | ||
print(jobs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
import openai | ||
import os | ||
from dotenv import load_dotenv | ||
from pydantic import BaseModel | ||
from typing import List, Optional | ||
|
||
load_dotenv() | ||
|
||
from monkey_patch.monkey import Monkey as monkey | ||
from utils import scrape_url | ||
|
||
|
||
openai.api_key = os.getenv("OPENAI_API_KEY") | ||
|
||
|
||
class Quote(BaseModel): | ||
text: str | ||
author: str | ||
tags: List[str] = [] | ||
|
||
|
||
@monkey.patch | ||
def extract_quote(content: str) -> Optional[Quote]: | ||
""" | ||
Examine the content string and extract the quote details for the text, author, and tags. | ||
""" | ||
|
||
|
||
@monkey.align | ||
def align_extract_quote() -> None: | ||
print("Aligning...") | ||
quote = "\nIt takes courage to grow up and become who you really are.\nby E.E. Cummings\n(about)\n\n\n Tags:\n \ncourage\n\n" | ||
assert extract_quote(quote) == Quote( | ||
text="It takes courage to grow up and become who you really are.", | ||
author="E.E. Cummings", | ||
tags=["courage"], | ||
) | ||
|
||
|
||
if __name__ == '__main__': | ||
|
||
# Align the function | ||
align_extract_quote() | ||
|
||
# Web scrape the url and extract the list of quotes | ||
url = "https://quotes.toscrape.com/page/1/" | ||
contents = scrape_url(url=url, class_name="quote") | ||
|
||
# Process the quote blocks using MonkeyPatch (only sampling a couple for demo purposes) | ||
quotes = [] | ||
for content in contents[0:2]: | ||
c = content.replace('“', '') | ||
c = c.replace('”', '') | ||
quotes.append(extract_quote(c)) | ||
print(quotes) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# Web Scraping | ||
|
||
This example shows how MonkeyPatch can be used with web scraping to easily populate the desired values into a structured class. | ||
|
||
Six examples for web scraping are provided: | ||
- [Quotes](https://quotes.toscrape.com/) | ||
- [Countries](https://www.scrapethissite.com/pages/simple/) | ||
- [Job Postings](https://realpython.github.io/fake-jobs/) | ||
- [Cocktails](https://kindredcocktails.com/cocktail/old-fashioned) | ||
- [Car Specs](https://www.cars.com/research/mazda-cx_90-2024/) | ||
- [StreetEasy Apartments](https://streeteasy.com/2-bedroom-apartments-for-rent/manhattan) | ||
|
||
## Configuration | ||
|
||
Ensure you have an account with OpenAI to access their underlying models. | ||
|
||
Set the following environment variables in your `.env` file: | ||
``` | ||
OPENAI_API_KEY=sk-XXX | ||
USER_AGENT=... (Optional and only needed for StreetEasy example) | ||
``` | ||
|
||
## Install | ||
|
||
```bash | ||
python -m venv .venv | ||
source .venv/bin/activate | ||
pip install -r requirements.txt | ||
``` | ||
|
||
## Usage | ||
|
||
To align and test, run following command for the desired example of interest: | ||
``` | ||
python quotes.py | ||
python countries.py | ||
python jobs.py | ||
python cocktail.py | ||
python cars.py | ||
python streeteasy.py # make sure to update User-Agent! | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
python-dotenv | ||
openai | ||
monkey-patch.py | ||
pytest | ||
beautifulsoup4 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
from numpy import square | ||
import openai | ||
import os | ||
from dotenv import load_dotenv | ||
from pydantic import BaseModel | ||
from typing import List, Optional | ||
|
||
load_dotenv() | ||
|
||
from monkey_patch.monkey import Monkey as monkey | ||
from utils import scrape_url | ||
|
||
|
||
openai.api_key = os.getenv("OPENAI_API_KEY") | ||
|
||
|
||
class Property(BaseModel): | ||
neighborhood: str | ||
address: str | ||
price: float | ||
fee: bool | ||
beds: float | ||
bath: float | ||
listed_by: str | ||
|
||
|
||
@monkey.patch | ||
def extract_property(content: str) -> Optional[Property]: | ||
""" | ||
Examine the content string and extract the rental property details for the neighborhood, address, | ||
price, number of beds, number of bathrooms, square footage, and company that is listing the property. | ||
""" | ||
|
||
@monkey.align | ||
def align_extract_property() -> None: | ||
print("Aligning...") | ||
unit_one = "Rental Unit in Lincoln Square\n \n\n\n229 West 60th Street #7H\n\n\n\n$7,250\nNO FEE\n\n\n\n\n\n\n\n\n2 Beds\n\n\n\n\n2 Baths\n\n\n\n\n\n 1,386\n square feet\nft²\n\n\n\n\n\n Listing by Algin Management" | ||
assert extract_property(unit_one) == Property( | ||
neighborhood="Lincoln Square", | ||
address="229 West 60th Street #7H", | ||
price=7250.0, | ||
fee=False, | ||
beds=2.0, | ||
bath=2.0, | ||
listed_by="Algin Management", | ||
) | ||
|
||
|
||
if __name__ == '__main__': | ||
|
||
# Align the function | ||
align_extract_property() | ||
|
||
# Web scrape the url and extract the rental property details | ||
url = "https://streeteasy.com/2-bedroom-apartments-for-rent/manhattan?page=2" | ||
contents = scrape_url(url=url, class_name="listingCardBottom") | ||
print(contents) | ||
|
||
# Process the rental property block using MonkeyPatch | ||
units = [] | ||
for content in contents[1:3]: | ||
units.append(extract_property(content)) | ||
print(units) |
Oops, something went wrong.