web scraping 6 examples working (#91)

Tanuki · Nov 26, 2023 · d6e3c32 · d6e3c32
1 parent 1f04812
commit d6e3c32
Show file tree

Hide file tree

Showing 11 changed files with 428 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 __pycache__/
 /__pycache__/
 /.venv/
+.venv/
 /venv/
 /venv3.11/
 .env

diff --git a/examples/web_scraper/.env.example b/examples/web_scraper/.env.example
@@ -0,0 +1,2 @@
+OPENAI_API_KEY=sk-XXX
+USER_AGENT=
diff --git a/examples/web_scraper/cars.py b/examples/web_scraper/cars.py
@@ -0,0 +1,46 @@
+import openai
+import os
+from dotenv import load_dotenv
+from pydantic import BaseModel
+from typing import List, Optional
+
+load_dotenv()
+
+from monkey_patch.monkey import Monkey as monkey
+from utils import scrape_url
+
+
+openai.api_key = os.getenv("OPENAI_API_KEY")
+
+
+class Car(BaseModel):
+    price: float
+    mpg: str
+    seating: int
+    horsepower: int
+    weight: int
+    fuel_size: float
+    warranty_basic: str
+    warranty_powertrain: str
+    warranty_roadside: str
+
+
+@monkey.patch
+def extract_car(content: str) -> Optional[Car]:
+    """
+    Examine the content string and extract the car details for the price, miles per gallon, seating, horsepower,
+    weight, fuel tank size, and warranty.
+    """
+
+
+if __name__ == '__main__':
+
+    # Web scrape the url and extract the car information
+    # url = "https://www.cars.com/research/ford-mustang-2024/"
+    url = "https://www.cars.com/research/mazda-cx_90-2024/"
+    contents = scrape_url(url=url)
+    print(contents)
+
+    # Process the cocktail block using MonkeyPatch
+    car = extract_car(contents[0])
+    print(car)
diff --git a/examples/web_scraper/cocktail.py b/examples/web_scraper/cocktail.py
@@ -0,0 +1,55 @@
+import openai
+import os
+from dotenv import load_dotenv
+from pydantic import BaseModel
+from typing import List, Optional
+
+load_dotenv()
+
+from monkey_patch.monkey import Monkey as monkey
+from utils import scrape_url
+
+
+openai.api_key = os.getenv("OPENAI_API_KEY")
+
+
+class Cocktail(BaseModel):
+    name: str
+    ingredients: List[str] = []
+    instructions: str
+    similar: List[str] = []
+
+
+@monkey.patch
+def extract_cocktail(content: str) -> Optional[Cocktail]:
+    """
+    Examine the content string and extract the cocktail details for the ingredients, instructions, and similar cocktails.
+    """
+
+
+@monkey.align
+def align_extract_cocktail() -> None:
+    print("Aligning...")
+    cocktail = """Black Rose | Kindred Cocktails\n\n\n\n\n\n      Skip to main content\n    \n\n\n\n\n\nKindred Cocktails\n\n\nToggle navigation\n\n\n\n\n\n\n\n\nMain navigation\n\n\nHome\n\n\nCocktails\n\n\nNew\n\n\nInfo \n\n\nStyle guidelines\n\n\nIngredients\n\n\n\n\n\nMeasurement units\n\n\nHistoric Cocktail Books\n\n\nRecommended Brands\n\n\nAmari & Friends\n\n\nArticles & Reviews\n\n\n\n\n\nAbout us\n\n\nLearn More\n\n\nFAQ\n\n\nTerms of Use\n\n\nContact us\n\n\n\n\nYou \n\n\nLog in\n\n\nSign Up\n\n\nReset your password\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nHome\n\n\nCocktails\n\n\n                  Black Rose\n              \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nCopy\n\n\n\n\nBlack Rose\n \n\n\n\n\n\n\n\n\n\n2 oz Bourbon\n\n1 ds Grenadine\n\n2 ds Peychaud's Bitters\n\n1  Lemon peel (flamed, for garnish)\n\n\n\nInstructions\nFill an old-fashioned glass three-quarters full with ice.  Add the bourbon, grenadine, and bitters, and stir.  Garnish with the lemon peel.\n\n\n\n\n\n\nCocktail summary\n\n\n\nPosted by\nThe Boston Shaker\n on \n4/12/2011\n\n\n\n\nIs of\nunknown authenticity\n\n\nReference\nDale Degroff, The Essential Cocktail, p48\n\n\n\nCurator\nNot yet rated\n\n\nAverage\n3.5 stars (6 ratings)\n\n\n\nYieldsDrink\n\n\nScale\n\n\nBourbon, Peychaud's Bitters, Grenadine, Lemon peel\nPT5M\nPT0M\nCocktail\nCocktail\n1\ncraft, alcoholic\n3.66667\n6\n\n\n\n\n\n\n\n\n\n\nCocktail Book\n\nLog in or sign up to start building your Cocktail Book.\n\n\n\n\nFrom other usersWith a modest grenadine dash, this drink didn't do much for me, but adding a bit more won me over.\nSimilar cocktailsNew Orleans Cocktail — Bourbon, Peychaud's Bitters, Orange Curaçao, Lemon peelOld Fashioned — Bourbon, Bitters, Sugar, Lemon peelBattle of New Orleans — Bourbon, Peychaud's Bitters, Absinthe, Orange bitters, Simple syrupImproved Whiskey Cocktail — Bourbon, Bitters, Maraschino Liqueur, Absinthe, Simple syrup, Lemon peelDerby Cocktail — Bourbon, Bénédictine, BittersMother-In-Law — Bourbon, Orange Curaçao, Maraschino Liqueur, Peychaud's Bitters, Bitters, Torani Amer, Simple syrupMint Julep — Bourbon, Rich demerara syrup 2:1, MintThe Journey — Bourbon, Mezcal, Hazelnut liqueurBenton's Old Fashioned — Bourbon, Bitters, Grade B maple syrup, Orange peelFancy Mint Julep — Bourbon, Simple syrup, Mint, Fine sugar\n\nComments\n\n\n\n\n\nLog in or register to post comments\n\n\n\n\n\n\n\n\n© 2010-2023 Dan Chadwick. Kindred Cocktails™ is a trademark of Dan Chadwick."""
+    assert extract_cocktail(cocktail) == Cocktail(
+        name="Black Rose",
+        ingredients=["2 oz Bourbon", "1 ds Grenadine", "2 ds Peychaud's Bitters", "1  Lemon peel (flamed, for garnish)"],
+        instructions="Fill an old-fashioned glass three-quarters full with ice.  Add the bourbon, grenadine, and bitters, and stir.  Garnish with the lemon peel.",
+        similar=["New Orleans Cocktail", "Old Fashioned", "Battle of New Orleans", "Improved Whiskey Cocktail", "Derby Cocktail", "Mother-In-Law", "Mint Julep", "The Journey", "Benton's Old Fashioned", "Fancy Mint Julep"],
+    )
+
+
+if __name__ == '__main__':
+
+    # Align the function
+    align_extract_cocktail()
+
+    # Web scrape the url and extract the cocktail information
+    url = "https://kindredcocktails.com/cocktail/old-fashioned"
+    # url = "https://kindredcocktails.com/cocktail/journey"
+    contents = scrape_url(url=url)
+    print(contents)
+
+    # Process the cocktail block using MonkeyPatch
+    cocktail = extract_cocktail(contents[0])
+    print(cocktail)
diff --git a/examples/web_scraper/countries.py b/examples/web_scraper/countries.py
@@ -0,0 +1,56 @@
+import openai
+import os
+from dotenv import load_dotenv
+from pydantic import BaseModel
+from typing import Optional
+
+load_dotenv()
+
+from monkey_patch.monkey import Monkey as monkey
+from utils import scrape_url
+
+
+openai.api_key = os.getenv("OPENAI_API_KEY")
+
+
+class Country(BaseModel):
+    name: str
+    capital: str
+    population: int
+    area: float
+
+
+@monkey.patch
+def extract_country(content: str) -> Optional[Country]:
+    """
+    Examine the content string and extract the country information pertaining to it's
+    name, capital, population, and area.
+    """
+
+
+@monkey.align
+def align_extract_country() -> None:
+    print("Aligning...")
+    country = "\n\n\n                            U.S. Virgin Islands\n                        \n\nCapital: Charlotte Amalie\nPopulation: 108708\nArea (km2): 352.0\n\n"
+    assert extract_country(country) == Country(
+        name="U.S. Virgin Islands",
+        capital="Charlotte Amalie",
+        population=108708,
+        area=352.0,
+    )
+
+
+if __name__ == '__main__':
+
+    # Align the function
+    align_extract_country()
+
+    # Web scrape the url and extract the list of countries
+    url = "https://www.scrapethissite.com/pages/simple/"
+    contents = scrape_url(url=url, class_name="country")
+
+    # Process the country blocks using MonkeyPatch (only sampling a couple for demo purposes)
+    countries = []
+    for content in contents[10:12]:
+        countries.append(extract_country(content))
+    print(countries)
diff --git a/examples/web_scraper/jobs.py b/examples/web_scraper/jobs.py
@@ -0,0 +1,53 @@
+import openai
+import os
+from dotenv import load_dotenv
+from pydantic import BaseModel
+from typing import Optional
+
+load_dotenv()
+
+from monkey_patch.monkey import Monkey as monkey
+from utils import scrape_url
+
+
+openai.api_key = os.getenv("OPENAI_API_KEY")
+
+
+class Job(BaseModel):
+    position: str
+    company: str
+    location: str
+
+
+@monkey.patch
+def extract_job(content: str) -> Optional[Job]:
+    """
+    Examine the content string and extract the job details for the position title, company, and location.
+    """
+
+
+@monkey.align
+def align_extract_job() -> None:
+    print("Aligning...")
+    job = "\n\n\n\n\n\n\n\n\nShip broker\nFuentes, Walls and Castro\n\n\n\n\n        Michelleville, AP\n      \n\n2021-04-08\n\n\n\nLearn\nApply\n\n\n"
+    assert extract_job(job) == Job(
+        position="Ship broker",
+        company="Fuentes, Walls and Castro",
+        location="Michelleville, AP",
+    )
+
+
+if __name__ == '__main__':
+
+    # Align the function
+    align_extract_job()
+
+    # Web scrape the url and extract the list of jobs
+    url = "https://realpython.github.io/fake-jobs/"
+    contents = scrape_url(url=url, class_name="card")
+
+    # Process the job blocks using MonkeyPatch (only sampling a couple for demo purposes)
+    jobs = []
+    for content in contents[1:3]:
+        jobs.append(extract_job(content))
+    print(jobs)
diff --git a/examples/web_scraper/quotes.py b/examples/web_scraper/quotes.py
@@ -0,0 +1,55 @@
+import openai
+import os
+from dotenv import load_dotenv
+from pydantic import BaseModel
+from typing import List, Optional
+
+load_dotenv()
+
+from monkey_patch.monkey import Monkey as monkey
+from utils import scrape_url
+
+
+openai.api_key = os.getenv("OPENAI_API_KEY")
+
+
+class Quote(BaseModel):
+    text: str
+    author: str
+    tags: List[str] = []
+
+
+@monkey.patch
+def extract_quote(content: str) -> Optional[Quote]:
+    """
+    Examine the content string and extract the quote details for the text, author, and tags.
+    """
+
+
+@monkey.align
+def align_extract_quote() -> None:
+    print("Aligning...")
+    quote = "\nIt takes courage to grow up and become who you really are.\nby E.E. Cummings\n(about)\n\n\n            Tags:\n            \ncourage\n\n"
+    assert extract_quote(quote) == Quote(
+        text="It takes courage to grow up and become who you really are.",
+        author="E.E. Cummings",
+        tags=["courage"],
+    )
+
+
+if __name__ == '__main__':
+
+    # Align the function
+    align_extract_quote()
+
+    # Web scrape the url and extract the list of quotes
+    url = "https://quotes.toscrape.com/page/1/"
+    contents = scrape_url(url=url, class_name="quote")
+
+    # Process the quote blocks using MonkeyPatch (only sampling a couple for demo purposes)
+    quotes = []
+    for content in contents[0:2]:
+        c = content.replace('“', '')
+        c = c.replace('”', '')
+        quotes.append(extract_quote(c))
+    print(quotes)
diff --git a/examples/web_scraper/readme.md b/examples/web_scraper/readme.md
@@ -0,0 +1,46 @@
+# Web Scraping
+
+This example shows how MonkeyPatch can be used with web scraping to easily populate the desired values into a structured class.
+
+Six examples for web scraping are provided:
+- [Quotes](https://quotes.toscrape.com/)
+- [Countries](https://www.scrapethissite.com/pages/simple/)
+- [Job Postings](https://realpython.github.io/fake-jobs/)
+- [Cocktails](https://kindredcocktails.com/cocktail/old-fashioned)
+- [Car Specs](https://www.cars.com/research/mazda-cx_90-2024/)
+- [StreetEasy Apartments](https://streeteasy.com/2-bedroom-apartments-for-rent/manhattan)
+
+## Configuration
+
+Ensure you have an account with OpenAI to access their underlying models.
+
+Set the following environment variables in your `.env` file:
+```
+OPENAI_API_KEY=sk-XXX
+USER_AGENT=... (Optional and only needed for StreetEasy example)
+```
+
+## Install
+
+```bash
+python -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+```
+
+## Usage
+
+To align and test, run following command for the desired example of interest:
+```
+python quotes.py
+
+python countries.py
+
+python jobs.py
+
+python cocktail.py
+
+python cars.py
+
+python streeteasy.py   # make sure to update User-Agent!
+```
diff --git a/examples/web_scraper/requirements.txt b/examples/web_scraper/requirements.txt
@@ -0,0 +1,5 @@
+python-dotenv
+openai
+monkey-patch.py
+pytest
+beautifulsoup4
diff --git a/examples/web_scraper/streeteasy.py b/examples/web_scraper/streeteasy.py
@@ -0,0 +1,63 @@
+from numpy import square
+import openai
+import os
+from dotenv import load_dotenv
+from pydantic import BaseModel
+from typing import List, Optional
+
+load_dotenv()
+
+from monkey_patch.monkey import Monkey as monkey
+from utils import scrape_url
+
+
+openai.api_key = os.getenv("OPENAI_API_KEY")
+
+
+class Property(BaseModel):
+    neighborhood: str
+    address: str
+    price: float
+    fee: bool
+    beds: float
+    bath: float
+    listed_by: str
+
+
+@monkey.patch
+def extract_property(content: str) -> Optional[Property]:
+    """
+    Examine the content string and extract the rental property details for the neighborhood, address,
+    price, number of beds, number of bathrooms, square footage, and company that is listing the property.
+    """
+
+@monkey.align
+def align_extract_property() -> None:
+    print("Aligning...")
+    unit_one = "Rental Unit in Lincoln Square\n      \n\n\n229 West 60th Street #7H\n\n\n\n$7,250\nNO FEE\n\n\n\n\n\n\n\n\n2 Beds\n\n\n\n\n2 Baths\n\n\n\n\n\n                1,386\n                square feet\nft²\n\n\n\n\n\n        Listing by Algin Management"
+    assert extract_property(unit_one) == Property(
+        neighborhood="Lincoln Square",
+        address="229 West 60th Street #7H",
+        price=7250.0,
+        fee=False,
+        beds=2.0,
+        bath=2.0,
+        listed_by="Algin Management",
+    )
+
+
+if __name__ == '__main__':
+
+    # Align the function
+    align_extract_property()
+
+    # Web scrape the url and extract the rental property details
+    url = "https://streeteasy.com/2-bedroom-apartments-for-rent/manhattan?page=2"
+    contents = scrape_url(url=url, class_name="listingCardBottom")
+    print(contents)
+
+    # Process the rental property block using MonkeyPatch
+    units = []
+    for content in contents[1:3]:
+        units.append(extract_property(content))
+    print(units)