Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

selenium webscraping example #93

Merged
merged 2 commits into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions examples/web_scraper/airbnb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import openai
import os
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from pydantic import BaseModel
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from typing import Optional

load_dotenv()

import tanuki


openai.api_key = os.getenv("OPENAI_API_KEY")


class AirBnb(BaseModel):
city: str
state: str
dates: str
price: float
stars: float


@tanuki.patch
def extract_airbnb(content: str) -> Optional[AirBnb]:
"""
Examine the content string and extract the airbnb details for the city, state,
dates available, nightly price, and stars rating.
"""


@tanuki.align
def align_extract_airbnb() -> None:
print("Aligning...")
airbnb1 = "Caroga Lake, New YorkRoyal Mountain Ski ResortDec 3 – 8$200\xa0night$200 per night4.99"
assert extract_airbnb(airbnb1) == AirBnb(
city="Caroga Lake",
state="New York",
dates="Dec 3 - 8",
price=200.0,
stars=4.99,
)


def selenium_driver() -> str:
"""Use selenium to scrape the airbnb url and return the page source."""

# configure webdriver
options = Options()
# options.add_argument('--headless') # Enable headless mode
# options.add_argument('--disable-gpu') # Disable GPU acceleration

# launch driver for the page
driver = webdriver.Chrome(options=options)
driver.get("https://www.airbnb.com/?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&search_mode=flex_destinations_search&flexible_trip_lengths%5B%5D=one_week&location_search=MIN_MAP_BOUNDS&monthly_start_date=2023-12-01&monthly_length=3&price_filter_input_type=0&channel=EXPLORE&search_type=category_change&price_filter_num_nights=5&category_tag=Tag%3A5366")
time.sleep(3)

# refresh the page to remove the dialog modal
driver.refresh()
time.sleep(3)

# Scroll halfway down page to get rest of listings to load
scroll_position = driver.execute_script("return (document.body.scrollHeight - window.innerHeight) * 0.4;")
driver.execute_script(f"window.scrollTo(0, {scroll_position});")
time.sleep(3)

# extract the page source and return
page_source = driver.page_source
driver.quit()
return page_source


if __name__ == '__main__':

# Align the function
align_extract_airbnb()

# Selenium driver to scrape the url and extract the airbnb information
page_source = selenium_driver()

# Beautiful Soup to parse the page source
soup = BeautifulSoup(page_source, 'html.parser')
entities = soup.find_all('div', class_="dir dir-ltr")

# Remove entries that are not airbnb listings
contents = [entity.text for entity in entities if entity.text != ""]
contents = [c for c in contents if "$" in c]
print(contents)

# Tanuki to extract the airbnb information
print("Tanuki Time!")
airbnbs = []
for content in contents[1:3]:
airbnbs.append(extract_airbnb(content))
print(airbnbs)
13 changes: 9 additions & 4 deletions examples/web_scraper/readme.md
Original file line number Diff line number Diff line change
@@ -1,23 +1,26 @@
# Web Scraping

This example shows how MonkeyPatch can be used with web scraping to easily populate the desired values into a structured class.
This example shows how Tanuki can be used with web scraping to easily populate the desired values into a structured class.

Six examples for web scraping are provided:
Six examples for web scraping with BeautifulSoup are provided:
- [Quotes](https://quotes.toscrape.com/)
- [Countries](https://www.scrapethissite.com/pages/simple/)
- [Job Postings](https://realpython.github.io/fake-jobs/)
- [Cocktails](https://kindredcocktails.com/cocktail/old-fashioned)
- [Car Specs](https://www.cars.com/research/mazda-cx_90-2024/)
- [StreetEasy Apartments](https://streeteasy.com/2-bedroom-apartments-for-rent/manhattan)

An additional example has been provided showing how to use Selenium with BeautifulSoup for scraping:
- [AirBnb](https://www.airbnb.com/?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&search_mode=flex_destinations_search&flexible_trip_lengths%5B%5D=one_week&location_search=MIN_MAP_BOUNDS&monthly_start_date=2023-12-01&monthly_length=3&price_filter_input_type=0&channel=EXPLORE&category_tag=Tag%3A5366&search_type=category_change)

## Configuration

Ensure you have an account with OpenAI to access their underlying models.
Make sure you have an account with OpenAI to access their underlying models.

Set the following environment variables in your `.env` file:
```
OPENAI_API_KEY=sk-XXX
USER_AGENT=... (Optional and only needed for StreetEasy example)
USER_AGENT=... (Optional: only needed for StreetEasy example)
```

## Install
Expand All @@ -43,4 +46,6 @@ python cocktail.py
python cars.py

python streeteasy.py # make sure to update User-Agent!

python airbnb.py
```
3 changes: 2 additions & 1 deletion examples/web_scraper/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@ python-dotenv
openai
monkey-patch.py
pytest
beautifulsoup4
beautifulsoup4
selenium
210 changes: 210 additions & 0 deletions src/tanuki.py.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
Metadata-Version: 2.1
Name: tanuki.py
Version: 0.0.1a1.dev14+gd12f5dc.d20231101
Summary: The easiest way to build scalable LLM-powered applications, which gets cheaper and faster over time.
Home-page: https://github.com/tanuki/tanuki.py
Author: Jack Hopkins
Author-email: [email protected]
Classifier: Programming Language :: Python :: 3
Classifier: License :: OSI Approved :: MIT License
Classifier: Operating System :: OS Independent
Requires-Python: >=3.6
Description-Content-Type: text/markdown
License-File: LICENSE.txt
Requires-Dist: pydantic>=1.8.2
Requires-Dist: appdirs~=1.4.4
Requires-Dist: openai==0.28.1
Requires-Dist: numpy~=1.24.4
Requires-Dist: python-dotenv==1.0.0
Requires-Dist: bitarray==2.8.2
Requires-Dist: pydantic==2.4.2

# 🙈 Tanuki

The fastest and easiest way to build LLM features in python.

A *Tanuki Function* is a piece of code which replaces function stubs with LLM transformations at runtime, enabling you to drop in well-typed, production-ready capabilities into your app in seconds. No more prompt wrangling. No surprises. The more often you call the function, the faster it gets.

```
@tanuki.patch
def some_function(input: TypedInput) -> TypedOutput:
"""
This is where you include the description of how your function will be used.
"""

@tanuki.align
def test_some_function(example_typed_input: TypedInput,
example_typed_output: TypedOutput):

assert similar_to(some_function(example_typed_input), example_typed_output)

```

## How it works

When you initially call a Tanuki Function during development, an LLM in a zero-shot configuration is invoked to generate the typed response. This response can be passed through to the rest of your app / stored in the DB / displayed to the user.

We register the input and outputs of the function during execution, ensuring that the outputs are correctly typed.
As your data volume increases, we distil, deploy and manage smaller models that are able to capture the desired behaviour at a lower computational cost and lower latency.

You can align the model to your use-case through *test-driven alignment*, in which you create tests which declare the desired behaviour of your function.

## Examples

In this example, we define a simple classification function, and several alignment assertions which guide the desired behaviour of the function.

```sentiment
@tanuki.patch
def classify_sentiment(msg: str) -> Optional[Literal['Good', 'Bad']]:
"""
Classifies a message from the user into Good, Bad or None.
"""

@tanuki.align
def align_classify_sentiment():
assert classify_sentiment("I love you") == 'Good'
assert classify_sentiment("I hate you") == 'Bad'
assert not classify_sentiment("Wednesdays are in the middle of the week")
```


```
@tanuki.patch
def score_sentiment(input: str) -> Annotated[int, Field(gt=0, lt=10)]:
"""
Scores the input between 0-10
"""

@tanuki.align
def align_score_sentiment():
"""Register several examples to align your function"""

assert score_sentiment("I love you") == 10
assert score_sentiment("I hate you") == 0
assert score_sentiment("You're okay I guess") == 5

# This is a normal test that can be invoked
def test_score_sentiment():
"""We can test the function as normal using Pytest or Unittest"""
assert score_sentiment("I like you") == 7
```

You can define standard Pydantic classes as your output, and can optionally add descriptions using _Field_ to help inform the purpose of each field.

```python
@dataclass
class ActionItem:
goal: str = Field(description="What task must be completed")
deadline: datetime = Field(description="The date the goal needs to be achieved")

@tanuki.patch
def action_items(input: str) -> List[ActionItem]:
"""Generate a list of Action Items"""

@tanuki.align
def align_action_items():
goal = "Can you please get the presentation to me by Tuesday?"
next_tuesday = (datetime.now() + timedelta((1 - datetime.now().weekday() + 7) % 7)).replace(hour=0, minute=0, second=0, microsecond=0)

assert action_items(goal) == ActionItem(goal="Prepare the presentation", deadline=next_tuesday)
```

## Test-Driven Alignment

To align the behaviour of your patched function to your needs, decorate a function with `@align`.

```python
def test_score_sentiment():
"""We can test the function as normal using Pytest or Unittest"""
assert score_sentiment("I like you") == 7
```

You assert the behaviour of your patched function either declaring the desired output through equality or inequality checking, or the downstream behaviour of consequent functions:


(HOW CAN WE ACHIEVE THIS?)
```python
def test_score_sentiment():
"""We can test the function as normal using Pytest or Unittest"""
assert multiply_by_two(score_sentiment("I like you")) == 14
assert 2*score_sentiment("I like you") == 14
```




## Simple ToDo List App

```
from datetime import datetime
from typing import Optional, List
from pydantic import Field
from fastapi import FastAPI
import munk

app = FastAPI()

@dataclass
class TodoItem:
goal: str = Field(description="What task must be completed")
deadline: datetime = Field(description="The date the goal needs to be achieved")
priority: str = Field(description="Priority level of the task")
people_involved: List[str] = Field(description="Names of people involved")


@tanuki.func
def generate_todo(input: str) -> TodoItem:
"""
Generate a TodoItem based on the natural language input.
"""

@tanuki.align
def align_generate_todo():
next_tuesday = (datetime.now() + timedelta((1 - datetime.now().weekday() + 7) % 7)).replace(hour=0, minute=0, second=0, microsecond=0)
next_friday = (datetime.now() + timedelta((4 - datetime.now().weekday() + 7) % 7)).replace(hour=0, minute=0, second=0, microsecond=0)

# First example
assert generate_todo("Prepare the presentation for John by next Tuesday, high priority") == TodoItem(
goal="Prepare the presentation",
deadline=next_tuesday,
priority="high",
people_involved=["John"]
)

# Second example: Different priority and deadline
assert generate_todo("Complete the report by Friday, medium priority") == TodoItem(
goal="Complete the report",
deadline=next_friday,
priority="medium",
people_involved=[]
)

# Third example: Multiple people involved
assert generate_todo("Organize the team meeting with Emily and Sarah for next Tuesday") == TodoItem(
goal="Organize the team meeting",
deadline=next_tuesday,
priority="",
people_involved=["Emily", "Sarah"]
)

# Fourth example: No deadline
assert generate_todo("Buy groceries, low priority") == TodoItem(
goal="Buy groceries",
deadline=None,
priority="low",
people_involved=[]
)

# Fifth example: No priority or people involved
assert generate_todo("Read the new book") == TodoItem(
goal="Read the new book",
deadline=None,
priority="",
people_involved=[]
)

@app.post("/todo/", response_model=TodoItem)
async def create_todo(input: str):
return generate_todo(input)

```
Loading