From 18306597c953b12649e36268bcc3336f668d52d2 Mon Sep 17 00:00:00 2001 From: Ben Magolan Date: Mon, 27 Nov 2023 02:12:19 -0500 Subject: [PATCH 1/2] selenium webscraping example --- examples/web_scraper/airbnb.py | 98 +++++++++++++++++++++++++++ examples/web_scraper/readme.md | 11 ++- examples/web_scraper/requirements.txt | 3 +- 3 files changed, 108 insertions(+), 4 deletions(-) create mode 100644 examples/web_scraper/airbnb.py diff --git a/examples/web_scraper/airbnb.py b/examples/web_scraper/airbnb.py new file mode 100644 index 0000000..2c812af --- /dev/null +++ b/examples/web_scraper/airbnb.py @@ -0,0 +1,98 @@ +import openai +import os +from bs4 import BeautifulSoup +from dotenv import load_dotenv +from pydantic import BaseModel +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +import time +from typing import Optional + +load_dotenv() + +from monkey_patch.monkey import Monkey as monkey + + +openai.api_key = os.getenv("OPENAI_API_KEY") + + +class AirBnb(BaseModel): + city: str + state: str + dates: str + price: float + stars: float + + +@monkey.patch +def extract_airbnb(content: str) -> Optional[AirBnb]: + """ + Examine the content string and extract the airbnb details for the city, state, + dates available, nightly price, and stars rating. + """ + + +@monkey.align +def align_extract_airbnb() -> None: + print("Aligning...") + airbnb1 = "Caroga Lake, New YorkRoyal Mountain Ski ResortDec 3 – 8$200\xa0night$200 per night4.99" + assert extract_airbnb(airbnb1) == AirBnb( + city="Caroga Lake", + state="New York", + dates="Dec 3 - 8", + price=200.0, + stars=4.99, + ) + + +def selenium_driver() -> str: + """Use selenium to scrape the airbnb url and return the page source.""" + + # configure webdriver + options = Options() + # options.add_argument('--headless') # Enable headless mode + # options.add_argument('--disable-gpu') # Disable GPU acceleration + + # launch driver for the page + driver = webdriver.Chrome(options=options) + driver.get("https://www.airbnb.com/?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&search_mode=flex_destinations_search&flexible_trip_lengths%5B%5D=one_week&location_search=MIN_MAP_BOUNDS&monthly_start_date=2023-12-01&monthly_length=3&price_filter_input_type=0&channel=EXPLORE&search_type=category_change&price_filter_num_nights=5&category_tag=Tag%3A5366") + time.sleep(3) + + # refresh the page to remove the dialog modal + driver.refresh() + time.sleep(3) + + # Scroll halfway down page to get rest of listings to load + scroll_position = driver.execute_script("return (document.body.scrollHeight - window.innerHeight) * 0.4;") + driver.execute_script(f"window.scrollTo(0, {scroll_position});") + time.sleep(3) + + # extract the page source and return + page_source = driver.page_source + driver.quit() + return page_source + + +if __name__ == '__main__': + + # Align the function + align_extract_airbnb() + + # Selenium driver to scrape the url and extract the airbnb information + page_source = selenium_driver() + + # Beautiful Soup to parse the page source + soup = BeautifulSoup(page_source, 'html.parser') + entities = soup.find_all('div', class_="dir dir-ltr") + + # Remove entries that are not airbnb listings + contents = [entity.text for entity in entities if entity.text != ""] + contents = [c for c in contents if "$" in c] + print(contents) + + # Tanuki to extract the airbnb information + print("Tanuki Time!") + airbnbs = [] + for content in contents[1:3]: + airbnbs.append(extract_airbnb(content)) + print(airbnbs) diff --git a/examples/web_scraper/readme.md b/examples/web_scraper/readme.md index a953c84..2f59986 100644 --- a/examples/web_scraper/readme.md +++ b/examples/web_scraper/readme.md @@ -2,7 +2,7 @@ This example shows how MonkeyPatch can be used with web scraping to easily populate the desired values into a structured class. -Six examples for web scraping are provided: +Six examples for web scraping with BeautifulSoup are provided: - [Quotes](https://quotes.toscrape.com/) - [Countries](https://www.scrapethissite.com/pages/simple/) - [Job Postings](https://realpython.github.io/fake-jobs/) @@ -10,14 +10,17 @@ Six examples for web scraping are provided: - [Car Specs](https://www.cars.com/research/mazda-cx_90-2024/) - [StreetEasy Apartments](https://streeteasy.com/2-bedroom-apartments-for-rent/manhattan) +An additional example has been provided showing how to use Selenium with BeautifulSoup for scraping: +- [AirBnb](https://www.airbnb.com/?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&search_mode=flex_destinations_search&flexible_trip_lengths%5B%5D=one_week&location_search=MIN_MAP_BOUNDS&monthly_start_date=2023-12-01&monthly_length=3&price_filter_input_type=0&channel=EXPLORE&category_tag=Tag%3A5366&search_type=category_change) + ## Configuration -Ensure you have an account with OpenAI to access their underlying models. +Make sure you have an account with OpenAI to access their underlying models. Set the following environment variables in your `.env` file: ``` OPENAI_API_KEY=sk-XXX -USER_AGENT=... (Optional and only needed for StreetEasy example) +USER_AGENT=... (Optional: only needed for StreetEasy example) ``` ## Install @@ -43,4 +46,6 @@ python cocktail.py python cars.py python streeteasy.py # make sure to update User-Agent! + +python airbnb.py ``` diff --git a/examples/web_scraper/requirements.txt b/examples/web_scraper/requirements.txt index 1a444ae..de62692 100644 --- a/examples/web_scraper/requirements.txt +++ b/examples/web_scraper/requirements.txt @@ -2,4 +2,5 @@ python-dotenv openai monkey-patch.py pytest -beautifulsoup4 \ No newline at end of file +beautifulsoup4 +selenium \ No newline at end of file From f4a2e2ea9bd515b36189250cf7f9600f1470b902 Mon Sep 17 00:00:00 2001 From: Jack Hopkins Date: Mon, 27 Nov 2023 13:05:40 +0000 Subject: [PATCH 2/2] Refactor MonkeyPatch to use Tanuki Description: --- examples/web_scraper/airbnb.py | 6 +- examples/web_scraper/readme.md | 2 +- src/tanuki.py.egg-info/PKG-INFO | 210 ++++++++++++++++++++ src/tanuki.py.egg-info/SOURCES.txt | 60 ++++++ src/tanuki.py.egg-info/dependency_links.txt | 1 + src/tanuki.py.egg-info/requires.txt | 7 + src/tanuki.py.egg-info/top_level.txt | 1 + 7 files changed, 283 insertions(+), 4 deletions(-) create mode 100644 src/tanuki.py.egg-info/PKG-INFO create mode 100644 src/tanuki.py.egg-info/SOURCES.txt create mode 100644 src/tanuki.py.egg-info/dependency_links.txt create mode 100644 src/tanuki.py.egg-info/requires.txt create mode 100644 src/tanuki.py.egg-info/top_level.txt diff --git a/examples/web_scraper/airbnb.py b/examples/web_scraper/airbnb.py index 2c812af..c680f59 100644 --- a/examples/web_scraper/airbnb.py +++ b/examples/web_scraper/airbnb.py @@ -10,7 +10,7 @@ load_dotenv() -from monkey_patch.monkey import Monkey as monkey +import tanuki openai.api_key = os.getenv("OPENAI_API_KEY") @@ -24,7 +24,7 @@ class AirBnb(BaseModel): stars: float -@monkey.patch +@tanuki.patch def extract_airbnb(content: str) -> Optional[AirBnb]: """ Examine the content string and extract the airbnb details for the city, state, @@ -32,7 +32,7 @@ def extract_airbnb(content: str) -> Optional[AirBnb]: """ -@monkey.align +@tanuki.align def align_extract_airbnb() -> None: print("Aligning...") airbnb1 = "Caroga Lake, New YorkRoyal Mountain Ski ResortDec 3 – 8$200\xa0night$200 per night4.99" diff --git a/examples/web_scraper/readme.md b/examples/web_scraper/readme.md index 2f59986..462d668 100644 --- a/examples/web_scraper/readme.md +++ b/examples/web_scraper/readme.md @@ -1,6 +1,6 @@ # Web Scraping -This example shows how MonkeyPatch can be used with web scraping to easily populate the desired values into a structured class. +This example shows how Tanuki can be used with web scraping to easily populate the desired values into a structured class. Six examples for web scraping with BeautifulSoup are provided: - [Quotes](https://quotes.toscrape.com/) diff --git a/src/tanuki.py.egg-info/PKG-INFO b/src/tanuki.py.egg-info/PKG-INFO new file mode 100644 index 0000000..6b436cf --- /dev/null +++ b/src/tanuki.py.egg-info/PKG-INFO @@ -0,0 +1,210 @@ +Metadata-Version: 2.1 +Name: tanuki.py +Version: 0.0.1a1.dev14+gd12f5dc.d20231101 +Summary: The easiest way to build scalable LLM-powered applications, which gets cheaper and faster over time. +Home-page: https://github.com/tanuki/tanuki.py +Author: Jack Hopkins +Author-email: jack.hopkins@me.com +Classifier: Programming Language :: Python :: 3 +Classifier: License :: OSI Approved :: MIT License +Classifier: Operating System :: OS Independent +Requires-Python: >=3.6 +Description-Content-Type: text/markdown +License-File: LICENSE.txt +Requires-Dist: pydantic>=1.8.2 +Requires-Dist: appdirs~=1.4.4 +Requires-Dist: openai==0.28.1 +Requires-Dist: numpy~=1.24.4 +Requires-Dist: python-dotenv==1.0.0 +Requires-Dist: bitarray==2.8.2 +Requires-Dist: pydantic==2.4.2 + +# 🙈 Tanuki + +The fastest and easiest way to build LLM features in python. + +A *Tanuki Function* is a piece of code which replaces function stubs with LLM transformations at runtime, enabling you to drop in well-typed, production-ready capabilities into your app in seconds. No more prompt wrangling. No surprises. The more often you call the function, the faster it gets. + +``` +@tanuki.patch +def some_function(input: TypedInput) -> TypedOutput: + """ + This is where you include the description of how your function will be used. + """ + +@tanuki.align +def test_some_function(example_typed_input: TypedInput, + example_typed_output: TypedOutput): + + assert similar_to(some_function(example_typed_input), example_typed_output) + +``` + +## How it works + +When you initially call a Tanuki Function during development, an LLM in a zero-shot configuration is invoked to generate the typed response. This response can be passed through to the rest of your app / stored in the DB / displayed to the user. + +We register the input and outputs of the function during execution, ensuring that the outputs are correctly typed. +As your data volume increases, we distil, deploy and manage smaller models that are able to capture the desired behaviour at a lower computational cost and lower latency. + +You can align the model to your use-case through *test-driven alignment*, in which you create tests which declare the desired behaviour of your function. + +## Examples + +In this example, we define a simple classification function, and several alignment assertions which guide the desired behaviour of the function. + +```sentiment +@tanuki.patch +def classify_sentiment(msg: str) -> Optional[Literal['Good', 'Bad']]: + """ + Classifies a message from the user into Good, Bad or None. + """ + +@tanuki.align +def align_classify_sentiment(): + assert classify_sentiment("I love you") == 'Good' + assert classify_sentiment("I hate you") == 'Bad' + assert not classify_sentiment("Wednesdays are in the middle of the week") +``` + + +``` +@tanuki.patch +def score_sentiment(input: str) -> Annotated[int, Field(gt=0, lt=10)]: + """ + Scores the input between 0-10 + """ + +@tanuki.align +def align_score_sentiment(): + """Register several examples to align your function""" + + assert score_sentiment("I love you") == 10 + assert score_sentiment("I hate you") == 0 + assert score_sentiment("You're okay I guess") == 5 + +# This is a normal test that can be invoked +def test_score_sentiment(): + """We can test the function as normal using Pytest or Unittest""" + assert score_sentiment("I like you") == 7 +``` + +You can define standard Pydantic classes as your output, and can optionally add descriptions using _Field_ to help inform the purpose of each field. + +```python +@dataclass +class ActionItem: + goal: str = Field(description="What task must be completed") + deadline: datetime = Field(description="The date the goal needs to be achieved") + +@tanuki.patch +def action_items(input: str) -> List[ActionItem]: + """Generate a list of Action Items""" + +@tanuki.align +def align_action_items(): + goal = "Can you please get the presentation to me by Tuesday?" + next_tuesday = (datetime.now() + timedelta((1 - datetime.now().weekday() + 7) % 7)).replace(hour=0, minute=0, second=0, microsecond=0) + + assert action_items(goal) == ActionItem(goal="Prepare the presentation", deadline=next_tuesday) +``` + +## Test-Driven Alignment + +To align the behaviour of your patched function to your needs, decorate a function with `@align`. + +```python +def test_score_sentiment(): + """We can test the function as normal using Pytest or Unittest""" + assert score_sentiment("I like you") == 7 +``` + +You assert the behaviour of your patched function either declaring the desired output through equality or inequality checking, or the downstream behaviour of consequent functions: + + +(HOW CAN WE ACHIEVE THIS?) +```python +def test_score_sentiment(): + """We can test the function as normal using Pytest or Unittest""" + assert multiply_by_two(score_sentiment("I like you")) == 14 + assert 2*score_sentiment("I like you") == 14 +``` + + + + +## Simple ToDo List App + +``` +from datetime import datetime +from typing import Optional, List +from pydantic import Field +from fastapi import FastAPI +import munk + +app = FastAPI() + +@dataclass +class TodoItem: + goal: str = Field(description="What task must be completed") + deadline: datetime = Field(description="The date the goal needs to be achieved") + priority: str = Field(description="Priority level of the task") + people_involved: List[str] = Field(description="Names of people involved") + + +@tanuki.func +def generate_todo(input: str) -> TodoItem: + """ + Generate a TodoItem based on the natural language input. + """ + +@tanuki.align +def align_generate_todo(): + next_tuesday = (datetime.now() + timedelta((1 - datetime.now().weekday() + 7) % 7)).replace(hour=0, minute=0, second=0, microsecond=0) + next_friday = (datetime.now() + timedelta((4 - datetime.now().weekday() + 7) % 7)).replace(hour=0, minute=0, second=0, microsecond=0) + + # First example + assert generate_todo("Prepare the presentation for John by next Tuesday, high priority") == TodoItem( + goal="Prepare the presentation", + deadline=next_tuesday, + priority="high", + people_involved=["John"] + ) + + # Second example: Different priority and deadline + assert generate_todo("Complete the report by Friday, medium priority") == TodoItem( + goal="Complete the report", + deadline=next_friday, + priority="medium", + people_involved=[] + ) + + # Third example: Multiple people involved + assert generate_todo("Organize the team meeting with Emily and Sarah for next Tuesday") == TodoItem( + goal="Organize the team meeting", + deadline=next_tuesday, + priority="", + people_involved=["Emily", "Sarah"] + ) + + # Fourth example: No deadline + assert generate_todo("Buy groceries, low priority") == TodoItem( + goal="Buy groceries", + deadline=None, + priority="low", + people_involved=[] + ) + + # Fifth example: No priority or people involved + assert generate_todo("Read the new book") == TodoItem( + goal="Read the new book", + deadline=None, + priority="", + people_involved=[] + ) + +@app.post("/todo/", response_model=TodoItem) +async def create_todo(input: str): + return generate_todo(input) + +``` diff --git a/src/tanuki.py.egg-info/SOURCES.txt b/src/tanuki.py.egg-info/SOURCES.txt new file mode 100644 index 0000000..a659d1f --- /dev/null +++ b/src/tanuki.py.egg-info/SOURCES.txt @@ -0,0 +1,60 @@ +.gitignore +LICENSE.txt +pyproject.toml +readme.md +requirements.txt +setup.cfg +.github/workflows/publish.yml +.idea/.gitignore +.idea/misc.xml +.idea/modules.xml +.idea/monkeyFunctions.iml +.idea/vcs.xml +.idea/inspectionProfiles/Project_Default.xml +.idea/inspectionProfiles/profiles_settings.xml +examples/clean_language/main.py +examples/score_sentiment/main.py +examples/semantic_sql/main.py +examples/stock_winners/main.py +examples/todolist/backend/main.py +examples/todolist/backend/requirements.txt +examples/todolist/backend/start.sh +examples/todolist/backend/todo_item.py +examples/todolist/backend/__pycache__/main.cpython-39.pyc +examples/todolist/backend/__pycache__/todo_item.cpython-311.pyc +examples/todolist/frontend/src.js +src/tanuki/__init__.py +src/tanuki/assertion_visitor.py +src/tanuki/bloom_filter.py +src/tanuki/function_modeler.py +src/tanuki/register.py +src/tanuki/repair.py +src/tanuki/utils.py +src/tanuki/validator.py +src/tanuki.py.egg-info/PKG-INFO +src/tanuki.py.egg-info/SOURCES.txt +src/tanuki.py.egg-info/dependency_links.txt +src/tanuki.py.egg-info/requires.txt +src/tanuki.py.egg-info/top_level.txt +src/tanuki/models/__init__.py +src/tanuki/models/function_description.py +src/tanuki/models/function_example.py +src/tanuki/models/language_model_output.py +tests/test_bloom_filter.py +tests/test_hash.py +tests/test_token_counter.py +tests/test_align/test_align_class.py +tests/test_align/test_align_global.py +tests/test_align/.align/b01fbd89d35b7eef4015fe0c6e363969 +tests/test_align/.align/c364f14f40084510581a3418aaab4037 +tests/test_assertion_visitor/test_mock.py +tests/test_load/test_load_trackers.py +tests/test_patch/test_classification.py +tests/test_patch/test_finance.py +tests/test_patch/test_regression.py +tests/test_validator/test_instantiate.py +tests/test_validator/test_is_base_type.py +tests/test_validator/test_validate_base_type.py +tests/test_validator/test_validate_output.py +tests/test_validator/test_validate_value.py +tests/test_validator/test_validator.py \ No newline at end of file diff --git a/src/tanuki.py.egg-info/dependency_links.txt b/src/tanuki.py.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/tanuki.py.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/src/tanuki.py.egg-info/requires.txt b/src/tanuki.py.egg-info/requires.txt new file mode 100644 index 0000000..e192fdf --- /dev/null +++ b/src/tanuki.py.egg-info/requires.txt @@ -0,0 +1,7 @@ +pydantic>=1.8.2 +appdirs~=1.4.4 +openai==0.28.1 +numpy~=1.24.4 +python-dotenv==1.0.0 +bitarray==2.8.2 +pydantic==2.4.2 diff --git a/src/tanuki.py.egg-info/top_level.txt b/src/tanuki.py.egg-info/top_level.txt new file mode 100644 index 0000000..d9bd6db --- /dev/null +++ b/src/tanuki.py.egg-info/top_level.txt @@ -0,0 +1 @@ +tanuki