fix: better playwright installation handling

ScrapeGraphAI · Jan 6, 2025 · f6009d1 · f6009d1
1 parent e374e05
commit f6009d1
Show file tree

Hide file tree

Showing 6 changed files with 204 additions and 143 deletions.
diff --git a/README.md b/README.md
@@ -24,21 +24,6 @@ Just say which information you want to extract and the library will do it for yo
   <img src="https://raw.githubusercontent.com/VinciGit00/Scrapegraph-ai/main/docs/assets/sgai-hero.png" alt="ScrapeGraphAI Hero" style="width: 100%;">
 </p>
 
-## 🔗 ScrapeGraph API & SDKs
-If you are looking for a quick solution to integrate ScrapeGraph in your system, check out our powerful API [here!](https://dashboard.scrapegraphai.com/login)
-
-<p align="center">
-  <img src="https://raw.githubusercontent.com/VinciGit00/Scrapegraph-ai/main/docs/assets/api-banner.png" alt="ScrapeGraph API Banner" style="width: 100%;">
-</p>
-
-We offer SDKs in both Python and Node.js, making it easy to integrate into your projects. Check them out below:
-
-| SDK       | Language | GitHub Link                                                                 |
-|-----------|----------|-----------------------------------------------------------------------------|
-| Python SDK | Python   | [scrapegraph-py](https://github.com/ScrapeGraphAI/scrapegraph-sdk/tree/main/scrapegraph-py) |
-| Node.js SDK | Node.js  | [scrapegraph-js](https://github.com/ScrapeGraphAI/scrapegraph-sdk/tree/main/scrapegraph-js) |
-
-The Official API Documentation can be found [here](https://docs.scrapegraphai.com/).
 
 ## 🚀 Quick install
 
@@ -47,6 +32,7 @@ The reference page for Scrapegraph-ai is available on the official page of PyPI:
 ```bash
 pip install scrapegraphai
 
+# IMPORTANT (to fetch webpage content)
 playwright install
 ```
 
@@ -84,13 +70,12 @@ The most common one is the `SmartScraperGraph`, which extracts information from
 
 
 ```python
-import json
 from scrapegraphai.graphs import SmartScraperGraph
 
 # Define the configuration for the scraping pipeline
 graph_config = {
     "llm": {
-        "api_key": "YOUR_OPENAI_APIKEY",
+        "api_key": "YOUR_OPENAI_API_KEY",
         "model": "openai/gpt-4o-mini",
     },
     "verbose": True,
@@ -99,33 +84,45 @@ graph_config = {
 
 # Create the SmartScraperGraph instance
 smart_scraper_graph = SmartScraperGraph(
-    prompt="Extract me all the news from the website",
-    source="https://www.wired.com",
+    prompt="Extract useful information from the webpage, including a description of what the company does, founders and social media links",
+    source="https://scrapegraphai.com/",
     config=graph_config
 )
 
 # Run the pipeline
 result = smart_scraper_graph.run()
+
+import json
 print(json.dumps(result, indent=4))
 ```
 
 The output will be a dictionary like the following:
 
 ```python
-"result": {
-    "news": [
-      {
-        "title": "The New Jersey Drone Mystery May Not Actually Be That Mysterious",
-        "link": "https://www.wired.com/story/new-jersey-drone-mystery-maybe-not-drones/",
-        "author": "Lily Hay Newman"
-      },
-      {
-        "title": "Former ByteDance Intern Accused of Sabotage Among Winners of Prestigious AI Award",
-        "link": "https://www.wired.com/story/bytedance-intern-best-paper-neurips/",
-        "author": "Louise Matsakis"
-      },
-    ...
-    ]
+{
+    "description": "ScrapeGraphAI transforms websites into clean, organized data for AI agents and data analytics. It offers an AI-powered API for effortless and cost-effective data extraction.",
+    "founders": [
+        {
+            "name": "Marco Perini",
+            "role": "Founder & Technical Lead",
+            "linkedin": "https://www.linkedin.com/in/perinim/"
+        },
+        {
+            "name": "Marco Vinciguerra",
+            "role": "Founder & Software Engineer",
+            "linkedin": "https://www.linkedin.com/in/marco-vinciguerra-7ba365242/"
+        },
+        {
+            "name": "Lorenzo Padoan",
+            "role": "Founder & Product Engineer",
+            "linkedin": "https://www.linkedin.com/in/lorenzo-padoan-4521a2154/"
+        }
+    ],
+    "social_media_links": {
+        "linkedin": "https://www.linkedin.com/company/101881123",
+        "twitter": "https://x.com/scrapegraphai",
+        "github": "https://github.com/ScrapeGraphAI/Scrapegraph-ai"
+    }
 }
 ```
 There are other pipelines that can be used to extract information from multiple pages, generate Python scripts, or even generate audio files.
@@ -145,20 +142,30 @@ It is possible to use different LLM through APIs, such as **OpenAI**, **Groq**,
 
 Remember to have [Ollama](https://ollama.com/) installed and download the models using the **ollama pull** command, if you want to use local models.
 
-## 🔍 Demo
-Official streamlit demo:
-
-[![My Skills](https://skillicons.dev/icons?i=react)](https://scrapegraph-demo-demo.streamlit.app)
 
-Try it directly on the web using Google Colab:
+## 📖 Documentation
 
 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1sEZBonBMGP44CtO6GQTwAlL0BGJXjtfd?usp=sharing)
 
-## 📖 Documentation
-
 The documentation for ScrapeGraphAI can be found [here](https://scrapegraph-ai.readthedocs.io/en/latest/).
 Check out also the Docusaurus [here](https://docs-oss.scrapegraphai.com/).
 
+## 🔗 ScrapeGraph API & SDKs
+If you are looking for a quick solution to integrate ScrapeGraph in your system, check out our powerful API [here!](https://dashboard.scrapegraphai.com/login)
+
+<p align="center">
+  <img src="https://raw.githubusercontent.com/VinciGit00/Scrapegraph-ai/main/docs/assets/api-banner.png" alt="ScrapeGraph API Banner" style="width: 100%;">
+</p>
+
+We offer SDKs in both Python and Node.js, making it easy to integrate into your projects. Check them out below:
+
+| SDK       | Language | GitHub Link                                                                 |
+|-----------|----------|-----------------------------------------------------------------------------|
+| Python SDK | Python   | [scrapegraph-py](https://github.com/ScrapeGraphAI/scrapegraph-sdk/tree/main/scrapegraph-py) |
+| Node.js SDK | Node.js  | [scrapegraph-js](https://github.com/ScrapeGraphAI/scrapegraph-sdk/tree/main/scrapegraph-js) |
+
+The Official API Documentation can be found [here](https://docs.scrapegraphai.com/).
+
 ## 🏆 Sponsors
 <div style="text-align: center;">
   <a href="https://2ly.link/1zaXG">

diff --git a/pyproject.toml b/pyproject.toml
@@ -32,13 +32,9 @@ dependencies = [
     "fastembed>=0.3.6",
     "semchunk>=2.2.0",
     "transformers>=4.44.2",
-    "transformers>=4.44.2",
     "googlesearch-python>=1.2.5",
     "async-timeout>=4.0.3",
-    "transformers>=4.44.2",
-    "googlesearch-python>=1.2.5",
     "simpleeval>=1.0.0",
-    "async_timeout>=4.0.3",
     "scrapegraph-py>=1.7.0"
 ]
 

diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py
@@ -23,9 +23,6 @@ class ChromiumLoader(BaseLoader):
         requires_js_support: Flag to determine if JS rendering is required.
     """
 
-    RETRY_LIMIT = 3
-    TIMEOUT = 10
-
     def __init__(
         self,
         urls: List[str],
@@ -37,6 +34,8 @@ def __init__(
         requires_js_support: bool = False,
         storage_state: Optional[str] = None,
         browser_name: str = "chromium",  #default chromium
+        retry_limit: int = 1,
+        timeout: int = 10,
         **kwargs: Any,
     ):
         """Initialize the loader with a list of URL paths.
@@ -47,6 +46,8 @@ def __init__(
             proxy: A dictionary containing proxy information; None disables protection.
             urls: A list of URLs to scrape content from.
             requires_js_support: Whether to use JS rendering for scraping.
+            retry_limit: Maximum number of retry attempts for scraping. Defaults to 3.
+            timeout: Maximum time in seconds to wait for scraping. Defaults to 10.
             kwargs: A dictionary containing additional browser kwargs.
 
         Raises:
@@ -68,12 +69,17 @@ def __init__(
         self.requires_js_support = requires_js_support
         self.storage_state = storage_state
         self.browser_name = browser_name
+        self.retry_limit = retry_limit
+        self.timeout = timeout
 
     async def scrape(self, url:str) -> str:
         if self.backend == "playwright":
             return await self.ascrape_playwright(url)
         elif self.backend == "selenium":
-            return await self.ascrape_undetected_chromedriver(url)
+            try:
+                return await self.ascrape_undetected_chromedriver(url)
+            except Exception as e:
+                raise ValueError(f"Failed to scrape with undetected chromedriver: {e}")
         else:
             raise ValueError(f"Unsupported backend: {self.backend}")     
 
@@ -97,9 +103,9 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
         results = ""
         attempt = 0
 
-        while attempt < self.RETRY_LIMIT:
+        while attempt < self.retry_limit:
             try:
-                async with async_timeout.timeout(self.TIMEOUT):
+                async with async_timeout.timeout(self.timeout):
                     # Handling browser selection
                     if self.backend == "selenium":
                         if self.browser_name == "chromium":
@@ -134,9 +140,9 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str:
             except (aiohttp.ClientError, asyncio.TimeoutError) as e:
                 attempt += 1
                 logger.error(f"Attempt {attempt} failed: {e}")
-                if attempt == self.RETRY_LIMIT:
+                if attempt == self.retry_limit:
                     results = (
-                        f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
+                        f"Error: Network error after {self.retry_limit} attempts - {e}"
                     )
             finally:
                 driver.quit()
@@ -204,7 +210,7 @@ async def ascrape_playwright_scroll(
         results = ""
         attempt = 0
 
-        while attempt < self.RETRY_LIMIT:
+        while attempt < self.retry_limit:
             try:
                 async with async_playwright() as p:
                     browser = None
@@ -268,8 +274,8 @@ async def ascrape_playwright_scroll(
             except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e:
                 attempt += 1
                 logger.error(f"Attempt {attempt} failed: {e}")
-                if attempt == self.RETRY_LIMIT:
-                    results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
+                if attempt == self.retry_limit:
+                    results = f"Error: Network error after {self.retry_limit} attempts - {e}"
             finally:
                 await browser.close()
 
@@ -283,7 +289,11 @@ async def ascrape_playwright(self, url: str, browser_name: str = "chromium") ->
             url (str): The URL to scrape.
 
         Returns:
-            str: The scraped HTML content or an error message if an exception occurs.
+            str: The scraped HTML content
+
+        Raises:
+            RuntimeError: When retry limit is reached without successful scraping
+            ValueError: When an invalid browser name is provided
         """
         from playwright.async_api import async_playwright
         from undetected_playwright import Malenia
@@ -292,9 +302,9 @@ async def ascrape_playwright(self, url: str, browser_name: str = "chromium") ->
         results = ""
         attempt = 0
 
-        while attempt < self.RETRY_LIMIT:
+        while attempt < self.retry_limit:
             try:
-                async with async_playwright() as p, async_timeout.timeout(self.TIMEOUT):
+                async with async_playwright() as p, async_timeout.timeout(self.timeout):
                     browser = None
                     if browser_name == "chromium":
                         browser = await p.chromium.launch(
@@ -315,41 +325,37 @@ async def ascrape_playwright(self, url: str, browser_name: str = "chromium") ->
                     await page.wait_for_load_state(self.load_state)
                     results = await page.content()
                     logger.info("Content scraped")
-                    break
+                    return results
             except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e:
                 attempt += 1
                 logger.error(f"Attempt {attempt} failed: {e}")
-                if attempt == self.RETRY_LIMIT:
-                    results = f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
+                if attempt == self.retry_limit:
+                    raise RuntimeError(f"Failed to scrape after {self.retry_limit} attempts: {str(e)}")
             finally:
-                if "browser" in locals():
-                    await browser.close()
-
-
-        return results
-
-
+                await browser.close()
 
-    async def ascrape_with_js_support(self, url: str , browser_name:str = "chromium") -> str:
+    async def ascrape_with_js_support(self, url: str, browser_name: str = "chromium") -> str:
         """
         Asynchronously scrape the content of a given URL by rendering JavaScript using Playwright.
 
         Args:
             url (str): The URL to scrape.
 
         Returns:
-            str: The fully rendered HTML content after JavaScript execution,
-            or an error message if an exception occurs.
+            str: The fully rendered HTML content after JavaScript execution
+
+        Raises:
+            RuntimeError: When retry limit is reached without successful scraping
+            ValueError: When an invalid browser name is provided
         """
         from playwright.async_api import async_playwright
 
         logger.info(f"Starting scraping with JavaScript support for {url}...")
-        results = ""
         attempt = 0
 
-        while attempt < self.RETRY_LIMIT:
+        while attempt < self.retry_limit:
             try:
-                async with async_playwright() as p, async_timeout.timeout(self.TIMEOUT):
+                async with async_playwright() as p, async_timeout.timeout(self.timeout):
                     browser = None
                     if browser_name == "chromium":
                         browser = await p.chromium.launch(
@@ -368,19 +374,15 @@ async def ascrape_with_js_support(self, url: str , browser_name:str = "chromium"
                     await page.goto(url, wait_until="networkidle")
                     results = await page.content()
                     logger.info("Content scraped after JavaScript rendering")
-                    break
+                    return results
             except (aiohttp.ClientError, asyncio.TimeoutError, Exception) as e:
                 attempt += 1
                 logger.error(f"Attempt {attempt} failed: {e}")
-                if attempt == self.RETRY_LIMIT:
-                    results = (
-                        f"Error: Network error after {self.RETRY_LIMIT} attempts - {e}"
-                    )
+                if attempt == self.retry_limit:
+                    raise RuntimeError(f"Failed to scrape after {self.retry_limit} attempts: {str(e)}")
             finally:
                 await browser.close()
 
-        return results
-
     def lazy_load(self) -> Iterator[Document]:
         """
         Lazily load text content from the provided URLs.

diff --git a/scrapegraphai/nodes/generate_answer_node.py b/scrapegraphai/nodes/generate_answer_node.py
@@ -12,10 +12,8 @@
 from langchain_community.chat_models import ChatOllama
 from tqdm import tqdm
 from .base_node import BaseNode
-from ..utils.output_parser import get_structured_output_parser, get_pydantic_output_parser
+from ..utils.output_parser import get_pydantic_output_parser
 from requests.exceptions import Timeout
-from langchain.callbacks.manager import CallbackManager
-from langchain.callbacks import get_openai_callback
 from ..prompts import (
     TEMPLATE_CHUNKS, TEMPLATE_NO_CHUNKS, TEMPLATE_MERGE,
     TEMPLATE_CHUNKS_MD, TEMPLATE_NO_CHUNKS_MD, TEMPLATE_MERGE_MD

diff --git a/scrapegraphai/utils/llm_callback_manager.py b/scrapegraphai/utils/llm_callback_manager.py
@@ -7,8 +7,7 @@
 
 import threading
 from contextlib import contextmanager
-from langchain_community.callbacks import get_openai_callback
-from langchain_community.callbacks.manager import get_bedrock_anthropic_callback
+from langchain_community.callbacks.manager import get_openai_callback, get_bedrock_anthropic_callback
 from langchain_openai import ChatOpenAI, AzureChatOpenAI
 from langchain_aws import ChatBedrock
 from .custom_callback import get_custom_callback