Skip to content

Commit

Permalink
chore: made some libs optional
Browse files Browse the repository at this point in the history
  • Loading branch information
PeriniM committed Jan 6, 2025
1 parent 54c69a2 commit 5cdf055
Show file tree
Hide file tree
Showing 17 changed files with 65 additions and 1,393 deletions.
26 changes: 1 addition & 25 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,36 +32,12 @@ The reference page for Scrapegraph-ai is available on the official page of PyPI:
```bash
pip install scrapegraphai

# IMPORTANT (to fetch webpage content)
# IMPORTANT (to fetch websites content)
playwright install
```

**Note**: it is recommended to install the library in a virtual environment to avoid conflicts with other libraries 🐱

<details>
<summary><b>Optional Dependencies</b></summary>
Additional dependecies can be added while installing the library:

- <b>More Language Models</b>: additional language models are installed, such as Fireworks, Groq, Anthropic, Hugging Face, and Nvidia AI Endpoints.

This group allows you to use additional language models like Fireworks, Groq, Anthropic, Together AI, Hugging Face, and Nvidia AI Endpoints.
```bash
pip install scrapegraphai[other-language-models]
```
- <b>Semantic Options</b>: this group includes tools for advanced semantic processing, such as Graphviz.

```bash
pip install scrapegraphai[more-semantic-options]
```

- <b>Browsers Options</b>: this group includes additional browser management tools/services, such as Browserbase.

```bash
pip install scrapegraphai[more-browser-options]
```

</details>


## 💻 Usage
There are multiple standard scraping pipelines that can be used to extract information from a website (or local file).
Expand Down
25 changes: 0 additions & 25 deletions docs/turkish.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,31 +31,6 @@ playwright install

**Not**: Diğer kütüphanelerle çakışmaları önlemek için kütüphaneyi sanal bir ortamda kurmanız önerilir 🐱

<details>
<summary><b>Opsiyonel Bağımlılıklar</b></summary>
Kütüphaneyi kurarken ek bağımlılıklar ekleyebilirsiniz:

- **Daha Fazla Dil Modeli**: Fireworks, Groq, Anthropic, Hugging Face ve Nvidia AI Endpoints gibi ek dil modelleri kurulur.

Bu grup, Fireworks, Groq, Anthropic, Together AI, Hugging Face ve Nvidia AI Endpoints gibi ek dil modellerini kullanmanızı sağlar.

```bash
pip install scrapegraphai[other-language-models]
```

- **Semantik Seçenekler**: Graphviz gibi gelişmiş semantik işleme araçlarını içerir.

```bash
pip install scrapegraphai[more-semantic-options]
```

- **Tarayıcı Seçenekleri**: Browserbase gibi ek tarayıcı yönetim araçları/hizmetlerini içerir.

```bash
pip install scrapegraphai[more-browser-options]
```

</details>

## 💻 Kullanım

Expand Down
2 changes: 1 addition & 1 deletion examples/openai/depth_search_graph_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

load_dotenv()

openai_key = os.getenv("OPENAI_APIKEY")
openai_key = os.getenv("OPENAI_API_KEY")

graph_config = {
"llm": {
Expand Down
2 changes: 1 addition & 1 deletion examples/openai/search_graph_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# Define the configuration for the graph
# ************************************************

openai_key = os.getenv("OPENAI_APIKEY")
openai_key = os.getenv("OPENAI_API_KEY")

graph_config = {
"llm": {
Expand Down
2 changes: 1 addition & 1 deletion examples/openai/speech_graph_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# Define the configuration for the graph
# ************************************************

openai_key = os.getenv("OPENAI_APIKEY")
openai_key = os.getenv("OPENAI_API_KEY")

graph_config = {
"llm": {
Expand Down
83 changes: 0 additions & 83 deletions funding.json

This file was deleted.

49 changes: 6 additions & 43 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,11 @@ authors = [

dependencies = [
"langchain>=0.3.0",
"langchain-google-genai>=1.0.7",
"langchain-openai>=0.1.22",
"langchain-mistralai>=0.1.12",
"langchain_community>=0.2.9",
"langchain-aws>=0.1.3",
"mistral-common>=1.4.0",
"langchain-ollama>=0.1.3",
"html2text>=2024.2.26",
"beautifulsoup4>=4.12.3",
"python-dotenv>=1.0.1",
Expand All @@ -26,16 +25,11 @@ dependencies = [
"free-proxy>=1.1.1",
"playwright>=1.43.0",
"undetected-playwright>=0.3.0",
"langchain-ollama>=0.1.3",
"semchunk>=2.2.0",
"qdrant-client>=1.11.3",
"fastembed>=0.3.6",

"transformers>=4.44.2",
"googlesearch-python>=1.2.5",
"async-timeout>=4.0.3",
"simpleeval>=1.0.0",
"scrapegraph-py>=1.7.0"
"jsonschema>=4.23.0",
]

readme = "README.md"
Expand Down Expand Up @@ -73,30 +67,7 @@ requires-python = ">=3.10,<4.0"
[project.optional-dependencies]
burr = ["burr[start]==0.22.1"]
docs = ["sphinx==6.0", "furo==2024.5.6"]

# Group 1: Other Language Models
other-language-models = [
"langchain-google-vertexai>=1.0.7",
"langchain-fireworks>=0.1.3",
"langchain-groq>=0.1.3",
"langchain-anthropic>=0.1.11",
"langchain-huggingface>=0.0.3",
"langchain-nvidia-ai-endpoints>=0.1.6",
"langchain_together>=0.2.0"
]

# Group 2: More Semantic Options
more-semantic-options = [
"graphviz>=0.20.3",
]

# Group 3: More Browser Options
more-browser-options = [
"browserbase>=0.3.0",
]

# Group 4: Surya Library
screenshot_scraper = [
ocr = [
"surya-ocr>=0.5.0",
"matplotlib>=3.7.2",
"ipywidgets>=8.1.0",
Expand All @@ -105,21 +76,13 @@ screenshot_scraper = [

[build-system]
requires = ["hatchling==1.26.3"]

build-backend = "hatchling.build"

[dependency-groups]
dev = [
"burr[start]==0.22.1",
"sphinx==6.0",
"furo==2024.5.6",
]

[tool.uv]
dev-dependencies = [
"poethepoet>=0.31.1",
"pytest==8.0.0",
"pytest-mock==3.14.0",
"pytest>=8.0.0",
"pytest-mock>=3.14.0",
"pytest-asyncio>=0.25.0",
"pylint>=3.2.5",
]

Expand Down
5 changes: 4 additions & 1 deletion scrapegraphai/builders/graph_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_extraction_chain
from langchain_community.chat_models import ErnieBotChat
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI
from ..helpers import nodes_metadata, graph_schema

Expand Down Expand Up @@ -70,6 +69,10 @@ def _create_llm(self, llm_config: dict):
if "gpt-" in llm_params["model"]:
return ChatOpenAI(llm_params)
elif "gemini" in llm_params["model"]:
try:
from langchain_google_genai import ChatGoogleGenerativeAI
except ImportError:
raise ImportError("langchain_google_genai is not installed. Please install it using 'pip install langchain-google-genai'.")
return ChatGoogleGenerativeAI(llm_params)
elif "ernie" in llm_params["model"]:
return ErnieBotChat(llm_params)
Expand Down
4 changes: 2 additions & 2 deletions scrapegraphai/graphs/abstract_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,15 +234,15 @@ def _create_llm(self, llm_config: dict) -> object:
from langchain_together import ChatTogether
except ImportError:
raise ImportError("""The langchain_together module is not installed.
Please install it using `pip install scrapegraphai[other-language-models]`.""")
Please install it using `pip install langchain-together`.""")
return ChatTogether(**llm_params)

elif model_provider == "nvidia":
try:
from langchain_nvidia_ai_endpoints import ChatNVIDIA
except ImportError:
raise ImportError("""The langchain_nvidia_ai_endpoints module is not installed.
Please install it using `pip install scrapegraphai[other-language-models]`.""")
Please install it using `pip install langchain-nvidia-ai-endpoints`.""")
return ChatNVIDIA(**llm_params)

except Exception as e:
Expand Down
9 changes: 6 additions & 3 deletions scrapegraphai/graphs/smart_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
"""
from typing import Optional
from pydantic import BaseModel
from scrapegraph_py import Client
from scrapegraph_py.logger import sgai_logger
from .base_graph import BaseGraph
from .abstract_graph import AbstractGraph
from ..nodes import (
Expand Down Expand Up @@ -67,7 +65,12 @@ def _create_graph(self) -> BaseGraph:
BaseGraph: A graph instance representing the web scraping workflow.
"""
if self.llm_model == "scrapegraphai/smart-scraper":

try:
from scrapegraph_py import Client
from scrapegraph_py.logger import sgai_logger
except ImportError:
raise ImportError("scrapegraph_py is not installed. Please install it using 'pip install scrapegraph-py'.")

sgai_logger.set_logging(level="INFO")

# Initialize the client with explicit API key
Expand Down
24 changes: 1 addition & 23 deletions scrapegraphai/nodes/fetch_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from ..utils.cleanup_html import cleanup_html
from ..docloaders import ChromiumLoader
from ..utils.convert_to_md import convert_to_md
from ..utils.logging import get_logger
from .base_node import BaseNode

class FetchNode(BaseNode):
Expand Down Expand Up @@ -79,24 +78,6 @@ def __init__(
None if node_config is None else node_config.get("storage_state", None)
)

def is_valid_url(self, source: str) -> bool:
"""
Validates if the source string is a valid URL using regex.
Parameters:
source (str): The URL string to validate
Raises:
ValueError: If the URL is invalid
"""
import re

url_pattern = r"^https?://[^\s/$.?#].[^\s]*$"
if not bool(re.match(url_pattern, source)):
raise ValueError(
f"Invalid URL format: {source}. URL must start with http(s):// and contain a valid domain."
)
return True

def execute(self, state):
"""
Expand Down Expand Up @@ -129,12 +110,9 @@ def execute(self, state):
elif self.input == "pdf_dir":
return state

# For web sources, validate URL before proceeding
try:
if self.is_valid_url(source):
return self.handle_web_source(state, source)
return self.handle_web_source(state, source)
except ValueError as e:
# Re-raise the exception from is_valid_url
raise

return self.handle_local_source(state, source)
Expand Down
Loading

0 comments on commit 5cdf055

Please sign in to comment.