Skip to content

Commit

Permalink
Merge pull request #780 from ScrapeGraphAI/pre/beta
Browse files Browse the repository at this point in the history
Pre/beta
  • Loading branch information
VinciGit00 authored Nov 1, 2024
2 parents ea2ff50 + 7e3598d commit 9f0ba35
Show file tree
Hide file tree
Showing 9 changed files with 348 additions and 259 deletions.
43 changes: 43 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,30 @@
## [1.28.0-beta.2](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.28.0-beta.1...v1.28.0-beta.2) (2024-10-31)


### Features

* update generate answer ([7172b32](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/7172b32a0f37f547edccab7bd09406e73c9ec5b2))

## [1.28.0-beta.1](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0...v1.28.0-beta.1) (2024-10-30)


### Features

* add new mistral models ([6914170](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/691417089014b5b0b64a1b26687cbb0cba693952))
* refactoring of the base_graph ([12a6c18](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/12a6c18f6ac205b744d1de92e217cfc2dfc3486c))


### Bug Fixes

* **AbstractGraph:** manually select model tokens ([f79f399](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f79f399ee0d660f162e0cb96d9faba48ecdc88b2)), closes [#768](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/768)


### CI

* **release:** 1.27.0-beta.11 [skip ci] ([3b2cadc](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/3b2cadce1a93f31bd7a8fda64f7afcf802ada9e2))
* **release:** 1.27.0-beta.12 [skip ci] ([62369e3](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/62369e3e2886eb8cc09f6ef64865140a87a28b60))
* **release:** 1.27.0-beta.13 [skip ci] ([deed355](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/deed355551d01d92dde11f8c0b373bdd43f8b8cf)), closes [#768](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/768)

## [1.27.0](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.26.7...v1.27.0) (2024-10-26)


Expand All @@ -13,6 +40,7 @@
* refactoring of ScrapeGraph to SmartScraperLiteGraph ([52b6bf5](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/52b6bf5fb8c570aa8ef026916230c5d52996f887))



### Bug Fixes

* fix export function ([c8a000f](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/c8a000f1d943734a921b34e91498b2f29c8c9422))
Expand Down Expand Up @@ -44,6 +72,21 @@
* **release:** 1.27.0-beta.7 [skip ci] ([407f1ce](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/407f1ce4eb22fb284ef0624dd3f7bf7ba432fa5c))
* **release:** 1.27.0-beta.8 [skip ci] ([4f1ed93](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/4f1ed939e671e46bb546b6b605db87e87c0d66ee))
* **release:** 1.27.0-beta.9 [skip ci] ([fd57cc7](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/fd57cc7c126658960e33b7214c2cc656ea032d8f))
* **AbstractGraph:** manually select model tokens ([f79f399](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/f79f399ee0d660f162e0cb96d9faba48ecdc88b2)), closes [#768](https://github.com/ScrapeGraphAI/Scrapegraph-ai/issues/768)

## [1.27.0-beta.12](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.11...v1.27.0-beta.12) (2024-10-28)


### Features

* refactoring of the base_graph ([12a6c18](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/12a6c18f6ac205b744d1de92e217cfc2dfc3486c))

## [1.27.0-beta.11](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.10...v1.27.0-beta.11) (2024-10-27)


### Features

* add new mistral models ([6914170](https://github.com/ScrapeGraphAI/Scrapegraph-ai/commit/691417089014b5b0b64a1b26687cbb0cba693952))

## [1.27.0-beta.10](https://github.com/ScrapeGraphAI/Scrapegraph-ai/compare/v1.27.0-beta.9...v1.27.0-beta.10) (2024-10-25)

Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
[project]
name = "scrapegraphai"

version = "1.27.0"

version = "1.28.0b2"



Expand Down
15 changes: 9 additions & 6 deletions scrapegraphai/graphs/abstract_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,12 +152,15 @@ def _create_llm(self, llm_config: dict) -> object:
raise ValueError(f"""Provider {llm_params['model_provider']} is not supported.
If possible, try to use a model instance instead.""")

try:
self.model_token = models_tokens[llm_params["model_provider"]][llm_params["model"]]
except KeyError:
print(f"""Model {llm_params['model_provider']}/{llm_params['model']} not found,
using default token size (8192)""")
self.model_token = 8192
if "model_tokens" not in llm_params:
try:
self.model_token = models_tokens[llm_params["model_provider"]][llm_params["model"]]
except KeyError:
print(f"""Model {llm_params['model_provider']}/{llm_params['model']} not found,
using default token size (8192)""")
self.model_token = 8192
else:
self.model_token = llm_params["model_tokens"]

try:
if llm_params["model_provider"] not in \
Expand Down
254 changes: 149 additions & 105 deletions scrapegraphai/graphs/base_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,21 +98,116 @@ def _set_conditional_node_edges(self):
except:
node.false_node_name = None

def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
"""
Executes the graph by traversing nodes starting from the
entry point using the standard method.
def _get_node_by_name(self, node_name: str):
"""Returns a node instance by its name."""
return next(node for node in self.nodes if node.node_name == node_name)

Args:
initial_state (dict): The initial state to pass to the entry point node.
def _update_source_info(self, current_node, state):
"""Updates source type and source information from FetchNode."""
source_type = None
source = []
prompt = None

if current_node.__class__.__name__ == "FetchNode":
source_type = list(state.keys())[1]
if state.get("user_prompt", None):
prompt = state["user_prompt"] if isinstance(state["user_prompt"], str) else None

if source_type == "local_dir":
source_type = "html_dir"
elif source_type == "url":
if isinstance(state[source_type], list):
source.extend(url for url in state[source_type] if isinstance(url, str))
elif isinstance(state[source_type], str):
source.append(state[source_type])

return source_type, source, prompt

def _get_model_info(self, current_node):
"""Extracts LLM and embedder model information from the node."""
llm_model = None
llm_model_name = None
embedder_model = None

Returns:
Tuple[dict, list]: A tuple containing the final state and a list of execution info.
if hasattr(current_node, "llm_model"):
llm_model = current_node.llm_model
if hasattr(llm_model, "model_name"):
llm_model_name = llm_model.model_name
elif hasattr(llm_model, "model"):
llm_model_name = llm_model.model
elif hasattr(llm_model, "model_id"):
llm_model_name = llm_model.model_id

if hasattr(current_node, "embedder_model"):
embedder_model = current_node.embedder_model
if hasattr(embedder_model, "model_name"):
embedder_model = embedder_model.model_name
elif hasattr(embedder_model, "model"):
embedder_model = embedder_model.model

return llm_model, llm_model_name, embedder_model

def _get_schema(self, current_node):
"""Extracts schema information from the node configuration."""
if not hasattr(current_node, "node_config"):
return None

if not isinstance(current_node.node_config, dict):
return None

schema_config = current_node.node_config.get("schema")
if not schema_config or isinstance(schema_config, dict):
return None

try:
return schema_config.schema()
except Exception:
return None

def _execute_node(self, current_node, state, llm_model, llm_model_name):
"""Executes a single node and returns execution information."""
curr_time = time.time()

with self.callback_manager.exclusive_get_callback(llm_model, llm_model_name) as cb:
result = current_node.execute(state)
node_exec_time = time.time() - curr_time

cb_data = None
if cb is not None:
cb_data = {
"node_name": current_node.node_name,
"total_tokens": cb.total_tokens,
"prompt_tokens": cb.prompt_tokens,
"completion_tokens": cb.completion_tokens,
"successful_requests": cb.successful_requests,
"total_cost_USD": cb.total_cost,
"exec_time": node_exec_time,
}

return result, node_exec_time, cb_data

def _get_next_node(self, current_node, result):
"""Determines the next node to execute based on current node type and result."""
if current_node.node_type == "conditional_node":
node_names = {node.node_name for node in self.nodes}
if result in node_names:
return result
elif result is None:
return None
raise ValueError(
f"Conditional Node returned a node name '{result}' that does not exist in the graph"
)

return self.edges.get(current_node.node_name)

def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
"""
Executes the graph by traversing nodes starting from the entry point using the standard method.
"""
current_node_name = self.entry_point
state = initial_state

# variables for tracking execution info
# Tracking variables
total_exec_time = 0.0
exec_info = []
cb_total = {
Expand All @@ -134,104 +229,51 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
schema = None

while current_node_name:
curr_time = time.time()
current_node = next(node for node in self.nodes if node.node_name == current_node_name)

if current_node.__class__.__name__ == "FetchNode":
source_type = list(state.keys())[1]
if state.get("user_prompt", None):
prompt = state["user_prompt"] if isinstance(state["user_prompt"], str) else None

if source_type == "local_dir":
source_type = "html_dir"
elif source_type == "url":
if isinstance(state[source_type], list):
for url in state[source_type]:
if isinstance(url, str):
source.append(url)
elif isinstance(state[source_type], str):
source.append(state[source_type])

if hasattr(current_node, "llm_model") and llm_model is None:
llm_model = current_node.llm_model
if hasattr(llm_model, "model_name"):
llm_model_name = llm_model.model_name
elif hasattr(llm_model, "model"):
llm_model_name = llm_model.model
elif hasattr(llm_model, "model_id"):
llm_model_name = llm_model.model_id

if hasattr(current_node, "embedder_model") and embedder_model is None:
embedder_model = current_node.embedder_model
if hasattr(embedder_model, "model_name"):
embedder_model = embedder_model.model_name
elif hasattr(embedder_model, "model"):
embedder_model = embedder_model.model

if hasattr(current_node, "node_config"):
if isinstance(current_node.node_config,dict):
if current_node.node_config.get("schema", None) and schema is None:
if not isinstance(current_node.node_config["schema"], dict):
try:
schema = current_node.node_config["schema"].schema()
except Exception as e:
schema = None

with self.callback_manager.exclusive_get_callback(llm_model, llm_model_name) as cb:
try:
result = current_node.execute(state)
except Exception as e:
error_node = current_node.node_name
graph_execution_time = time.time() - start_time
log_graph_execution(
graph_name=self.graph_name,
source=source,
prompt=prompt,
schema=schema,
llm_model=llm_model_name,
embedder_model=embedder_model,
source_type=source_type,
execution_time=graph_execution_time,
error_node=error_node,
exception=str(e)
)
raise e
node_exec_time = time.time() - curr_time
current_node = self._get_node_by_name(current_node_name)

# Update source information if needed
if source_type is None:
source_type, source, prompt = self._update_source_info(current_node, state)

# Get model information if needed
if llm_model is None:
llm_model, llm_model_name, embedder_model = self._get_model_info(current_node)

# Get schema if needed
if schema is None:
schema = self._get_schema(current_node)

try:
result, node_exec_time, cb_data = self._execute_node(
current_node, state, llm_model, llm_model_name
)
total_exec_time += node_exec_time

if cb is not None:
cb_data = {
"node_name": current_node.node_name,
"total_tokens": cb.total_tokens,
"prompt_tokens": cb.prompt_tokens,
"completion_tokens": cb.completion_tokens,
"successful_requests": cb.successful_requests,
"total_cost_USD": cb.total_cost,
"exec_time": node_exec_time,
}

if cb_data:
exec_info.append(cb_data)

cb_total["total_tokens"] += cb_data["total_tokens"]
cb_total["prompt_tokens"] += cb_data["prompt_tokens"]
cb_total["completion_tokens"] += cb_data["completion_tokens"]
cb_total["successful_requests"] += cb_data["successful_requests"]
cb_total["total_cost_USD"] += cb_data["total_cost_USD"]

if current_node.node_type == "conditional_node":
node_names = {node.node_name for node in self.nodes}
if result in node_names:
current_node_name = result
elif result is None:
current_node_name = None
else:
raise ValueError(f"Conditional Node returned a node name '{result}' that does not exist in the graph")

elif current_node_name in self.edges:
current_node_name = self.edges[current_node_name]
else:
current_node_name = None

for key in cb_total:
cb_total[key] += cb_data[key]

current_node_name = self._get_next_node(current_node, result)

except Exception as e:
error_node = current_node.node_name
graph_execution_time = time.time() - start_time
log_graph_execution(
graph_name=self.graph_name,
source=source,
prompt=prompt,
schema=schema,
llm_model=llm_model_name,
embedder_model=embedder_model,
source_type=source_type,
execution_time=graph_execution_time,
error_node=error_node,
exception=str(e)
)
raise e

# Add total results to execution info
exec_info.append({
"node_name": "TOTAL RESULT",
"total_tokens": cb_total["total_tokens"],
Expand All @@ -242,6 +284,7 @@ def _execute_standard(self, initial_state: dict) -> Tuple[dict, list]:
"exec_time": total_exec_time,
})

# Log final execution results
graph_execution_time = time.time() - start_time
response = state.get("answer", None) if source_type == "url" else None
content = state.get("parsed_doc", None) if response is not None else None
Expand Down Expand Up @@ -300,3 +343,4 @@ def append_node(self, node):
self.raw_edges.append((last_node, node))
self.nodes.append(node)
self.edges = self._create_edges({e for e in self.raw_edges})

4 changes: 3 additions & 1 deletion scrapegraphai/helpers/models_tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,12 @@
"llama3.2:1b": 128000,
"scrapegraph": 8192,
"mistral": 8192,
"mistral-small": 128000,
"mistral-openorca": 32000,
"mistral-large": 128000,
"grok-1": 8192,
"llava": 4096,
"mixtral:8x22b-instruct": 65536,
"mistral-openorca": 32000,
"nomic-embed-text": 8192,
"nous-hermes2:34b": 4096,
"orca-mini": 2048,
Expand Down
Loading

0 comments on commit 9f0ba35

Please sign in to comment.