Skip to content

Commit

Permalink
⚡️ Speed up method URLComponent.as_dataframe by 137% in PR #6051 (`…
Browse files Browse the repository at this point in the history
…cz/url-improve`)

### Explanation of Optimizations.
  • Loading branch information
codeflash-ai[bot] authored Feb 12, 2025
1 parent a43d82d commit 4a0c30d
Showing 1 changed file with 28 additions and 24 deletions.
52 changes: 28 additions & 24 deletions src/backend/base/langflow/components/data/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,38 +120,18 @@ def fetch_content(self) -> list[Data]:
"""Fetch content based on selected format."""
urls = list({self.ensure_url(url.strip()) for url in self.urls if url.strip()})

no_urls_msg = "No valid URLs provided."
if not urls:
raise ValueError(no_urls_msg)

# If JSON format is selected, validate JSON content first
if self.format == "JSON":
for url in urls:
is_json = asyncio.run(self.validate_json_content(url))
if not is_json:
error_msg = "Invalid JSON content from URL - " + url
raise ValueError(error_msg)
raise ValueError("No valid URLs provided.")

Check failure on line 124 in src/backend/base/langflow/components/data/url.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.12)

Ruff (TRY003)

src/backend/base/langflow/components/data/url.py:124:19: TRY003 Avoid specifying long messages outside the exception class

Check failure on line 124 in src/backend/base/langflow/components/data/url.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.12)

Ruff (EM101)

src/backend/base/langflow/components/data/url.py:124:30: EM101 Exception must not use a string literal, assign to variable first

# Choose appropriate loader based on the format
if self.format == "Raw HTML":
loader = AsyncHtmlLoader(web_path=urls, encoding="utf-8")
else:
if self.format == "JSON":
return asyncio.run(self.fetch_json_content(urls))
loader = WebBaseLoader(web_paths=urls, encoding="utf-8")

docs = loader.load()

if self.format == "JSON":
data = []
for doc in docs:
try:
json_content = json.loads(doc.page_content)
data_dict = {"text": json.dumps(json_content, indent=2), **json_content, **doc.metadata}
data.append(Data(**data_dict))
except json.JSONDecodeError as err:
source = doc.metadata.get("source", "unknown URL")
error_msg = "Invalid JSON content from " + source
raise ValueError(error_msg) from err
return data

return [Data(text=doc.page_content, **doc.metadata) for doc in docs]

def fetch_content_text(self) -> Message:
Expand All @@ -173,3 +153,27 @@ def fetch_content_text(self) -> Message:
def as_dataframe(self) -> DataFrame:
"""Return fetched content as a DataFrame."""
return DataFrame(self.fetch_content())

async def fetch_json_content(self, urls: list[str]) -> list[Data]:
"""Fetch and validate JSON content from URLs asynchronously."""
tasks = [self.validate_json_content(url) for url in urls]
results = await asyncio.gather(*tasks)

invalid_urls = [url for url, is_valid in zip(urls, results, strict=False) if not is_valid]
if invalid_urls:
raise ValueError(f"Invalid JSON content from URLs - {', '.join(invalid_urls)}")

Check failure on line 164 in src/backend/base/langflow/components/data/url.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.12)

Ruff (TRY003)

src/backend/base/langflow/components/data/url.py:164:19: TRY003 Avoid specifying long messages outside the exception class

Check failure on line 164 in src/backend/base/langflow/components/data/url.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.12)

Ruff (EM102)

src/backend/base/langflow/components/data/url.py:164:30: EM102 Exception must not use an f-string literal, assign to variable first

loader = WebBaseLoader(web_paths=urls, encoding="utf-8")
docs = loader.load()

data = []
for doc in docs:
try:
json_content = json.loads(doc.page_content)
data_dict = {"text": json.dumps(json_content, indent=2), **json_content, **doc.metadata}
data.append(Data(**data_dict))
except json.JSONDecodeError as err:
source = doc.metadata.get("source", "unknown URL")
raise ValueError(f"Invalid JSON content from {source}") from err

Check failure on line 177 in src/backend/base/langflow/components/data/url.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.12)

Ruff (TRY003)

src/backend/base/langflow/components/data/url.py:177:23: TRY003 Avoid specifying long messages outside the exception class

Check failure on line 177 in src/backend/base/langflow/components/data/url.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.12)

Ruff (EM102)

src/backend/base/langflow/components/data/url.py:177:34: EM102 Exception must not use an f-string literal, assign to variable first

return data

0 comments on commit 4a0c30d

Please sign in to comment.