Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add JSON field extraction and enhanced URL validation #6051

Open
wants to merge 28 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
4ca1bf1
URL component improvement - JSON URL
Cristhianzl Jan 31, 2025
6006b07
[autofix.ci] apply automated fixes
autofix-ci[bot] Jan 31, 2025
2b93e89
[autofix.ci] apply automated fixes (attempt 2/3)
autofix-ci[bot] Jan 31, 2025
7b44f8b
♻️ (url.py): refactor URLComponent class to simplify data_dict creati…
Cristhianzl Jan 31, 2025
3d0e49f
📝 (url.py): import json module for JSON operations
Cristhianzl Jan 31, 2025
e9f4cc9
[autofix.ci] apply automated fixes
autofix-ci[bot] Jan 31, 2025
fc37187
[autofix.ci] apply automated fixes (attempt 2/3)
autofix-ci[bot] Jan 31, 2025
b307dff
📝 (url.py): improve formatting of info string for DropdownInput in UR…
Cristhianzl Feb 1, 2025
5df7bb6
[autofix.ci] apply automated fixes
autofix-ci[bot] Feb 1, 2025
075a4b8
Merge branch 'main' into cz/url-improve
Cristhianzl Feb 3, 2025
e183959
✨ (url.py): Add BoolInput and StrInput to support new features in URL…
Cristhianzl Feb 4, 2025
a28723b
[autofix.ci] apply automated fixes
autofix-ci[bot] Feb 4, 2025
ec94e39
[autofix.ci] apply automated fixes (attempt 2/3)
autofix-ci[bot] Feb 4, 2025
6232a4d
♻️ (url.py): remove unnecessary comments and improve code readability…
Cristhianzl Feb 4, 2025
225670e
Merge branch 'cz/url-improve' of https://github.com/langflow-ai/langf…
Cristhianzl Feb 4, 2025
39154fd
[autofix.ci] apply automated fixes
autofix-ci[bot] Feb 4, 2025
70be801
merge fix
Cristhianzl Feb 10, 2025
cda054e
📝 (url.py): improve readability by splitting long description and inf…
Cristhianzl Feb 12, 2025
060aa15
🔧 (Blog Writer.json, Custom Component Maker.json, Graph Vector Store …
Cristhianzl Feb 12, 2025
950fee3
[autofix.ci] apply automated fixes
autofix-ci[bot] Feb 12, 2025
c3e5ed6
Merge branch 'main' into cz/url-improve
Cristhianzl Feb 12, 2025
da22270
merge fix
Cristhianzl Feb 12, 2025
d93b6f9
Merge branch 'cz/url-improve' of https://github.com/langflow-ai/langf…
Cristhianzl Feb 12, 2025
89bbc2c
Merge branch 'main' into cz/url-improve
Cristhianzl Feb 12, 2025
c63a1d7
🐛 (url.py): fix validation of JSON content from URLs to ensure correc…
Cristhianzl Feb 12, 2025
9bddfc3
Merge branch 'cz/url-improve' of https://github.com/langflow-ai/langf…
Cristhianzl Feb 12, 2025
a942ef5
[autofix.ci] apply automated fixes
autofix-ci[bot] Feb 12, 2025
a43d82d
[autofix.ci] apply automated fixes (attempt 2/3)
autofix-ci[bot] Feb 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 114 additions & 35 deletions src/backend/base/langflow/components/data/url.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
import asyncio
import json
import re
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
import re
import re
from functools import cache


import aiohttp
from langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader

from langflow.custom import Component
from langflow.helpers.data import data_to_text
from langflow.io import DropdownInput, MessageTextInput, Output
from langflow.io import BoolInput, DropdownInput, MessageTextInput, Output, StrInput
from langflow.schema import Data
from langflow.schema.dataframe import DataFrame
from langflow.schema.message import Message


class URLComponent(Component):
display_name = "URL"
description = "Load and retrive data from specified URLs."
description = (
"Load and retrieve data from specified URLs. Supports output in plain text, raw HTML, "
"or JSON, with options for cleaning and separating multiple outputs."
)
icon = "layout-template"
name = "URL"

Expand All @@ -28,69 +33,143 @@ class URLComponent(Component):
DropdownInput(
name="format",
display_name="Output Format",
info="Output Format. Use 'Text' to extract the text from the HTML or 'Raw HTML' for the raw HTML content.",
options=["Text", "Raw HTML"],
info=(
"Output Format. Use 'Text' to extract text from the HTML, 'Raw HTML' for the raw HTML "
"content, or 'JSON' to extract JSON from the HTML."
),
options=["Text", "Raw HTML", "JSON"],
value="Text",
real_time_refresh=True,
),
StrInput(
name="separator",
display_name="Separator",
value="\n\n",
show=True,
info=(
"Specify the separator to use between multiple outputs. Default for Text is '\\n\\n'. "
"Default for Raw HTML is '\\n<!-- Separator -->\\n'."
),
),
BoolInput(
name="clean_extra_whitespace",
display_name="Clean Extra Whitespace",
value=True,
show=True,
info="Whether to clean excessive blank lines in the text output. Only applies to 'Text' format.",
),
]

outputs = [
Output(display_name="Data", name="data", method="fetch_content"),
Output(display_name="Message", name="text", method="fetch_content_text"),
Output(display_name="Text", name="text", method="fetch_content_text"),
Output(display_name="DataFrame", name="dataframe", method="as_dataframe"),
]

def ensure_url(self, string: str) -> str:
"""Ensures the given string is a URL by adding 'http://' if it doesn't start with 'http://' or 'https://'.

Raises an error if the string is not a valid URL.
async def validate_json_content(self, url: str) -> bool:
"""Validates if the URL content is actually JSON."""
try:
async with aiohttp.ClientSession() as session, session.get(url) as response:
http_ok = 200
if response.status != http_ok:
return False

content = await response.text()
try:
json.loads(content)
except json.JSONDecodeError:
return False
else:
return True
except (aiohttp.ClientError, asyncio.TimeoutError):
# Log specific error for debugging if needed
return False

def update_build_config(self, build_config: dict, field_value: str, field_name: str | None = None) -> dict:
"""Dynamically update fields based on selected format."""
if field_name == "format":
is_text_mode = field_value == "Text"
is_json_mode = field_value == "JSON"
build_config["separator"]["value"] = "\n\n" if is_text_mode else "\n<!-- Separator -->\n"
build_config["clean_extra_whitespace"]["show"] = is_text_mode
build_config["separator"]["show"] = not is_json_mode
return build_config

Parameters:
string (str): The string to be checked and possibly modified.

Returns:
str: The modified string that is ensured to be a URL.

Raises:
ValueError: If the string is not a valid URL.
"""
def ensure_url(self, string: str) -> str:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def ensure_url(self, string: str) -> str:
@cache
def ensure_url(self, string: str) -> str:

"""Ensures the given string is a valid URL."""
if not string.startswith(("http://", "https://")):
string = "http://" + string

# Basic URL validation regex
url_regex = re.compile(
r"^(https?:\/\/)?" # optional protocol
r"(www\.)?" # optional www
r"([a-zA-Z0-9.-]+)" # domain
r"(\.[a-zA-Z]{2,})?" # top-level domain
r"(:\d+)?" # optional port
r"(\/[^\s]*)?$", # optional path
r"^(https?:\/\/)?"
r"(www\.)?"
r"([a-zA-Z0-9.-]+)"
r"(\.[a-zA-Z]{2,})?"
r"(:\d+)?"
r"(\/[^\s]*)?$",
re.IGNORECASE,
)

error_msg = "Invalid URL - " + string
if not url_regex.match(string):
msg = f"Invalid URL: {string}"
raise ValueError(msg)
raise ValueError(error_msg)

return string

def fetch_content(self) -> list[Data]:
urls = [self.ensure_url(url.strip()) for url in self.urls if url.strip()]
"""Fetch content based on selected format."""
urls = list({self.ensure_url(url.strip()) for url in self.urls if url.strip()})

no_urls_msg = "No valid URLs provided."
if not urls:
raise ValueError(no_urls_msg)

# If JSON format is selected, validate JSON content first
if self.format == "JSON":
for url in urls:
is_json = asyncio.run(self.validate_json_content(url))
if not is_json:
error_msg = "Invalid JSON content from URL - " + url
raise ValueError(error_msg)

if self.format == "Raw HTML":
loader = AsyncHtmlLoader(web_path=urls, encoding="utf-8")
else:
loader = WebBaseLoader(web_paths=urls, encoding="utf-8")

docs = loader.load()
data = [Data(text=doc.page_content, **doc.metadata) for doc in docs]
self.status = data
return data

if self.format == "JSON":
data = []
for doc in docs:
try:
json_content = json.loads(doc.page_content)
data_dict = {"text": json.dumps(json_content, indent=2), **json_content, **doc.metadata}
data.append(Data(**data_dict))
except json.JSONDecodeError as err:
source = doc.metadata.get("source", "unknown URL")
error_msg = "Invalid JSON content from " + source
raise ValueError(error_msg) from err
return data

return [Data(text=doc.page_content, **doc.metadata) for doc in docs]

def fetch_content_text(self) -> Message:
"""Fetch content and return as formatted text."""
data = self.fetch_content()

result_string = data_to_text("{text}", data)
self.status = result_string
return Message(text=result_string)
if self.format == "JSON":
text_list = [item.text for item in data]
result = "\n".join(text_list)
else:
text_list = [item.text for item in data]
if self.format == "Text" and self.clean_extra_whitespace:
text_list = [re.sub(r"\n{3,}", "\n\n", text) for text in text_list]
result = self.separator.join(text_list)

self.status = result
return Message(text=result)

def as_dataframe(self) -> DataFrame:
"""Return fetched content as a DataFrame."""
return DataFrame(self.fetch_content())
Loading
Loading