Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

⚡️ Speed up method URLComponent.as_dataframe by 137% in PR #6051 (cz/url-improve) #6307

Open
wants to merge 2 commits into
base: cz/url-improve
Choose a base branch
from

Conversation

codeflash-ai[bot]
Copy link
Contributor

@codeflash-ai codeflash-ai bot commented Feb 12, 2025

⚡️ This pull request contains optimizations for PR #6051

If you approve this dependent PR, these changes will be merged into the original PR branch cz/url-improve.

This PR will be automatically closed if the original PR is merged.


📄 137% (1.37x) speedup for URLComponent.as_dataframe in src/backend/base/langflow/components/data/url.py

⏱️ Runtime : 96.9 milliseconds 40.9 milliseconds (best of 6 runs)

📝 Explanation and details

Explanation of Optimizations.

Correctness verification report:

Test Status
⚙️ Existing Unit Tests 8 Passed
🌀 Generated Regression Tests 11 Passed
⏪ Replay Tests 🔘 None Found
🔎 Concolic Coverage Tests 🔘 None Found
📊 Tests Coverage undefined
⚙️ Existing Unit Tests Details
- components/data/test_url_component.py
🌀 Generated Regression Tests Details
import asyncio
import json
from unittest.mock import MagicMock, patch

import pandas as pd
# imports
import pytest  # used for our unit tests
from langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader
from langflow.components.data.url import URLComponent
from langflow.custom import Component
from langflow.schema import Data
from langflow.schema.dataframe import DataFrame


class DataFrame(pd.DataFrame):
    """A pandas DataFrame subclass specialized for handling collections of Data objects.

    This class extends pandas.DataFrame to provide seamless integration between
    Langflow's Data objects and pandas' powerful data manipulation capabilities.

    Args:
        data: Input data in various formats:
            - List[Data]: List of Data objects
            - List[Dict]: List of dictionaries
            - Dict: Dictionary of arrays/lists
            - pandas.DataFrame: Existing DataFrame
            - Any format supported by pandas.DataFrame
        **kwargs: Additional arguments passed to pandas.DataFrame constructor

    Examples:
        >>> # From Data objects
        >>> dataset = DataFrame([Data(data={"name": "John"}), Data(data={"name": "Jane"})])

        >>> # From dictionaries
        >>> dataset = DataFrame([{"name": "John"}, {"name": "Jane"}])

        >>> # From dictionary of lists
        >>> dataset = DataFrame({"name": ["John", "Jane"], "age": [30, 25]})
    """

    def __init__(self, data: list[dict] | list[Data] | pd.DataFrame | None = None, **kwargs):
        if data is None:
            super().__init__(**kwargs)
            return

        if isinstance(data, list):
            if all(isinstance(x, Data) for x in data):
                data = [d.data for d in data if hasattr(d, "data")]
            elif not all(isinstance(x, dict) for x in data):
                msg = "List items must be either all Data objects or all dictionaries"
                raise ValueError(msg)
            kwargs["data"] = data
        elif isinstance(data, dict | pd.DataFrame):
            kwargs["data"] = data

        super().__init__(**kwargs)

    def to_data_list(self) -> list[Data]:
        """Converts the DataFrame back to a list of Data objects."""
        list_of_dicts = self.to_dict(orient="records")
        return [Data(data=row) for row in list_of_dicts]

    def add_row(self, data: dict | Data) -> "DataFrame":
        """Adds a single row to the dataset.

        Args:
            data: Either a Data object or a dictionary to add as a new row

        Returns:
            DataFrame: A new DataFrame with the added row

        Example:
            >>> dataset = DataFrame([{"name": "John"}])
            >>> dataset = dataset.add_row({"name": "Jane"})
        """
        if isinstance(data, Data):
            data = data.data
        new_df = self._constructor([data])
        return pd.concat([self, new_df], ignore_index=True)

    def add_rows(self, data: list[dict | Data]) -> "DataFrame":
        """Adds multiple rows to the dataset.

        Args:
            data: List of Data objects or dictionaries to add as new rows

        Returns:
            DataFrame: A new DataFrame with the added rows
        """
        processed_data = []
        for item in data:
            if isinstance(item, Data):
                processed_data.append(item.data)
            else:
                processed_data.append(item)
        new_df = self._constructor(processed_data)
        return pd.concat([self, new_df], ignore_index=True)

    @property
    def _constructor(self):
        def _c(*args, **kwargs):
            return DataFrame(*args, **kwargs).__finalize__(self)

        return _c

    def __bool__(self):
        """Truth value testing for the DataFrame.

        Returns True if the DataFrame has at least one row, False otherwise.
        """
        return not self.empty

# unit tests
class TestAsDataFrame:
    # Helper function to create a mock Data object
    def create_mock_data(self, text, metadata):
        mock_data = MagicMock()
        mock_data.page_content = text
        mock_data.metadata = metadata
        return mock_data

    @patch('langchain_community.document_loaders.AsyncHtmlLoader.load')
    

import asyncio
import json
from unittest.mock import AsyncMock, patch

import pandas as pd
# imports
import pytest  # used for our unit tests
from langchain_community.document_loaders import AsyncHtmlLoader, WebBaseLoader
from langflow.components.data.url import URLComponent
from langflow.custom import Component
from langflow.schema import Data
from langflow.schema.dataframe import DataFrame


class DataFrame(pd.DataFrame):
    """A pandas DataFrame subclass specialized for handling collections of Data objects.

    This class extends pandas.DataFrame to provide seamless integration between
    Langflow's Data objects and pandas' powerful data manipulation capabilities.

    Args:
        data: Input data in various formats:
            - List[Data]: List of Data objects
            - List[Dict]: List of dictionaries
            - Dict: Dictionary of arrays/lists
            - pandas.DataFrame: Existing DataFrame
            - Any format supported by pandas.DataFrame
        **kwargs: Additional arguments passed to pandas.DataFrame constructor

    Examples:
        >>> # From Data objects
        >>> dataset = DataFrame([Data(data={"name": "John"}), Data(data={"name": "Jane"})])

        >>> # From dictionaries
        >>> dataset = DataFrame([{"name": "John"}, {"name": "Jane"}])

        >>> # From dictionary of lists
        >>> dataset = DataFrame({"name": ["John", "Jane"], "age": [30, 25]})
    """

    def __init__(self, data: list[dict] | list[Data] | pd.DataFrame | None = None, **kwargs):
        if data is None:
            super().__init__(**kwargs)
            return

        if isinstance(data, list):
            if all(isinstance(x, Data) for x in data):
                data = [d.data for d in data if hasattr(d, "data")]
            elif not all(isinstance(x, dict) for x in data):
                msg = "List items must be either all Data objects or all dictionaries"
                raise ValueError(msg)
            kwargs["data"] = data
        elif isinstance(data, dict | pd.DataFrame):
            kwargs["data"] = data

        super().__init__(**kwargs)

    def to_data_list(self) -> list[Data]:
        """Converts the DataFrame back to a list of Data objects."""
        list_of_dicts = self.to_dict(orient="records")
        return [Data(data=row) for row in list_of_dicts]

    def add_row(self, data: dict | Data) -> "DataFrame":
        """Adds a single row to the dataset.

        Args:
            data: Either a Data object or a dictionary to add as a new row

        Returns:
            DataFrame: A new DataFrame with the added row

        Example:
            >>> dataset = DataFrame([{"name": "John"}])
            >>> dataset = dataset.add_row({"name": "Jane"})
        """
        if isinstance(data, Data):
            data = data.data
        new_df = self._constructor([data])
        return cast("DataFrame", pd.concat([self, new_df], ignore_index=True))

    def add_rows(self, data: list[dict | Data]) -> "DataFrame":
        """Adds multiple rows to the dataset.

        Args:
            data: List of Data objects or dictionaries to add as new rows

        Returns:
            DataFrame: A new DataFrame with the added rows
        """
        processed_data = []
        for item in data:
            if isinstance(item, Data):
                processed_data.append(item.data)
            else:
                processed_data.append(item)
        new_df = self._constructor(processed_data)
        return cast("DataFrame", pd.concat([self, new_df], ignore_index=True))

    @property
    def _constructor(self):
        def _c(*args, **kwargs):
            return DataFrame(*args, **kwargs).__finalize__(self)

        return _c

    def __bool__(self):
        """Truth value testing for the DataFrame.

        Returns True if the DataFrame has at least one row, False otherwise.
        """
        return not self.empty

# unit tests

# Mock Component class to use in tests
class MockComponent(URLComponent):
    def __init__(self, urls, format):
        self.urls = urls
        self.format = format

    def ensure_url(self, url):
        return url

    async def validate_json_content(self, url):
        return True

# Basic Functionality Tests


def test_empty_url_list():
    component = MockComponent(urls=[], format="Raw HTML")
    with pytest.raises(ValueError, match="No valid URLs provided."):
        component.as_dataframe()







def test_empty_dataframe():
    df = DataFrame()

def test_dataframe_from_list_of_data_objects():
    data_objects = [Data(data={"name": "John"}), Data(data={"name": "Jane"})]
    df = DataFrame(data_objects)

def test_dataframe_from_list_of_dictionaries():
    data_dicts = [{"name": "John"}, {"name": "Jane"}]
    df = DataFrame(data_dicts)

def test_dataframe_from_dictionary_of_lists():
    data_dict = {"name": ["John", "Jane"], "age": [30, 25]}
    df = DataFrame(data_dict)

# Error Handling Tests
def test_no_valid_urls_provided():
    component = MockComponent(urls=[], format="Raw HTML")
    with pytest.raises(ValueError, match="No valid URLs provided."):
        component.as_dataframe()

Codeflash

…cz/url-improve`)

### Explanation of Optimizations.
@codeflash-ai codeflash-ai bot added the ⚡️ codeflash Optimization PR opened by Codeflash AI label Feb 12, 2025
@dosubot dosubot bot added the size:M This PR changes 30-99 lines, ignoring generated files. label Feb 12, 2025
@dosubot dosubot bot added the enhancement New feature or request label Feb 12, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
⚡️ codeflash Optimization PR opened by Codeflash AI enhancement New feature or request size:M This PR changes 30-99 lines, ignoring generated files.
Projects
None yet
Development

Successfully merging this pull request may close these issues.

0 participants