Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Added GLiNER link extractor and correlated unit test #5738

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from .conversation import ConversationChainComponent
from .csv_agent import CSVAgentComponent
from .fake_embeddings import FakeEmbeddingsComponent
from .gliner_link_extractor import GLiNERLinkExtractorComponent

Check failure on line 5 in src/backend/base/langflow/components/langchain_utilities/__init__.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.12)

Ruff (F401)

src/backend/base/langflow/components/langchain_utilities/__init__.py:5:36: F401 `.gliner_link_extractor.GLiNERLinkExtractorComponent` imported but unused; consider removing, adding to `__all__`, or using a redundant alias
from .html_link_extractor import HtmlLinkExtractorComponent
from .json_agent import JsonAgentComponent
from .json_document_builder import JSONDocumentBuilder
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
from collections.abc import Iterable
from typing import Any, Union

from langchain_community.graph_vectorstores.extractors import LinkExtractorTransformer
from langchain_community.graph_vectorstores.extractors.link_extractor import LinkExtractor
from langchain_community.graph_vectorstores.links import Link
from langchain_core.documents import BaseDocumentTransformer, Document
from loguru import logger

from langflow.base.document_transformers.model import LCDocumentTransformerComponent
from langflow.custom.custom_component.component_with_cache import ComponentWithCache
from langflow.inputs import DataInput, DictInput, StrInput
from langflow.services.cache.utils import CacheMiss

GLiNERInput = Union[str, Document]

Check failure on line 15 in src/backend/base/langflow/components/langchain_utilities/gliner_link_extractor.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.12)

Ruff (UP007)

src/backend/base/langflow/components/langchain_utilities/gliner_link_extractor.py:15:15: UP007 Use `X | Y` for type annotations


class GLiNERLinkExtractorComponent(LCDocumentTransformerComponent, ComponentWithCache):
display_name = "GliNER Link Extractor"
description = "Extract named entities links from documents using GLiNER"
documentation = "https://python.langchain.com/api_reference/community/graph_vectorstores/langchain_community.graph_vectorstores.extractors.gliner_link_extractor.GLiNERLinkExtractor.html"
name = "GLiNERLinkExtractor"

inputs = [
# Note that I removed the isList from the labels input.
StrInput(name="labels", display_name="Command separated list of kinds of entities to extract", required=True),
StrInput(name="kind", display_name="Kind of edge", value="entity", required=True),
StrInput(
name="model_name", display_name="GLiNER model to use", value="urchade/gliner_mediumv2.1", required=True
),
DictInput(
name="extract_kwargs",
display_name="Arguments to pass to GLiNER.",
is_list=True,
advanced=True,
),
DataInput(
name="data_input",
display_name="Input",
info="The texts from which to extract links.",
input_types=["Document", "Data"],
),
]

def load_model(self) -> Any:
try:
from gliner import GLiNER

self.embedding_model = self._shared_component_cache.get("gliner_model")
if isinstance(self.embedding_model, CacheMiss):
logger.debug(f"Loading GLiNER model {self.model_name}")
self.embedding_model = GLiNER.from_pretrained(self.model_name)
self._shared_component_cache.set("gliner_model", self.embedding_model)
else:
logger.debug(f"GLiNER already loaded {self.model_name}")
return self.embedding_model

Check failure on line 56 in src/backend/base/langflow/components/langchain_utilities/gliner_link_extractor.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.12)

Ruff (TRY300)

src/backend/base/langflow/components/langchain_utilities/gliner_link_extractor.py:56:13: TRY300 Consider moving this statement to an `else` block

except ImportError:
raise ImportError(
"gliner is required for GlinerLinkExtractor. Please install it with `pip install gliner`."

Check failure on line 60 in src/backend/base/langflow/components/langchain_utilities/gliner_link_extractor.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.12)

Ruff (EM101)

src/backend/base/langflow/components/langchain_utilities/gliner_link_extractor.py:60:17: EM101 Exception must not use a string literal, assign to variable first
) from None

Check failure on line 61 in src/backend/base/langflow/components/langchain_utilities/gliner_link_extractor.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.12)

Ruff (TRY003)

src/backend/base/langflow/components/langchain_utilities/gliner_link_extractor.py:59:19: TRY003 Avoid specifying long messages outside the exception class

def get_data_input(self) -> Any:
return self.data_input

def build_document_transformer(self) -> BaseDocumentTransformer:
self.load_model()

# This is a hack to cvonvert a list of a single string to a list of strings. Should be handled by the UI
self.labels = self.labels[0].split(",") if isinstance(self.labels, list) else self.labels.split(",")

return LinkExtractorTransformer(
[
GLiNERLinkExtractor(
self.labels, kind=self.kind, model=self.embedding_model, extract_kwargs=self.extract_kwargs
)
]
)


class GLiNERLinkExtractor(LinkExtractor[GLiNERInput]):
def __init__(
self,
labels: list[str],
*,
kind: str = "entity",
model,
extract_kwargs: dict[str, Any] | None = None,
):
self._glinermodel = model
self._labels = labels
self._kind = kind
self._extract_kwargs = extract_kwargs or {}

def extract_one(self, input: GLiNERInput) -> set[Link]: # noqa: A002
return next(iter(self.extract_many([input])))

def extract_many(
self,
inputs: Iterable[GLiNERInput],
) -> Iterable[set[Link]]:
strs = [i if isinstance(i, str) else i.page_content for i in inputs]
for entities in self._glinermodel.batch_predict_entities(strs, self._labels, **self._extract_kwargs):
yield {Link.bidir(kind=f"{self._kind}:{e['label']}", tag=e["text"]) for e in entities}
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
import pytest

Check failure on line 1 in src/backend/tests/unit/components/langchain_utilities/test_gliner_link_extractor.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.12)

Ruff (INP001)

src/backend/tests/unit/components/langchain_utilities/test_gliner_link_extractor.py:1:1: INP001 File `src/backend/tests/unit/components/langchain_utilities/test_gliner_link_extractor.py` is part of an implicit namespace package. Add an `__init__.py`.
from langchain_community.graph_vectorstores.links import Link
from langchain_core.documents import Document
from langflow.components.langchain_utilities.gliner_link_extractor import GLiNERLinkExtractorComponent
from langflow.schema import Data
from langflow.services.cache.utils import CacheMiss
from tests.base import ComponentTestBaseWithClient


@pytest.mark.usefixtures("client")
class TestGlinerLinkExtractorComponent(ComponentTestBaseWithClient):
@pytest.fixture
def component_class(self):
return GLiNERLinkExtractorComponent

@pytest.fixture
def default_kwargs(self, test_text):
return {
"_session_id": "123",
"kind": "entity",
"labels": "people, places, dates, events",
"model_name": "urchade/gliner_mediumv2.1",
"extract_kwargs": {},
"data_input": test_text,
}

@pytest.fixture
def file_names_mapping(self):
return [
{"version": "1.0.19", "module": "langchain_utilities", "file_name": "gliner_link_extractor"},
{"version": "1.1.0", "module": "langchain_utilities", "file_name": "gliner_link_extractor"},
{"version": "1.1.1", "module": "langchain_utilities", "file_name": "gliner_link_extractor"},
]

@pytest.fixture
def test_text(self):
text = """
The life of Alexander the Great, one of the most renowned military leaders in history, is a tale of ambition, conquest, and cultural integration. Born in 356 BCE in Pella, the capital of Macedonia, Alexander was the son of King Philip II and Queen Olympias. He was tutored by the philosopher Aristotle, who instilled in him a love for learning and culture.

Check failure on line 38 in src/backend/tests/unit/components/langchain_utilities/test_gliner_link_extractor.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.12)

Ruff (E501)

src/backend/tests/unit/components/langchain_utilities/test_gliner_link_extractor.py:38:121: E501 Line too long (364 > 120)

Early Life and Education
Alexander III of Macedon, later known as Alexander the Great, was born on July 20, 356 BCE, in Pella, the capital of Macedonia. His father, King Philip II, united most of the Greek city-states under Macedonian rule. His mother, Queen Olympias, claimed descent from Achilles. Educated by the philosopher Aristotle from age 13 to 16, Alexander studied subjects such as philosophy, politics, and science, developing a lifelong appreciation for knowledge and culture.

Check failure on line 41 in src/backend/tests/unit/components/langchain_utilities/test_gliner_link_extractor.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.12)

Ruff (E501)

src/backend/tests/unit/components/langchain_utilities/test_gliner_link_extractor.py:41:121: E501 Line too long (471 > 120)

In 340 BCE, at age 16, Alexander acted as regent while Philip campaigned and demonstrated his military aptitude by suppressing a rebellion and founding a city, Alexandropolis.

Check failure on line 43 in src/backend/tests/unit/components/langchain_utilities/test_gliner_link_extractor.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.12)

Ruff (E501)

src/backend/tests/unit/components/langchain_utilities/test_gliner_link_extractor.py:43:121: E501 Line too long (183 > 120)

Ascension to Power
When Philip II was assassinated in 336 BCE, Alexander ascended to the throne at age 20. He quickly consolidated power by eliminating rivals and securing loyalty from the Greek city-states, reaffirming Macedonian hegemony through a campaign against Thebes in 335 BCE, which he destroyed as a warning to others.

Check failure on line 46 in src/backend/tests/unit/components/langchain_utilities/test_gliner_link_extractor.py

View workflow job for this annotation

GitHub Actions / Ruff Style Check (3.12)

Ruff (E501)

src/backend/tests/unit/components/langchain_utilities/test_gliner_link_extractor.py:46:121: E501 Line too long (317 > 120)

Military Campaigns and Conquests
Conquest of the Persian Empire (334–330 BCE):

334 BCE: Alexander crossed the Hellespont into Asia Minor and defeated the Persians at the Battle of Granicus.
333 BCE: At the Battle of Issus, he routed the forces of Darius III, capturing Darius’s family.
332 BCE: After a lengthy siege, Alexander took Tyre, and in Egypt, he was hailed as a liberator and declared the son of the god Amun. He founded Alexandria, the first of many cities bearing his name.
331 BCE: In the decisive Battle of Gaugamela, Alexander defeated Darius III and effectively took control of the Persian Empire.
Expansion into Central Asia and India (329–325 BCE):

Alexander pursued Darius III, who was ultimately killed by his own men in 330 BCE. Alexander executed Darius's murderers and declared himself "King of Asia".
In 327 BCE, he invaded the Indian subcontinent, winning the Battle of Hydaspes River against King Porus in 326 BCE. Despite the victory, his troops refused to march further east, prompting their return.
Return to Babylon and Death (324–323 BCE):

On the way back, Alexander endured grueling campaigns in the Gedrosian Desert (present-day Iran). He attempted to consolidate his empire, promoting unity between Macedonians and Persians through marriages and governance reforms.
Alexander died in Babylon on June 10 or 11, 323 BCE, under mysterious circumstances, possibly fever, poisoning, or malaria.
Cultural Integration and Legacy
Alexander sought to merge Greek and local cultures, a process known as Hellenization, which left a lasting impact on regions spanning Greece, Egypt, Persia, and India. He founded over 20 cities, such as Alexandria in Egypt, which became a major center of learning and commerce. His policies included adopting local customs and appointing non-Macedonians to administrative roles.

Associations and Mythology
Alexander was deified by many cultures during and after his reign. In Islamic tradition, he is often associated with "Iskandar Dhul-Qarnayn" (Alexander the Two-Horned) and linked to legends such as the building of the iron wall against Gog and Magog.
His death spurred the fragmentation of his empire among his generals, the Diadochi, who divided it into the Ptolemaic, Seleucid, and Antigonid kingdoms.
Tomb and Mysteries
Alexander's burial site remains a mystery. Historical sources suggest it was in Alexandria, Egypt, but the exact location is unknown. His sarcophagus and associated treasures have been sought by archaeologists and explorers for centuries\u200b

Alexander’s conquests reshaped the ancient world, blending cultures and setting the stage for the Hellenistic period, marked by advancements in art, science, and philosophy
"""

paragraphs = text.strip().split("\n\n")
documents = [Document(page_content=paragraph) for paragraph in paragraphs if paragraph]
return documents

@pytest.fixture
def all_tags(self):
return [
"Pella",
"Macedonia",
"356 BCE",
"356 BCE",
"Pella",
"Macedonia",
"July 20",
"Greek city-states",
"340 BCE",
"rebellion",
"Alexandropolis",
"Thebes",
"335 BCE",
"336 BCE",
"334–330 BCE",
"Persians",
"Alexandria",
"Egypt",
"332 BCE",
"Battle of Granicus",
"334 BCE",
"Asia Minor",
"Battle of Issus",
"331 BCE",
"Tyre",
"Battle of Gaugamela",
"333 BCE",
"Hellespont",
"Indian subcontinent",
"330 BCE",
"324–323 BCE",
"Babylon",
"327 BCE",
"Battle of Hydaspes River",
"326 BCE",
"Persians",
"Alexandria",
"Greece",
"Egypt",
"Babylon",
"323 BCE",
"Hellenization",
"June 10 or 11",
"Gedrosian Desert",
"India",
"Persia",
"Macedonians",
"Alexandria, Egypt",
"Diadochi",
"Hellenistic period",
"ancient world",
]

def test_link_extraction(self, component_class, default_kwargs, all_tags):
"""Test the post-processing of code using the GLiNERLinkExtractorComponent.
This test verifies that the component correctly processes data and extracts
links with the expected tags.

Args:
component_class (type): The class of the component to be tested.
default_kwargs (dict): The default keyword arguments to initialize the component.
Asserts:
- The component is an instance of GLiNERLinkExtractorComponent.
- The transformed data is not None.
- The length of the transformed data is 10.
- Each datum in the transformed data is an instance of Data.
- Each datum contains a 'links' key that is not None.
- Each link in the 'links' list is an instance of Link.
- Each link's tag matches the expected tag from all_tags.
"""
component = component_class(**default_kwargs)
assert isinstance(component, GLiNERLinkExtractorComponent)
data = component.transform_data()
assert data is not None
assert len(data) == 10
for datum in data:
assert isinstance(datum, Data)
links = datum.data["links"]
assert links is not None
for link in links:
assert isinstance(link, Link)
assert link.tag in all_tags

def test_post_code_processing(self, component_class, default_kwargs):
"""Test the post-processing of code in the component class.
This test verifies that the component class correctly processes the code
and converts it to a frontend node with the expected structure and values.

Args:
component_class (class): The class of the component to be tested.
default_kwargs (dict): The default keyword arguments to initialize the component.
Asserts:
- The node data is not None.
- The 'value' of 'labels' in the 'template' of node data is "people, places, dates, events".
- The string "alexander" is present in the 'page_content' of the first item in 'data_input' of 'template'.
"""
component = component_class(**default_kwargs)
frontend_node = component.to_frontend_node()
node_data = frontend_node["data"]["node"]
assert node_data is not None
assert node_data["template"]["labels"]["value"] == "people, places, dates, events"
assert "alexander" in node_data["template"]["data_input"]["value"][0]["page_content"].lower()

def test_model_caching(self, test_text):
"""Test the model caching in the GLiNERLinkExtractorComponent.
This test verifies that the model is cached and loaded correctly.
Asserts:
- The model is loaded and cached.
- The model is loaded from cache.
"""
component = GLiNERLinkExtractorComponent(
_session_id="123",
kind="entity",
labels="people, places, dates, events",
embedding_model="urchade/gliner_mediumv2.1",
extract_kwargs={},
data_input=test_text,
)

model = component._shared_component_cache.get("gliner_model")
assert isinstance(model, CacheMiss)
assert component.load_model() is not None

another_component = GLiNERLinkExtractorComponent(
_session_id="123",
kind="entity",
labels="people, places, dates, events",
embedding_model="urchade/gliner_mediumv2.1",
extract_kwargs={},
data_input=test_text,
)

model = component._shared_component_cache.get("gliner_model")
assert not isinstance(model, CacheMiss)
Loading