Merge branch 'feat-jina-rerank' of https://github.com/JoanFM/langchain …

…into feat-jina-rerank
langchain-ai · Apr 8, 2024 · d50d611 · d50d611
2 parents 41bdb01 + 455d320
commit d50d611
Show file tree

Hide file tree

Showing 28 changed files with 825 additions and 594 deletions.
diff --git a/cookbook/rewrite.ipynb b/cookbook/rewrite.ipynb
@@ -245,7 +245,7 @@
     "\n",
     "\n",
     "def _parse(text):\n",
-    "    return text.strip(\"**\")"
+    "    return text.strip('\"').strip(\"**\")"
    ]
   },
   {

diff --git a/docs/docs/integrations/retrievers/jina-reranker.ipynb b/docs/docs/integrations/retrievers/jina-reranker.ipynb
@@ -41,6 +41,7 @@
    "source": [
     "# Helper function for printing docs\n",
     "\n",
+    "\n",
     "def pretty_print_docs(docs):\n",
     "    print(\n",
     "        f\"\\n{'-' * 100}\\n\".join(\n",
@@ -125,9 +126,7 @@
     "text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)\n",
     "texts = text_splitter.split_documents(documents)\n",
     "\n",
-    "embedding = JinaEmbeddings(\n",
-    "    model_name=\"jina-embeddings-v2-base-en\"\n",
-    ")\n",
+    "embedding = JinaEmbeddings(model_name=\"jina-embeddings-v2-base-en\")\n",
     "retriever = FAISS.from_documents(texts, embedding).as_retriever(search_kwargs={\"k\": 20})\n",
     "\n",
     "query = \"What did the president say about Ketanji Brown Jackson\"\n",

diff --git a/docs/docs/integrations/toolkits/pandas.ipynb b/docs/docs/integrations/toolkits/pandas.ipynb
@@ -34,7 +34,9 @@
     "import pandas as pd\n",
     "from langchain_openai import OpenAI\n",
     "\n",
-    "df = pd.read_csv(\"titanic.csv\")"
+    "df = pd.read_csv(\n",
+    "    \"https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv\"\n",
+    ")"
    ]
   },
   {
@@ -116,7 +118,7 @@
     }
    ],
    "source": [
-    "agent.run(\"how many rows are there?\")"
+    "agent.invoke(\"how many rows are there?\")"
    ]
   },
   {
@@ -154,7 +156,7 @@
     }
    ],
    "source": [
-    "agent.run(\"how many people have more than 3 siblings\")"
+    "agent.invoke(\"how many people have more than 3 siblings\")"
    ]
   },
   {
@@ -204,7 +206,7 @@
     }
    ],
    "source": [
-    "agent.run(\"whats the square root of the average age?\")"
+    "agent.invoke(\"whats the square root of the average age?\")"
    ]
   },
   {
@@ -264,7 +266,7 @@
    ],
    "source": [
     "agent = create_pandas_dataframe_agent(OpenAI(temperature=0), [df, df1], verbose=True)\n",
-    "agent.run(\"how many rows in the age column are different?\")"
+    "agent.invoke(\"how many rows in the age column are different?\")"
    ]
   },
   {

diff --git a/libs/cli/langchain_cli/utils/git.py b/libs/cli/langchain_cli/utils/git.py
@@ -155,7 +155,7 @@ def _get_repo_path(gitstring: str, ref: Optional[str], repo_dir: Path) -> Path:
     removed_protocol = gitstring.split("://")[-1]
     removed_basename = re.split(r"[/:]", removed_protocol, 1)[-1]
     removed_extras = removed_basename.split("#")[0]
-    foldername = re.sub(r"[^a-zA-Z0-9_]", "_", removed_extras)
+    foldername = re.sub(r"\W", "_", removed_extras)
 
     directory_name = f"{foldername}_{hashed}"
     return repo_dir / directory_name

diff --git a/libs/community/langchain_community/chat_models/cohere.py b/libs/community/langchain_community/chat_models/cohere.py
@@ -244,4 +244,4 @@ async def _agenerate(
 
     def get_num_tokens(self, text: str) -> int:
         """Calculate number of tokens."""
-        return len(self.client.tokenize(text).tokens)
+        return len(self.client.tokenize(text=text).tokens)
diff --git a/libs/community/langchain_community/document_loaders/recursive_url_loader.py b/libs/community/langchain_community/document_loaders/recursive_url_loader.py
@@ -94,6 +94,8 @@ def __init__(
         headers: Optional[dict] = None,
         check_response_status: bool = False,
         continue_on_failure: bool = True,
+        *,
+        base_url: Optional[str] = None,
     ) -> None:
         """Initialize with URL to crawl and any subdirectories to exclude.
 
@@ -120,6 +122,7 @@ def __init__(
                 URLs with error responses (400-599).
             continue_on_failure: If True, continue if getting or parsing a link raises
                 an exception. Otherwise, raise the exception.
+            base_url: The base url to check for outside links against.
         """
 
         self.url = url
@@ -146,6 +149,7 @@ def __init__(
         self.headers = headers
         self.check_response_status = check_response_status
         self.continue_on_failure = continue_on_failure
+        self.base_url = base_url if base_url is not None else url
 
     def _get_child_links_recursive(
         self, url: str, visited: Set[str], *, depth: int = 0
@@ -187,7 +191,7 @@ def _get_child_links_recursive(
         sub_links = extract_sub_links(
             response.text,
             url,
-            base_url=self.url,
+            base_url=self.base_url,
             pattern=self.link_regex,
             prevent_outside=self.prevent_outside,
             exclude_prefixes=self.exclude_dirs,
@@ -273,7 +277,7 @@ async def _async_get_child_links_recursive(
             sub_links = extract_sub_links(
                 text,
                 url,
-                base_url=self.url,
+                base_url=self.base_url,
                 pattern=self.link_regex,
                 prevent_outside=self.prevent_outside,
                 exclude_prefixes=self.exclude_dirs,

diff --git a/libs/community/langchain_community/llms/llamacpp.py b/libs/community/langchain_community/llms/llamacpp.py
@@ -344,11 +344,11 @@ def _stream(
                 text=part["choices"][0]["text"],
                 generation_info={"logprobs": logprobs},
             )
-            yield chunk
             if run_manager:
                 run_manager.on_llm_new_token(
                     token=chunk.text, verbose=self.verbose, log_probs=logprobs
                 )
+            yield chunk
 
     def get_num_tokens(self, text: str) -> int:
         tokenized_text = self.client.tokenize(text.encode("utf-8"))

diff --git a/libs/community/langchain_community/vectorstores/docarray/hnsw.py b/libs/community/langchain_community/vectorstores/docarray/hnsw.py
@@ -14,7 +14,7 @@ class DocArrayHnswSearch(DocArrayIndex):
     """`HnswLib` storage using `DocArray` package.
 
     To use it, you should have the ``docarray`` package with version >=0.32.0 installed.
-    You can install it with `pip install "langchain[docarray]"`.
+    You can install it with `pip install "docarray[hnswlib]"`.
     """
 
     @classmethod

diff --git a/libs/core/langchain_core/prompts/chat.py b/libs/core/langchain_core/prompts/chat.py
@@ -551,7 +551,10 @@ def pretty_print(self) -> None:
 
 MessageLikeRepresentation = Union[
     MessageLike,
-    Tuple[Union[str, Type], Union[str, List[dict], List[object]]],
+    Tuple[
+        Union[str, Type],
+        Union[str, List[dict], List[object]],
+    ],
     str,
 ]
 
@@ -590,6 +593,45 @@ class ChatPromptTemplate(BaseChatPromptTemplate):
             #    ]
             #)
 
+    Messages Placeholder:
+
+        .. code-block:: python
+
+            # In addition to Human/AI/Tool/Function messages,
+            # you can initialize the template with a MessagesPlaceholder
+            # either using the class directly or with the shorthand tuple syntax:
+
+            template = ChatPromptTemplate.from_messages([
+                ("system", "You are a helpful AI bot."),
+                # Means the template will receive an optional list of messages under
+                # the "conversation" key
+                ("placeholder", "{conversation}")
+                # Equivalently:
+                # MessagesPlaceholder(variable_name="conversation", optional=True)
+            ])
+
+            prompt_value = template.invoke(
+                {
+                    "conversation": [
+                        ("human", "Hi!"),
+                        ("ai", "How can I assist you today?"),
+                        ("human", "Can you make me an ice cream sundae?"),
+                        ("ai", "No.")
+                    ]
+                }
+            )
+
+            # Output:
+            # ChatPromptValue(
+            #    messages=[
+            #        SystemMessage(content='You are a helpful AI bot.'),
+            #        HumanMessage(content='Hi!'),
+            #        AIMessage(content='How can I assist you today?'),
+            #        HumanMessage(content='Can you make me an ice cream sundae?'),
+            #        AIMessage(content='No.'),
+            #    ]
+            #)
+
     Single-variable template:
 
         If your prompt has only a single input variable (i.e., 1 instance of "{variable_nams}"),
@@ -949,6 +991,36 @@ def _create_template_from_message_type(
         message = AIMessagePromptTemplate.from_template(cast(str, template))
     elif message_type == "system":
         message = SystemMessagePromptTemplate.from_template(cast(str, template))
+    elif message_type == "placeholder":
+        if isinstance(template, str):
+            if template[0] != "{" or template[-1] != "}":
+                raise ValueError(
+                    f"Invalid placeholder template: {template}."
+                    " Expected a variable name surrounded by curly braces."
+                )
+            var_name = template[1:-1]
+            message = MessagesPlaceholder(variable_name=var_name, optional=True)
+        elif len(template) == 2 and isinstance(template[1], bool):
+            var_name_wrapped, is_optional = template
+            if not isinstance(var_name_wrapped, str):
+                raise ValueError(
+                    "Expected variable name to be a string." f" Got: {var_name_wrapped}"
+                )
+            if var_name_wrapped[0] != "{" or var_name_wrapped[-1] != "}":
+                raise ValueError(
+                    f"Invalid placeholder template: {var_name_wrapped}."
+                    " Expected a variable name surrounded by curly braces."
+                )
+            var_name = var_name_wrapped[1:-1]
+
+            message = MessagesPlaceholder(variable_name=var_name, optional=is_optional)
+        else:
+            raise ValueError(
+                "Unexpected arguments for placeholder message type."
+                " Expected either a single string variable name"
+                " or a list of [variable_name: str, is_optional: bool]."
+                f" Got: {template}"
+            )
     else:
         raise ValueError(
             f"Unexpected message type: {message_type}. Use one of 'human',"

diff --git a/libs/core/langchain_core/utils/function_calling.py b/libs/core/langchain_core/utils/function_calling.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import inspect
+import uuid
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -20,6 +21,12 @@
 from typing_extensions import TypedDict
 
 from langchain_core._api import deprecated
+from langchain_core.messages import (
+    AIMessage,
+    BaseMessage,
+    HumanMessage,
+    ToolMessage,
+)
 from langchain_core.pydantic_v1 import BaseModel
 from langchain_core.utils.json_schema import dereference_refs
 
@@ -332,3 +339,96 @@ def convert_to_openai_tool(
         return tool
     function = convert_to_openai_function(tool)
     return {"type": "function", "function": function}
+
+
+def tool_example_to_messages(
+    input: str, tool_calls: List[BaseModel], tool_outputs: Optional[List[str]] = None
+) -> List[BaseMessage]:
+    """Convert an example into a list of messages that can be fed into an LLM.
+
+    This code is an adapter that converts a single example to a list of messages
+    that can be fed into a chat model.
+
+    The list of messages per example corresponds to:
+
+    1) HumanMessage: contains the content from which content should be extracted.
+    2) AIMessage: contains the extracted information from the model
+    3) ToolMessage: contains confirmation to the model that the model requested a tool
+        correctly.
+
+    The ToolMessage is required because some chat models are hyper-optimized for agents
+    rather than for an extraction use case.
+
+    Arguments:
+        input: string, the user input
+        tool_calls: List[BaseModel], a list of tool calls represented as Pydantic
+            BaseModels
+        tool_outputs: Optional[List[str]], a list of tool call outputs.
+            Does not need to be provided. If not provided, a placeholder value
+            will be inserted.
+
+    Returns:
+        A list of messages
+
+    Examples:
+
+        .. code-block:: python
+
+            from typing import List, Optional
+            from langchain_core.pydantic_v1 import BaseModel, Field
+            from langchain_openai import ChatOpenAI
+
+            class Person(BaseModel):
+                '''Information about a person.'''
+                name: Optional[str] = Field(..., description="The name of the person")
+                hair_color: Optional[str] = Field(
+                    ..., description="The color of the peron's eyes if known"
+                )
+                height_in_meters: Optional[str] = Field(
+                    ..., description="Height in METERs"
+                )
+
+            examples = [
+                (
+                    "The ocean is vast and blue. It's more than 20,000 feet deep.",
+                    Person(name=None, height_in_meters=None, hair_color=None),
+                ),
+                (
+                    "Fiona traveled far from France to Spain.",
+                    Person(name="Fiona", height_in_meters=None, hair_color=None),
+                ),
+            ]
+
+
+            messages = []
+
+            for txt, tool_call in examples:
+                messages.extend(
+                    tool_example_to_messages(txt, [tool_call])
+                )
+    """
+    messages: List[BaseMessage] = [HumanMessage(content=input)]
+    openai_tool_calls = []
+    for tool_call in tool_calls:
+        openai_tool_calls.append(
+            {
+                "id": str(uuid.uuid4()),
+                "type": "function",
+                "function": {
+                    # The name of the function right now corresponds to the name
+                    # of the pydantic model. This is implicit in the API right now,
+                    # and will be improved over time.
+                    "name": tool_call.__class__.__name__,
+                    "arguments": tool_call.json(),
+                },
+            }
+        )
+    messages.append(
+        AIMessage(content="", additional_kwargs={"tool_calls": openai_tool_calls})
+    )
+    tool_outputs = tool_outputs or ["You have correctly called this tool."] * len(
+        openai_tool_calls
+    )
+    for output, tool_call_dict in zip(tool_outputs, openai_tool_calls):
+        messages.append(ToolMessage(content=output, tool_call_id=tool_call_dict["id"]))  # type: ignore
+    return messages
diff --git a/libs/core/tests/unit_tests/prompts/test_chat.py b/libs/core/tests/unit_tests/prompts/test_chat.py
@@ -535,6 +535,25 @@ def test_chat_prompt_message_placeholder_partial() -> None:
     assert prompt.format_messages() == [SystemMessage(content="foo")]
 
 
+def test_chat_prompt_message_placeholder_tuple() -> None:
+    prompt = ChatPromptTemplate.from_messages([("placeholder", "{convo}")])
+    assert prompt.format_messages(convo=[("user", "foo")]) == [
+        HumanMessage(content="foo")
+    ]
+
+    assert prompt.format_messages() == []
+
+    # Is optional = True
+    optional_prompt = ChatPromptTemplate.from_messages(
+        [("placeholder", ["{convo}", False])]
+    )
+    assert optional_prompt.format_messages(convo=[("user", "foo")]) == [
+        HumanMessage(content="foo")
+    ]
+    with pytest.raises(KeyError):
+        assert optional_prompt.format_messages() == []
+
+
 def test_messages_prompt_accepts_list() -> None:
     prompt = ChatPromptTemplate.from_messages([MessagesPlaceholder("history")])
     value = prompt.invoke([("user", "Hi there")])  # type: ignore
-Original file line number
+Diff line change
@@ Expand Up / @@ -245,7 +245,7 @@ @@
         "\n",
         "\n",
         "def _parse(text):\n",
-        "    return text.strip(\"**\")"
+        "    return text.strip('\"').strip(\"**\")"
        ]
       },
       {
@@ Expand Down @@