docker · jexp · Sep 25, 2023 · Sep 25, 2023 · Sep 25, 2023 · Sep 25, 2023
diff --git a/bot.py b/bot.py
@@ -14,6 +14,8 @@
     SystemMessagePromptTemplate,
     HumanMessagePromptTemplate,
 )
+from langchain.graphs import Neo4jGraph
+
 from dotenv import load_dotenv
 
 load_dotenv(".env")
@@ -29,6 +31,21 @@
 
 logger = get_logger(__name__)
 
+
+neo4j_graph = Neo4jGraph(url=url, username=username, password=password)
+
+def create_vector_index(dimension):
+    index_query = "CALL db.index.vector.createNodeIndex('stackoverflow', 'Question', 'embedding', $dimension, 'cosine')"
+    try:
+        neo4j_graph.query(index_query, {"dimension": dimension})
+    except:  # Already exists
+        pass
+    index_query = "CALL db.index.vector.createNodeIndex('top_answers', 'Answer', 'embedding', $dimension, 'cosine')"
+    try:
+        neo4j_graph.query(index_query, {"dimension": dimension})
+    except:  # Already exists
+        pass
+
 class StreamHandler(BaseCallbackHandler):
     def __init__(self, container, initial_text=""):
         self.container = container
@@ -40,14 +57,19 @@ def on_llm_new_token(self, token: str, **kwargs) -> None:
 
 if embedding_model_name == "ollama":
     embeddings = OllamaEmbeddings(base_url=ollama_base_url, model="llama2")
+    dimension = 4096
     logger.info("Embedding: Using Ollama")
 elif embedding_model_name == "openai":
     embeddings = OpenAIEmbeddings()
+    dimension = 1536
     logger.info("Embedding: Using OpenAI")
 else:
     embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+    dimension = 384
     logger.info("Embedding: Using SentenceTransformer")
 
+create_vector_index(dimension)
+
 if llm_name == "gpt-4":
     llm = ChatOpenAI(temperature=0, model_name="gpt-4", streaming=True)
     logger.info("LLM: Using GPT-4")
@@ -62,7 +84,7 @@ def on_llm_new_token(self, token: str, **kwargs) -> None:
 
 # LLM only response
 template = """
-You are a helpful assistant that helps with programming questions.
+You are a helpful assistant that helps a support agent with answering programming questions.
 If you don't know the answer, just say that you don't know, don't try to make up an answer.
 """
 system_message_prompt = SystemMessagePromptTemplate.from_template(template)
@@ -90,11 +112,11 @@ def generate_llm_output(user_input: str, callbacks: List[Any]) -> str:
     username=username,
     password=password,
     database="neo4j",  # neo4j by default
-    index_name="stackoverflow",  # vector by default
+    index_name="top_answers",  # vector by default
     text_node_property="body",  # text by default
     retrieval_query="""
-    OPTIONAL MATCH (node)<-[:ANSWERS]-(a)
-    RETURN node.title + '\n' + node.body + '\n' + coalesce(a.body,"") AS text, score, {source:node.link} AS metadata LIMIT 1
+    OPTIONAL MATCH (node)-[:ANSWERS]->(question)
+    RETURN question.title + '\n' + question.body + '\n' + coalesce(node.body,"") AS text, score, {source:question.link} AS metadata
 """,
 )
 
@@ -120,7 +142,7 @@ def generate_llm_output(user_input: str, callbacks: List[Any]) -> str:
     combine_docs_chain_kwargs={"prompt": qa_prompt},
 )
 
-# Rag + KG
+# Rag + Knowledge Graph response
 kg = Neo4jVector.from_existing_index(
     embedding=embeddings,
     url=url,
@@ -137,7 +159,7 @@ def generate_llm_output(user_input: str, callbacks: List[Any]) -> str:
     WITH collect(a.body)[..2] as answers
     RETURN reduce(str='', text IN answers | str +  text + '\n') as answerTexts
 } 
-RETURN node.body + '\n' + answerTexts AS text, score, {source:node.link} AS metadata
+RETURN node.title + '\n' + node.body + '\n' + answerTexts AS text, score, {source:node.link} AS metadata
 """,
 )
 

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -6,9 +6,12 @@ services:
     ports:
       - 7687:7687
       - 7474:7474
+    volumes:
+      - $PWD/data:/data
     environment:
       - NEO4J_AUTH=${NEO4J_USERNAME-neo4j}/${NEO4J_PASSWORD-password}
       - NEO4J_PLUGINS=["apoc"]
+      - NEO4J_db_tx__log_rotation_retention__policy=false
     healthcheck:
         test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider localhost:7474 || exit 1"]
         interval: 5s
@@ -28,6 +31,10 @@ services:
       - OPENAI_API_KEY=${OPENAI_API_KEY}
       - OLLAMA_BASE_URL=${OLLAMA_BASE_URL-http://host.docker.internal:11434}
       - EMBEDDING_MODEL=${EMBEDDING_MODEL-sentence_transformer}
+      - LANGCHAIN_ENDPOINT=${LANGCHAIN_ENDPOINT-"https://api.smith.langchain.com"}
+      - LANGCHAIN_TRACING_V2=${LANGCHAIN_TRACING_V2-false}
+      - LANGCHAIN_PROJECT=${LANGCHAIN_PROJECT}
+      - LANGCHAIN_API_KEY=${LANGCHAIN_API_KEY}
     networks:
       - net
     depends_on:
@@ -55,6 +62,10 @@ services:
       - OLLAMA_BASE_URL=${OLLAMA_BASE_URL-http://host.docker.internal:11434}
       - LLM=${LLM-gpt-3.5}
       - EMBEDDING_MODEL=${EMBEDDING_MODEL-sentence_transformer}
+      - LANGCHAIN_ENDPOINT=${LANGCHAIN_ENDPOINT-"https://api.smith.langchain.com"}
+      - LANGCHAIN_TRACING_V2=${LANGCHAIN_TRACING_V2-false}
+      - LANGCHAIN_PROJECT=${LANGCHAIN_PROJECT}
+      - LANGCHAIN_API_KEY=${LANGCHAIN_API_KEY}
     networks:
       - net
     depends_on:

diff --git a/example.env b/example.env
@@ -1,7 +1,12 @@
-OPENAI_API_KEY=sk-...
-OLLAMA_BASE_URL=http://host.docker.internal:11434
+#OPENAI_API_KEY=sk-...
+#OLLAMA_BASE_URL=http://host.docker.internal:11434
 #NEO4J_URI=neo4j://localhost:7687
 #NEO4J_USERNAME=neo4j
 #NEO4J_PASSWORD=password
 LLM=ollama #or gpt-4 or gpt-3.5
 EMBEDDING_MODEL=sentence_transformer #or openai or ollama
+
+LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
+LANGCHAIN_TRACING_V2=true # false
+LANGCHAIN_PROJECT=#your-project-name
+LANGCHAIN_API_KEY=#your-api-key ls_...
diff --git a/loader.py b/loader.py
@@ -10,6 +10,7 @@
 from langchain.graphs import Neo4jGraph
 
 import streamlit as st
+from streamlit.logger import get_logger
 
 load_dotenv(".env")
 
@@ -21,18 +22,20 @@
 
 os.environ["NEO4J_URL"] = url
 
+logger = get_logger(__name__)
+
 if embedding_model_name == "ollama":
     embeddings = OllamaEmbeddings(base_url=ollama_base_url, model="llama2")
     dimension = 4096
-    print("Embedding: Using Ollama")
+    logger.info("Embedding: Using Ollama")
 elif embedding_model_name == "openai":
     embeddings = OpenAIEmbeddings()
     dimension = 1536
-    print("Embedding: Using OpenAI")
+    logger.info("Embedding: Using OpenAI")
 else:
     embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
     dimension = 384
-    print("Embedding: Using SentenceTransformer")
+    logger.info("Embedding: Using SentenceTransformer")
 
 neo4j_graph = Neo4jGraph(url=url, username=username, password=password)
 
@@ -56,12 +59,16 @@ def create_constraints():
 
 
 def create_vector_index(dimension):
-    # TODO use Neo4jVector Code from LangChain on the existing graph
     index_query = "CALL db.index.vector.createNodeIndex('stackoverflow', 'Question', 'embedding', $dimension, 'cosine')"
     try:
         neo4j_graph.query(index_query, {"dimension": dimension})
     except:  # Already exists
         pass
+    index_query = "CALL db.index.vector.createNodeIndex('top_answers', 'Answer', 'embedding', $dimension, 'cosine')"
+    try:
+        neo4j_graph.query(index_query, {"dimension": dimension})
+    except:  # Already exists
+        pass
 
 
 create_vector_index(dimension)
@@ -74,9 +81,13 @@ def load_so_data(tag: str = "neo4j", page: int = 1) -> None:
         "&site=stackoverflow&filter=!51dU0b1n(WTdqj5MH1iGsNShY6BhXXwJ)xwV5b"
     )
     data = requests.get(base_url + parameters).json()
-    # Convert html to text and calculate embedding values
+    # Calculate embedding values for questions and answers
     for q in data["items"]:
-        q["embedding"] = embeddings.embed_query(q["title"] + " " + q["body_markdown"])
+        question_text = q["title"] + "\n" + q["body_markdown"]
+        q["embedding"] = embeddings.embed_query(question_text)
+        for a in q["answers"]:
+            a["embedding"] = embeddings.embed_query(question_text + "\n" + a["body_markdown"])
+
 
     import_query = """
     UNWIND $data AS q
@@ -93,7 +104,8 @@ def load_so_data(tag: str = "neo4j", page: int = 1) -> None:
         SET answer.is_accepted = a.is_accepted,
             answer.score = a.score,
             answer.creation_date = datetime({epochSeconds:a.creation_date}),
-            answer.body = a.body_markdown
+            answer.body = a.body_markdown,
+            answer.embedding = a.embedding
         MERGE (answerer:User {id:coalesce(a.owner.user_id, "deleted")}) 
         ON CREATE SET answerer.display_name = a.owner.display_name,
                       answerer.reputation= a.owner.reputation