Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Various changes #7

Merged
merged 3 commits into from
Sep 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 28 additions & 6 deletions bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
SystemMessagePromptTemplate,
HumanMessagePromptTemplate,
)
from langchain.graphs import Neo4jGraph

from dotenv import load_dotenv

load_dotenv(".env")
Expand All @@ -29,6 +31,21 @@

logger = get_logger(__name__)


neo4j_graph = Neo4jGraph(url=url, username=username, password=password)

def create_vector_index(dimension):
index_query = "CALL db.index.vector.createNodeIndex('stackoverflow', 'Question', 'embedding', $dimension, 'cosine')"
try:
neo4j_graph.query(index_query, {"dimension": dimension})
except: # Already exists
pass
index_query = "CALL db.index.vector.createNodeIndex('top_answers', 'Answer', 'embedding', $dimension, 'cosine')"
try:
neo4j_graph.query(index_query, {"dimension": dimension})
except: # Already exists
pass

class StreamHandler(BaseCallbackHandler):
def __init__(self, container, initial_text=""):
self.container = container
Expand All @@ -40,14 +57,19 @@ def on_llm_new_token(self, token: str, **kwargs) -> None:

if embedding_model_name == "ollama":
embeddings = OllamaEmbeddings(base_url=ollama_base_url, model="llama2")
dimension = 4096
logger.info("Embedding: Using Ollama")
elif embedding_model_name == "openai":
embeddings = OpenAIEmbeddings()
dimension = 1536
logger.info("Embedding: Using OpenAI")
else:
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
dimension = 384
logger.info("Embedding: Using SentenceTransformer")

create_vector_index(dimension)

if llm_name == "gpt-4":
llm = ChatOpenAI(temperature=0, model_name="gpt-4", streaming=True)
logger.info("LLM: Using GPT-4")
Expand All @@ -62,7 +84,7 @@ def on_llm_new_token(self, token: str, **kwargs) -> None:

# LLM only response
template = """
You are a helpful assistant that helps with programming questions.
You are a helpful assistant that helps a support agent with answering programming questions.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
"""
system_message_prompt = SystemMessagePromptTemplate.from_template(template)
Expand Down Expand Up @@ -90,11 +112,11 @@ def generate_llm_output(user_input: str, callbacks: List[Any]) -> str:
username=username,
password=password,
database="neo4j", # neo4j by default
index_name="stackoverflow", # vector by default
index_name="top_answers", # vector by default
text_node_property="body", # text by default
retrieval_query="""
OPTIONAL MATCH (node)<-[:ANSWERS]-(a)
RETURN node.title + '\n' + node.body + '\n' + coalesce(a.body,"") AS text, score, {source:node.link} AS metadata LIMIT 1
OPTIONAL MATCH (node)-[:ANSWERS]->(question)
RETURN question.title + '\n' + question.body + '\n' + coalesce(node.body,"") AS text, score, {source:question.link} AS metadata
""",
)

Expand All @@ -120,7 +142,7 @@ def generate_llm_output(user_input: str, callbacks: List[Any]) -> str:
combine_docs_chain_kwargs={"prompt": qa_prompt},
)

# Rag + KG
# Rag + Knowledge Graph response
kg = Neo4jVector.from_existing_index(
embedding=embeddings,
url=url,
Expand All @@ -137,7 +159,7 @@ def generate_llm_output(user_input: str, callbacks: List[Any]) -> str:
WITH collect(a.body)[..2] as answers
RETURN reduce(str='', text IN answers | str + text + '\n') as answerTexts
}
RETURN node.body + '\n' + answerTexts AS text, score, {source:node.link} AS metadata
RETURN node.title + '\n' + node.body + '\n' + answerTexts AS text, score, {source:node.link} AS metadata
""",
)

Expand Down
11 changes: 11 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@ services:
ports:
- 7687:7687
- 7474:7474
volumes:
- $PWD/data:/data
environment:
- NEO4J_AUTH=${NEO4J_USERNAME-neo4j}/${NEO4J_PASSWORD-password}
- NEO4J_PLUGINS=["apoc"]
- NEO4J_db_tx__log_rotation_retention__policy=false
healthcheck:
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider localhost:7474 || exit 1"]
interval: 5s
Expand All @@ -28,6 +31,10 @@ services:
- OPENAI_API_KEY=${OPENAI_API_KEY}
- OLLAMA_BASE_URL=${OLLAMA_BASE_URL-http://host.docker.internal:11434}
- EMBEDDING_MODEL=${EMBEDDING_MODEL-sentence_transformer}
- LANGCHAIN_ENDPOINT=${LANGCHAIN_ENDPOINT-"https://api.smith.langchain.com"}
- LANGCHAIN_TRACING_V2=${LANGCHAIN_TRACING_V2-false}
- LANGCHAIN_PROJECT=${LANGCHAIN_PROJECT}
- LANGCHAIN_API_KEY=${LANGCHAIN_API_KEY}
networks:
- net
depends_on:
Expand Down Expand Up @@ -55,6 +62,10 @@ services:
- OLLAMA_BASE_URL=${OLLAMA_BASE_URL-http://host.docker.internal:11434}
- LLM=${LLM-gpt-3.5}
- EMBEDDING_MODEL=${EMBEDDING_MODEL-sentence_transformer}
- LANGCHAIN_ENDPOINT=${LANGCHAIN_ENDPOINT-"https://api.smith.langchain.com"}
- LANGCHAIN_TRACING_V2=${LANGCHAIN_TRACING_V2-false}
- LANGCHAIN_PROJECT=${LANGCHAIN_PROJECT}
- LANGCHAIN_API_KEY=${LANGCHAIN_API_KEY}
networks:
- net
depends_on:
Expand Down
9 changes: 7 additions & 2 deletions example.env
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
OPENAI_API_KEY=sk-...
OLLAMA_BASE_URL=http://host.docker.internal:11434
#OPENAI_API_KEY=sk-...
#OLLAMA_BASE_URL=http://host.docker.internal:11434
#NEO4J_URI=neo4j://localhost:7687
#NEO4J_USERNAME=neo4j
#NEO4J_PASSWORD=password
LLM=ollama #or gpt-4 or gpt-3.5
EMBEDDING_MODEL=sentence_transformer #or openai or ollama

LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
LANGCHAIN_TRACING_V2=true # false
LANGCHAIN_PROJECT=#your-project-name
LANGCHAIN_API_KEY=#your-api-key ls_...
26 changes: 19 additions & 7 deletions loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from langchain.graphs import Neo4jGraph

import streamlit as st
from streamlit.logger import get_logger

load_dotenv(".env")

Expand All @@ -21,18 +22,20 @@

os.environ["NEO4J_URL"] = url

logger = get_logger(__name__)

if embedding_model_name == "ollama":
embeddings = OllamaEmbeddings(base_url=ollama_base_url, model="llama2")
dimension = 4096
print("Embedding: Using Ollama")
logger.info("Embedding: Using Ollama")
elif embedding_model_name == "openai":
embeddings = OpenAIEmbeddings()
dimension = 1536
print("Embedding: Using OpenAI")
logger.info("Embedding: Using OpenAI")
else:
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
dimension = 384
print("Embedding: Using SentenceTransformer")
logger.info("Embedding: Using SentenceTransformer")

neo4j_graph = Neo4jGraph(url=url, username=username, password=password)

Expand All @@ -56,12 +59,16 @@ def create_constraints():


def create_vector_index(dimension):
# TODO use Neo4jVector Code from LangChain on the existing graph
index_query = "CALL db.index.vector.createNodeIndex('stackoverflow', 'Question', 'embedding', $dimension, 'cosine')"
try:
neo4j_graph.query(index_query, {"dimension": dimension})
except: # Already exists
pass
index_query = "CALL db.index.vector.createNodeIndex('top_answers', 'Answer', 'embedding', $dimension, 'cosine')"
try:
neo4j_graph.query(index_query, {"dimension": dimension})
except: # Already exists
pass


create_vector_index(dimension)
Expand All @@ -74,9 +81,13 @@ def load_so_data(tag: str = "neo4j", page: int = 1) -> None:
"&site=stackoverflow&filter=!51dU0b1n(WTdqj5MH1iGsNShY6BhXXwJ)xwV5b"
)
data = requests.get(base_url + parameters).json()
# Convert html to text and calculate embedding values
# Calculate embedding values for questions and answers
for q in data["items"]:
q["embedding"] = embeddings.embed_query(q["title"] + " " + q["body_markdown"])
question_text = q["title"] + "\n" + q["body_markdown"]
q["embedding"] = embeddings.embed_query(question_text)
for a in q["answers"]:
a["embedding"] = embeddings.embed_query(question_text + "\n" + a["body_markdown"])


import_query = """
UNWIND $data AS q
Expand All @@ -93,7 +104,8 @@ def load_so_data(tag: str = "neo4j", page: int = 1) -> None:
SET answer.is_accepted = a.is_accepted,
answer.score = a.score,
answer.creation_date = datetime({epochSeconds:a.creation_date}),
answer.body = a.body_markdown
answer.body = a.body_markdown,
answer.embedding = a.embedding
MERGE (answerer:User {id:coalesce(a.owner.user_id, "deleted")})
ON CREATE SET answerer.display_name = a.owner.display_name,
answerer.reputation= a.owner.reputation
Expand Down