Merge pull request #2 from dtbuchholz/dtb/improve-setup-process

Improve setup process
peterw · Apr 25, 2023 · 92e57e6 · 92e57e6
2 parents 3cd90ad + 849a204
commit 92e57e6
Show file tree

Hide file tree

Showing 7 changed files with 57 additions and 24 deletions.
diff --git a/.env b/.env
diff --git a/.env.example b/.env.example
@@ -0,0 +1,7 @@
+OPENAI_API_KEY=your_api_key
+ACTIVELOOP_TOKEN=your_api_key
+DEEPLAKE_USERNAME=your_username
+DEEPLAKE_DATASET_PATH=your_hub_path
+DEEPLAKE_REPO_NAME=your_repo_name
+REPO_URL=your_github_repo_url
+SITE_TITLE="Your Site Title"
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+gumroad
+.env
+.vscode
diff --git a/README.md b/README.md
@@ -1,21 +1,28 @@
 # Chat-with-Github-Repo
 
-This repository contains two Python scripts that demonstrate how to create a chatbot using Streamlit, OpenAI GPT-3.5-turbo, and DeepLake.
+This repository contains two Python scripts that demonstrate how to create a chatbot using Streamlit, OpenAI GPT-3.5-turbo, and Activeloop's Deep Lake.
 
-The chatbot searches a dataset stored in DeepLake to find relevant information and generates responses based on the user's input.
+The chatbot searches a dataset stored in Deep Lake to find relevant information and generates responses based on the user's input.
 
 ## Files
-github.py: This script clones a git repository, processes the text documents, computes embeddings using OpenAIEmbeddings, and stores the embeddings in a DeepLake instance.
 
-chat.py: This script creates a Streamlit web application that interacts with the user and the DeepLake instance to generate chatbot responses using OpenAI GPT-3.5-turbo.
+`github.py`: This script clones a git repository, processes the text documents, computes embeddings using OpenAIEmbeddings, and stores the embeddings in a DeepLake instance.
+
+`chat.py`: This script creates a Streamlit web application that interacts with the user and the DeepLake instance to generate chatbot responses using OpenAI GPT-3.5-turbo.
 
 ## Setup
-To set up and run this project, follow these steps:
 
-Run the github.py script to embed the github repo first 
+Before getting started, be sure to sign up for an [Activeloop](https://www.activeloop.ai/) and [OpenAI](https://openai.com/) account and create API keys. You'll also want to create a Deep Lake dataset, which will generate a dataset path in the format `hub://{username}/{repo_name}` (where you define the `repo_name`).
 
-This is how you run the streamlit app: 
+To set up and run this project, follow these steps:
 
-```
-streamlit run chat.py
-```
+1. Install the required packages with `pip`:
+   ```
+   pip install -r requirements.txt
+   ```
+2. Copy the `.env.example` file to `.env` and replace the variables, including API keys, GitHub URL, and site / Deep Lake information.
+3. Run the `github.py` script to embed the GitHub repo, thus, storing the data in the specified Activeloop Deep Lake.
+4. Run the Streamlit chat app, which should default to `http://localhost:8502` and allow you to ask questions about the repo:
+   ```
+   streamlit run chat.py
+   ```
diff --git a/chat.py b/chat.py
@@ -23,19 +23,20 @@
 from langchain.agents import create_csv_agent
 
 # Set the title for the Streamlit app
-st.title("🤖 pwang_szn Github bot")
-
+st.title(os.environ.get('SITE_TITLE'))
 # Load environment variables from a .env file (containing OPENAI_API_KEY)
 load_dotenv()
 # Set the OpenAI API key from the environment variable
 openai.api_key = os.environ.get('OPENAI_API_KEY')
-active_loop_data_set_path = "hub://pwangszn/gumroad" #change this to your dataset path
+active_loop_data_set_path = os.environ.get('DEEPLAKE_DATASET_PATH')
 
 # Create an instance of OpenAIEmbeddings
 embeddings = OpenAIEmbeddings()
 
 # Create an instance of DeepLake with the specified dataset path and embeddings
-db = DeepLake(dataset_path=active_loop_data_set_path, read_only=True, embedding_function=embeddings)
+db = DeepLake(dataset_path=active_loop_data_set_path,
+              read_only=True, embedding_function=embeddings)
+
 
 def generate_response(prompt):
     # Generate a response using OpenAI's ChatCompletion API and the specified prompt
@@ -47,11 +48,13 @@ def generate_response(prompt):
     response = completion.choices[0].message.content
     return response
 
+
 def get_text():
     # Create a Streamlit input field and return the user's input
     input_text = st.text_input("", key="input")
     return input_text
 
+
 def search_db(query):
     # Create a retriever from the DeepLake instance
     retriever = db.as_retriever()
@@ -67,6 +70,7 @@ def search_db(query):
     # Return the result of the query
     return qa.run(query)
 
+
 # Initialize the session state for generated responses and past inputs
 if 'generated' not in st.session_state:
     st.session_state['generated'] = ['i am ready to help you ser']
@@ -86,5 +90,6 @@ def search_db(query):
 # If there are generated responses, display the conversation using Streamlit messages
 if st.session_state['generated']:
     for i in range(len(st.session_state['generated'])):
-        message(st.session_state['past'][i], is_user=True, key=str(i) + '_user')
+        message(st.session_state['past'][i],
+                is_user=True, key=str(i) + '_user')
         message(st.session_state["generated"][i], key=str(i))
diff --git a/github.py b/github.py
@@ -10,37 +10,44 @@
 load_dotenv()
 openai.api_key = os.environ.get('OPENAI_API_KEY')
 
+
 def clone_repository(repo_url, local_path):
     subprocess.run(["git", "clone", repo_url, local_path])
 
+
 def load_docs(root_dir):
     docs = []
     for dirpath, dirnames, filenames in os.walk(root_dir):
         for file in filenames:
-            try: 
-                loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
+            try:
+                loader = TextLoader(os.path.join(
+                    dirpath, file), encoding='utf-8')
                 docs.extend(loader.load_and_split())
-            except Exception as e: 
+            except Exception as e:
                 pass
     return docs
 
+
 def split_docs(docs):
     text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
     return text_splitter.split_documents(docs)
 
+
 def main(repo_url, root_dir, repo_name, username):
     clone_repository(repo_url, root_dir)
     docs = load_docs(root_dir)
     texts = split_docs(docs)
     embeddings = OpenAIEmbeddings()
 
-    db = DeepLake(dataset_path=f"hub://{username}/{repo_name}", embedding_function=embeddings)
+    db = DeepLake(
+        dataset_path=f"hub://{username}/{repo_name}", embedding_function=embeddings)
     db.add_documents(texts)
 
+
 if __name__ == "__main__":
-    repo_url = "https://github.com/peterw/Gumroad-Landing-Page-Generator"
+    repo_url = os.environ.get('REPO_URL')
     root_dir = "./gumroad"
-    deeplake_repo_name = "gumroad"
-    deeplake_username = "pwangszn" # replace with your username from app.activeloop.ai
+    deeplake_repo_name = os.environ.get('DEEPLAKE_REPO_NAME')
+    deeplake_username = os.environ.get('DEEPLAKE_USERNAME')
 
     main(repo_url, root_dir, deeplake_repo_name, deeplake_username)
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,6 @@
+streamlit
+streamlit_chat
+langchain
+openai
+python-dotenv
+deeplake
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    gumroad
+    .env
+    .vscode