diff --git a/.env b/.env deleted file mode 100644 index 0ced6fa..0000000 --- a/.env +++ /dev/null @@ -1,2 +0,0 @@ -OPENAI_API_KEY="" -ACTIVELOOP_TOKEN="" diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..b52e1fd --- /dev/null +++ b/.env.example @@ -0,0 +1,7 @@ +OPENAI_API_KEY=your_api_key +ACTIVELOOP_TOKEN=your_api_key +DEEPLAKE_USERNAME=your_username +DEEPLAKE_DATASET_PATH=your_hub_path +DEEPLAKE_REPO_NAME=your_repo_name +REPO_URL=your_github_repo_url +SITE_TITLE="Your Site Title" \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dac1fb3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +gumroad +.env +.vscode \ No newline at end of file diff --git a/README.md b/README.md index 50e6f0e..1921e78 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,28 @@ # Chat-with-Github-Repo -This repository contains two Python scripts that demonstrate how to create a chatbot using Streamlit, OpenAI GPT-3.5-turbo, and DeepLake. +This repository contains two Python scripts that demonstrate how to create a chatbot using Streamlit, OpenAI GPT-3.5-turbo, and Activeloop's Deep Lake. -The chatbot searches a dataset stored in DeepLake to find relevant information and generates responses based on the user's input. +The chatbot searches a dataset stored in Deep Lake to find relevant information and generates responses based on the user's input. ## Files -github.py: This script clones a git repository, processes the text documents, computes embeddings using OpenAIEmbeddings, and stores the embeddings in a DeepLake instance. -chat.py: This script creates a Streamlit web application that interacts with the user and the DeepLake instance to generate chatbot responses using OpenAI GPT-3.5-turbo. +`github.py`: This script clones a git repository, processes the text documents, computes embeddings using OpenAIEmbeddings, and stores the embeddings in a DeepLake instance. + +`chat.py`: This script creates a Streamlit web application that interacts with the user and the DeepLake instance to generate chatbot responses using OpenAI GPT-3.5-turbo. ## Setup -To set up and run this project, follow these steps: -Run the github.py script to embed the github repo first +Before getting started, be sure to sign up for an [Activeloop](https://www.activeloop.ai/) and [OpenAI](https://openai.com/) account and create API keys. You'll also want to create a Deep Lake dataset, which will generate a dataset path in the format `hub://{username}/{repo_name}` (where you define the `repo_name`). -This is how you run the streamlit app: +To set up and run this project, follow these steps: -``` -streamlit run chat.py -``` +1. Install the required packages with `pip`: + ``` + pip install -r requirements.txt + ``` +2. Copy the `.env.example` file to `.env` and replace the variables, including API keys, GitHub URL, and site / Deep Lake information. +3. Run the `github.py` script to embed the GitHub repo, thus, storing the data in the specified Activeloop Deep Lake. +4. Run the Streamlit chat app, which should default to `http://localhost:8502` and allow you to ask questions about the repo: + ``` + streamlit run chat.py + ``` diff --git a/chat.py b/chat.py index d379464..79575d8 100644 --- a/chat.py +++ b/chat.py @@ -23,19 +23,20 @@ from langchain.agents import create_csv_agent # Set the title for the Streamlit app -st.title("🤖 pwang_szn Github bot") - +st.title(os.environ.get('SITE_TITLE')) # Load environment variables from a .env file (containing OPENAI_API_KEY) load_dotenv() # Set the OpenAI API key from the environment variable openai.api_key = os.environ.get('OPENAI_API_KEY') -active_loop_data_set_path = "hub://pwangszn/gumroad" #change this to your dataset path +active_loop_data_set_path = os.environ.get('DEEPLAKE_DATASET_PATH') # Create an instance of OpenAIEmbeddings embeddings = OpenAIEmbeddings() # Create an instance of DeepLake with the specified dataset path and embeddings -db = DeepLake(dataset_path=active_loop_data_set_path, read_only=True, embedding_function=embeddings) +db = DeepLake(dataset_path=active_loop_data_set_path, + read_only=True, embedding_function=embeddings) + def generate_response(prompt): # Generate a response using OpenAI's ChatCompletion API and the specified prompt @@ -47,11 +48,13 @@ def generate_response(prompt): response = completion.choices[0].message.content return response + def get_text(): # Create a Streamlit input field and return the user's input input_text = st.text_input("", key="input") return input_text + def search_db(query): # Create a retriever from the DeepLake instance retriever = db.as_retriever() @@ -67,6 +70,7 @@ def search_db(query): # Return the result of the query return qa.run(query) + # Initialize the session state for generated responses and past inputs if 'generated' not in st.session_state: st.session_state['generated'] = ['i am ready to help you ser'] @@ -86,5 +90,6 @@ def search_db(query): # If there are generated responses, display the conversation using Streamlit messages if st.session_state['generated']: for i in range(len(st.session_state['generated'])): - message(st.session_state['past'][i], is_user=True, key=str(i) + '_user') + message(st.session_state['past'][i], + is_user=True, key=str(i) + '_user') message(st.session_state["generated"][i], key=str(i)) diff --git a/github.py b/github.py index 358bdf4..e3be8c0 100644 --- a/github.py +++ b/github.py @@ -10,37 +10,44 @@ load_dotenv() openai.api_key = os.environ.get('OPENAI_API_KEY') + def clone_repository(repo_url, local_path): subprocess.run(["git", "clone", repo_url, local_path]) + def load_docs(root_dir): docs = [] for dirpath, dirnames, filenames in os.walk(root_dir): for file in filenames: - try: - loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8') + try: + loader = TextLoader(os.path.join( + dirpath, file), encoding='utf-8') docs.extend(loader.load_and_split()) - except Exception as e: + except Exception as e: pass return docs + def split_docs(docs): text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) return text_splitter.split_documents(docs) + def main(repo_url, root_dir, repo_name, username): clone_repository(repo_url, root_dir) docs = load_docs(root_dir) texts = split_docs(docs) embeddings = OpenAIEmbeddings() - db = DeepLake(dataset_path=f"hub://{username}/{repo_name}", embedding_function=embeddings) + db = DeepLake( + dataset_path=f"hub://{username}/{repo_name}", embedding_function=embeddings) db.add_documents(texts) + if __name__ == "__main__": - repo_url = "https://github.com/peterw/Gumroad-Landing-Page-Generator" + repo_url = os.environ.get('REPO_URL') root_dir = "./gumroad" - deeplake_repo_name = "gumroad" - deeplake_username = "pwangszn" # replace with your username from app.activeloop.ai + deeplake_repo_name = os.environ.get('DEEPLAKE_REPO_NAME') + deeplake_username = os.environ.get('DEEPLAKE_USERNAME') main(repo_url, root_dir, deeplake_repo_name, deeplake_username) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..120809b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +streamlit +streamlit_chat +langchain +openai +python-dotenv +deeplake \ No newline at end of file