Skip to content

Commit

Permalink
Merge pull request #2 from dtbuchholz/dtb/improve-setup-process
Browse files Browse the repository at this point in the history
Improve setup process
  • Loading branch information
peterw authored Apr 25, 2023
2 parents 3cd90ad + 849a204 commit 92e57e6
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 24 deletions.
2 changes: 0 additions & 2 deletions .env

This file was deleted.

7 changes: 7 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
OPENAI_API_KEY=your_api_key
ACTIVELOOP_TOKEN=your_api_key
DEEPLAKE_USERNAME=your_username
DEEPLAKE_DATASET_PATH=your_hub_path
DEEPLAKE_REPO_NAME=your_repo_name
REPO_URL=your_github_repo_url
SITE_TITLE="Your Site Title"
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
gumroad
.env
.vscode
27 changes: 17 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,21 +1,28 @@
# Chat-with-Github-Repo

This repository contains two Python scripts that demonstrate how to create a chatbot using Streamlit, OpenAI GPT-3.5-turbo, and DeepLake.
This repository contains two Python scripts that demonstrate how to create a chatbot using Streamlit, OpenAI GPT-3.5-turbo, and Activeloop's Deep Lake.

The chatbot searches a dataset stored in DeepLake to find relevant information and generates responses based on the user's input.
The chatbot searches a dataset stored in Deep Lake to find relevant information and generates responses based on the user's input.

## Files
github.py: This script clones a git repository, processes the text documents, computes embeddings using OpenAIEmbeddings, and stores the embeddings in a DeepLake instance.

chat.py: This script creates a Streamlit web application that interacts with the user and the DeepLake instance to generate chatbot responses using OpenAI GPT-3.5-turbo.
`github.py`: This script clones a git repository, processes the text documents, computes embeddings using OpenAIEmbeddings, and stores the embeddings in a DeepLake instance.

`chat.py`: This script creates a Streamlit web application that interacts with the user and the DeepLake instance to generate chatbot responses using OpenAI GPT-3.5-turbo.

## Setup
To set up and run this project, follow these steps:

Run the github.py script to embed the github repo first
Before getting started, be sure to sign up for an [Activeloop](https://www.activeloop.ai/) and [OpenAI](https://openai.com/) account and create API keys. You'll also want to create a Deep Lake dataset, which will generate a dataset path in the format `hub://{username}/{repo_name}` (where you define the `repo_name`).

This is how you run the streamlit app:
To set up and run this project, follow these steps:

```
streamlit run chat.py
```
1. Install the required packages with `pip`:
```
pip install -r requirements.txt
```
2. Copy the `.env.example` file to `.env` and replace the variables, including API keys, GitHub URL, and site / Deep Lake information.
3. Run the `github.py` script to embed the GitHub repo, thus, storing the data in the specified Activeloop Deep Lake.
4. Run the Streamlit chat app, which should default to `http://localhost:8502` and allow you to ask questions about the repo:
```
streamlit run chat.py
```
15 changes: 10 additions & 5 deletions chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,20 @@
from langchain.agents import create_csv_agent

# Set the title for the Streamlit app
st.title("🤖 pwang_szn Github bot")

st.title(os.environ.get('SITE_TITLE'))
# Load environment variables from a .env file (containing OPENAI_API_KEY)
load_dotenv()
# Set the OpenAI API key from the environment variable
openai.api_key = os.environ.get('OPENAI_API_KEY')
active_loop_data_set_path = "hub://pwangszn/gumroad" #change this to your dataset path
active_loop_data_set_path = os.environ.get('DEEPLAKE_DATASET_PATH')

# Create an instance of OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

# Create an instance of DeepLake with the specified dataset path and embeddings
db = DeepLake(dataset_path=active_loop_data_set_path, read_only=True, embedding_function=embeddings)
db = DeepLake(dataset_path=active_loop_data_set_path,
read_only=True, embedding_function=embeddings)


def generate_response(prompt):
# Generate a response using OpenAI's ChatCompletion API and the specified prompt
Expand All @@ -47,11 +48,13 @@ def generate_response(prompt):
response = completion.choices[0].message.content
return response


def get_text():
# Create a Streamlit input field and return the user's input
input_text = st.text_input("", key="input")
return input_text


def search_db(query):
# Create a retriever from the DeepLake instance
retriever = db.as_retriever()
Expand All @@ -67,6 +70,7 @@ def search_db(query):
# Return the result of the query
return qa.run(query)


# Initialize the session state for generated responses and past inputs
if 'generated' not in st.session_state:
st.session_state['generated'] = ['i am ready to help you ser']
Expand All @@ -86,5 +90,6 @@ def search_db(query):
# If there are generated responses, display the conversation using Streamlit messages
if st.session_state['generated']:
for i in range(len(st.session_state['generated'])):
message(st.session_state['past'][i], is_user=True, key=str(i) + '_user')
message(st.session_state['past'][i],
is_user=True, key=str(i) + '_user')
message(st.session_state["generated"][i], key=str(i))
21 changes: 14 additions & 7 deletions github.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,37 +10,44 @@
load_dotenv()
openai.api_key = os.environ.get('OPENAI_API_KEY')


def clone_repository(repo_url, local_path):
subprocess.run(["git", "clone", repo_url, local_path])


def load_docs(root_dir):
docs = []
for dirpath, dirnames, filenames in os.walk(root_dir):
for file in filenames:
try:
loader = TextLoader(os.path.join(dirpath, file), encoding='utf-8')
try:
loader = TextLoader(os.path.join(
dirpath, file), encoding='utf-8')
docs.extend(loader.load_and_split())
except Exception as e:
except Exception as e:
pass
return docs


def split_docs(docs):
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
return text_splitter.split_documents(docs)


def main(repo_url, root_dir, repo_name, username):
clone_repository(repo_url, root_dir)
docs = load_docs(root_dir)
texts = split_docs(docs)
embeddings = OpenAIEmbeddings()

db = DeepLake(dataset_path=f"hub://{username}/{repo_name}", embedding_function=embeddings)
db = DeepLake(
dataset_path=f"hub://{username}/{repo_name}", embedding_function=embeddings)
db.add_documents(texts)


if __name__ == "__main__":
repo_url = "https://github.com/peterw/Gumroad-Landing-Page-Generator"
repo_url = os.environ.get('REPO_URL')
root_dir = "./gumroad"
deeplake_repo_name = "gumroad"
deeplake_username = "pwangszn" # replace with your username from app.activeloop.ai
deeplake_repo_name = os.environ.get('DEEPLAKE_REPO_NAME')
deeplake_username = os.environ.get('DEEPLAKE_USERNAME')

main(repo_url, root_dir, deeplake_repo_name, deeplake_username)
6 changes: 6 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
streamlit
streamlit_chat
langchain
openai
python-dotenv
deeplake

0 comments on commit 92e57e6

Please sign in to comment.