diff --git a/README.md b/README.md index 29a583e..326c9b4 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ The purpose of this package is to offer a convenient question-answering system w * Supports the "Retrieve and Re-rank" strategy for semantic search, see - https://www.sbert.net/examples/applications/retrieve_rerank/README.html. * Supports HyDE (Hypothetical Document Embeddings) - https://arxiv.org/pdf/2212.10496.pdf + * WARNING: Enabling HyDE can significantly alter the quality of the results. Please make sure to read the paper before enabling. * Allows interaction with embedded documents, supporting the following models and methods (including locally hosted): * OpenAI models (ChatGPT 3.5/4 and Azure OpenAI). @@ -83,7 +84,8 @@ pip install . # or `pip install -e .` for development To create a configuration file in YAML format, you can refer to the example template provided in `sample_templates/config_template.yaml`. -The sample configuration file specifies how to load one of the supported locally hosted models, downloaded from Huggingface - https://huggingface.co/TheBloke/wizardLM-13B-1.0-GGML/resolve/main/WizardLM-13B-1.0.ggmlv3.q5_K_S.bin +The sample configuration file specifies how to load one of the supported locally hosted models, downloaded from Huggingface - +https://huggingface.co/TheBloke/airoboros-l2-13B-gpt4-1.4.1-GGUF/resolve/main/airoboros-l2-13b-gpt4-1.4.1.Q4_K_M.gguf As an alternative uncomment the llm section for OpenAI model. @@ -130,7 +132,7 @@ llmsearch interact llm -c /path/to/config.yaml Based on the example configuration provided in the sample configuration file, the following actions will take place: -- The system will load a quantized GGML model using the LlamaCpp framework. The model file is located at `/storage/llm/cache/WizardLM-13B-1.0-GGML/WizardLM-13B-1.0.ggmlv3.q5_K_S.bin`. +- The system will load a quantized GGUF model using the LlamaCpp framework. The model file is located at `/storage/llm/cache/airoboros-l2-13b-gpt4-1.4.1.Q4_K_M.gguf` - The model will be partially loaded into the GPU (30 layers) and partially into the CPU (remaining layers). The `n_gpu_layers` parameter can be adjusted according to the hardware limitations. - Additional LlamaCpp specific parameters specified in `model_kwargs` from the `llm->params` section will be passed to the model. - The system will query the embeddings database using hybrid search algorithm using sparse and dense embeddings. It will provide the most relevant context from different documents, up to a maximum context size of 4096 characters (`max_char_size` in `semantic_search`). diff --git a/sample_templates/config_template.yaml b/sample_templates/config_template.yaml index ed37562..0945f9a 100644 --- a/sample_templates/config_template.yaml +++ b/sample_templates/config_template.yaml @@ -69,7 +69,7 @@ persist_response_db_path: "/path/to/responses.db" # optional sqlite database fi llm: type: llamacpp params: - model_path: /storage/llm/cache/WizardLM-13B-1.0-GGML/WizardLM-13B-1.0.ggmlv3.q5_K_S.bin + model_path: /storage/llm/cache/airoboros-l2-13b-gpt4-1.4.1.Q4_K_M.gguf prompt_template: | ### Instruction: Use the following pieces of context to answer the question at the end. If answer isn't in the context, say that you don't know, don't try to make up an answer. diff --git a/src/llmsearch/webapp.py b/src/llmsearch/webapp.py index a38aa6a..be39ca2 100644 --- a/src/llmsearch/webapp.py +++ b/src/llmsearch/webapp.py @@ -54,7 +54,7 @@ def get_bundle(config): @st.cache_data -def generate_response(question: str, _config: Config, _bundle, label_filter: str = ""): +def generate_response(question: str, use_hyde: bool, _config: Config, _bundle, label_filter: str = ""): # _config and _bundle are under scored so paratemeters aren't hashed output = get_and_parse_response(query=question, config=_config, llm_bundle=_bundle, label=label_filter) @@ -68,9 +68,11 @@ def generate_response(question: str, _config: Config, _bundle, label_filter: str if args.cli_config_path: config_file = args.cli_config_path else: - config_file = st.sidebar.file_uploader( - "Select tempate to load", type=["yml", "yaml"] - ) + config_file = st.sidebar.file_uploader("Select tempate to load", type=["yml", "yaml"]) + +# Initialsize state for historical resutls +if "messages" not in st.session_state: + st.session_state["messages"] = [] if config_file is not None: @@ -82,29 +84,48 @@ def generate_response(question: str, _config: Config, _bundle, label_filter: str st.sidebar.write(f"**Model type:** {config.llm.type}") - st.sidebar.write( - f"**Document path**: {config.embeddings.document_settings[0].doc_path}" - ) + st.sidebar.write(f"**Document path**: {config.embeddings.document_settings[0].doc_path}") st.sidebar.write(f"**Embedding path:** {config.embeddings.embeddings_path}") - st.sidebar.write( - f"**Max char size (semantic search):** {config.semantic_search.max_char_size}" - ) + st.sidebar.write(f"**Max char size (semantic search):** {config.semantic_search.max_char_size}") label_filter = "" if config.embeddings.labels: - label_filter = st.sidebar.selectbox(label="Filter by label", options = ["-"] + config.embeddings.labels) - if label_filter is None or label_filter == '-': + label_filter = st.sidebar.selectbox(label="Filter by label", options=["-"] + config.embeddings.labels) + if label_filter is None or label_filter == "-": label_filter = "" - + llm_bundle = get_bundle(config) text = st.chat_input("Enter text") - is_hyde = st.sidebar.checkbox(label = "Use HyDE (cost: 2 api calls)", value=llm_bundle.hyde_enabled) + is_hyde = st.sidebar.checkbox(label="Use HyDE (cost: 2 api calls)", value=llm_bundle.hyde_enabled) if text: - # Dynamically switch hyde llm_bundle.hyde_enabled = is_hyde - output = generate_response(question=text, _bundle=llm_bundle, _config=config, label_filter = label_filter) + output = generate_response( + question=text, + use_hyde=llm_bundle.hyde_enabled, + _bundle=llm_bundle, + _config=config, + label_filter=label_filter, + ) + + # Add assistant response to chat history + st.session_state["messages"].append( + { + "question": text, + "response": output.response, + "links": [f'{s.chunk_link}' for s in output.semantic_search[::-1]], + "quality": f"{output.average_score:.2f}", + } + ) + for h_response in st.session_state["messages"]: + with st.expander(label=f":question: **{h_response['question']}**", expanded=False): + st.markdown(f"##### {h_response['question']}") + st.write(h_response["response"]) + st.markdown(f"\n---\n##### Serrch Quality Score: {h_response['quality']}") + st.markdown("##### Links") + for link in h_response["links"]: + st.write("\t* " + link, unsafe_allow_html=True) for source in output.semantic_search[::-1]: source_path = source.metadata.pop("source") @@ -119,12 +140,13 @@ def generate_response(question: str, _config: Config, _bundle, label_filter: str st.text(f"\n\n{source.chunk_text}") if llm_bundle.hyde_enabled: - with st.expander(label=':octagonal_sign: **HyDE Reponse**', expanded=False): + with st.expander(label=":octagonal_sign: **HyDE Reponse**", expanded=False): st.write(output.question) with chat_message("assistant"): st.write(f"**Search results quality score: {output.average_score:.2f}**\n") - st.write(output.response) + st.write(output.response) # Add user message to chat history + else: st.info("Choose a configuration template to start...")