curvedinf · curvedinf · Jan 9, 2025 · Jan 9, 2025 · Jan 9, 2025 · Jan 9, 2025
diff --git a/README.md b/README.md
@@ -18,18 +18,19 @@ and runs API LLMS using the also fantastic [LiteLLM](https://github.com/BerriAI/
 
 ## New Features
 
-* pipx support for Ubuntu 24.04
-* Automatically disable CGRAG for filesets smaller than 4x the LLM context
+* API embedding support with the new `ACTIVE_EMBED_IS_LOCAL = false` setting
+* Updated default local model to `QWQ-LCoT-7B-Instruct`
+* Improved prompt robustness and efficiency
 
 ## Quickstart
 
 In this section are recipes to run `dir-assistant` in basic capacity to get you started quickly.
 
-### Quickstart with Local Default Model (Phi 3 128k)
+### Quickstart with Local Default Model
 
 To get started locally, you can download a default llm model. Default configuration with this model requires 
-11GB of memory on most hardware or 8GB on nvidia GPUs due to flash attention availability (enabled by default). 
-You will be able to adjust the configuration to fit higher or lower memory requirements. To run via CPU:
+8GB of memory on most hardware. You will be able to adjust the configuration to fit higher or lower memory 
+requirements. To run via CPU:
 
 ```shell
 pip install dir-assistant
@@ -205,7 +206,7 @@ However, in most cases you will need to modify other options when changing APIs.
 
 ## Local LLM Model Download
 
-If you want to use a local LLM, you can download a low requirements default model (Phi 3 128k) with:
+If you want to use a local LLM, you can download a low requirements default model with:
 
 ```shell
 dir-assistant models download-llm
@@ -359,7 +360,7 @@ please see [CONTRIBUTORS.md](CONTRIBUTORS.md).
 
 ## Limitations
 
-- Only tested on Ubuntu 22.04. Please let us know if you run it successfully on other platforms by submitting an issue.
+- Only tested on Ubuntu 22.04 and 24.04. Please let us know if you run it successfully on other platforms by submitting an issue.
 - Dir-assistant only detects and reads text files at this time.
 
 ## Todos
@@ -373,8 +374,8 @@ please see [CONTRIBUTORS.md](CONTRIBUTORS.md).
 - ~~Single-step pip install~~
 - ~~Model download~~
 - ~~Commit to git~~
+- ~~API Embedding models~~
 - Web search
-- API Embedding models
 - Simple mode for better compatibility with external script automations
 
 ## Additional Credits

diff --git a/dir_assistant/assistant/base_assistant.py b/dir_assistant/assistant/base_assistant.py
@@ -1,6 +1,4 @@
-import copy
 import sys
-from os import write
 
 import numpy as np
 from colorama import Fore, Style
@@ -23,20 +21,25 @@ def __init__(
         context_file_ratio,
         output_acceptance_retries
     ):
+        self.system_instructions = system_instructions
         self.embed = embed
         self.index = index
         self.chunks = chunks
         self.context_file_ratio = context_file_ratio
-        system_instructions_tokens = self.embed.count_tokens(system_instructions)
+        self.context_size = 8192
+        self.output_acceptance_retries = output_acceptance_retries
+
+    def initialize_history(self):
+        # This inititialization occurs separately from the constructor because child classes need to initialize
+        # before count_tokens can be called.
+        system_instructions_tokens = self.count_tokens(self.system_instructions)
         self.chat_history = [
             {
                 "role": "system",
-                "content": system_instructions,
+                "content": self.system_instructions,
                 "tokens": system_instructions_tokens,
             }
         ]
-        self.context_size = 8192
-        self.output_acceptance_retries = output_acceptance_retries
 
     def call_completion(self, chat_history):
         # unimplemented on base class
@@ -46,12 +49,18 @@ def run_completion_generator(self, completion_output, output_message, write_to_s
         # unimplemented on base class
         raise NotImplementedError
 
+    def count_tokens(self, text):
+        # unimplemented on base class
+        raise NotImplementedError
+
     def build_relevant_full_text(self, user_input):
         relevant_chunks = search_index(self.embed, self.index, user_input, self.chunks)
         relevant_full_text = ""
         chunk_total_tokens = 0
         for i, relevant_chunk in enumerate(relevant_chunks, start=1):
-            chunk_total_tokens += relevant_chunk["tokens"]
+            # Note: relevant_chunk["tokens"] is created with the embedding model, not the LLM, so it will
+            # not be accurate for the purposes of maximizing the context of the LLM.
+            chunk_total_tokens += self.count_tokens(relevant_chunk["text"] + "\n\n")
             if chunk_total_tokens >= self.context_size * self.context_file_ratio:
                 break
             relevant_full_text += relevant_chunk["text"] + "\n\n"
@@ -82,10 +91,10 @@ def cull_history(self):
         self.cull_history_list(self.chat_history)
 
     def cull_history_list(self, history_list):
-        sum_of_tokens = sum([message["tokens"] for message in history_list])
+        sum_of_tokens = sum([self.count_tokens(message["content"]) for message in history_list])
         while sum_of_tokens > self.context_size:
             history_list.pop(0)
-            sum_of_tokens = sum([message["tokens"] for message in history_list])
+            sum_of_tokens = sum([self.count_tokens(message["content"]) for message in history_list])
 
     def create_empty_history(self, role="assistant"):
         return {"role": role, "content": "", "tokens": 0}
@@ -94,7 +103,7 @@ def create_one_off_prompt_history(self, prompt):
         return [{
             "role": "user",
             "content": prompt,
-            "tokens": self.embed.count_tokens(prompt),
+            "tokens": self.count_tokens(prompt),
         }]
 
     def create_prompt(self, user_input):
@@ -163,7 +172,7 @@ def run_basic_chat_stream(self, user_input, relevant_full_text, write_to_stdout)
             sys.stdout.flush()
 
         # Add the completion to the chat history
-        output_history["tokens"] = self.embed.count_tokens(output_history["content"])
+        output_history["tokens"] = self.count_tokens(output_history["content"])
         self.chat_history.append(output_history)
         return output_history["content"]
 

diff --git a/dir_assistant/assistant/cgrag_assistant.py b/dir_assistant/assistant/cgrag_assistant.py
@@ -31,19 +31,18 @@ def __init__(
 
     def write_assistant_thinking_message(self):
         # Disable CGRAG if the fileset is smaller than 4x the LLM context
-        total_tokens = sum(chunk['tokens'] for chunk in self.chunks)
-        self.fileset_larger_than_4x_context = total_tokens > self.context_size * 4
+        # total_tokens = sum(chunk['tokens'] for chunk in self.chunks)
 
         # Display the assistant thinking message
-        if self.use_cgrag and self.print_cgrag and self.fileset_larger_than_4x_context:
+        if self.use_cgrag and self.print_cgrag:
             sys.stdout.write(
                 f"{Style.BRIGHT}{Fore.BLUE}\nCGRAG Guidance: \n\n{Style.RESET_ALL}"
             )
         else:
             sys.stdout.write(
                 f"{Style.BRIGHT}{Fore.GREEN}\nAssistant: \n\n{Style.RESET_ALL}"
             )
-        if self.use_cgrag and self.fileset_larger_than_4x_context:
+        if self.use_cgrag:
             sys.stdout.write(
                 f"{Style.BRIGHT}{Fore.WHITE}\r(generating contextual guidance...){Style.RESET_ALL}"
             )
@@ -59,7 +58,7 @@ def print_cgrag_output(self, cgrag_output):
             sys.stdout.write(
                 Style.BRIGHT
                 + Fore.WHITE
-                + f'\r{cgrag_output}\n'
+                + f'\r{cgrag_output}\n\n'
                 + Style.RESET_ALL
             )
             sys.stdout.write(
@@ -80,23 +79,26 @@ def create_cgrag_prompt(self, base_prompt):
 response will be used to create an LLM embedding that will be used in a RAG to find the appropriate files which are 
 needed to answer the user prompt. There may be many files not currently included which have more relevant information, 
 so your response must include the most important concepts and information required to accurately answer the user 
-prompt. It is okay if the list is very long or short, but err on the side of a longer list so the RAG has more 
-information to work with. If the prompt is referencing code, list specific class, function, and variable names.
+prompt. Keep the list length to around 20 items. If the prompt is referencing code, list specific class, 
+function, and variable names as applicable to answering the user prompt.
 """
 
     def run_stream_processes(self, user_input, write_to_stdout):
-        prompt = self.create_prompt(user_input)
-        relevant_full_text = self.build_relevant_full_text(prompt)
-        if self.use_cgrag and self.fileset_larger_than_4x_context:
-            cgrag_prompt = self.create_cgrag_prompt(prompt)
+        if self.use_cgrag:
+            relevant_full_text = self.build_relevant_full_text(user_input)
+            cgrag_prompt = self.create_cgrag_prompt(user_input)
             cgrag_content = relevant_full_text + cgrag_prompt
             cgrag_history = copy.deepcopy(self.chat_history)
-            cgrag_history.append(self.create_user_history(cgrag_content, cgrag_content))
+            cgrag_prompt_history = self.create_user_history(cgrag_content, cgrag_content)
+            cgrag_history.append(cgrag_prompt_history)
             self.cull_history_list(cgrag_history)
             cgrag_generator = self.call_completion(cgrag_history)
             output_history = self.create_empty_history()
             output_history = self.run_completion_generator(cgrag_generator, output_history, False)
             relevant_full_text = self.build_relevant_full_text(output_history["content"])
             self.print_cgrag_output(output_history["content"])
             sys.stdout.flush()
+        else:
+            relevant_full_text = self.build_relevant_full_text(user_input)
+        prompt = self.create_prompt(user_input)
         return self.run_basic_chat_stream(prompt, relevant_full_text, write_to_stdout)
diff --git a/dir_assistant/assistant/git_assistant.py b/dir_assistant/assistant/git_assistant.py
@@ -1,6 +1,5 @@
 import os
 import sys
-import tempfile
 from colorama import Style, Fore
 from prompt_toolkit import prompt
 from dir_assistant.assistant.cgrag_assistant import CGRAGAssistant
@@ -36,7 +35,7 @@ def create_prompt(self, user_input):
         else:
             # Ask the LLM if a diff commit is appropriate
             should_diff_output = self.run_one_off_completion(f"""Does the prompt below request changes to files? 
-Respond only with "YES" or "NO". Do not respond with additional characters.
+Respond only with one word: "YES" or "NO". Do not respond with additional words or characters, only "YES" or "NO".
 User prompt:
 {user_input}
 """)
@@ -61,6 +60,7 @@ def create_prompt(self, user_input):
 /home/user/hello_project/hello_world.py
 if __name__ == "__main__":
     print("Hello, World!")
+
 Real response:
 """
             else:

diff --git a/dir_assistant/assistant/lite_llm_assistant.py b/dir_assistant/assistant/lite_llm_assistant.py
@@ -1,7 +1,7 @@
 import sys
 
 from colorama import Fore, Style
-from litellm import completion
+from litellm import completion, token_counter
 
 from dir_assistant.assistant.git_assistant import GitAssistant
 
@@ -69,3 +69,6 @@ def run_completion_generator(self, completion_output, output_message, write_to_s
                     sys.stdout.write(delta["content"])
                     sys.stdout.flush()
         return output_message
+
+    def count_tokens(self, text):
+        return token_counter(model=self.lite_llm_model, messages=[{"user": "role", "content": text}])
diff --git a/dir_assistant/assistant/lite_llm_embed.py b/dir_assistant/assistant/lite_llm_embed.py
@@ -0,0 +1,23 @@
+from time import sleep
+
+from litellm import embedding, token_counter
+
+from dir_assistant.assistant.base_embed import BaseEmbed
+
+
+class LiteLlmEmbed(BaseEmbed):
+    def __init__(self, lite_llm_embed_model, chunk_size=8192, delay=0):
+        self.lite_llm_model = lite_llm_embed_model
+        self.chunk_size = chunk_size
+        self.delay = delay
+
+    def create_embedding(self, text):
+        if self.delay:
+            sleep(self.delay)
+        return embedding(model=self.lite_llm_model, input=text, timeout=600)["data"][0]["embedding"]
+
+    def get_chunk_size(self):
+        return self.chunk_size
+
+    def count_tokens(self, text):
+        return token_counter(model=self.lite_llm_model, messages=[{"user": "role", "content": text}])
diff --git a/dir_assistant/assistant/llama_cpp_assistant.py b/dir_assistant/assistant/llama_cpp_assistant.py
@@ -54,3 +54,6 @@ def run_completion_generator(self, completion_output, output_message, write_to_s
                     sys.stdout.write(delta["content"])
                     sys.stdout.flush()
         return output_message
+
+    def count_tokens(self, text):
+        return len(self.llm.tokenize(bytes(text, "utf-8")))
diff --git a/dir_assistant/cli/config.py b/dir_assistant/cli/config.py
@@ -20,6 +20,7 @@
     ],
     "CONTEXT_FILE_RATIO": 0.9,
     "ACTIVE_MODEL_IS_LOCAL": False,
+    "ACTIVE_EMBED_IS_LOCAL": False,
     "OUTPUT_ACCEPTANCE_RETRIES": 1,
     "USE_CGRAG": True,
     "PRINT_CGRAG": False,
@@ -28,7 +29,7 @@
     "EMBED_MODEL": "",
     "LLM_MODEL": "",
     "LLAMA_CPP_OPTIONS": {
-        "n_ctx": 9200,
+        "n_ctx": 10000,
         "verbose": False,
     },
     "LLAMA_CPP_EMBED_OPTIONS": {
@@ -45,6 +46,9 @@
     "LITELLM_CONTEXT_SIZE": 500000,
     "LITELLM_MODEL_USES_SYSTEM_MESSAGE": False,
     "LITELLM_PASS_THROUGH_CONTEXT_SIZE": False,
+    "LITELLM_EMBED_MODEL": "gemini/text-embedding-004",
+    "LITELLM_EMBED_CHUNK_SIZE": 2048,
+    "LITELLM_EMBED_REQUEST_DELAY": 0,
     "LITELLM_API_KEYS": {
         "GEMINI_API_KEY": "",
         "OPENAI_API_KEY": "",

diff --git a/dir_assistant/cli/models.py b/dir_assistant/cli/models.py
@@ -5,9 +5,9 @@
 from dir_assistant.cli.config import get_file_path, save_config
 
 MODELS_DEFAULT_EMBED = "nomic-embed-text-v1.5.Q5_K_M.gguf"
-MODELS_DEFAULT_LLM = "Phi-3.1-mini-128k-instruct-Q5_K_L.gguf"
+MODELS_DEFAULT_LLM = "QwQ-LCoT-7B-Instruct-Q4_0.gguf"
 MODELS_DEFAULT_EMBED_URL = f"https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/{MODELS_DEFAULT_EMBED}?download=true"
-MODELS_DEFAULT_LLM_URL = f"https://huggingface.co/bartowski/Phi-3.1-mini-128k-instruct-GGUF/resolve/main/{MODELS_DEFAULT_LLM}?download=true"
+MODELS_DEFAULT_LLM_URL = f"https://huggingface.co/bartowski/QwQ-LCoT-7B-Instruct-GGUF/resolve/main/{MODELS_DEFAULT_LLM}?download=true"
 
 
 def open_directory(path):
@@ -37,6 +37,7 @@ def models_download_embed(args, config_dict):
         config_dict["DIR_ASSISTANT"]["MODELS_PATH"], MODELS_DEFAULT_EMBED
     )
     run(["wget", "-O", model_path, MODELS_DEFAULT_EMBED_URL])
+    config_dict["DIR_ASSISTANT"]["ACTIVE_EMBED_IS_LOCAL"] = True
     config_dict["DIR_ASSISTANT"]["EMBED_MODEL"] = MODELS_DEFAULT_EMBED
     save_config(config_dict)