From 629b3bb78c6b51c45269e1889bb3f024df106b44 Mon Sep 17 00:00:00 2001
From: Chase Adams <chase.adams@gmail.com>
Date: Thu, 9 Jan 2025 11:22:06 -0600
Subject: [PATCH 1/4] initial testing of api embedding

---
 dir_assistant/assistant/base_assistant.py     |  2 --
 dir_assistant/assistant/git_assistant.py      |  1 -
 dir_assistant/assistant/lite_llm_assistant.py |  5 +++-
 dir_assistant/assistant/lite_llm_embed.py     | 23 +++++++++++++++++++
 .../assistant/llama_cpp_assistant.py          |  3 +++
 dir_assistant/cli/start.py                    |  1 -
 6 files changed, 30 insertions(+), 5 deletions(-)
 create mode 100644 dir_assistant/assistant/lite_llm_embed.py

diff --git a/dir_assistant/assistant/base_assistant.py b/dir_assistant/assistant/base_assistant.py
index e68d11e..4411d80 100644
--- a/dir_assistant/assistant/base_assistant.py
+++ b/dir_assistant/assistant/base_assistant.py
@@ -1,6 +1,4 @@
-import copy
 import sys
-from os import write
 
 import numpy as np
 from colorama import Fore, Style
diff --git a/dir_assistant/assistant/git_assistant.py b/dir_assistant/assistant/git_assistant.py
index 3f13047..47c0b77 100644
--- a/dir_assistant/assistant/git_assistant.py
+++ b/dir_assistant/assistant/git_assistant.py
@@ -1,6 +1,5 @@
 import os
 import sys
-import tempfile
 from colorama import Style, Fore
 from prompt_toolkit import prompt
 from dir_assistant.assistant.cgrag_assistant import CGRAGAssistant
diff --git a/dir_assistant/assistant/lite_llm_assistant.py b/dir_assistant/assistant/lite_llm_assistant.py
index 06c8320..aaf8d56 100644
--- a/dir_assistant/assistant/lite_llm_assistant.py
+++ b/dir_assistant/assistant/lite_llm_assistant.py
@@ -1,7 +1,7 @@
 import sys
 
 from colorama import Fore, Style
-from litellm import completion
+from litellm import completion, token_counter
 
 from dir_assistant.assistant.git_assistant import GitAssistant
 
@@ -69,3 +69,6 @@ def run_completion_generator(self, completion_output, output_message, write_to_s
                     sys.stdout.write(delta["content"])
                     sys.stdout.flush()
         return output_message
+
+    def count_tokens(self, text):
+        return token_counter(model=self.lite_llm_model, messages=[text])
\ No newline at end of file
diff --git a/dir_assistant/assistant/lite_llm_embed.py b/dir_assistant/assistant/lite_llm_embed.py
new file mode 100644
index 0000000..621cd62
--- /dev/null
+++ b/dir_assistant/assistant/lite_llm_embed.py
@@ -0,0 +1,23 @@
+from time import sleep
+
+from litellm import embedding, token_counter
+
+from dir_assistant.assistant.base_embed import BaseEmbed
+
+
+class LiteLlmEmbed(BaseEmbed):
+    def __init__(self, lite_llm_model, chunk_size=8192, delay=0):
+        self.lite_llm_model = lite_llm_model
+        self.chunk_size = chunk_size
+        self.delay = delay
+
+    def create_embedding(self, text):
+        if self.delay:
+            sleep(self.delay)
+        return embedding(model=self.lite_llm_model, input=text, timeout=600)["data"][0]["embedding"]
+
+    def get_chunk_size(self):
+        return self.chunk_size
+
+    def count_tokens(self, text):
+        return token_counter(model=self.lite_llm_model, messages=[text])
\ No newline at end of file
diff --git a/dir_assistant/assistant/llama_cpp_assistant.py b/dir_assistant/assistant/llama_cpp_assistant.py
index 8af14be..4cc705f 100644
--- a/dir_assistant/assistant/llama_cpp_assistant.py
+++ b/dir_assistant/assistant/llama_cpp_assistant.py
@@ -54,3 +54,6 @@ def run_completion_generator(self, completion_output, output_message, write_to_s
                     sys.stdout.write(delta["content"])
                     sys.stdout.flush()
         return output_message
+
+    def count_tokens(self, text):
+        return len(self.llm.tokenize(bytes(text, "utf-8")))
\ No newline at end of file
diff --git a/dir_assistant/cli/start.py b/dir_assistant/cli/start.py
index a7305c7..ef6e989 100644
--- a/dir_assistant/cli/start.py
+++ b/dir_assistant/cli/start.py
@@ -1,7 +1,6 @@
 import os
 import sys
 from colorama import Fore, Style
-from llama_cpp import Llama
 from prompt_toolkit import prompt
 from prompt_toolkit.history import InMemoryHistory
 from dir_assistant.assistant.file_watcher import start_file_watcher

From a83c06596b1cd73fe2bb9900e679fd6755e5f3c9 Mon Sep 17 00:00:00 2001
From: Chase Adams <chase.adams@gmail.com>
Date: Thu, 9 Jan 2025 16:00:51 -0600
Subject: [PATCH 2/4] working api embedding

---
 README.md                                     | 12 ++--
 dir_assistant/assistant/base_assistant.py     | 29 +++++++---
 dir_assistant/assistant/cgrag_assistant.py    | 26 +++++----
 dir_assistant/assistant/git_assistant.py      |  3 +-
 dir_assistant/assistant/lite_llm_assistant.py |  2 +-
 dir_assistant/assistant/lite_llm_embed.py     |  6 +-
 dir_assistant/cli/config.py                   |  6 +-
 dir_assistant/cli/models.py                   |  5 +-
 dir_assistant/cli/start.py                    | 58 ++++++++++++++-----
 9 files changed, 98 insertions(+), 49 deletions(-)

diff --git a/README.md b/README.md
index 0927a77..a17cf5c 100644
--- a/README.md
+++ b/README.md
@@ -18,18 +18,18 @@ and runs API LLMS using the also fantastic [LiteLLM](https://github.com/BerriAI/
 
 ## New Features
 
-* pipx support for Ubuntu 24.04
-* Automatically disable CGRAG for filesets smaller than 4x the LLM context
+* API embedding support with the new `ACTIVE_EMBED_IS_LOCAL = false` setting
+* Updated default local model to `QWQ-LCoT-7B-Instruct`
 
 ## Quickstart
 
 In this section are recipes to run `dir-assistant` in basic capacity to get you started quickly.
 
-### Quickstart with Local Default Model (Phi 3 128k)
+### Quickstart with Local Default Model
 
 To get started locally, you can download a default llm model. Default configuration with this model requires 
-11GB of memory on most hardware or 8GB on nvidia GPUs due to flash attention availability (enabled by default). 
-You will be able to adjust the configuration to fit higher or lower memory requirements. To run via CPU:
+8GB of memory on most hardware. You will be able to adjust the configuration to fit higher or lower memory 
+requirements. To run via CPU:
 
 ```shell
 pip install dir-assistant
@@ -205,7 +205,7 @@ However, in most cases you will need to modify other options when changing APIs.
 
 ## Local LLM Model Download
 
-If you want to use a local LLM, you can download a low requirements default model (Phi 3 128k) with:
+If you want to use a local LLM, you can download a low requirements default model with:
 
 ```shell
 dir-assistant models download-llm
diff --git a/dir_assistant/assistant/base_assistant.py b/dir_assistant/assistant/base_assistant.py
index 4411d80..a9c00dd 100644
--- a/dir_assistant/assistant/base_assistant.py
+++ b/dir_assistant/assistant/base_assistant.py
@@ -21,20 +21,25 @@ def __init__(
         context_file_ratio,
         output_acceptance_retries
     ):
+        self.system_instructions = system_instructions
         self.embed = embed
         self.index = index
         self.chunks = chunks
         self.context_file_ratio = context_file_ratio
-        system_instructions_tokens = self.embed.count_tokens(system_instructions)
+        self.context_size = 8192
+        self.output_acceptance_retries = output_acceptance_retries
+
+    def initialize_history(self):
+        # This inititialization occurs separately from the constructor because child classes need to initialize
+        # before count_tokens can be called.
+        system_instructions_tokens = self.count_tokens(self.system_instructions)
         self.chat_history = [
             {
                 "role": "system",
-                "content": system_instructions,
+                "content": self.system_instructions,
                 "tokens": system_instructions_tokens,
             }
         ]
-        self.context_size = 8192
-        self.output_acceptance_retries = output_acceptance_retries
 
     def call_completion(self, chat_history):
         # unimplemented on base class
@@ -44,12 +49,18 @@ def run_completion_generator(self, completion_output, output_message, write_to_s
         # unimplemented on base class
         raise NotImplementedError
 
+    def count_tokens(self, text):
+        # unimplemented on base class
+        raise NotImplementedError
+
     def build_relevant_full_text(self, user_input):
         relevant_chunks = search_index(self.embed, self.index, user_input, self.chunks)
         relevant_full_text = ""
         chunk_total_tokens = 0
         for i, relevant_chunk in enumerate(relevant_chunks, start=1):
-            chunk_total_tokens += relevant_chunk["tokens"]
+            # Note: relevant_chunk["tokens"] is created with the embedding model, not the LLM, so it will
+            # not be accurate for the purposes of maximizing the context of the LLM.
+            chunk_total_tokens += self.count_tokens(relevant_chunk["text"] + "\n\n")
             if chunk_total_tokens >= self.context_size * self.context_file_ratio:
                 break
             relevant_full_text += relevant_chunk["text"] + "\n\n"
@@ -80,10 +91,10 @@ def cull_history(self):
         self.cull_history_list(self.chat_history)
 
     def cull_history_list(self, history_list):
-        sum_of_tokens = sum([message["tokens"] for message in history_list])
+        sum_of_tokens = sum([self.count_tokens(message["content"]) for message in history_list])
         while sum_of_tokens > self.context_size:
             history_list.pop(0)
-            sum_of_tokens = sum([message["tokens"] for message in history_list])
+            sum_of_tokens = sum([self.count_tokens(message["content"]) for message in history_list])
 
     def create_empty_history(self, role="assistant"):
         return {"role": role, "content": "", "tokens": 0}
@@ -92,7 +103,7 @@ def create_one_off_prompt_history(self, prompt):
         return [{
             "role": "user",
             "content": prompt,
-            "tokens": self.embed.count_tokens(prompt),
+            "tokens": self.count_tokens(prompt),
         }]
 
     def create_prompt(self, user_input):
@@ -161,7 +172,7 @@ def run_basic_chat_stream(self, user_input, relevant_full_text, write_to_stdout)
             sys.stdout.flush()
 
         # Add the completion to the chat history
-        output_history["tokens"] = self.embed.count_tokens(output_history["content"])
+        output_history["tokens"] = self.count_tokens(output_history["content"])
         self.chat_history.append(output_history)
         return output_history["content"]
 
diff --git a/dir_assistant/assistant/cgrag_assistant.py b/dir_assistant/assistant/cgrag_assistant.py
index 3c59ff3..04ef719 100644
--- a/dir_assistant/assistant/cgrag_assistant.py
+++ b/dir_assistant/assistant/cgrag_assistant.py
@@ -31,11 +31,10 @@ def __init__(
 
     def write_assistant_thinking_message(self):
         # Disable CGRAG if the fileset is smaller than 4x the LLM context
-        total_tokens = sum(chunk['tokens'] for chunk in self.chunks)
-        self.fileset_larger_than_4x_context = total_tokens > self.context_size * 4
+        # total_tokens = sum(chunk['tokens'] for chunk in self.chunks)
 
         # Display the assistant thinking message
-        if self.use_cgrag and self.print_cgrag and self.fileset_larger_than_4x_context:
+        if self.use_cgrag and self.print_cgrag:
             sys.stdout.write(
                 f"{Style.BRIGHT}{Fore.BLUE}\nCGRAG Guidance: \n\n{Style.RESET_ALL}"
             )
@@ -43,7 +42,7 @@ def write_assistant_thinking_message(self):
             sys.stdout.write(
                 f"{Style.BRIGHT}{Fore.GREEN}\nAssistant: \n\n{Style.RESET_ALL}"
             )
-        if self.use_cgrag and self.fileset_larger_than_4x_context:
+        if self.use_cgrag:
             sys.stdout.write(
                 f"{Style.BRIGHT}{Fore.WHITE}\r(generating contextual guidance...){Style.RESET_ALL}"
             )
@@ -59,7 +58,7 @@ def print_cgrag_output(self, cgrag_output):
             sys.stdout.write(
                 Style.BRIGHT
                 + Fore.WHITE
-                + f'\r{cgrag_output}\n'
+                + f'\r{cgrag_output}\n\n'
                 + Style.RESET_ALL
             )
             sys.stdout.write(
@@ -80,18 +79,18 @@ def create_cgrag_prompt(self, base_prompt):
 response will be used to create an LLM embedding that will be used in a RAG to find the appropriate files which are 
 needed to answer the user prompt. There may be many files not currently included which have more relevant information, 
 so your response must include the most important concepts and information required to accurately answer the user 
-prompt. It is okay if the list is very long or short, but err on the side of a longer list so the RAG has more 
-information to work with. If the prompt is referencing code, list specific class, function, and variable names.
+prompt. Keep the list length to around 20 items. If the prompt is referencing code, list specific class, 
+function, and variable names as applicable to answering the user prompt.
 """
 
     def run_stream_processes(self, user_input, write_to_stdout):
-        prompt = self.create_prompt(user_input)
-        relevant_full_text = self.build_relevant_full_text(prompt)
-        if self.use_cgrag and self.fileset_larger_than_4x_context:
-            cgrag_prompt = self.create_cgrag_prompt(prompt)
+        if self.use_cgrag:
+            relevant_full_text = self.build_relevant_full_text(user_input)
+            cgrag_prompt = self.create_cgrag_prompt(user_input)
             cgrag_content = relevant_full_text + cgrag_prompt
             cgrag_history = copy.deepcopy(self.chat_history)
-            cgrag_history.append(self.create_user_history(cgrag_content, cgrag_content))
+            cgrag_prompt_history = self.create_user_history(cgrag_content, cgrag_content)
+            cgrag_history.append(cgrag_prompt_history)
             self.cull_history_list(cgrag_history)
             cgrag_generator = self.call_completion(cgrag_history)
             output_history = self.create_empty_history()
@@ -99,4 +98,7 @@ def run_stream_processes(self, user_input, write_to_stdout):
             relevant_full_text = self.build_relevant_full_text(output_history["content"])
             self.print_cgrag_output(output_history["content"])
             sys.stdout.flush()
+        else:
+            relevant_full_text = self.build_relevant_full_text(user_input)
+        prompt = self.create_prompt(user_input)
         return self.run_basic_chat_stream(prompt, relevant_full_text, write_to_stdout)
diff --git a/dir_assistant/assistant/git_assistant.py b/dir_assistant/assistant/git_assistant.py
index 47c0b77..7b0bb60 100644
--- a/dir_assistant/assistant/git_assistant.py
+++ b/dir_assistant/assistant/git_assistant.py
@@ -35,7 +35,7 @@ def create_prompt(self, user_input):
         else:
             # Ask the LLM if a diff commit is appropriate
             should_diff_output = self.run_one_off_completion(f"""Does the prompt below request changes to files? 
-Respond only with "YES" or "NO". Do not respond with additional characters.
+Respond only with one word: "YES" or "NO". Do not respond with additional words or characters, only "YES" or "NO".
 User prompt:
 {user_input}
 """)
@@ -60,6 +60,7 @@ def create_prompt(self, user_input):
 /home/user/hello_project/hello_world.py
 if __name__ == "__main__":
     print("Hello, World!")
+
 Real response:
 """
             else:
diff --git a/dir_assistant/assistant/lite_llm_assistant.py b/dir_assistant/assistant/lite_llm_assistant.py
index aaf8d56..20c626c 100644
--- a/dir_assistant/assistant/lite_llm_assistant.py
+++ b/dir_assistant/assistant/lite_llm_assistant.py
@@ -71,4 +71,4 @@ def run_completion_generator(self, completion_output, output_message, write_to_s
         return output_message
 
     def count_tokens(self, text):
-        return token_counter(model=self.lite_llm_model, messages=[text])
\ No newline at end of file
+        return token_counter(model=self.lite_llm_model, messages=[{"user": "role", "content": text}])
\ No newline at end of file
diff --git a/dir_assistant/assistant/lite_llm_embed.py b/dir_assistant/assistant/lite_llm_embed.py
index 621cd62..d78f1e0 100644
--- a/dir_assistant/assistant/lite_llm_embed.py
+++ b/dir_assistant/assistant/lite_llm_embed.py
@@ -6,8 +6,8 @@
 
 
 class LiteLlmEmbed(BaseEmbed):
-    def __init__(self, lite_llm_model, chunk_size=8192, delay=0):
-        self.lite_llm_model = lite_llm_model
+    def __init__(self, lite_llm_embed_model, chunk_size=8192, delay=0):
+        self.lite_llm_model = lite_llm_embed_model
         self.chunk_size = chunk_size
         self.delay = delay
 
@@ -20,4 +20,4 @@ def get_chunk_size(self):
         return self.chunk_size
 
     def count_tokens(self, text):
-        return token_counter(model=self.lite_llm_model, messages=[text])
\ No newline at end of file
+        return token_counter(model=self.lite_llm_model, messages=[{"user": "role", "content": text}])
\ No newline at end of file
diff --git a/dir_assistant/cli/config.py b/dir_assistant/cli/config.py
index aca1753..a455c27 100644
--- a/dir_assistant/cli/config.py
+++ b/dir_assistant/cli/config.py
@@ -20,6 +20,7 @@
     ],
     "CONTEXT_FILE_RATIO": 0.9,
     "ACTIVE_MODEL_IS_LOCAL": False,
+    "ACTIVE_EMBED_IS_LOCAL": False,
     "OUTPUT_ACCEPTANCE_RETRIES": 1,
     "USE_CGRAG": True,
     "PRINT_CGRAG": False,
@@ -28,7 +29,7 @@
     "EMBED_MODEL": "",
     "LLM_MODEL": "",
     "LLAMA_CPP_OPTIONS": {
-        "n_ctx": 9200,
+        "n_ctx": 10000,
         "verbose": False,
     },
     "LLAMA_CPP_EMBED_OPTIONS": {
@@ -45,6 +46,9 @@
     "LITELLM_CONTEXT_SIZE": 500000,
     "LITELLM_MODEL_USES_SYSTEM_MESSAGE": False,
     "LITELLM_PASS_THROUGH_CONTEXT_SIZE": False,
+    "LITELLM_EMBED_MODEL": "gemini/text-embedding-004",
+    "LITELLM_EMBED_CHUNK_SIZE": 2048,
+    "LITELLM_EMBED_REQUEST_DELAY": 0,
     "LITELLM_API_KEYS": {
         "GEMINI_API_KEY": "",
         "OPENAI_API_KEY": "",
diff --git a/dir_assistant/cli/models.py b/dir_assistant/cli/models.py
index 46d025c..a5af830 100644
--- a/dir_assistant/cli/models.py
+++ b/dir_assistant/cli/models.py
@@ -5,9 +5,9 @@
 from dir_assistant.cli.config import get_file_path, save_config
 
 MODELS_DEFAULT_EMBED = "nomic-embed-text-v1.5.Q5_K_M.gguf"
-MODELS_DEFAULT_LLM = "Phi-3.1-mini-128k-instruct-Q5_K_L.gguf"
+MODELS_DEFAULT_LLM = "QwQ-LCoT-7B-Instruct-Q4_0.gguf"
 MODELS_DEFAULT_EMBED_URL = f"https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF/resolve/main/{MODELS_DEFAULT_EMBED}?download=true"
-MODELS_DEFAULT_LLM_URL = f"https://huggingface.co/bartowski/Phi-3.1-mini-128k-instruct-GGUF/resolve/main/{MODELS_DEFAULT_LLM}?download=true"
+MODELS_DEFAULT_LLM_URL = f"https://huggingface.co/bartowski/QwQ-LCoT-7B-Instruct-GGUF/resolve/main/{MODELS_DEFAULT_LLM}?download=true"
 
 
 def open_directory(path):
@@ -37,6 +37,7 @@ def models_download_embed(args, config_dict):
         config_dict["DIR_ASSISTANT"]["MODELS_PATH"], MODELS_DEFAULT_EMBED
     )
     run(["wget", "-O", model_path, MODELS_DEFAULT_EMBED_URL])
+    config_dict["DIR_ASSISTANT"]["ACTIVE_EMBED_IS_LOCAL"] = True
     config_dict["DIR_ASSISTANT"]["EMBED_MODEL"] = MODELS_DEFAULT_EMBED
     save_config(config_dict)
 
diff --git a/dir_assistant/cli/start.py b/dir_assistant/cli/start.py
index ef6e989..915ff22 100644
--- a/dir_assistant/cli/start.py
+++ b/dir_assistant/cli/start.py
@@ -6,6 +6,7 @@
 from dir_assistant.assistant.file_watcher import start_file_watcher
 from dir_assistant.assistant.index import create_file_index
 from dir_assistant.assistant.lite_llm_assistant import LiteLLMAssistant
+from dir_assistant.assistant.lite_llm_embed import LiteLlmEmbed
 from dir_assistant.assistant.llama_cpp_assistant import LlamaCppAssistant
 from dir_assistant.assistant.llama_cpp_embed import LlamaCppEmbed
 from dir_assistant.cli.config import get_file_path
@@ -38,17 +39,22 @@ def display_startup_art(commit_to_git):
     sys.stdout.write("\n")
 
 def start(args, config_dict):
-    # Load settings
+    # Main settings
+    active_model_is_local = config_dict["ACTIVE_MODEL_IS_LOCAL"]
+    active_embed_is_local = config_dict["ACTIVE_EMBED_IS_LOCAL"]
+    context_file_ratio = config_dict["CONTEXT_FILE_RATIO"]
+    system_instructions = config_dict["SYSTEM_INSTRUCTIONS"]
+
+    # Llama.cpp settings
     llm_model_file = get_file_path(config_dict["MODELS_PATH"], config_dict["LLM_MODEL"])
     embed_model_file = get_file_path(
         config_dict["MODELS_PATH"], config_dict["EMBED_MODEL"]
     )
-    context_file_ratio = config_dict["CONTEXT_FILE_RATIO"]
-    system_instructions = config_dict["SYSTEM_INSTRUCTIONS"]
     llama_cpp_options = config_dict["LLAMA_CPP_OPTIONS"]
     llama_cpp_embed_options = config_dict["LLAMA_CPP_EMBED_OPTIONS"]
     llama_cpp_completion_options = config_dict["LLAMA_CPP_COMPLETION_OPTIONS"]
-    active_model_is_local = config_dict["ACTIVE_MODEL_IS_LOCAL"]
+
+    # LiteLLM settings
     lite_llm_model = config_dict["LITELLM_MODEL"]
     lite_llm_context_size = config_dict["LITELLM_CONTEXT_SIZE"]
     lite_llm_model_uses_system_message = config_dict[
@@ -57,17 +63,17 @@ def start(args, config_dict):
     lite_llm_pass_through_context_size = config_dict[
         "LITELLM_PASS_THROUGH_CONTEXT_SIZE"
     ]
+    lite_llm_embed_model = config_dict["LITELLM_EMBED_MODEL"]
+    lite_llm_embed_chunk_size = config_dict["LITELLM_EMBED_CHUNK_SIZE"]
+    lite_llm_embed_request_delay = float(config_dict["LITELLM_EMBED_REQUEST_DELAY"])
+
+    # Assistant settings
     use_cgrag = config_dict["USE_CGRAG"]
     print_cgrag = config_dict["PRINT_CGRAG"]
     output_acceptance_retries = config_dict["OUTPUT_ACCEPTANCE_RETRIES"]
     commit_to_git = config_dict["COMMIT_TO_GIT"]
 
-    if config_dict["EMBED_MODEL"] == "":
-        print(
-            """You must specify EMBED_MODEL. Use 'dir-assistant config open' and \
-see readme for more information. Exiting..."""
-        )
-        exit(1)
+    # Check for basic missing model configs
     if active_model_is_local:
         if config_dict["LLM_MODEL"] == "":
             print(
@@ -82,15 +88,38 @@ def start(args, config_dict):
         )
         exit(1)
 
+    # Check for basic missing embedding model configs
+    if active_embed_is_local:
+        if config_dict["EMBED_MODEL"] == "":
+            print(
+                """You must specify EMBED_MODEL. Use 'dir-assistant config open' and \
+see readme for more information. Exiting..."""
+            )
+            exit(1)
+    elif lite_llm_embed_model == "":
+        print(
+            """You must specify LITELLM_EMBED_MODEL. Use 'dir-assistant config open' and \
+see readme for more information. Exiting..."""
+        )
+        exit(1)
+
     ignore_paths = args.i__ignore if args.i__ignore else []
     ignore_paths.extend(config_dict["GLOBAL_IGNORES"])
 
     # Initialize the embedding model
     print(f"{Fore.LIGHTBLACK_EX}Loading embedding model...{Style.RESET_ALL}")
-    embed = LlamaCppEmbed(
-        model_path=embed_model_file, embed_options=llama_cpp_embed_options
-    )
-    embed_chunk_size = embed.get_chunk_size()
+    if active_embed_is_local:
+        embed = LlamaCppEmbed(
+            model_path=embed_model_file, embed_options=llama_cpp_embed_options
+        )
+        embed_chunk_size = embed.get_chunk_size()
+    else:
+        embed = LiteLlmEmbed(
+            lite_llm_embed_model=lite_llm_embed_model,
+            chunk_size=lite_llm_embed_chunk_size,
+            delay=lite_llm_embed_request_delay,
+        )
+        embed_chunk_size = lite_llm_embed_chunk_size
 
     # Create the file index
     print(f"{Fore.LIGHTBLACK_EX}Creating file embeddings and index...{Style.RESET_ALL}")
@@ -135,6 +164,7 @@ def start(args, config_dict):
             print_cgrag,
             commit_to_git,
         )
+    llm.initialize_history()
 
     # Start file watcher
     watcher = start_file_watcher(

From d6c6796abbbfb81af7361c84711ee3ffa70936d7 Mon Sep 17 00:00:00 2001
From: Chase Adams <chase.adams@gmail.com>
Date: Thu, 9 Jan 2025 16:08:24 -0600
Subject: [PATCH 3/4] readme fixes

---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a17cf5c..a144a4a 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,7 @@ and runs API LLMS using the also fantastic [LiteLLM](https://github.com/BerriAI/
 
 * API embedding support with the new `ACTIVE_EMBED_IS_LOCAL = false` setting
 * Updated default local model to `QWQ-LCoT-7B-Instruct`
+* Improved prompt robustness and efficiency
 
 ## Quickstart
 
@@ -359,7 +360,7 @@ please see [CONTRIBUTORS.md](CONTRIBUTORS.md).
 
 ## Limitations
 
-- Only tested on Ubuntu 22.04. Please let us know if you run it successfully on other platforms by submitting an issue.
+- Only tested on Ubuntu 22.04 and 24.04. Please let us know if you run it successfully on other platforms by submitting an issue.
 - Dir-assistant only detects and reads text files at this time.
 
 ## Todos
@@ -373,8 +374,8 @@ please see [CONTRIBUTORS.md](CONTRIBUTORS.md).
 - ~~Single-step pip install~~
 - ~~Model download~~
 - ~~Commit to git~~
+- ~~API Embedding models~~
 - Web search
-- API Embedding models
 - Simple mode for better compatibility with external script automations
 
 ## Additional Credits

From 69549eb0921470298e53ac69299e279b6a47f1ff Mon Sep 17 00:00:00 2001
From: Chase Adams <chase.adams@gmail.com>
Date: Thu, 9 Jan 2025 16:10:39 -0600
Subject: [PATCH 4/4] version bump

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 577219b..9a977f6 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name="dir-assistant",
-    version="1.1.2",
+    version="1.2.0",
     description="Chat with your current directory's files using a local or API LLM.",
     long_description=README,
     long_description_content_type="text/markdown",