ShishirPatil · ShishirPatil · Aug 27, 2024 · May 10, 2024 · May 13, 2024 · May 14, 2024
diff --git a/raft/.gitignore b/raft/.gitignore
@@ -1 +1,2 @@
 .venv/
+output/
diff --git a/raft/README.md b/raft/README.md
@@ -21,11 +21,13 @@ pip install -r requirements.txt
 ```
 
 Arguments:
-- `--datapath` - the path at which the document is located
+- `--datapath` - if a file, the path at which the document is located. If a folder, the path at which to load all documents
 - `--output` - the path at which to save the dataset
-- `--output-format` - the format of the output dataset. Defaults to `hf` for HuggingFace. Can be one of `hf`, `completion`, `chat`.
+- `--output-format` - the format of the output dataset. Defaults to `hf` for HuggingFace. Can be one of `hf`, `completion`, `chat`, `eval`.
 - `--output-type` - the type of the output dataset file. Defaults to `jsonl`. Can be one of `jsonl`, `parquet`.
 - `--output-chat-system-prompt` - The system prompt to use when the output format is `chat`. Optional.
+- `--output-completion-prompt-column` - The column (json field name) for the `prompt` / `instruction` when using the `completion` output format. Defaults to `prompt`.
+- `--output-completion-completion-column` - The column (json field name) for the `completion` when using the `completion` output format. Defaults to `completion`.
 - `--distractors` - the number of distractor documents to include per data point / triplet
 - `--doctype` - the type of the document, must be one of the accepted doctypes
   - currently accepted doctypes: `pdf`, `txt`, `json`, `api`
@@ -37,8 +39,11 @@ Arguments:
 - `--openai_key` - your OpenAI key used to make queries to GPT-3.5 or GPT-4
 - `--embedding-model` - The embedding model to use to encode documents chunks. Defaults to `text-embedding-ada-002`.
 - `--completion-model` - The model to use to generate questions and answers. Defaults to `gpt-4`.
-- `--fast` - Fast mode flag. By default, this flag is not included and the script runs in safe mode, where it saves checkpoint datasets, allowing the script to recover and continue where it left off in the case of an interruption. Include this flag to run RAFT without recovery. 
+- `--system-prompt-key` - The system prompt key to use to generate the dataset. Defaults to `gpt`. Can by one of `gpt`, `llama`.
+- `--workers` - The number of worker threads to use to generate the dataset. Defaults to 2.
+- `--auto-clean-checkpoints` - Whether to auto clean the checkpoints after the dataset is generated. Defaults to `false`.
 
+*Note*: The `--fast` mode flag has been removed, checkpointing is now always active.
 
 ## Usage
 
@@ -219,6 +224,27 @@ python3 format.py --input output/data-00000-of-00001.arrow --output output.compl
 
 ```
 python3 format.py --help
+
+usage: format.py [-h] --input INPUT [--input-type {arrow,jsonl}] --output OUTPUT --output-format {hf,completion,chat,eval} [--output-type {parquet,jsonl}] [--output-chat-system-prompt OUTPUT_CHAT_SYSTEM_PROMPT] [--output-completion-prompt-column OUTPUT_COMPLETION_PROMPT_COLUMN] [--output-completion-completion-column OUTPUT_COMPLETION_COMPLETION_COLUMN] [--output-completion-stop OUTPUT_COMPLETION_STOP]
+
+options:
+  -h, --help            show this help message and exit
+  --input INPUT         Input HuggingFace dataset file (default: None)
+  --input-type {arrow,jsonl}
+                        Format of the input dataset. Defaults to arrow. (default: arrow)
+  --output OUTPUT       Output file (default: None)
+  --output-format {hf,completion,chat,eval}
+                        Format to convert the dataset to (default: None)
+  --output-type {parquet,jsonl}
+                        Type to export the dataset to. Defaults to jsonl. (default: jsonl)
+  --output-chat-system-prompt OUTPUT_CHAT_SYSTEM_PROMPT
+                        The system prompt to use when the output format is chat (default: None)
+  --output-completion-prompt-column OUTPUT_COMPLETION_PROMPT_COLUMN
+                        The prompt column name to use for the completion format (default: prompt)
+  --output-completion-completion-column OUTPUT_COMPLETION_COMPLETION_COLUMN
+                        The completion column name to use for the completion format (default: completion)
+  --output-completion-stop OUTPUT_COMPLETION_STOP
+                        The stop keyword to use for the completion format (default: <STOP>)
 ```
 
 **Note**: If fine tuning a chat model, then you need to use `--output-format chat` and optionally add the `--output-chat-system-prompt` parameter to configure the system prompt included in the dataset.

diff --git a/raft/checkpointing.py b/raft/checkpointing.py
@@ -0,0 +1,77 @@
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List
+from datasets import Dataset, concatenate_datasets
+import logging
+import shutil
+
+logger = logging.getLogger("raft")
+
+@dataclass
+class Checkpoint:
+    path: Path
+    num: int
+
+    def load(self) -> Dataset:
+        return Dataset.load_from_disk(self.path)
+
+    def __lt__(self, other: 'Checkpoint') -> bool:
+        return self.num < other.num
+
+    def __eq__(self, other: 'Checkpoint') -> bool:
+        return self.num == other.num
+
+    def __hash__(self) -> int:
+        return hash(self.num)
+
+class Checkpointing:
+
+    def __init__(self, checkpoints_dir: Path) -> None:
+        self.checkpoints_dir = checkpoints_dir
+
+    def missing_checkpoints(self, num) -> List[int]:
+        return [n for n in range(0, num) if not (self.checkpoints_dir / f"checkpoint-{n}").exists()]
+
+    def save_checkpoint(self, ds: Dataset, num: int):
+        checkpoint_path = self.checkpoints_dir / ("checkpoint-" + str(num))
+        ds.save_to_disk(checkpoint_path)
+
+    def load_checkpoint(self, num: int):
+        checkpoint_path = self.checkpoints_dir / ("checkpoint-" + str(num))
+        if checkpoint_path.exists():
+            return Dataset.load_from_disk(checkpoint_path)
+        return None
+
+    def get_checkpoints(self) -> List[Checkpoint]:
+        checkpoints = []
+        if not self.checkpoints_dir.exists():
+            return checkpoints
+        for dir_path in self.checkpoints_dir.iterdir():
+            if dir_path.is_dir() and dir_path.name.startswith("checkpoint-"):
+                num = int(dir_path.name.split("-")[1])
+                checkpoints.append(Checkpoint(dir_path, num))
+        return checkpoints
+
+    def has_checkpoints(self) -> bool:
+        return len(self.get_checkpoints()) > 0
+
+    def collect_checkpoints(self) -> Dataset:
+        ds_list = list([checkpoint.load() for checkpoint in self.get_checkpoints()])
+        ds = concatenate_datasets(ds_list)
+        return ds
+
+    def delete_checkpoints(self):
+        shutil.rmtree(self.checkpoints_dir)
+
+def checkpointed(checkpointing: Checkpointing):
+    def wrapped(func):
+        def wrapper(chunk_id, *args, **kwargs):
+            ds = checkpointing.load_checkpoint(chunk_id)
+            if ds:
+                return ds
+            ds = func(chunk_id=chunk_id, *args, **kwargs)
+            if ds.num_rows > 0:
+                checkpointing.save_checkpoint(ds, chunk_id)
+            return ds
+        return wrapper
+    return wrapped
diff --git a/raft/client_utils.py b/raft/client_utils.py
@@ -1,24 +1,29 @@
+from abc import ABC
 from typing import Any
-from dotenv import load_dotenv
 from langchain_openai import OpenAIEmbeddings, AzureOpenAIEmbeddings
 from openai import AzureOpenAI, OpenAI
 import logging
 from env_config import read_env_config, set_env
-from os import environ
+from os import environ, getenv
+import time
+from threading import Lock
+from azure.identity import DefaultAzureCredential, ManagedIdentityCredential
+from azure.identity import get_bearer_token_provider
 
-logger = logging.getLogger("client_utils")
 
-load_dotenv()  # take environment variables from .env.
+logger = logging.getLogger("client_utils")
 
-def build_openai_client(**kwargs: Any) -> OpenAI:
+def build_openai_client(env_prefix : str = "COMPLETION", **kwargs: Any) -> OpenAI:
     """
     Build OpenAI client based on the environment variables.
     """
 
-    env = read_env_config("COMPLETION")
+    kwargs = _remove_empty_values(kwargs)
+    env = read_env_config(env_prefix)
     with set_env(**env):
         if is_azure():
-            client = AzureOpenAI(**kwargs)
+            auth_args = _get_azure_auth_client_args()
+            client = AzureOpenAI(**auth_args, **kwargs)
         else:
             client = OpenAI(**kwargs)
         return client
@@ -28,19 +33,124 @@ def build_langchain_embeddings(**kwargs: Any) -> OpenAIEmbeddings:
     Build OpenAI embeddings client based on the environment variables.
     """
 
+    kwargs = _remove_empty_values(kwargs)
     env = read_env_config("EMBEDDING")
-
     with set_env(**env):
         if is_azure():
-            client = AzureOpenAIEmbeddings(**kwargs)
+            auth_args = _get_azure_auth_client_args()
+            client = AzureOpenAIEmbeddings(**auth_args, **kwargs)
         else:
             client = OpenAIEmbeddings(**kwargs)
         return client
 
+def _remove_empty_values(d: dict) -> dict:
+    return {k: v for k, v in d.items() if v is not None}
+
+def _get_azure_auth_client_args() -> dict:
+    """Handle Azure OpenAI Keyless, Managed Identity and Key based authentication
+    https://techcommunity.microsoft.com/t5/microsoft-developer-community/using-keyless-authentication-with-azure-openai/ba-p/4111521
+    """
+    client_args = {}
+    if getenv("AZURE_OPENAI_KEY"):
+        logger.info("Using Azure OpenAI Key based authentication")
+        client_args["api_key"] = getenv("AZURE_OPENAI_KEY")
+    else:
+        if client_id := getenv("AZURE_OPENAI_CLIENT_ID"):
+            # Authenticate using a user-assigned managed identity on Azure
+            logger.info("Using Azure OpenAI Managed Identity Keyless authentication")
+            azure_credential = ManagedIdentityCredential(client_id=client_id)
+        else:
+            # Authenticate using the default Azure credential chain
+            logger.info("Using Azure OpenAI Default Azure Credential Keyless authentication")
+            azure_credential = DefaultAzureCredential()
+
+        client_args["azure_ad_token_provider"] = get_bearer_token_provider(
+            azure_credential, "https://cognitiveservices.azure.com/.default")
+    client_args["api_version"] = getenv("AZURE_OPENAI_API_VERSION") or "2024-02-15-preview"
+    client_args["azure_endpoint"] = getenv("AZURE_OPENAI_ENDPOINT")
+    client_args["azure_deployment"] = getenv("AZURE_OPENAI_DEPLOYMENT")
+    return client_args
+
 def is_azure():
     azure = "AZURE_OPENAI_ENDPOINT" in environ or "AZURE_OPENAI_KEY" in environ or "AZURE_OPENAI_AD_TOKEN" in environ
     if azure:
         logger.debug("Using Azure OpenAI environment variables")
     else:
         logger.debug("Using OpenAI environment variables")
     return azure
+
+def safe_min(a: Any, b: Any) -> Any:
+    if a is None:
+        return b
+    if b is None:
+        return a
+    return min(a, b)
+
+def safe_max(a: Any, b: Any) -> Any:
+    if a is None:
+        return b
+    if b is None:
+        return a
+    return max(a, b)
+
+class UsageStats:
+    def __init__(self) -> None:
+        self.start = time.time()
+        self.completion_tokens = 0
+        self.prompt_tokens = 0
+        self.total_tokens = 0
+        self.end = None
+        self.duration = 0
+        self.calls = 0
+
+    def __add__(self, other: 'UsageStats') -> 'UsageStats':
+        stats = UsageStats()
+        stats.start = safe_min(self.start, other.start)
+        stats.end = safe_max(self.end, other.end)
+        stats.completion_tokens = self.completion_tokens + other.completion_tokens
+        stats.prompt_tokens = self.prompt_tokens + other.prompt_tokens
+        stats.total_tokens = self.total_tokens + other.total_tokens
+        stats.duration = self.duration + other.duration
+        stats.calls = self.calls + other.calls
+        return stats
+
+class StatsCompleter(ABC):
+    def __init__(self, create_func):
+        self.create_func = create_func
+        self.stats = None
+        self.lock = Lock()
+
+    def __call__(self, *args: Any, **kwds: Any) -> Any:
+        response = self.create_func(*args, **kwds)
+        self.lock.acquire()
+        try:
+            if not self.stats:
+                self.stats = UsageStats()
+            self.stats.completion_tokens += response.usage.completion_tokens
+            self.stats.prompt_tokens += response.usage.prompt_tokens
+            self.stats.total_tokens += response.usage.total_tokens
+            self.stats.calls += 1
+            return response
+        finally:
+            self.lock.release()
+
+    def get_stats_and_reset(self) -> UsageStats:
+        self.lock.acquire()
+        try:
+            end = time.time()
+            stats = self.stats
+            if stats:
+                stats.end = end
+                stats.duration = end - self.stats.start
+                self.stats = None
+            return stats
+        finally:
+            self.lock.release()
+
+class ChatCompleter(StatsCompleter):
+    def __init__(self, client):
+        super().__init__(client.chat.completions.create)
+
+class CompletionsCompleter(StatsCompleter):
+    def __init__(self, client):
+        super().__init__(client.completions.create)
diff --git a/raft/env_config.py b/raft/env_config.py
@@ -1,12 +1,30 @@
 import contextlib
 import os
+import logging
+
+logger = logging.getLogger("env_config")
 
 # List of environment variables prefixes that are allowed to be used for configuration.
 env_prefix_whitelist = [
     'OPENAI', 
     'AZURE_OPENAI'
 ]
 
+def _obfuscate(secret):
+    l = len(secret)
+    return '.' * (l - 4) + secret[-4:]
+
+def _log_env(use_prefix: str, env: dict):
+    """
+    Logs each name value pair of the given environment. If the name indicates that it might store a secret such as an API key, then obfuscate the value.
+    """
+    log_prefix = f"'{use_prefix}'" if use_prefix else "no"
+    logger.info(f"Resolved OpenAI env vars with {log_prefix} prefix:")
+    for key, value in env.items():
+        if any(prefix in key for prefix in ['KEY', 'SECRET', 'TOKEN']):
+            value = _obfuscate(value)
+        logger.info(f" - {key}={value}")
+
 def read_env_config(use_prefix: str, env: dict = os.environ) -> str:
     """
     Read whitelisted environment variables and return them in a dictionary.
@@ -15,6 +33,7 @@ def read_env_config(use_prefix: str, env: dict = os.environ) -> str:
     config = {}
     for prefix in [None, use_prefix]:
         read_env_config_prefixed(prefix, config, env)
+    _log_env(use_prefix, config)
     return config
 
 def read_env_config_prefixed(use_prefix: str, config: dict, env: dict = os.environ) -> str: