Tanuki · JackHopkins · Nov 24, 2023 · Nov 20, 2023 · Nov 21, 2023 · Nov 21, 2023
@@ -8,9 +8,6 @@
 load_dotenv()
 openai.api_key = os.getenv("OPENAI_API_KEY")
 
-config = {"workspace_id": 0}
-monkey.configure(**config)
-
 @monkey.patch
 def clean_language(statement: str) -> str:
     """

@@ -1,5 +1,7 @@
 from pydantic import Field
 from typing import Annotated
+from dotenv import load_dotenv
+load_dotenv()
 from monkey_patch.monkey import Monkey as monkey
 
 @monkey.patch

@@ -13,14 +13,16 @@
 
 
 class FunctionModeler(object):
-    def __init__(self, data_worker, workspace_id = 0, check_for_finetunes = True) -> None:
+    def __init__(self, data_worker, environment_id = 0) -> None:
         self.function_configs = {}
         self.data_worker = data_worker
         self.distillation_token_limit = 3000 # the token limit for finetuning
         self.align_buffer = {}
         self._get_datasets()
-        self.workspace_id = workspace_id
-        self.check_for_finetunes = check_for_finetunes
+        self.environment_id = environment_id
+        self.check_finetune_blacklist = []
+        self.execute_finetune_blacklist = []
+        self.store_data_blacklist = []
 
 
     def _get_dataset_info(self, dataset_type, func_hash, type = "length"):
@@ -38,6 +40,8 @@ def _get_datasets(self):
     def save_align_statements(self, function_hash, args, kwargs, output):
         """
         Save the align statements and add to the align buffer
+        Do not save if the function hash is in the store data blacklist
+        Then just add the datapoints to the align buffer
         """
         # prepare output for saving and later parsing
         # make a deepcopy of the output to avoid changing the original object
@@ -51,8 +55,11 @@ def save_align_statements(self, function_hash, args, kwargs, output):
         parsed_kwargs = prepare_object_for_saving(copy_kwargs)
 
         example = FunctionExample(parsed_args, parsed_kwargs, parsed_output)
-
-        successfully_saved, new_datapoint = self.data_worker.log_align(function_hash, example)
+        if function_hash not in self.store_data_blacklist:
+            successfully_saved, new_datapoint = self.data_worker.log_align(function_hash, example)
+        else:
+            successfully_saved = False
+            new_datapoint = True
         if successfully_saved:
             if function_hash in self.dataset_sizes["alignments"]:
                 self.dataset_sizes["alignments"][function_hash] += 1 
@@ -126,8 +133,14 @@ def get_alignments(self, func_hash, max=20):
     def load_align_statements(self, function_hash):
         """
         Load all align statements
+        First check the data storage blacklist,
+        if the func hash is in the blacklist, then set the dataset size to 0 and the align buffer to empty bytearray
         """
-        if function_hash not in self.align_buffer:
+        if function_hash in self.store_data_blacklist:
+            self.dataset_sizes["alignments"][function_hash] = 0
+            self.align_buffer[function_hash] = bytearray()
+
+        elif function_hash not in self.align_buffer:
             dataset_size, align_dataset = self._get_dataset_info("alignments", function_hash, type = "both")
             if align_dataset:
                 self.align_buffer[function_hash] = bytearray(align_dataset)
@@ -137,26 +150,28 @@ def load_align_statements(self, function_hash):
     def postprocess_datapoint(self, func_hash, function_description, example, repaired=True):
         """
         Postprocess the datapoint
+        First check if the datapoint should be added to the training data
+        Add the datapoint if it should be added
+        Then check if the function should be finetuned and execute finetuning if it should
         """
         try:
-
-            added = self.save_datapoint(func_hash, example)
-            if added:
-                self._update_datapoint_config(repaired, func_hash)
+            if func_hash not in self.store_data_blacklist:
+                added = self.save_datapoint(func_hash, example)
+                if added:
+                    self._update_datapoint_config(repaired, func_hash)
         except Exception as e:
             print(e)
             print("Could not add datapoint to training data")
-            return None
-
-        self.check_for_finetuning(function_description, func_hash)
+        if func_hash not in self.execute_finetune_blacklist:
+            self.check_for_finetuning(function_description, func_hash)
 
     def _load_function_config(self, func_hash, function_description):
         """
         Load the config file for a function hash
         """
 
         config, default = self.data_worker._load_function_config(func_hash)
-        if default and self.check_for_finetunes:
+        if default and func_hash not in self.check_finetune_blacklist:
             finetuned, finetune_config = self._check_for_finetunes(function_description)
             if finetuned:
                 config = finetune_config
@@ -168,7 +183,7 @@ def _check_for_finetunes(self, function_description):
         # This here should be discussed, what's the bestd way to do it
 
         # hash the function_hash into 16 characters
-        finetune_hash = function_description.__hash__(purpose = "finetune") + encode_int(self.workspace_id)
+        finetune_hash = function_description.__hash__(purpose = "finetune") + encode_int(self.environment_id)
         # List 10 fine-tuning jobs
         finetunes = openai.FineTuningJob.list(limit=1000)
         # Check if the function_hash is in the fine-tuning jobs
@@ -367,7 +382,7 @@ def _execute_finetuning(self, function_description, func_hash):
         # create the finetune hash
         finetune_hash = function_description.__hash__(purpose = "finetune")
         nr_of_training_runs = self.function_configs[func_hash]["nr_of_training_runs"]
-        finetune_hash += encode_int(self.workspace_id)
+        finetune_hash += encode_int(self.environment_id)
         finetune_hash += encode_int(nr_of_training_runs)
 
         # Use the stream as a file

@@ -206,61 +206,91 @@ def _get_args(func_args, kwarg_names, num_args):
         return wrapper
 
     @staticmethod
-    def patch(test_func):
-        Monkey._anonymous_usage(logger=Monkey.logger.name)
-        function_description = Register.load_function_description(test_func)
-        Monkey._load_alignments(function_description.__hash__())
-
-        @wraps(test_func)
-        def wrapper(*args, **kwargs):
-            function_description = Register.load_function_description(test_func)
-            output = Monkey.language_modeler.generate(args, kwargs, Monkey.function_modeler, function_description)
-            # start parsing the object, very hacky way for the time being
-            try:
-                # json load
-                choice_parsed = json.loads(output.generated_response)
-            except:
-                # if it fails, it's not a json object, try eval
+    def patch(patchable_func = None,
+                environment_id : int = 0, 
+                ignore_finetune_fetching : bool = False, 
+                ignore_finetuning : bool = False,
+                ignore_data_storage : bool = False
+                ):
+        """
+        The main decorator for patching a function.
+        args:
+            patchable_func: The function to be patched, should be always set to none. This is used here to allow for keyword arguments or no arguments to be passed to the decorator
+            environment_id (int): The environment id. Used for fetching correct finetuned models
+            ignore_finetune_fetching (bool): Whether to ignore fetching finetuned models.
+                If set to False, during the first call openai will not be queried for finetuned models, which reduces initial startup latency
+            ignore_finetuning (bool): Whether to ignore finetuning the models altogether. If set to True the teacher model will always be used.
+                The data is still saved however if in future would need to use finetuning
+            ignore_data_storage (bool): Whether to ignore storing the data.
+                If set to True, the data will not be stored in the finetune dataset and the align statements will not be saved
+                This improves latency as communications with data storage is minimised
+
+
+        """
+        def wrap(test_func):
+            @wraps(test_func)
+            def wrapper(*args, **kwargs):
+                function_description = Register.load_function_description(test_func)
+                output = Monkey.language_modeler.generate(args, kwargs, Monkey.function_modeler, function_description)
+                # start parsing the object, very hacky way for the time being
                 try:
-                    choice_parsed = eval(output.generated_response)
-                except: 
-                    choice_parsed = output.generated_response
-
-            validator = Validator()
-
-            valid = validator.check_type(choice_parsed, function_description.output_type_hint)
-
-            if not valid:
-                choice, choice_parsed, successful_repair = repair_output(args,
-                                                                         kwargs,
-                                                                         function_description,
-                                                                         output.generated_response,
-                                                                         validator,
-                                                                         Monkey.function_modeler,
-                                                                         Monkey.language_modeler)
-
-                if not successful_repair:
-                    raise TypeError(f"Output type was not valid. Expected an object of type {function_description.output_type_hint}, got '{output.generated_response}'")
-                output.generated_response = choice
-                output.distilled_model = False
-
-
-            datapoint = FunctionExample(args, kwargs, output.generated_response)
-            if output.suitable_for_finetuning and not output.distilled_model:
-                Monkey.function_modeler.postprocess_datapoint(function_description.__hash__(), function_description, datapoint, repaired = not valid)
-
-            instantiated = validator.instantiate(choice_parsed, function_description.output_type_hint)
-
-            return instantiated  # test_func(*args, **kwargs)
+                    # json load
+                    choice_parsed = json.loads(output.generated_response)
+                except:
+                    # if it fails, it's not a json object, try eval
+                    try:
+                        choice_parsed = eval(output.generated_response)
+                    except: 
+                        choice_parsed = output.generated_response
+
+                validator = Validator()
+
+                valid = validator.check_type(choice_parsed, function_description.output_type_hint)
+
+                if not valid:
+                    choice, choice_parsed, successful_repair = repair_output(args,
+                                                                             kwargs,
+                                                                             function_description,
+                                                                             output.generated_response,
+                                                                             validator,
+                                                                             Monkey.function_modeler,
+                                                                             Monkey.language_modeler)
+
+                    if not successful_repair:
+                        raise TypeError(f"Output type was not valid. Expected an object of type {function_description.output_type_hint}, got '{output.generated_response}'")
+                    output.generated_response = choice
+                    output.distilled_model = False
+
+
+                datapoint = FunctionExample(args, kwargs, output.generated_response)
+                if output.suitable_for_finetuning and not output.distilled_model:
+                    Monkey.function_modeler.postprocess_datapoint(function_description.__hash__(), function_description, datapoint, repaired = not valid)
+
+                instantiated = validator.instantiate(choice_parsed, function_description.output_type_hint)
+
+                return instantiated  # test_func(*args, **kwargs)
+
+            Monkey._anonymous_usage(logger=Monkey.logger.name)
+            function_description = Register.load_function_description(test_func)
+            func_hash = function_description.__hash__()
+            Monkey.function_modeler.environment_id = environment_id
+            if ignore_finetuning:
+                Monkey.function_modeler.execute_finetune_blacklist.append(func_hash)
+            if ignore_finetune_fetching:
+                Monkey.function_modeler.check_finetune_blacklist.append(func_hash)
+            if ignore_data_storage:
+                Monkey.function_modeler.store_data_blacklist.append(func_hash)
+            Monkey._load_alignments(func_hash)
+
+            wrapper._is_alignable = True
+            Register.add_function(test_func, wrapper)
+            return wrapper
+
+        if  callable(patchable_func):
+            func = patchable_func
+            return wrap(func)
+        if patchable_func is not None:
+            raise TypeError("The first argument to patch must not be specified. Please use keyword arguments or specify the first argument as None")
+        return wrap
 
-        wrapper._is_alignable = True
-        Register.add_function(test_func, wrapper)
-        return wrapper
-
-    @staticmethod
-    def configure(**kwargs):
-        if "workspace_id" in kwargs:
-            Monkey.function_modeler.workspace_id = kwargs["workspace_id"]
-        if "check_for_finetunes" in kwargs:
-            Monkey.function_modeler.check_for_finetunes = kwargs["check_for_finetunes"]
 
@@ -0,0 +1,71 @@
+from typing import List
+from monkey_patch.register import Register
+
+import os
+from typing import Optional, Literal, List
+import openai
+from dotenv import load_dotenv
+from monkey_patch.monkey import Monkey
+
+load_dotenv()
+openai.api_key = os.getenv("OPENAI_API_KEY")
+
+
+@Monkey.patch
+def classify_sentiment_2(input: str, input_2: str) -> Optional[Literal['Good', 'Bad']]:
+    """
+    Determine if the inputs are positive or negative sentiment, or None
+    """
+
+
+@Monkey.patch(environment_id = 12, ignore_finetune_fetching=True, ignore_finetuning=True, ignore_data_storage=True)
+def classify_sentiment(input: str) -> Optional[Literal['Good', 'Bad']]:
+    """
+    Determine if the input is positive or negative sentiment
+    """
+
+@Monkey.align
+def align_classify_sentiment():
+    """We can test the function as normal using Pytest or Unittest"""
+
+    i_love_you = "I love you"
+    assert classify_sentiment_2(i_love_you, "I love woo") == 'Good'
+    assert classify_sentiment_2("I hate you", "You're discusting") == 'Bad'
+    assert classify_sentiment_2("Today is wednesday", "The dogs are running outside") == None
+
+
+    assert classify_sentiment("I love you") == 'Good'
+    assert classify_sentiment("I hate you") == 'Bad'
+    assert classify_sentiment("Wednesdays are in the middle of the week") == None
+
+def test_classify_sentiment():
+    align_classify_sentiment()
+    bad_input = "I find you awful"
+    good_input = "I really really like you"
+    good_input_2 = "I adore you"
+    assert classify_sentiment("I like you") == 'Good'
+    assert classify_sentiment(bad_input) == 'Bad'
+    assert classify_sentiment("I am neutral") == None
+
+    assert classify_sentiment_2(good_input, good_input_2) == 'Good'
+    assert classify_sentiment_2("I do not like you you", bad_input) == 'Bad'
+    assert classify_sentiment_2("I am neutral", "I am neutral too") == None
+
+def test_configurability():
+    classify_sent_description = Register.load_function_description(classify_sentiment)
+    classify_sentiment_2_description = Register.load_function_description(classify_sentiment_2)
+    sent_func_hash = classify_sent_description.__hash__()
+    sent_func_2_hash = classify_sentiment_2_description.__hash__()
+
+    func_modeler = Monkey.function_modeler
+    assert func_modeler.environment_id == 12
+    assert sent_func_hash in func_modeler.check_finetune_blacklist
+    assert sent_func_2_hash not in func_modeler.check_finetune_blacklist
+    assert sent_func_hash in func_modeler.execute_finetune_blacklist
+    assert sent_func_2_hash not in func_modeler.execute_finetune_blacklist
+    assert sent_func_hash in func_modeler.store_data_blacklist
+    assert sent_func_2_hash not in func_modeler.store_data_blacklist
+
+
+
+
@@ -37,8 +37,8 @@ def test_encode_decode_hash():
     workspace_id = 12
     function_description = function_description = Register.load_function_description(dummy_func)
     logger = BufferedLogger("test")
-    func_modeler = FunctionModeler(logger, workspace_id=workspace_id)
-    finetune_hash = function_description.__hash__(purpose = "finetune") + encode_int(func_modeler.workspace_id) + encode_int(nr_of_training_runs)
+    func_modeler = FunctionModeler(logger, environment_id=workspace_id)
+    finetune_hash = function_description.__hash__(purpose = "finetune") + encode_int(func_modeler.environment_id) + encode_int(nr_of_training_runs)
     finetune = {"fine_tuned_model": f"Test_model:__{finetune_hash}:asd[]asd",}
     config = func_modeler._construct_config_from_finetune(finetune_hash[:-1], finetune)
     assert config["distilled_model"] == f"Test_model:__{finetune_hash}:asd[]asd"