Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/patch configurability #80

Merged
merged 6 commits into from
Nov 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions examples/clean_language/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

config = {"workspace_id": 0}
monkey.configure(**config)

@monkey.patch
def clean_language(statement: str) -> str:
"""
Expand Down
2 changes: 2 additions & 0 deletions examples/score_sentiment/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from pydantic import Field
from typing import Annotated
from dotenv import load_dotenv
load_dotenv()
from monkey_patch.monkey import Monkey as monkey

@monkey.patch
Expand Down
47 changes: 31 additions & 16 deletions src/monkey_patch/function_modeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,16 @@


class FunctionModeler(object):
def __init__(self, data_worker, workspace_id = 0, check_for_finetunes = True) -> None:
def __init__(self, data_worker, environment_id = 0) -> None:
self.function_configs = {}
self.data_worker = data_worker
self.distillation_token_limit = 3000 # the token limit for finetuning
self.align_buffer = {}
self._get_datasets()
self.workspace_id = workspace_id
self.check_for_finetunes = check_for_finetunes
self.environment_id = environment_id
self.check_finetune_blacklist = []
self.execute_finetune_blacklist = []
self.store_data_blacklist = []


def _get_dataset_info(self, dataset_type, func_hash, type = "length"):
Expand All @@ -38,6 +40,8 @@ def _get_datasets(self):
def save_align_statements(self, function_hash, args, kwargs, output):
"""
Save the align statements and add to the align buffer
Do not save if the function hash is in the store data blacklist
Then just add the datapoints to the align buffer
"""
# prepare output for saving and later parsing
# make a deepcopy of the output to avoid changing the original object
Expand All @@ -51,8 +55,11 @@ def save_align_statements(self, function_hash, args, kwargs, output):
parsed_kwargs = prepare_object_for_saving(copy_kwargs)

example = FunctionExample(parsed_args, parsed_kwargs, parsed_output)

successfully_saved, new_datapoint = self.data_worker.log_align(function_hash, example)
if function_hash not in self.store_data_blacklist:
successfully_saved, new_datapoint = self.data_worker.log_align(function_hash, example)
else:
successfully_saved = False
new_datapoint = True
if successfully_saved:
if function_hash in self.dataset_sizes["alignments"]:
self.dataset_sizes["alignments"][function_hash] += 1
Expand Down Expand Up @@ -126,8 +133,14 @@ def get_alignments(self, func_hash, max=20):
def load_align_statements(self, function_hash):
"""
Load all align statements
First check the data storage blacklist,
if the func hash is in the blacklist, then set the dataset size to 0 and the align buffer to empty bytearray
"""
if function_hash not in self.align_buffer:
if function_hash in self.store_data_blacklist:
self.dataset_sizes["alignments"][function_hash] = 0
self.align_buffer[function_hash] = bytearray()

elif function_hash not in self.align_buffer:
dataset_size, align_dataset = self._get_dataset_info("alignments", function_hash, type = "both")
if align_dataset:
self.align_buffer[function_hash] = bytearray(align_dataset)
Expand All @@ -137,26 +150,28 @@ def load_align_statements(self, function_hash):
def postprocess_datapoint(self, func_hash, function_description, example, repaired=True):
"""
Postprocess the datapoint
First check if the datapoint should be added to the training data
Add the datapoint if it should be added
Then check if the function should be finetuned and execute finetuning if it should
"""
try:

added = self.save_datapoint(func_hash, example)
if added:
self._update_datapoint_config(repaired, func_hash)
if func_hash not in self.store_data_blacklist:
added = self.save_datapoint(func_hash, example)
if added:
self._update_datapoint_config(repaired, func_hash)
except Exception as e:
print(e)
print("Could not add datapoint to training data")
return None

self.check_for_finetuning(function_description, func_hash)
if func_hash not in self.execute_finetune_blacklist:
self.check_for_finetuning(function_description, func_hash)

def _load_function_config(self, func_hash, function_description):
"""
Load the config file for a function hash
"""

config, default = self.data_worker._load_function_config(func_hash)
if default and self.check_for_finetunes:
if default and func_hash not in self.check_finetune_blacklist:
finetuned, finetune_config = self._check_for_finetunes(function_description)
if finetuned:
config = finetune_config
Expand All @@ -168,7 +183,7 @@ def _check_for_finetunes(self, function_description):
# This here should be discussed, what's the bestd way to do it

# hash the function_hash into 16 characters
finetune_hash = function_description.__hash__(purpose = "finetune") + encode_int(self.workspace_id)
finetune_hash = function_description.__hash__(purpose = "finetune") + encode_int(self.environment_id)
# List 10 fine-tuning jobs
finetunes = openai.FineTuningJob.list(limit=1000)
# Check if the function_hash is in the fine-tuning jobs
Expand Down Expand Up @@ -367,7 +382,7 @@ def _execute_finetuning(self, function_description, func_hash):
# create the finetune hash
finetune_hash = function_description.__hash__(purpose = "finetune")
nr_of_training_runs = self.function_configs[func_hash]["nr_of_training_runs"]
finetune_hash += encode_int(self.workspace_id)
finetune_hash += encode_int(self.environment_id)
finetune_hash += encode_int(nr_of_training_runs)

# Use the stream as a file
Expand Down
140 changes: 85 additions & 55 deletions src/monkey_patch/monkey.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,61 +206,91 @@ def _get_args(func_args, kwarg_names, num_args):
return wrapper

@staticmethod
def patch(test_func):
Monkey._anonymous_usage(logger=Monkey.logger.name)
function_description = Register.load_function_description(test_func)
Monkey._load_alignments(function_description.__hash__())

@wraps(test_func)
def wrapper(*args, **kwargs):
function_description = Register.load_function_description(test_func)
output = Monkey.language_modeler.generate(args, kwargs, Monkey.function_modeler, function_description)
# start parsing the object, very hacky way for the time being
try:
# json load
choice_parsed = json.loads(output.generated_response)
except:
# if it fails, it's not a json object, try eval
def patch(patchable_func = None,
environment_id : int = 0,
ignore_finetune_fetching : bool = False,
ignore_finetuning : bool = False,
ignore_data_storage : bool = False
):
"""
The main decorator for patching a function.
args:
patchable_func: The function to be patched, should be always set to none. This is used here to allow for keyword arguments or no arguments to be passed to the decorator
environment_id (int): The environment id. Used for fetching correct finetuned models
ignore_finetune_fetching (bool): Whether to ignore fetching finetuned models.
If set to False, during the first call openai will not be queried for finetuned models, which reduces initial startup latency
ignore_finetuning (bool): Whether to ignore finetuning the models altogether. If set to True the teacher model will always be used.
The data is still saved however if in future would need to use finetuning
ignore_data_storage (bool): Whether to ignore storing the data.
If set to True, the data will not be stored in the finetune dataset and the align statements will not be saved
This improves latency as communications with data storage is minimised


"""
def wrap(test_func):
@wraps(test_func)
def wrapper(*args, **kwargs):
function_description = Register.load_function_description(test_func)
output = Monkey.language_modeler.generate(args, kwargs, Monkey.function_modeler, function_description)
# start parsing the object, very hacky way for the time being
try:
choice_parsed = eval(output.generated_response)
except:
choice_parsed = output.generated_response

validator = Validator()

valid = validator.check_type(choice_parsed, function_description.output_type_hint)

if not valid:
choice, choice_parsed, successful_repair = repair_output(args,
kwargs,
function_description,
output.generated_response,
validator,
Monkey.function_modeler,
Monkey.language_modeler)

if not successful_repair:
raise TypeError(f"Output type was not valid. Expected an object of type {function_description.output_type_hint}, got '{output.generated_response}'")
output.generated_response = choice
output.distilled_model = False


datapoint = FunctionExample(args, kwargs, output.generated_response)
if output.suitable_for_finetuning and not output.distilled_model:
Monkey.function_modeler.postprocess_datapoint(function_description.__hash__(), function_description, datapoint, repaired = not valid)

instantiated = validator.instantiate(choice_parsed, function_description.output_type_hint)

return instantiated # test_func(*args, **kwargs)
# json load
choice_parsed = json.loads(output.generated_response)
except:
# if it fails, it's not a json object, try eval
try:
choice_parsed = eval(output.generated_response)
except:
choice_parsed = output.generated_response

validator = Validator()

valid = validator.check_type(choice_parsed, function_description.output_type_hint)

if not valid:
choice, choice_parsed, successful_repair = repair_output(args,
kwargs,
function_description,
output.generated_response,
validator,
Monkey.function_modeler,
Monkey.language_modeler)

if not successful_repair:
raise TypeError(f"Output type was not valid. Expected an object of type {function_description.output_type_hint}, got '{output.generated_response}'")
output.generated_response = choice
output.distilled_model = False


datapoint = FunctionExample(args, kwargs, output.generated_response)
if output.suitable_for_finetuning and not output.distilled_model:
Monkey.function_modeler.postprocess_datapoint(function_description.__hash__(), function_description, datapoint, repaired = not valid)

instantiated = validator.instantiate(choice_parsed, function_description.output_type_hint)

return instantiated # test_func(*args, **kwargs)

Monkey._anonymous_usage(logger=Monkey.logger.name)
function_description = Register.load_function_description(test_func)
func_hash = function_description.__hash__()
Monkey.function_modeler.environment_id = environment_id
if ignore_finetuning:
Monkey.function_modeler.execute_finetune_blacklist.append(func_hash)
if ignore_finetune_fetching:
Monkey.function_modeler.check_finetune_blacklist.append(func_hash)
if ignore_data_storage:
Monkey.function_modeler.store_data_blacklist.append(func_hash)
Monkey._load_alignments(func_hash)

wrapper._is_alignable = True
Register.add_function(test_func, wrapper)
return wrapper

if callable(patchable_func):
func = patchable_func
return wrap(func)
if patchable_func is not None:
raise TypeError("The first argument to patch must not be specified. Please use keyword arguments or specify the first argument as None")
return wrap

wrapper._is_alignable = True
Register.add_function(test_func, wrapper)
return wrapper

@staticmethod
def configure(**kwargs):
if "workspace_id" in kwargs:
Monkey.function_modeler.workspace_id = kwargs["workspace_id"]
if "check_for_finetunes" in kwargs:
Monkey.function_modeler.check_for_finetunes = kwargs["check_for_finetunes"]

71 changes: 71 additions & 0 deletions tests/test_configure_MP.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from typing import List
from monkey_patch.register import Register

import os
from typing import Optional, Literal, List
import openai
from dotenv import load_dotenv
from monkey_patch.monkey import Monkey

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")


@Monkey.patch
def classify_sentiment_2(input: str, input_2: str) -> Optional[Literal['Good', 'Bad']]:
"""
Determine if the inputs are positive or negative sentiment, or None
"""


@Monkey.patch(environment_id = 12, ignore_finetune_fetching=True, ignore_finetuning=True, ignore_data_storage=True)
def classify_sentiment(input: str) -> Optional[Literal['Good', 'Bad']]:
"""
Determine if the input is positive or negative sentiment
"""

@Monkey.align
def align_classify_sentiment():
"""We can test the function as normal using Pytest or Unittest"""

i_love_you = "I love you"
assert classify_sentiment_2(i_love_you, "I love woo") == 'Good'
assert classify_sentiment_2("I hate you", "You're discusting") == 'Bad'
assert classify_sentiment_2("Today is wednesday", "The dogs are running outside") == None


assert classify_sentiment("I love you") == 'Good'
assert classify_sentiment("I hate you") == 'Bad'
assert classify_sentiment("Wednesdays are in the middle of the week") == None

def test_classify_sentiment():
align_classify_sentiment()
bad_input = "I find you awful"
good_input = "I really really like you"
good_input_2 = "I adore you"
assert classify_sentiment("I like you") == 'Good'
assert classify_sentiment(bad_input) == 'Bad'
assert classify_sentiment("I am neutral") == None

assert classify_sentiment_2(good_input, good_input_2) == 'Good'
assert classify_sentiment_2("I do not like you you", bad_input) == 'Bad'
assert classify_sentiment_2("I am neutral", "I am neutral too") == None

def test_configurability():
classify_sent_description = Register.load_function_description(classify_sentiment)
classify_sentiment_2_description = Register.load_function_description(classify_sentiment_2)
sent_func_hash = classify_sent_description.__hash__()
sent_func_2_hash = classify_sentiment_2_description.__hash__()

func_modeler = Monkey.function_modeler
assert func_modeler.environment_id == 12
assert sent_func_hash in func_modeler.check_finetune_blacklist
assert sent_func_2_hash not in func_modeler.check_finetune_blacklist
assert sent_func_hash in func_modeler.execute_finetune_blacklist
assert sent_func_2_hash not in func_modeler.execute_finetune_blacklist
assert sent_func_hash in func_modeler.store_data_blacklist
assert sent_func_2_hash not in func_modeler.store_data_blacklist




4 changes: 2 additions & 2 deletions tests/test_finetune_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ def test_encode_decode_hash():
workspace_id = 12
function_description = function_description = Register.load_function_description(dummy_func)
logger = BufferedLogger("test")
func_modeler = FunctionModeler(logger, workspace_id=workspace_id)
finetune_hash = function_description.__hash__(purpose = "finetune") + encode_int(func_modeler.workspace_id) + encode_int(nr_of_training_runs)
func_modeler = FunctionModeler(logger, environment_id=workspace_id)
finetune_hash = function_description.__hash__(purpose = "finetune") + encode_int(func_modeler.environment_id) + encode_int(nr_of_training_runs)
finetune = {"fine_tuned_model": f"Test_model:__{finetune_hash}:asd[]asd",}
config = func_modeler._construct_config_from_finetune(finetune_hash[:-1], finetune)
assert config["distilled_model"] == f"Test_model:__{finetune_hash}:asd[]asd"
Expand Down
Loading