-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_utils.py
143 lines (121 loc) · 4.76 KB
/
test_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""
Author: Michael Drob
Contact: https://www.linkedin.com/in/michael-drob/
License: Apache 2.0
This program is free software: you can redistribute it and/or modify
it under the terms of the Apache License as published by
the Free Software Foundation, either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Apache License for more details.
You should have received a copy of the Apache License
along with this program. If not, see <http://www.apache.org/licenses/LICENSE-2.0>.
"""
import pytest
from utils import eval_llm_output, eval_llm_top_k
def test_eval_llm_output_success():
llm_output = """
[PYTHON]
def max_sum(my_list):
return max([sum(x) for x in my_list])
[/PYTHON]
"""
test_list = [
'assert max_sum([[1, 2, 3], [4, 5, 6], [10, 11, 12], [7, 8, 9]]) == 33',
'assert max_sum([[0, 1, 1], [1, 1, 2], [3, 2, 1]]) == 6',
'assert max_sum([[0, 1, 3], [1, 2, 1], [9, 8, 2], [0, 1, 0], [6, 4, 8]]) == 19',
'assert max_sum([[0, -1, -1], [-1, -1, -2], [-3, -2, -1]]) == -2'
]
assert eval_llm_output(llm_output, test_list) == True
def test_eval_llm_output_wrong_name_failure():
llm_output = """
[PYTHON]
def sum_max(my_list):
return max([sum(x) for x in my_list])
[/PYTHON]
"""
test_list = [
'assert max_sum([[1, 2, 3], [4, 5, 6], [10, 11, 12], [7, 8, 9]]) == 33',
'assert max_sum([[0, 1, 1], [1, 1, 2], [3, 2, 1]]) == 6',
'assert max_sum([[0, 1, 3], [1, 2, 1], [9, 8, 2], [0, 1, 0], [6, 4, 8]]) == 19',
'assert max_sum([[0, -1, -1], [-1, -1, -2], [-3, -2, -1]]) == -2'
]
assert eval_llm_output(llm_output, test_list) == False
def test_eval_llm_output_failure():
llm_output = """
[PYTHON]
def max_sum_wrong(my_list):
# Incorrect implementation on purpose
return sum([sum(x) for x in my_list])
[/PYTHON]
"""
test_list = [
'assert max_sum_wrong([[1, 2, 3], [4, 5, 6], [10, 11, 12], [7, 8, 9]]) == 33',
'assert max_sum_wrong([[0, 1, 1], [1, 1, 2], [3, 2, 1]]) == 6',
'assert max_sum_wrong([[0, 1, 3], [1, 2, 1], [9, 8, 2], [0, 1, 0], [6, 4, 8]]) == 19',
'assert max_sum_wrong([[0, -1, -1], [-1, -1, -2], [-3, -2, -1]]) == -2'
]
assert eval_llm_output(llm_output, test_list) == False
def test_eval_llm_output_sphere_volume_success():
llm_output = """
[PYTHON]
def get_sphere_volume(radius):
import math
return (4/3) * math.pi * radius**3
[/PYTHON]
"""
test_list = [
'assert get_sphere_volume(10) == 4188.790204786391',
'assert get_sphere_volume(25) == 65449.84694978735',
'assert get_sphere_volume(20) == 33510.32163829113'
]
assert eval_llm_output(llm_output, test_list) == True
def test_eval_llm_output_sphere_volume_failure():
llm_output = """
[PYTHON]
def get_sphere_volume_incorrect(radius):
# Incorrect implementation on purpose
return 2 * radius**3
[/PYTHON]
"""
test_list = [
'assert get_sphere_volume_incorrect(10) == 4188.790204786391',
'assert get_sphere_volume_incorrect(25) == 65449.84694978735',
'assert get_sphere_volume_incorrect(20) == 33510.32163829113'
]
assert eval_llm_output(llm_output, test_list) == False
@pytest.fixture
def mock_eval_function(monkeypatch):
def mock_eval_llm_output(llm_output, test_list):
return "success" in llm_output
monkeypatch.setattr("utils.eval_llm_output", mock_eval_llm_output)
def test_eval_llm_top_k_all_success(mock_eval_function):
llm_outputs = ["fail", "fail", "success"]
test_list = [] # Test list is irrelevant for this mock
k = [3, 4, 5]
expected = [True, True, True]
actual = eval_llm_top_k(llm_outputs, test_list, k)
assert actual == expected
def test_eval_llm_top_k_partial_success(mock_eval_function):
llm_outputs = ["fail"] * 5 + ["success"]
test_list = [] # Test list is irrelevant for this mock
k = [5, 6, 10]
expected = [False, True, True]
actual = eval_llm_top_k(llm_outputs, test_list, k)
assert actual == expected
def test_eval_llm_top_k_no_success(mock_eval_function):
llm_outputs = ["fail"] * 10
test_list = [] # Test list is irrelevant for this mock
k = [5, 8]
expected = [False, False]
actual = eval_llm_top_k(llm_outputs, test_list, k)
assert actual == expected
def test_eval_llm_top_k_empty_outputs(mock_eval_function):
llm_outputs = []
test_list = []
k = [1, 2]
expected = [False, False]
actual = eval_llm_top_k(llm_outputs, test_list, k)
assert actual == expected