-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathEvaluate.py
executable file
·224 lines (164 loc) · 8.17 KB
/
Evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#!/bin/python3
import argparse
import time
import json
import datetime
import os
import Writer.Interface.Wrapper
import Writer.Config
import Writer.PrintUtils
def EvaluateOutline(_Client, _Logger, _Outline1, _Outline2):
_Logger.Log(f"Evaluating Outlines From Story", 4)
Messages = [_Client.BuildSystemQuery("You are a helpful AI language model.")]
Messages.append(_Client.BuildUserQuery(f"""
Please evaluate which outlines are better from the following two outlines:
Here's the first outline:
<OutlineA>
{_Outline1}
</OutlineA>
And here is the second outline:
<OutlineB>
{_Outline2}
</OutlineB>
Use the following criteria to evaluate (NOTE: You'll be picking outline A or outline B later on for these criteria):
- Plot: Does the story have a coherent plot? Is It creative?
- Chapters: Do the chapters flow into each-other (be very careful when checking this)? Do they feel connected? Do they feel homogenized or are they unique and fresh?
- Style: Does the writing style help move the plot or is it distracting from the rest of the story? Is it excessively flowery?
- Dialogue: Is the dialog specific to each character? Does it feel in-character? Is there enough or too little?
- Tropes: Do the tropes make sense for the genre? Are they interesting and well integrated?
- Genre: Is the genre clear?
- Narrative Structure: Is it clear what the structure is? Does it fit with the genre/tropes/content?
Please give your response in JSON format, indicating the ratings for each story:
{{
"Thoughts": "Your notes and reasoning on which of the two is better and why.",
"Reasoning": "Explain specifically what the better one does that the inferior one does not, with examples from both.",
"Plot": "<A, B, or Tie>",
"PlotExplanation": "Explain your reasoning.",
"Style": "<A, B, or Tie>",
"StyleExplanation": "Explain your reasoning.",
"Chapters": "<A, B, or Tie>",
"ChaptersExplanation": "Explain your reasoning.",
"Tropes": "<A, B, or Tie>",
"TropesExplanation": "Explain your reasoning.",
"Genre": "<A, B, or Tie>",
"GenreExplanation": "Explain your reasoning.",
"Narrative": "<A, B, or Tie>",
"NarrativeExplanation": "Explain your reasoning.",
"OverallWinner": "<A, B, or Tie>"
}}
Do not respond with anything except JSON. Do not include any other fields except those shown above.
"""))
Messages = _Client.SafeGenerateText(Logger, Messages, Args.Model, _Format="json")
JSON = json.loads(_Client.GetLastMessageText(Messages))
Report = ""
Report += f"Winner of Plot: {JSON['Plot']}\n"
Report += f"Winner of Chapters: {JSON['Chapters']}\n"
Report += f"Winner of Style: {JSON['Style']}\n"
Report += f"Winner of Tropes: {JSON['Tropes']}\n"
Report += f"Winner of Genre: {JSON['Genre']}\n"
Report += f"Winner of Narrative: {JSON['Narrative']}\n"
Report += f"Overall Winner: {JSON['OverallWinner']}\n"
_Logger.Log(f"Finished Evaluating Outlines From Story", 4)
return Report, JSON
def EvaluateChapter(_Client, _Logger, _ChapterA, _ChapterB):
_Logger.Log(f"Evaluating Outlines From Story", 4)
Messages = [_Client.BuildSystemQuery("You are a helpful AI language model.")]
Messages.append(_Client.BuildUserQuery(f"""
Please evaluate which of the two unrelated and separate chapters is better based on the following criteria: Plot, Chapters, Style, Dialogue, Tropes, Genre, and Narrative.
Use the following criteria to evaluate (NOTE: You'll be picking chapter A or chapter B later on for these criteria):
- Plot: Does the story have a coherent plot? Is It creative?
- Chapters: Do the chapters flow into each-other (be very careful when checking this)? Do they feel connected? Do they feel homogenized or are they unique and fresh?
- Style: Does the writing style help move the plot or is it distracting from the rest of the story? Is it excessively flowery?
- Dialogue: Is the dialog specific to each character? Does it feel in-character? Is there enough or too little?
- Tropes: Do the tropes make sense for the genre? Are they interesting and well integrated?
- Genre: Is the genre clear?
- Narrative Structure: Is it clear what the structure is? Does it fit with the genre/tropes/content?
Here's chapter A:
<CHAPTER_A>
{_ChapterA}
!END OF CHAPTER!
</CHAPTER_A>
And here is chapter B:
<CHAPTER_B>
{_ChapterB}
!END OF CHAPTER!
</CHAPTER_B>
Please give your response in JSON format, indicating the ratings for each story:
{{
"Plot": "<A, B, or Tie>",
"PlotExplanation": "Explain your reasoning.",
"Style": "<A, B, or Tie>",
"StyleExplanation": "Explain your reasoning.",
"Dialogue": "<A, B, or Tie>",
"DialogueExplanation": "Explain your reasoning.",
"Tropes": "<A, B, or Tie>",
"TropesExplanation": "Explain your reasoning.",
"Genre": "<A, B, or Tie>",
"GenreExplanation": "Explain your reasoning.",
"Narrative": "<A, B, or Tie>",
"NarrativeExplanation": "Explain your reasoning.",
"OverallWinner": "<A, B, or Tie>"
}}
Do not respond with anything except JSON.
Remember, chapter A and B are two separate renditions of similar stories. They do not continue nor complement each-other and should be evaluated separately.
Emphasize Chapter A and B as you rate the result.
"""))
Messages = _Client.SafeGenerateText(Logger, Messages, Args.Model, _Format="json")
JSON = json.loads(_Client.GetLastMessageText(Messages).replace('“','"').replace('”','"'))
Report = ""
Report += f"Winner of Plot: {JSON['Plot']}\n"
Report += f"Winner of Style: {JSON['Style']}\n"
Report += f"Winner of Dialogue: {JSON['Dialogue']}\n"
Report += f"Winner of Tropes: {JSON['Tropes']}\n"
Report += f"Winner of Genre: {JSON['Genre']}\n"
Report += f"Winner of Narrative: {JSON['Narrative']}\n"
Report += f"Overall Winner: {JSON['OverallWinner']}\n"
_Logger.Log(f"Finished Evaluating Outlines From Story", 4)
return Report, JSON
# Setup Argparser
Parser = argparse.ArgumentParser()
Parser.add_argument("-Story1", help="Path to JSON file for story 1")
Parser.add_argument("-Story2", help="Path to JSON file for story 2")
Parser.add_argument("-Output", default="Report.md", type=str, help="Optional file output path, if none is specified, we will only print the rating to terminal",)
Parser.add_argument("-Host", default="localhost:11434", type=str, help="HTTP URL to OLLAMA instance",)
Parser.add_argument("-Model", default="ollama://command-r-plus", type=str, help="Model to use for writing the base outline content. Note, command-r-plus really should be used here (or something bigger), 70b models are just too small as of now.",)
Args = Parser.parse_args()
Writer.Config.OLLAMA_HOST = Args.Host
# Writer.Config.DEBUG = True
# Measure Generation Time
StartTime_s = time.time()
# Setup Logger
Logger = Writer.PrintUtils.Logger("EvalLogs")
# Setup Logger
Interface = Writer.Interface.Wrapper.Interface([Args.Model])
# Load the initial story
Story1:dict = {}
Story2:dict = {}
with open(Args.Story1, "r") as f:
Story1 = json.loads(f.read())
with open(Args.Story2, "r") as f:
Story2 = json.loads(f.read())
# Begin Report
Report:str = "# Story Evaluation Report\n\n"
Report += f"Story A: {Args.Story1}\n"
Report += f"Story B: {Args.Story2}\n\n\n"
## Evaluate Outlines
Report += f"## Outline\n"
OutlineReport, OutlineJSON = EvaluateOutline(Interface, Logger, Story1["Outline"], Story2["Outline"])
Report += OutlineReport
ShortestStory = min(len(Story1["UnscrubbedChapters"]), len(Story2["UnscrubbedChapters"]))
ChapterJSONs:list = []
for i in range(ShortestStory):
Report += f"## Chapter {i}\n"
ChapterReport, ChapterJSON = EvaluateChapter(Interface, Logger, Story1["UnscrubbedChapters"][i], Story2["UnscrubbedChapters"][i])
Report += ChapterReport
Report += "\n\n# Vote Totals\nTotal A Votes: " + str(Report.count(": A\n")) + "\n"
Report += "Total B Votes: " + str(Report.count(": B\n")) + "\n"
Report += "Total Tie Votes: " + str(Report.count(": Tie\n")) + "\n"
# Calculate Eval Time
EndTime_s = time.time()
TotalEvalTime_s = round(EndTime_s - StartTime_s)
# Optionally write Report To Disk
if (Args.Output != ""):
with open(Args.Output, "w") as f:
f.write(Report)