sotopia-lab · XuhuiZhou · Oct 4, 2024 · May 28, 2024 · May 28, 2024 · May 28, 2024
diff --git a/README.md b/README.md
@@ -74,8 +74,8 @@ asyncio.run(
     run_async_server(
         model_dict={
             "env": "gpt-4",
-            "agent1": "gpt-3.5-turbo",
-            "agent2": "gpt-3.5-turbo",
+            "agent1": "gpt-4o-mini",
+            "agent2": "gpt-4o-mini",
         },
         sampler=UniformSampler(),
     )

diff --git a/docs/pages/concepts/agents.md b/docs/pages/concepts/agents.md
@@ -11,7 +11,7 @@ class LLMAgent(BaseAgent[Observation, AgentAction]):
         agent_name: str | None = None,
         uuid_str: str | None = None,
         agent_profile: AgentProfile | None = None,
-        model_name: str = "gpt-3.5-turbo",
+        model_name: str = "gpt-4o-mini",
         script_like: bool = False,
     ) -> None:
 ```
@@ -26,7 +26,7 @@ class ScriptWritingAgent(LLMAgent):
         agent_name: str | None = None,
         uuid_str: str | None = None,
         agent_profile: AgentProfile | None = None,
-        model_name: str = "gpt-3.5-turbo",
+        model_name: str = "gpt-4o-mini",
         agent_names: list[str] = [],
         background: ScriptBackground | None = None,
     ) -> None:

diff --git a/docs/pages/concepts/generation.md b/docs/pages/concepts/generation.md
@@ -12,6 +12,8 @@ async def agenerate(
     output_parser: BaseOutputParser[OutputType],
     temperature: float = 0.7,
     structured_output: bool = False,
+    bad_output_process_model: str = DEFAULT_BAD_OUTPUT_PROCESS_MODEL,
+    use_fixed_model_version: bool = True
 ) -> OutputType:
     input_variables = re.findall(r"(?<!{){([^{}]+)}(?!})", template)
 ```
@@ -23,6 +25,12 @@ The `agenerate` function is versatile by taking the output_parser as an argument
     * `gpt-4o-mini-2024-07-18` and later
     * `gpt-4o-2024-08-06` and later
 
+The `bad_output_process_model` is used to process the bad output. `DEFAULT_BAD_OUTPUT_PROCESS_MODEL` is set to be `gpt-4o-mini` (At the publication time of Sotopia, we used `gpt-3.5-turbo-0613`. However this model has been taken off the shelf by OpenAI.).
+
+The `use_fixed_model_version` is used to determine whether to use the fixed model version. If set to `True`, the model version will be fixed to the version that was used in Sotopia paper. If set to `False`, the model version will be the latest version available.
+
+Warning: As some fixed model versions might not be available in the OpenAI API, setting `use_fixed_model_version = True` might result in an error.
+
 </Callout>
 
 Here are a few examples of how to use the `agenerate` function:
@@ -37,6 +45,8 @@ async def agenerate_env_profile(
     inspiration_prompt: str = "asking my boyfriend to stop being friends with his ex",
     examples: str = "",
     temperature: float = 0.7,
+    bad_output_process_model: str = DEFAULT_BAD_OUTPUT_PROCESS_MODEL,
+    use_fixed_model_version: bool = True
 ) -> tuple[EnvironmentProfile, str]:
     """
     Using langchain to generate the background
@@ -56,6 +66,8 @@ async def agenerate_env_profile(
         ),
         output_parser=PydanticOutputParser(pydantic_object=EnvironmentProfile),
         temperature=temperature,
+        bad_output_process_model=bad_output_process_model,
+        use_fixed_model_version=use_fixed_model_version
     )
 ```
 ### Other generation functions
@@ -66,6 +78,8 @@ Similarly, there are other utility functions that builds upon the `agenerate` fu
 async def agenerate_relationship_profile(
     model_name: str,
     agents_profiles: list[str],
+    bad_output_process_model: str = DEFAULT_BAD_OUTPUT_PROCESS_MODEL,
+    use_fixed_model_version: bool = True
 ) -> tuple[RelationshipProfile, str]
 ```
 
@@ -78,5 +92,7 @@ async def agenerate_script(
     agent_name: str = "",
     history: str = "",
     single_step: bool = False,
+    bad_output_process_model: str = DEFAULT_BAD_OUTPUT_PROCESS_MODEL,
+    use_fixed_model_version: bool = True
 ) -> tuple[ScriptInteractionReturnType, str]
 ```
diff --git a/docs/pages/index.mdx b/docs/pages/index.mdx
@@ -206,8 +206,8 @@ asyncio.run(
     run_async_server(
         model_dict={
             "env": "gpt-4",
-            "agent1": "gpt-3.5-turbo",
-            "agent2": "gpt-3.5-turbo",
+            "agent1": "gpt-4o-mini",
+            "agent2": "gpt-4o-mini",
         },
         sampler=UniformSampler(),
     )

diff --git a/examples/benchmark_evaluator.py b/examples/benchmark_evaluator.py
@@ -15,8 +15,8 @@
 
 target_model_patterns: list[list[str]] = [
     ["gpt-4", "gpt-4", "gpt-3.5-turbo"],
-    ["gpt-4", "gpt-3.5-turbo", "gpt-4"],
-    ["gpt-4", "gpt-3.5-turbo", "togethercomputer/llama-2-70b-chat"],
+    ["gpt-4", "gpt-4o-mini", "gpt-4"],
+    ["gpt-4", "gpt-4o-mini", "togethercomputer/llama-2-70b-chat"],
     ["gpt-4", "togethercomputer/llama-2-70b-chat", "gpt-3.5-turbo"],
 ]
 

diff --git a/examples/experiment_eval.py b/examples/experiment_eval.py
@@ -170,8 +170,8 @@ def run_async_server_in_batch(
     batch_size: int = 1,
     model_names: dict[str, LLM_Name] = {
         "env": "gpt-4",
-        "agent1": "gpt-3.5-turbo",
-        "agent2": "gpt-3.5-turbo",
+        "agent1": "gpt-4o-mini",
+        "agent2": "gpt-4o-mini",
     },
     tag: str | None = None,
     verbose: bool = False,

diff --git a/examples/fix_missing_episodes.py b/examples/fix_missing_episodes.py
@@ -252,8 +252,8 @@ def re_run_missing_episodes(
     combo_with_models: dict[tuple[LLM_Name, LLM_Name], list[tuple[str, str, str]]],
     model_names: dict[str, LLM_Name] = {
         "env": "gpt-4",
-        "agent1": "gpt-3.5-turbo",
-        "agent2": "gpt-3.5-turbo",
+        "agent1": "gpt-4o-mini",
+        "agent2": "gpt-4o-mini",
     },
     batch_size: int = 5,
     verbose: bool = False,

diff --git a/examples/fix_missing_episodes_with_tag.py b/examples/fix_missing_episodes_with_tag.py
@@ -350,8 +350,8 @@ def re_run_missing_episodes(
     env_agent_ids: List[Tuple[str, str, str]] = [],
     model_names: dict[str, LLM_Name] = {
         "env": "gpt-4",
-        "agent1": "gpt-3.5-turbo",
-        "agent2": "gpt-3.5-turbo",
+        "agent1": "gpt-4o-mini",
+        "agent2": "gpt-4o-mini",
     },
     batch_size: int = 5,
     rerun_tag: str = "missing_episodes",

diff --git a/examples/generate_script.py b/examples/generate_script.py
@@ -175,7 +175,7 @@ def full_freeform(
 def run_async_server_in_batch_script(
     *,
     batch_size: int = 10,
-    model: LLM_Name = "gpt-3.5-turbo",
+    model: LLM_Name = "gpt-4o-mini",
     tag: str | None = None,
     push_to_db: bool = True,
     json_in_script: bool = False,

diff --git a/examples/minimalist_demo.py b/examples/minimalist_demo.py
@@ -28,8 +28,8 @@
     run_async_server(
         model_dict={
             "env": "gpt-4",
-            "agent1": "gpt-3.5-turbo",
-            "agent2": "gpt-3.5-turbo",
+            "agent1": "gpt-4o-mini",
+            "agent2": "gpt-4o-mini",
         },
         sampler=UniformSampler(),
     )

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sotopia"
-version = "0.1.0-rc.3"
+version = "0.1.0-rc.5"
 description = "A platform for simulating and evaluating social interaction."
 authors = ["Hao Zhu <[email protected]>, Xuhui Zhou <[email protected]>"]
 license = "MIT License"

diff --git a/sotopia/agents/generate_agent_background.py b/sotopia/agents/generate_agent_background.py
@@ -20,13 +20,13 @@ async def generate_background(
     else:
         initial_profile = str(basic_info)
         profile = await agenerate_init_profile(
-            model_name="gpt-3.5-turbo", basic_info=basic_info
+            model_name="gpt-4o-mini", basic_info=basic_info
         )
         first_narrative = convert_narratives(
-            model_name="gpt-3.5-turbo", narrative="first", text=profile
+            model_name="gpt-4o-mini", narrative="first", text=profile
         )
         second_narrative = convert_narratives(
-            model_name="gpt-3.5-turbo", narrative="second", text=profile
+            model_name="gpt-4o-mini", narrative="second", text=profile
         )
         previous_messages = []
     return (
@@ -67,8 +67,8 @@ def generate_background_conversation(
         json.dump(background_dict, f, indent=4)
 
     model_names: dict[str, str] = {
-        "env": "gpt-3.5-turbo",
-        "agent2": "gpt-3.5-turbo",
+        "env": "gpt-4o-mini",
+        "agent2": "gpt-4o-mini",
         "agent1": "gpt-4",
     }
 

diff --git a/sotopia/agents/llm_agent.py b/sotopia/agents/llm_agent.py
@@ -27,7 +27,7 @@ def __init__(
         agent_name: str | None = None,
         uuid_str: str | None = None,
         agent_profile: AgentProfile | None = None,
-        model_name: str = "gpt-3.5-turbo",
+        model_name: str = "gpt-4o-mini",
         script_like: bool = False,
     ) -> None:
         super().__init__(
@@ -99,7 +99,7 @@ def __init__(
         agent_name: str | None = None,
         uuid_str: str | None = None,
         agent_profile: AgentProfile | None = None,
-        model_name: str = "gpt-3.5-turbo",
+        model_name: str = "gpt-4o-mini",
         agent_names: list[str] = [],
         background: ScriptBackground | None = None,
     ) -> None:

diff --git a/sotopia/envs/parallel.py b/sotopia/envs/parallel.py
@@ -130,7 +130,7 @@ def __init__(
             ["none", "speak", "non-verbal communication", "action", "leave"]
         ),
         action_order: Literal["simultaneous", "round-robin", "random"] = "simultaneous",
-        model_name: str = "gpt-3.5-turbo",
+        model_name: str = "gpt-4o-mini",
         evaluators: list[Evaluator] = [],
         terminal_evaluators: list[Evaluator] = [],
         uuid_str: str | None = None,