refactor: Move generate_diff to utils and update DSExpGen logic

microsoft · Jan 16, 2025 · ad47360 · ad47360
1 parent 953a1d7
commit ad47360
Show file tree

Hide file tree

Showing 7 changed files with 253 additions and 258 deletions.
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
@@ -12,6 +12,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
 
     ## proposal
     exp_gen: str = "rdagent.scenarios.data_science.proposal.exp_gen.DSExpGen"
+    # exp_gen_init_kwargs: dict = {"max_trace_hist": 3}   # TODO: to be configurable
 
     # the two below should be used in ExpGen
     # hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesisGen"

diff --git a/rdagent/scenarios/data_science/dev/feedback.py b/rdagent/scenarios/data_science/dev/feedback.py
@@ -1,7 +1,4 @@
-import difflib
 import json
-from pathlib import Path
-from typing import List
 
 from rdagent.components.knowledge_management.graph import UndirectedNode
 from rdagent.core.experiment import Experiment
@@ -17,59 +14,7 @@
 from rdagent.scenarios.data_science.proposal.exp_gen import DSTrace
 from rdagent.utils import convert2bool, remove_path_info_from_str
 from rdagent.utils.agent.tpl import T
-
-
-# TODO:  find a better place.
-def generate_diff(dir1: str, dir2: str) -> List[str]:
-    """
-    Generate a diff between two directories, considering only .py files.
-    It is mocking `diff -durN dir1 dir2` in linux.
-
-    Args:
-        dir1 (str): Path to the first directory.
-        dir2 (str): Path to the second directory.
-
-    Returns:
-        List[str]: A list of diffs for .py files that are different between the two directories.
-    """
-
-    diff_files = []
-
-    dir1_files = {f.relative_to(dir1) for f in Path(dir1).rglob("*.py") if f.is_file()}
-    dir2_files = {f.relative_to(dir2) for f in Path(dir2).rglob("*.py") if f.is_file()}
-
-    all_files = dir1_files.union(dir2_files)
-
-    for file in all_files:
-        file1 = Path(dir1) / file
-        file2 = Path(dir2) / file
-
-        if file1.exists() and file2.exists():
-            with file1.open() as f1, file2.open() as f2:
-                diff = list(
-                    difflib.unified_diff(f1.readlines(), f2.readlines(), fromfile=str(file1), tofile=str(file2))
-                )
-                if diff:
-                    diff_files.extend(diff)
-        else:
-            if file1.exists():
-                with file1.open() as f1:
-                    diff = list(
-                        difflib.unified_diff(
-                            f1.readlines(), [], fromfile=str(file1), tofile=str(file2) + " (empty file)"
-                        )
-                    )
-                    diff_files.extend(diff)
-            elif file2.exists():
-                with file2.open() as f2:
-                    diff = list(
-                        difflib.unified_diff(
-                            [], f2.readlines(), fromfile=str(file1) + " (empty file)", tofile=str(file2)
-                        )
-                    )
-                    diff_files.extend(diff)
-
-    return diff_files
+from rdagent.utils.repo.diff import generate_diff
 
 
 class DSExperiment2Feedback(Experiment2Feedback):

diff --git a/rdagent/scenarios/data_science/experiment/experiment.py b/rdagent/scenarios/data_science/experiment/experiment.py
@@ -1,4 +1,5 @@
 import re
+import pandas as pd
 from typing import Literal
 
 from rdagent.core.experiment import Experiment, FBWorkspace, Task

diff --git a/rdagent/scenarios/data_science/proposal/exp_gen.py b/rdagent/scenarios/data_science/proposal/exp_gen.py
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -1,4 +1,4 @@
-hypothesis_gen:
+hypothesis_gen: # It is deprecated now, please refer to direct_exp_gen
   system: |-
     The user is working on generating new hypotheses for the {{targets}} in a data-driven research and development process. 
     The {{targets}} are used in the following scenario:
@@ -33,7 +33,7 @@ hypothesis_gen:
     In addition, generate relevant reasoning and distilled knowledge keys.
     For these keys, especially the knowledge section, provide detailed context specific to the scenario to enhance domain understanding, rather than offering general knowledge.
 
-hypothesis_model:
+hypothesis_model: # It is deprecated now, please refer to direct_exp_gen
   system: |-
     The user is working on generating new hypotheses for the {{targets}} in a data-driven research and development process. 
     The {{targets}} are used in the following scenario:
@@ -59,15 +59,7 @@ hypothesis_model:
     Please generate the output using the following format and specifications:
     {{ hypothesis_output_format }}
 
-hypothesis_and_feedback: |-
-  {% for experiment, feedback in hist %}
-  Hypothesis {{ loop.index }}
-  Observation on the result with the hypothesis: {{ feedback.observations }}
-  Feedback on the original hypothesis:  {{ feedback.hypothesis_evaluation }}
-  Did changing to this hypothesis work? (focus on the change):  {{ feedback.decision }}
-  {% endfor %}
-
-task_gen:
+task_gen: # It is deprecated now, please refer to direct_exp_gen
   system: |-
     {% if hypothesis is not none %}
     The user is trying to generate new {{targets}} based on the hypothesis generated in the previous step. 
@@ -111,7 +103,7 @@ task_gen:
     Please generate the new {{targets}} task.
     {% endif %}
 
-task_gen_model:
+task_gen_model: # It is deprecated now, please refer to direct_exp_gen
   system: |-
     {% if hypothesis is not none %}
     The user is trying to generate new {{targets}} based on the hypothesis generated in the previous step. 
@@ -143,6 +135,85 @@ task_gen_model:
     Please generate the new {{targets}} task.
     {% endif %}
 
+direct_exp_gen:
+  system: |-
+    You are a data scientist and a top Kaggle competitor. The user is working on creating a solution for a Kaggle competition. Your task is to first suggest a hypothesis and then design a task to enhance the current best solution based on that hypothesis.
+
+    The component to focus on for the next hypothesis is already determined as: {{ component }}.
+    It will be used in the following scenario:
+    {{scenario}}
+
+    # Hypothesis Proposal
+
+    The user has already proposed several hypotheses and conducted evaluations on them. This information will be provided to you later. Your task is to check if a similar hypothesis has already been generated. If one exists and you agree with it, you can use it. If you disagree, please create an improved version.
+
+    To assist you in formulating new hypotheses, the user has provided some additional information: 
+    Hypothesis should avoid being too general and vague, and should be specific and actionable. For example, hypothesis like 'tune a model' is too general, while hypothesis like 'increase the learning rate to 0.1 of the lightgbm model will improve the performance' is specific and actionable.
+    Your hypothesis should based on current SOTA solution. The user will conduct experiments based on the SOTA solution(current best experiments) to test whether your hypothesis is right on this specific competition.
+    Important: If the hypothesis_specification outlines the next steps you need to follow, ensure you adhere to those instructions.
+
+    [Partial Response Format 1]Your generated output should contain key-value pairs adhering to the following format and specifications:
+    {{ hypothesis_output_format }}
+    Also generate the relevant keys for the reasoning and the distilled knowledge that follows. For those keys, in particular for knowledge, explain in the context of the specific scenario to build up domain knowledge in the specific field rather than general knowledge.
+
+    # Task Design
+
+    The user is trying to generate new {{targets}} based on the hypothesis generated in the previous step.
+
+    The scope of the {{targets}} can be described by a interface specification as follows
+    ```Python
+    {{task_specification}}
+    ```
+
+    The user will use the {{targets}} generated to do some experiments. The user will provide this information to you:
+    1. The target hypothesis you are targeting to generate {{targets}} for.
+    2. The hypothesis generated in the previous steps and their corresponding feedbacks.
+    3.  Former proposed {{targets}} on similar hypothesis.
+    4. Some additional information to help you generate new {{targets}}.
+
+    [Partial Response Format 2] Your generated output should contain key-value pairs adhering to the following format and specifications:
+    {{ task_output_format }}
+
+    {% if extra_requirement %}
+    {{extra_requirement}}
+    {% endif %}
+
+    # Response Requirement
+    Your generated output should combine the key-value pairs based on [Partial Response Format 1] and [Partial Response Format 2]. Please output the key-values specified in [Partial Response Format 1] first.
+
+  user: |-
+    # The detailed description of current best experiments
+    {{sota_exp_desc}}
+
+    ## The according feedbacks for the best experiments
+    {{ exp_and_feedback_desc }}
+
+    {% if recent_trace_desc %}
+    # Several trials after the best experiments
+    The user has made several hypothesis on this scenario and did several evaluation on them.
+    The former hypothesis and the corresponding feedbacks are as follows (focus on the last one & the new hypothesis that it provides and reasoning to see if you agree):
+    {{recent_trace_desc}}
+
+    # The difference from the best experiments to the last one
+    {{last_exp_diff}}
+    {% endif %}
+
+
+extra_requirement:
+  model: |-
+    If there are sufficient models available. Your task is to choose one of the existing models for further tuning or optimization. Based on the model's information:
+
+    If the number of available models is insufficient . Your task is to first decide whether to:
+    - Tune an existing model: Select one of the current models for further tuning and improvement.
+    - Add a new model: Introduce a new model to expand the hypothesis space.
+
+    The information of the model is described by the code of workspace.
+
+    Make a decision and proceed accordingly:
+    - If you decide to tune an existing model, select the existing model file and generate a new hypothesis.
+    - If you decide to add a new model, specify the type of model you would add and generate a new hypothesis related to the new model.
+
+
 component_gen:
   system: |-
     You are a Kaggle Grander Master. You are going to provide a solution for a kaggle competition.
@@ -155,8 +226,8 @@ component_gen:
     # Here is the current best version of implementation.
     {{sota_exp_desc}}
 
-    # Here is the latest version of implementation
-    {{current_exp_desc}}
+    # Here is the latest version of implementation different from the sota_exp_desc
+    {{last_exp_diff}}
 
     You will be provided the feedback for the latest implementation.
 

diff --git a/rdagent/scenarios/data_science/share.yaml b/rdagent/scenarios/data_science/share.yaml
@@ -16,7 +16,7 @@ describe: # some template to describe some object
     {% if exp.result is none %}
     There are no according evaluation results
     {% else %}
-    Evaluated results is:
+    Evaluated results on validation is:
     {{ exp.result }}
     {% endif %}
 
@@ -27,7 +27,7 @@ describe: # some template to describe some object
   feedback: |-
     {% if exp_and_feedback and exp_and_feedback|length > 1 %}
     ## {{heading | default('Previous trial and feedback')}}
-    Before current trial, previous recent trial is listed below.
+    Before current trial, a previous recent trial is listed below.
     {% if exp_and_feedback[0].hypothesis %}
     the experiment is designed based on hypothesis: {{ exp_and_feedback[0].hypothesis }}
     {% endif %}

diff --git a/rdagent/utils/repo/diff.py b/rdagent/utils/repo/diff.py
@@ -0,0 +1,56 @@
+import difflib
+from pathlib import Path
+from typing import List
+
+
+def generate_diff(dir1: str, dir2: str) -> List[str]:
+    """
+    Generate a diff between two directories(from dir1 to dir2), considering only .py files.
+    It is mocking `diff -durN dir1 dir2` in linux.
+
+    Args:
+        dir1 (str): Path to the first directory.
+        dir2 (str): Path to the second directory.
+
+    Returns:
+        List[str]: A list of diffs for .py files that are different between the two directories.
+    """
+
+    diff_files = []
+
+    dir1_files = {f.relative_to(dir1) for f in Path(dir1).rglob("*.py") if f.is_file()}
+    dir2_files = {f.relative_to(dir2) for f in Path(dir2).rglob("*.py") if f.is_file()}
+
+    all_files = dir1_files.union(dir2_files)
+
+    for file in all_files:
+        file1 = Path(dir1) / file
+        file2 = Path(dir2) / file
+
+        if file1.exists() and file2.exists():
+            with file1.open() as f1, file2.open() as f2:
+                diff = list(
+                    difflib.unified_diff(f1.readlines(), f2.readlines(), fromfile=str(file1), tofile=str(file2))
+                )
+                if diff:
+                    diff_files.extend(diff)
+        else:
+            if file1.exists():
+                with file1.open() as f1:
+                    diff = list(
+                        difflib.unified_diff(
+                            f1.readlines(), [], fromfile=str(file1), tofile=str(file2) + " (empty file)"
+                        )
+                    )
+                    diff_files.extend(diff)
+            elif file2.exists():
+                with file2.open() as f2:
+                    diff = list(
+                        difflib.unified_diff(
+                            [], f2.readlines(), fromfile=str(file1) + " (empty file)", tofile=str(file2)
+                        )
+                    )
+                    diff_files.extend(diff)
+
+    return diff_files
+