test: Test ds refactor ll (#523)

* fix bugs to former scenario * fix a bug because coding in rdloop changed * fix the bug when feedback gets no hypothesis * fix trace structure * change all trace hist when merging hypothesis to experiments * ignore some error in ruff * fix kaggle scenario bugs * refine one line * another bug * another small bug * fix ui bugs * chage kaggle train.py path --------- Co-authored-by: Xu Yang <[email protected]>
microsoft · Jan 17, 2025 · ae0ec76 · ae0ec76
1 parent e572aa1
commit ae0ec76
Show file tree

Hide file tree

Showing 19 changed files with 134 additions and 102 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -81,7 +81,6 @@ src = ["rdagent"]
 [tool.ruff.lint]
 ignore = [
   # https://docs.astral.sh/ruff/rules/#pydocstyle-d
-  "ANN101",
   "ANN401",
   "D",
   "ERA001",
@@ -92,7 +91,7 @@ ignore = [
   "S101",
   "S301",
   "T20",
-  "TCH003",
+  "TC003",
   "TD",
 ]
 select = ["ALL"]

diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
@@ -7,7 +7,7 @@
 from rdagent.components.workflow.conf import BasePropSetting
 from rdagent.components.workflow.rd_loop import RDLoop
 from rdagent.core.developer import Developer
-from rdagent.core.exception import FactorEmptyError, ModelEmptyError
+from rdagent.core.exception import CoderError, FactorEmptyError, ModelEmptyError
 from rdagent.core.proposal import (
     Experiment2Feedback,
     Hypothesis2Experiment,
@@ -115,7 +115,7 @@ def running(self, prev_out: dict[str, Any]):
 
         return exp
 
-    skip_loop_error = (ModelEmptyError, FactorEmptyError)
+    skip_loop_error = (ModelEmptyError, FactorEmptyError, CoderError)
 
 
 def main(path=None, step_n=None, competition=None):

diff --git a/rdagent/app/qlib_rd_loop/factor_from_report.py b/rdagent/app/qlib_rd_loop/factor_from_report.py
@@ -97,6 +97,7 @@ def extract_hypothesis_and_exp_from_reports(report_file_path: str) -> Tuple[Qlib
 
     report_content = "\n".join(docs_dict.values())
     hypothesis = generate_hypothesis(factor_result, report_content)
+    exp.hypothesis = hypothesis
     return exp, hypothesis
 
 
@@ -128,7 +129,9 @@ def propose_hypo_exp(self, prev_out: dict[str, Any]):
                 if exp is None:
                     continue
                 self.valid_pdf_file_count += 1
-                exp.based_experiments = [QlibFactorExperiment(sub_tasks=[])] + [t[1] for t in self.trace.hist if t[2]]
+                exp.based_experiments = [QlibFactorExperiment(sub_tasks=[], hypothesis=hypothesis)] + [
+                    t[0] for t in self.trace.hist if t[1]
+                ]
                 exp.sub_workspace_list = exp.sub_workspace_list[: FACTOR_FROM_REPORT_PROP_SETTING.max_factors_per_exp]
                 exp.sub_tasks = exp.sub_tasks[: FACTOR_FROM_REPORT_PROP_SETTING.max_factors_per_exp]
                 logger.log_object(hypothesis, tag="hypothesis generation")
@@ -143,6 +146,12 @@ def propose(self, prev_out: dict[str, Any]):
     def exp_gen(self, prev_out: dict[str, Any]):
         return self.current_loop_exp
 
+    def coding(self, prev_out: dict[str, Any]):
+        with logger.tag("d"):  # develop
+            exp = self.coder.develop(prev_out["exp_gen"])
+            logger.log_object(exp.sub_workspace_list, tag="coder result")
+        return exp
+
 
 def main(report_folder=None, path=None, step_n=None):
     """

diff --git a/rdagent/components/coder/factor_coder/eva_utils.py b/rdagent/components/coder/factor_coder/eva_utils.py
@@ -79,7 +79,7 @@ def evaluate(
         **kwargs,
     ):
         factor_information = target_task.get_task_information()
-        code = implementation.code
+        code = implementation.all_codes
 
         system_prompt = (
             Environment(undefined=StrictUndefined)

diff --git a/rdagent/components/workflow/rd_loop.py b/rdagent/components/workflow/rd_loop.py
@@ -11,6 +11,7 @@
     Experiment2Feedback,
     Hypothesis,
     Hypothesis2Experiment,
+    HypothesisFeedback,
     HypothesisGen,
     Trace,
 )
@@ -74,9 +75,18 @@ def running(self, prev_out: dict[str, Any]):
         return exp
 
     def feedback(self, prev_out: dict[str, Any]):
-        feedback = self.summarizer.generate_feedback(
-            prev_out["running"], prev_out["direct_exp_gen"]["propose"], self.trace
-        )
-        with logger.tag("ef"):  # evaluate and feedback
-            logger.log_object(feedback, tag="feedback")
-        self.trace.hist.append((prev_out["direct_exp_gen"]["propose"], prev_out["running"], feedback))
+        e = prev_out.get(self.EXCEPTION_KEY, None)
+        if e is not None:
+            feedback = HypothesisFeedback(
+                observations="Error occurred in loop, skip this loop",
+                hypothesis_evaluation="",
+                new_hypothesis="",
+                reason="",
+                decision=False,
+            )
+            self.trace.hist.append((prev_out["direct_exp_gen"]["exp_gen"], feedback))
+        else:
+            feedback = self.summarizer.generate_feedback(prev_out["running"], self.trace)
+            with logger.tag("ef"):  # evaluate and feedback
+                logger.log_object(feedback, tag="feedback")
+            self.trace.hist.append((prev_out["running"], feedback))
diff --git a/rdagent/core/evolving_agent.py b/rdagent/core/evolving_agent.py
@@ -59,50 +59,50 @@ def multistep_evolve(
         filter_final_evo: bool = False,
     ) -> EvolvableSubjects:
         for evo_loop_id in tqdm(range(self.max_loop), "Implementing"):
-            with logger.tag(f"evo_loop_{evo_loop_id}"):
-                # 1. knowledge self-evolving
-                if self.knowledge_self_gen and self.rag is not None:
-                    self.rag.generate_knowledge(self.evolving_trace)
-                # 2. RAG
-                queried_knowledge = None
-                if self.with_knowledge and self.rag is not None:
-                    # TODO: Putting the evolving trace in here doesn't actually work
-                    queried_knowledge = self.rag.query(evo, self.evolving_trace)
-
-                # 3. evolve
-                evo = self.evolving_strategy.evolve(
-                    evo=evo,
-                    evolving_trace=self.evolving_trace,
-                    queried_knowledge=queried_knowledge,
+            # with logger.tag(f"evo_loop_{evo_loop_id}"):
+            # 1. knowledge self-evolving
+            if self.knowledge_self_gen and self.rag is not None:
+                self.rag.generate_knowledge(self.evolving_trace)
+            # 2. RAG
+            queried_knowledge = None
+            if self.with_knowledge and self.rag is not None:
+                # TODO: Putting the evolving trace in here doesn't actually work
+                queried_knowledge = self.rag.query(evo, self.evolving_trace)
+
+            # 3. evolve
+            evo = self.evolving_strategy.evolve(
+                evo=evo,
+                evolving_trace=self.evolving_trace,
+                queried_knowledge=queried_knowledge,
+            )
+            # TODO: Due to design issues, we have chosen to ignore this mypy error.
+            logger.log_object(evo.sub_workspace_list, tag="evolving code")  # type: ignore[attr-defined]
+            for sw in evo.sub_workspace_list:  # type: ignore[attr-defined]
+                logger.info(f"evolving code workspace: {sw}")
+
+            # 4. Pack evolve results
+            es = EvoStep(evo, queried_knowledge)
+
+            # 5. Evaluation
+            if self.with_feedback:
+                es.feedback = (
+                    # TODO: Due to the irregular design of rdagent.core.evaluation.Evaluator,
+                    # it fails mypy's test here, so we'll ignore this error for now.
+                    eva
+                    if isinstance(eva, Feedback)
+                    else eva.evaluate(evo, queried_knowledge=queried_knowledge)  # type: ignore[arg-type, call-arg]
                 )
-                # TODO: Due to design issues, we have chosen to ignore this mypy error.
-                logger.log_object(evo.sub_workspace_list, tag="evolving code")  # type: ignore[attr-defined]
-                for sw in evo.sub_workspace_list:  # type: ignore[attr-defined]
-                    logger.info(f"evolving code workspace: {sw}")
-
-                # 4. Pack evolve results
-                es = EvoStep(evo, queried_knowledge)
-
-                # 5. Evaluation
-                if self.with_feedback:
-                    es.feedback = (
-                        # TODO: Due to the irregular design of rdagent.core.evaluation.Evaluator,
-                        # it fails mypy's test here, so we'll ignore this error for now.
-                        eva
-                        if isinstance(eva, Feedback)
-                        else eva.evaluate(evo, queried_knowledge=queried_knowledge)  # type: ignore[arg-type, call-arg]
-                    )
-                    logger.log_object(es.feedback, tag="evolving feedback")
-
-                # 6. update trace
-                self.evolving_trace.append(es)
-
-                # 7. check if all tasks are completed
-                if self.with_feedback:
-                    all_completed = all(es.feedback) if isinstance(es.feedback, list) else es.feedback
-                    if all_completed:
-                        logger.info("All tasks in evolving subject have been completed.")
-                        break
+                logger.log_object(es.feedback, tag="evolving feedback")
+
+            # 6. update trace
+            self.evolving_trace.append(es)
+
+            # 7. check if all tasks are completed
+            if self.with_feedback:
+                all_completed = all(es.feedback) if isinstance(es.feedback, list) else es.feedback
+                if all_completed:
+                    logger.info("All tasks in evolving subject have been completed.")
+                    break
 
         if self.with_feedback and filter_final_evo:
             evo = self.filter_evolvable_subjects_by_feedback(evo, self.evolving_trace[-1].feedback)

diff --git a/rdagent/log/storage.py b/rdagent/log/storage.py
@@ -100,6 +100,8 @@ def iter_msg(self, watch: bool = False) -> Generator[Message, None, None]:
                 msg_l.append(m)
 
         for file in self.path.glob("**/*.pkl"):
+            if file.name == "debug_llm.pkl":
+                continue
             tag = ".".join(file.relative_to(self.path).as_posix().replace("/", ".").split(".")[:-3])
             pid = file.parent.name
 

diff --git a/rdagent/log/ui/app.py b/rdagent/log/ui/app.py
@@ -357,7 +357,7 @@ def hypothesis_hover_text(h: Hypothesis, d: bool = False):
     hover_texts = [
         hypothesis_hover_text(state.hypotheses[int(i[6:])], state.h_decisions[int(i[6:])])
         for i in df.index
-        if i != "alpha158"
+        if i != "alpha158" and i != "Baseline"
     ]
     if state.alpha158_metrics is not None:
         hover_texts = ["Baseline: alpha158"] + hover_texts

diff --git a/rdagent/scenarios/data_mining/proposal/model_proposal.py b/rdagent/scenarios/data_mining/proposal/model_proposal.py
@@ -80,7 +80,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
             else "No previous hypothesis and feedback available since it's the first round."
         )
 
-        experiment_list: List[ModelExperiment] = [t[1] for t in trace.hist]
+        experiment_list: List[ModelExperiment] = [t[0] for t in trace.hist]
 
         model_list = []
         for experiment in experiment_list:
@@ -117,5 +117,5 @@ def convert_response(self, response: str, hypothesis: Hypothesis, trace: Trace)
                 )
             )
         exp = DMModelExperiment(tasks, hypothesis=hypothesis)
-        exp.based_experiments = [t[1] for t in trace.hist if t[2]]
+        exp.based_experiments = [t[0] for t in trace.hist if t[1]]
         return exp
diff --git a/rdagent/scenarios/data_science/proposal/prompts.yaml b/rdagent/scenarios/data_science/proposal/prompts.yaml
@@ -59,6 +59,15 @@ hypothesis_model: # It is deprecated now, please refer to direct_exp_gen
     Please generate the output using the following format and specifications:
     {{ hypothesis_output_format }}
 
+hypothesis_and_feedback: |-
+  {% for experiment, feedback in hist %}
+  Hypothesis {{ loop.index }}
+  The experiment is design driven by hypothesis : {{ experiment.hypothesis }}
+  Observation on the result with the hypothesis: {{ feedback.observations }}
+  Feedback on the original hypothesis:  {{ feedback.hypothesis_evaluation }}
+  Did changing to this hypothesis work? (focus on the change):  {{ feedback.decision }}
+  {% endfor %}
+
 task_gen: # It is deprecated now, please refer to direct_exp_gen
   system: |-
     {% if hypothesis is not none %}

diff --git a/rdagent/scenarios/kaggle/developer/feedback.py b/rdagent/scenarios/kaggle/developer/feedback.py
@@ -111,15 +111,15 @@ def generate_feedback(self, exp: Experiment, trace: Trace) -> HypothesisFeedback
             ]
         else:
             current_sub_exps_to_code = {
-                sub_ws.target_task.get_task_information(): sub_ws.code for sub_ws in exp.sub_workspace_list
+                sub_ws.target_task.get_task_information(): sub_ws.all_codes for sub_ws in exp.sub_workspace_list
             }
         current_sub_exps_to_code_str = json.dumps(current_sub_exps_to_code, indent=2)
         current_result = exp.result
         current_sub_results = exp.sub_results
 
         last_hypothesis_and_feedback = None
         if trace.hist and len(trace.hist) > 0:
-            last_hypothesis_and_feedback = (trace.hist[-1][0], trace.hist[-1][2])
+            last_hypothesis_and_feedback = (trace.hist[-1][0].hypothesis, trace.hist[-1][1])
 
         # Prepare render dictionary
         render_dict = {

diff --git a/rdagent/scenarios/kaggle/experiment/scenario.py b/rdagent/scenarios/kaggle/experiment/scenario.py
@@ -125,7 +125,7 @@ def background(self) -> str:
         background_template = prompt_dict["kg_background"]
 
         train_script = (
-            Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template" / "train.py"
+            Path(__file__).parent / "templates" / KAGGLE_IMPLEMENT_SETTING.competition / "train.py"
         ).read_text()
 
         background_prompt = (

diff --git a/rdagent/scenarios/kaggle/prompts.yaml b/rdagent/scenarios/kaggle/prompts.yaml
@@ -25,8 +25,8 @@ KG_hypothesis_gen_RAG: |-
   {% endif %}
 
 hypothesis_and_feedback: |-
-  {% for hypothesis, experiment, feedback in trace.hist[-10:] %}
-  Hypothesis {{ loop.index }}: {{ hypothesis }}
+  {% for experiment, feedback in trace.hist[-10:] %}
+  Hypothesis {{ loop.index }}: {{ experiment.hypothesis }}
   Observation on the result with the hypothesis: {{ feedback.observations }}
   Feedback on the original hypothesis:  {{ feedback.hypothesis_evaluation }}
   Did changing to this hypothesis work? (focus on the change):  {{ feedback.decision }}

diff --git a/rdagent/scenarios/kaggle/proposal/proposal.py b/rdagent/scenarios/kaggle/proposal/proposal.py
@@ -276,11 +276,11 @@ def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
 
         hypothesis_specification = f"Hypothesis should avoid being too general and vague, and should be specific and actionable. For example, hypothesis like 'tune a model' is too general, while hypothesis like 'increase the learning rate to 0.1 of the lightgbm model will improve the performance' is specific and actionable."
         if len(trace.hist) > 0:
-            sota_features = str(trace.hist[-1][1].based_experiments[-1].experiment_workspace.data_description)
+            sota_features = str(trace.hist[-1][0].based_experiments[-1].experiment_workspace.data_description)
             sota_models = json.dumps(
-                trace.hist[-1][1].based_experiments[-1].experiment_workspace.model_description, indent=2
+                trace.hist[-1][0].based_experiments[-1].experiment_workspace.model_description, indent=2
             )
-            sota_result = trace.hist[-1][1].based_experiments[-1].result
+            sota_result = trace.hist[-1][0].based_experiments[-1].result
             hypothesis_specification += f"\nYour hypothesis should based on current SOTA solution. The user will conduct experiments based on the SOTA solution to test whether your hypothesis is right on this specific ecompetition. \n\nSOTA Features: {sota_features}\n\nSOTA Models: {sota_models}\n\nSOTA Result: {sota_result}"
         if self.scen.if_action_choosing_based_on_UCB:
             hypothesis_specification += (
@@ -340,7 +340,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
             else "No previous hypothesis and feedback available since it's the first round."
         )
 
-        experiment_list: List[ModelExperiment] = [t[1] for t in trace.hist]
+        experiment_list: List[ModelExperiment] = [t[0] for t in trace.hist]
 
         model_list = []
         for experiment in experiment_list:
@@ -384,7 +384,7 @@ def convert_feature_experiment(self, response: str, hypothesis: Hypothesis, trac
             sub_tasks=tasks,
             based_experiments=(
                 [KGFactorExperiment(sub_tasks=[], source_feature_size=trace.scen.input_shape[-1])]
-                + [t[1] for t in trace.hist if t[2]]
+                + [t[0] for t in trace.hist if t[1]]
             ),
             hypothesis=hypothesis,
         )
@@ -400,7 +400,7 @@ def convert_model_experiment(self, response: str, hypothesis: Hypothesis, trace:
             )
 
         based_experiments = [KGModelExperiment(sub_tasks=[], source_feature_size=trace.scen.input_shape[-1])] + [
-            t[1] for t in trace.hist if t[2]
+            t[0] for t in trace.hist if t[1]
         ]
         model_type = response_dict.get("model_type", "Model type not provided")
         if model_type in KG_MODEL_MAPPING:

diff --git a/rdagent/scenarios/qlib/prompts.yaml b/rdagent/scenarios/qlib/prompts.yaml
@@ -1,6 +1,6 @@
 hypothesis_and_feedback: |-
-  {% for hypothesis, experiment, feedback in trace.hist[-10:] %}
-  Hypothesis {{ loop.index }}: {{ hypothesis }}
+  {% for experiment, feedback in trace.hist[-10:] %}
+  Hypothesis {{ loop.index }}: {{ experiment.hypothesis }}
   Corresponding Code (that leads to the difference in performance): {{experiment.sub_workspace_list[0].file_dict.get("model.py")}}
   Observation on the result with the hypothesis: {{ feedback.observations }}
   Feedback on the original hypothesis:  {{ feedback.hypothesis_evaluation }}

diff --git a/rdagent/scenarios/qlib/proposal/factor_proposal.py b/rdagent/scenarios/qlib/proposal/factor_proposal.py
@@ -65,7 +65,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict |
             else "No previous hypothesis and feedback available since it's the first round."
         )
 
-        experiment_list: List[FactorExperiment] = [t[1] for t in trace.hist]
+        experiment_list: List[FactorExperiment] = [t[0] for t in trace.hist]
 
         factor_list = []
         for experiment in experiment_list:
@@ -98,7 +98,7 @@ def convert_response(self, response: str, hypothesis: Hypothesis, trace: Trace)
             )
 
         exp = QlibFactorExperiment(tasks, hypothesis=hypothesis)
-        exp.based_experiments = [QlibFactorExperiment(sub_tasks=[])] + [t[1] for t in trace.hist if t[2]]
+        exp.based_experiments = [QlibFactorExperiment(sub_tasks=[])] + [t[0] for t in trace.hist if t[1]]
 
         unique_tasks = []
 

diff --git a/rdagent/scenarios/qlib/proposal/model_proposal.py b/rdagent/scenarios/qlib/proposal/model_proposal.py
@@ -65,7 +65,7 @@ def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, b
             else "No previous hypothesis and feedback available since it's the first round."
         )
 
-        experiment_list: List[ModelExperiment] = [t[1] for t in trace.hist]
+        experiment_list: List[ModelExperiment] = [t[0] for t in trace.hist]
 
         model_list = []
         for experiment in experiment_list:
@@ -102,5 +102,5 @@ def convert_response(self, response: str, hypothesis: Hypothesis, trace: Trace)
                 )
             )
         exp = QlibModelExperiment(tasks, hypothesis=hypothesis)
-        exp.based_experiments = [t[1] for t in trace.hist if t[2]]
+        exp.based_experiments = [t[0] for t in trace.hist if t[1]]
         return exp