microsoft · jingyuanlm · Jul 25, 2025 · Jul 25, 2025 · Jul 30, 2025 · Jul 30, 2025
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
@@ -41,6 +41,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     """The recommend time limit for running on full data"""
     full_timeout: int = 3600
     """The timeout limit for running on full data"""
+    ensemble_timeout: int = 3600*5
 
     ### specific feature
 
@@ -103,6 +104,10 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     sota_count_threshold: int = 1
     """The threshold for SOTA count"""
 
+    ensemble_with_merge: bool = False
+
+    ratio_merge_or_ensemble: int = 70 # 70% for merge/ensemble
+
     #### multi-trace: SOTA experiment selector
     sota_exp_selector_name: str = "rdagent.scenarios.data_science.proposal.exp_gen.select.submit.GlobalSOTASelector"
     """The name of the SOTA experiment selector to use"""
@@ -116,7 +121,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     exp_gen_version_list: str = "v3,v2"
 
     #### multi-trace: time for final multi-trace merge
-    merge_hours: int = 0
+    merge_hours: float = 0
     """The time for merge"""
 
     #### multi-trace: max SOTA-retrieved number, used in AutoSOTAexpSelector
@@ -136,10 +141,13 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     show_hard_limit: bool = True
 
     #### hypothesis critique and rewrite
-    enable_hypo_critique_rewrite: bool = True
+    enable_hypo_critique_rewrite: bool = False
     """Enable hypothesis critique and rewrite stages for improving hypothesis quality"""
     enable_scale_check: bool = False
 
+    #### hypothesis selection method
+    llm_select_hypothesis: bool = True
+    """Whether to use LLM to select hypothesis. If True, use LLM selection; if False, use the existing ranking method."""
     #### enable runner code change summary
     runner_enable_code_change_summary: bool = True
 

diff --git a/rdagent/log/logger.py b/rdagent/log/logger.py
@@ -20,7 +20,7 @@
 
 from .base import Storage
 from .storage import FileStorage
-from .utils import get_caller_info
+from .utils import LogColors, get_caller_info
 
 
 class RDAgentLog(SingletonBaseClass):
@@ -127,10 +127,15 @@ def _log(self, level: str, msg: str, *, tag: str = "", raw: bool = False) -> Non
             logger.add(sys.stderr)
 
     def info(self, msg: str, *, tag: str = "", raw: bool = False) -> None:
+        # Use default color for info messages
         self._log("info", msg, tag=tag, raw=raw)
 
     def warning(self, msg: str, *, tag: str = "", raw: bool = False) -> None:
-        self._log("warning", msg, tag=tag, raw=raw)
+        # Add yellow color for warning messages
+        colored_msg = f"{LogColors.YELLOW}{msg}{LogColors.END}"
+        self._log("warning", colored_msg, tag=tag, raw=True)
 
     def error(self, msg: str, *, tag: str = "", raw: bool = False) -> None:
-        self._log("error", msg, tag=tag, raw=raw)
+        # Add red color for error messages
+        colored_msg = f"{LogColors.RED}{msg}{LogColors.END}"
+        self._log("error", colored_msg, tag=tag, raw=True)
diff --git a/rdagent/scenarios/data_science/dev/prompts.yaml b/rdagent/scenarios/data_science/dev/prompts.yaml
@@ -31,9 +31,13 @@ exp_feedback:
       - If overfitting is detected, provide a detailed analysis explaining how and why it occurs, referencing scenario description, code implementation, and validation scores to support your findings.
     - If such discrepancies or risks are found:
       - Clearly document these issues in `Reasoning`, referencing both scenario description and code implementation—not just validation scores.
-      - Set `"Evaluation Aligned With Task": "no"` and `"Replace Best Result": "no"`.
-      - Begin your `reasoning` with `[Evaluation error]`, explicitly stating the evaluation alignment issues causing experiment failure.
-    - If evaluation alignment passes, set `"Evaluation Aligned With Task": "yes"`, and then proceed to Step 3.
+        - Severity-based handling:
+         - Severe risk — likely to invert or invalidate the performance trend between validation and test (e.g., strong overfitting, label leakage, test distribution shift):
+           - Set "Evaluation Aligned With Task": "no" and "Replace Best Result": "no".
+           - Begin your reasoning with [Evaluation error], explicitly stating the evaluation alignment issues causing experiment failure.
+         - Mild/moderate risk — may cause slightly optimistic or biased validation scores but is unlikely to change the relative performance trend (e.g., scaling or PCA fit on full training data that’s also applied consistently to test):
+          - Set "Evaluation Aligned With Task": "yes" but note the potential bias in Reasoning.
+           - Proceed to Step 3 for result comparison.
 
     Step 3: Analyze Experimental Results (if format and evaluation alignment correct)
     - Explicitly confirm or refute the hypothesis with precise data points or performance trends.