notque · notque · Mar 29, 2026 · Mar 29, 2026
diff --git a/pipelines/comprehensive-review/SKILL.md b/pipelines/comprehensive-review/SKILL.md
@@ -1,23 +1,16 @@
 ---
 name: comprehensive-review
 description: |
-  Unified 4-wave code review: Wave 0 auto-discovers packages/modules and
-  dispatches one language-specialist agent per package for deep per-package
-  analysis. Wave 1 dispatches 12 foundation reviewers in parallel (with Wave 0
-  context). Wave 2 dispatches 10 deep-dive reviewers that receive Wave 0+1
-  findings as context for targeted analysis. Wave 3 dispatches 4-5 adversarial
-  reviewers that challenge Wave 1+2 consensus — contrarian, skeptical senior,
-  user advocate, meta-process, and conditionally SAPCC structural. Aggregates
-  all findings by severity with wave-agreement labels (unanimous, majority,
-  contested), then auto-fixes ALL issues. Covers per-package deep review,
-  security, business logic, architecture, error handling, test coverage, type
-  design, code quality, comment analysis, language idioms, docs validation,
-  newcomer perspective, performance, concurrency, API contracts, dependencies,
-  error messages, dead code, naming, observability, config safety, migration
-  safety, and adversarial challenge.
-  Use for "comprehensive review", "full review", "review everything", "review
-  and fix", or "thorough code review".
-  Do NOT use for single-concern reviews (use individual agents instead).
+  Four-wave code review pipeline for large or high-risk changes. Wave 0
+  auto-discovers packages/modules and runs per-package specialist review. Wave
+  1 runs broad foundation reviewers in parallel. Wave 2 runs targeted deep-dive
+  reviewers using earlier findings as context. Wave 3 runs adversarial reviewers
+  that challenge consensus and surface missed risks. Aggregates findings by
+  severity and agreement level, deduplicates them, and can auto-fix issues
+  unless review-only mode is used. Use for "comprehensive review", "full
+  review", "review everything", "review and fix", or "thorough code review".
+  Do NOT use for narrow single-concern reviews; use smaller review skills
+  instead.
 effort: high
 version: 4.0.0
 user-invocable: false

diff --git a/scripts/tests/test_agent_comparison_optimize_loop.py b/scripts/tests/test_agent_comparison_optimize_loop.py
@@ -0,0 +1,132 @@
+import importlib.util
+import json
+from pathlib import Path
+import sys
+
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+
+
+def load_module(name: str, relative_path: str):
+    spec = importlib.util.spec_from_file_location(name, REPO_ROOT / relative_path)
+    module = importlib.util.module_from_spec(spec)
+    assert spec.loader is not None
+    spec.loader.exec_module(module)
+    return module
+
+
+def test_assess_target_rejects_missing_frontmatter(tmp_path):
+    optimize_loop = load_module(
+        "agent_comparison_optimize_loop",
+        "skills/agent-comparison/scripts/optimize_loop.py",
+    )
+    target = tmp_path / "SKILL.md"
+    target.write_text("# no frontmatter\nbody\n")
+
+    scores = optimize_loop.assess_target(
+        target,
+        [{"query": "write tests", "should_trigger": True}],
+        "improve routing precision",
+        dry_run=True,
+    )
+
+    assert scores["parses"] is False
+    assert optimize_loop.composite_score(scores) == 0.0
+
+
+def test_check_protected_sections_rejects_missing_blocks():
+    optimize_loop = load_module(
+        "agent_comparison_optimize_loop",
+        "skills/agent-comparison/scripts/optimize_loop.py",
+    )
+    original = (
+        "alpha\n"
+        "<!-- DO NOT OPTIMIZE -->\n"
+        "keep me\n"
+        "<!-- END DO NOT OPTIMIZE -->\n"
+        "omega\n"
+    )
+    relocated = "alpha\nomega\n"
+
+    assert optimize_loop.check_protected_sections(original, relocated) is False
+
+
+def test_restore_protected_does_not_silently_reinsert_missing_blocks():
+    generate_variant = load_module(
+        "agent_comparison_generate_variant",
+        "skills/agent-comparison/scripts/generate_variant.py",
+    )
+    original = (
+        "alpha\n"
+        "<!-- DO NOT OPTIMIZE -->\n"
+        "keep me\n"
+        "<!-- END DO NOT OPTIMIZE -->\n"
+        "omega\n"
+    )
+    variant = "alpha\nomega\n"
+
+    restored = generate_variant.restore_protected(original, variant)
+
+    assert restored == variant
+
+
+def test_generate_variant_main_reads_current_content_from_file(tmp_path, monkeypatch, capsys):
+    generate_variant = load_module(
+        "agent_comparison_generate_variant",
+        "skills/agent-comparison/scripts/generate_variant.py",
+    )
+
+    class FakeBlock:
+        def __init__(self, block_type: str, text: str):
+            self.type = block_type
+            if block_type == "thinking":
+                self.thinking = text
+            else:
+                self.text = text
+
+    class FakeResponse:
+        def __init__(self):
+            self.content = [
+                FakeBlock("thinking", "reasoning"),
+                FakeBlock(
+                    "text",
+                    "<variant>---\ndescription: updated\n---</variant>"
+                    "<summary>updated</summary><deletion_justification></deletion_justification>",
+                ),
+            ]
+            self.usage = type("Usage", (), {"input_tokens": 1, "output_tokens": 2})()
+
+    class FakeClient:
+        def __init__(self):
+            self.messages = type("Messages", (), {"create": lambda self, **kwargs: FakeResponse()})()
+
+    class FakeAnthropicModule:
+        class Anthropic:
+            def __new__(cls):
+                return FakeClient()
+
+    content_file = tmp_path / "current.md"
+    content_file.write_text("---\ndescription: current\n---\n")
+
+    monkeypatch.setattr(generate_variant, "anthropic", FakeAnthropicModule)
+    monkeypatch.setattr(
+        sys,
+        "argv",
+        [
+            "generate_variant.py",
+            "--target",
+            "skills/example/SKILL.md",
+            "--goal",
+            "improve routing precision",
+            "--current-content-file",
+            str(content_file),
+            "--model",
+            "fake-model",
+        ],
+    )
+
+    generate_variant.main()
+    output = json.loads(capsys.readouterr().out)
+
+    assert output["variant"] == "---\ndescription: updated\n---"
+    assert output["tokens_used"] == 3
diff --git a/scripts/tests/test_eval_compare_optimization.py b/scripts/tests/test_eval_compare_optimization.py
@@ -0,0 +1,33 @@
+import importlib.util
+import json
+from pathlib import Path
+
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+
+
+def load_module(name: str, relative_path: str):
+    spec = importlib.util.spec_from_file_location(name, REPO_ROOT / relative_path)
+    module = importlib.util.module_from_spec(spec)
+    assert spec.loader is not None
+    spec.loader.exec_module(module)
+    return module
+
+
+def test_load_optimization_data_ignores_unrelated_results_json(tmp_path):
+    eval_compare = load_module(
+        "skill_creator_eval_compare",
+        "skills/skill-creator/scripts/eval_compare.py",
+    )
+    (tmp_path / "results.json").write_text(json.dumps({"status": "not-optimization"}))
+    (tmp_path / "evals" / "iterations").mkdir(parents=True)
+    expected = {
+        "target": "skills/example/SKILL.md",
+        "baseline_score": {"train": 1.0, "test": 1.0},
+        "iterations": [],
+    }
+    (tmp_path / "evals" / "iterations" / "results.json").write_text(json.dumps(expected))
+
+    loaded = eval_compare.load_optimization_data(tmp_path)
+
+    assert loaded == expected
diff --git a/skills/agent-comparison/SKILL.md b/skills/agent-comparison/SKILL.md
@@ -2,12 +2,15 @@
 name: agent-comparison
 description: |
   A/B test agent variants measuring quality and total session token cost
-  across simple and complex benchmarks. Use when creating compact agent
-  versions, validating agent changes, comparing internal vs external agents,
-  or deciding between variants for production. Use for "compare agents",
-  "A/B test", "benchmark agents", or "test agent efficiency". Route single-agent evaluation to agent-evaluation, testing skills, or optimizing prompts
-  without variant comparison.
-version: 2.0.0
+  across simple and complex benchmarks. Also supports automated optimization
+  loops (autoresearch) for frontmatter description and routing-trigger quality
+  using train/test eval sets. Use when creating compact agent versions, validating
+  agent changes, comparing internal vs external agents, optimizing a skill description,
+  or deciding between variants for production. Use for "compare agents", "A/B test",
+  "benchmark agents", "test agent efficiency", "optimize description", "optimize skill",
+  or "run autoresearch". Route single-agent evaluation to agent-evaluation, testing skills,
+  or simple prompt optimization without variant comparison.
+version: 2.2.0
 user-invocable: false
 allowed-tools:
   - Read
@@ -22,6 +25,9 @@ routing:
     - "compare agents"
     - "A/B test agents"
     - "benchmark agents"
+    - "optimize skill"
+    - "optimize description"
+    - "run autoresearch"
   category: meta-tooling
 ---
 
@@ -254,6 +260,129 @@ Remove temporary benchmark files and debug outputs. Keep only the comparison rep
 
 **Gate**: Report generated with all metrics. Verdict stated with evidence. Report saved to benchmark directory.
 
+### Phase 5: OPTIMIZE (optional — invoked explicitly)
+
+**Goal**: Run an automated optimization loop that iteratively improves a markdown target's frontmatter `description` using trigger-rate eval tasks, then keeps only measured improvements.
+
+This phase is for routing/trigger optimization, not full code-generation benchmarking. Invoke it when the user says "optimize this skill", "optimize the description", or "run autoresearch". The existing manual A/B comparison (Phases 1-4) remains the path for full agent benchmarking.
+
+**Step 1: Validate optimization target and goal**
+
+Confirm the target file exists, has YAML frontmatter with a `description`, and the optimization goal is clear:
+
+```bash
+# Target must be a markdown file with frontmatter description
+test -f skills/{target}/SKILL.md
+rg -n '^description:' skills/{target}/SKILL.md
+
+# Goal should be specific and measurable
+# Good: "improve error handling instructions"
+# Bad: "make it better"
+```
+
+**Step 2: Prepare trigger-rate eval tasks**
+
+```bash
+python3 skills/agent-comparison/scripts/optimize_loop.py \
+    --target skills/{target}/SKILL.md \
+    --goal "{optimization goal}" \
+    --benchmark-tasks skills/agent-comparison/references/optimization-tasks.example.json \
+    --train-split 0.6 \
+    --model claude-sonnet-4-20250514 \
+    --verbose
+```
+
+Supported task schemas:
+- Flat `tasks` list with optional `"split": "train" | "test"` per task
+- Top-level `train` and `test` arrays
+
+Every task must include:
+- `query`: the routing prompt to test
+- `should_trigger`: whether the target should trigger for that prompt
+
+If no split markers are present, the loop does a reproducible random split with seed `42`.
+
+**Step 3: Run baseline evaluation**
+
+The loop automatically evaluates the unmodified target against the train set before starting iteration. This establishes the score to beat, and records a held-out baseline if test tasks exist.
+
+**Step 4: Enter optimization loop**
+
+The `optimize_loop.py` script handles the full loop:
+- Calls `generate_variant.py` to propose changes (Claude with extended thinking)
+- Evaluates each variant against train tasks
+- Keeps variants that improve score by more than `--min-gain` (default 0.02)
+- Reverts variants that don't improve, break hard gates, or delete sections without justification
+- Checks held-out test set every 5 iterations for Goodhart divergence
+- Stops on convergence (5 consecutive reverts), Goodhart alarm, or max iterations
+
+```bash
+python3 skills/agent-comparison/scripts/optimize_loop.py \
+    --target skills/{target}/SKILL.md \
+    --goal "{optimization goal}" \
+    --benchmark-tasks skills/agent-comparison/references/optimization-tasks.example.json \
+    --max-iterations 20 \
+    --min-gain 0.02 \
+    --train-split 0.6 \
+    --model claude-sonnet-4-20250514 \
+    --report optimization-report.html \
+    --output-dir evals/iterations \
+    --verbose
+```
+
+The `--report` flag generates a live HTML dashboard that auto-refreshes every 10 seconds, showing a convergence chart, iteration table, and cherry-pick controls.
+
+**Step 5: Present results in UI**
+
+Open the generated `optimization-report.html` in a browser. The report shows:
+- Progress dashboard (status, baseline vs best, kept/reverted counts)
+- Convergence chart (train solid line, held-out dashed line, baseline dotted)
+- Iteration table with verdict, composite score, delta, and change summary
+- Expandable inline diffs per iteration (click any row)
+
+**Step 6: User cherry-picks improvements**
+
+Not all KEEP iterations are real improvements — some may be harness artifacts. The user reviews each kept iteration's diff and selects which to include:
+- Check the "Pick" checkbox for desired iterations
+- Click "Preview Combined" to see the merged diff
+- Click "Export Selected" to download a JSON file with chosen diffs
+
+**Step 7: Apply selected improvements to target file**
+
+Apply the selected improvements to the original target file.
+
+- If you want the best single kept variant, use `evals/iterations/best_variant.md`.
+- If you exported selected diffs, treat that JSON as review material for a manual follow-up apply step. It is not auto-applied by the current tooling.
+
+```bash
+# Review the best kept variant before applying
+cat evals/iterations/best_variant.md | head -20
+
+# Replace the target with the best kept variant
+cp evals/iterations/best_variant.md skills/{target}/SKILL.md
+```
+
+**Step 8: Run final evaluation on FULL task set (train + test)**
+
+After applying improvements, run a final evaluation on ALL tasks (not just train) to verify the improvements generalize:
+
+```bash
+# Re-run optimize_loop.py against the same task file and inspect results.json/report output
+```
+
+Compare final scores to the baseline to confirm net improvement.
+
+**Step 9: Record in learning-db**
+
+```bash
+python3 scripts/learning-db.py learn \
+    --skill agent-comparison \
+    "autoresearch: {target} improved {baseline}→{best} over {iterations} iterations. \
+     Kept: {kept}/{total}. Stop: {reason}. Changes: {summaries}"
+```
+
+**Gate**: Optimization complete. Results reviewed. Cherry-picked improvements applied and verified against full task set. Results recorded.
+
 ### Optional Extensions
 
 These are off by default. Enable explicitly when needed: