Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 10 additions & 17 deletions pipelines/comprehensive-review/SKILL.md
Original file line number Diff line number Diff line change
@@ -1,23 +1,16 @@
---
name: comprehensive-review
description: |
Unified 4-wave code review: Wave 0 auto-discovers packages/modules and
dispatches one language-specialist agent per package for deep per-package
analysis. Wave 1 dispatches 12 foundation reviewers in parallel (with Wave 0
context). Wave 2 dispatches 10 deep-dive reviewers that receive Wave 0+1
findings as context for targeted analysis. Wave 3 dispatches 4-5 adversarial
reviewers that challenge Wave 1+2 consensus — contrarian, skeptical senior,
user advocate, meta-process, and conditionally SAPCC structural. Aggregates
all findings by severity with wave-agreement labels (unanimous, majority,
contested), then auto-fixes ALL issues. Covers per-package deep review,
security, business logic, architecture, error handling, test coverage, type
design, code quality, comment analysis, language idioms, docs validation,
newcomer perspective, performance, concurrency, API contracts, dependencies,
error messages, dead code, naming, observability, config safety, migration
safety, and adversarial challenge.
Use for "comprehensive review", "full review", "review everything", "review
and fix", or "thorough code review".
Do NOT use for single-concern reviews (use individual agents instead).
Four-wave code review pipeline for large or high-risk changes. Wave 0
auto-discovers packages/modules and runs per-package specialist review. Wave
1 runs broad foundation reviewers in parallel. Wave 2 runs targeted deep-dive
reviewers using earlier findings as context. Wave 3 runs adversarial reviewers
that challenge consensus and surface missed risks. Aggregates findings by
severity and agreement level, deduplicates them, and can auto-fix issues
unless review-only mode is used. Use for "comprehensive review", "full
review", "review everything", "review and fix", or "thorough code review".
Do NOT use for narrow single-concern reviews; use smaller review skills
instead.
effort: high
version: 4.0.0
user-invocable: false
Expand Down
132 changes: 132 additions & 0 deletions scripts/tests/test_agent_comparison_optimize_loop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import importlib.util
import json
from pathlib import Path
import sys


REPO_ROOT = Path(__file__).resolve().parents[2]


def load_module(name: str, relative_path: str):
spec = importlib.util.spec_from_file_location(name, REPO_ROOT / relative_path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module


def test_assess_target_rejects_missing_frontmatter(tmp_path):
optimize_loop = load_module(
"agent_comparison_optimize_loop",
"skills/agent-comparison/scripts/optimize_loop.py",
)
target = tmp_path / "SKILL.md"
target.write_text("# no frontmatter\nbody\n")

scores = optimize_loop.assess_target(
target,
[{"query": "write tests", "should_trigger": True}],
"improve routing precision",
dry_run=True,
)

assert scores["parses"] is False
assert optimize_loop.composite_score(scores) == 0.0


def test_check_protected_sections_rejects_missing_blocks():
optimize_loop = load_module(
"agent_comparison_optimize_loop",
"skills/agent-comparison/scripts/optimize_loop.py",
)
original = (
"alpha\n"
"<!-- DO NOT OPTIMIZE -->\n"
"keep me\n"
"<!-- END DO NOT OPTIMIZE -->\n"
"omega\n"
)
relocated = "alpha\nomega\n"

assert optimize_loop.check_protected_sections(original, relocated) is False


def test_restore_protected_does_not_silently_reinsert_missing_blocks():
generate_variant = load_module(
"agent_comparison_generate_variant",
"skills/agent-comparison/scripts/generate_variant.py",
)
original = (
"alpha\n"
"<!-- DO NOT OPTIMIZE -->\n"
"keep me\n"
"<!-- END DO NOT OPTIMIZE -->\n"
"omega\n"
)
variant = "alpha\nomega\n"

restored = generate_variant.restore_protected(original, variant)

assert restored == variant


def test_generate_variant_main_reads_current_content_from_file(tmp_path, monkeypatch, capsys):
generate_variant = load_module(
"agent_comparison_generate_variant",
"skills/agent-comparison/scripts/generate_variant.py",
)

class FakeBlock:
def __init__(self, block_type: str, text: str):
self.type = block_type
if block_type == "thinking":
self.thinking = text
else:
self.text = text

class FakeResponse:
def __init__(self):
self.content = [
FakeBlock("thinking", "reasoning"),
FakeBlock(
"text",
"<variant>---\ndescription: updated\n---</variant>"
"<summary>updated</summary><deletion_justification></deletion_justification>",
),
]
self.usage = type("Usage", (), {"input_tokens": 1, "output_tokens": 2})()

class FakeClient:
def __init__(self):
self.messages = type("Messages", (), {"create": lambda self, **kwargs: FakeResponse()})()

class FakeAnthropicModule:
class Anthropic:
def __new__(cls):
return FakeClient()

content_file = tmp_path / "current.md"
content_file.write_text("---\ndescription: current\n---\n")

monkeypatch.setattr(generate_variant, "anthropic", FakeAnthropicModule)
monkeypatch.setattr(
sys,
"argv",
[
"generate_variant.py",
"--target",
"skills/example/SKILL.md",
"--goal",
"improve routing precision",
"--current-content-file",
str(content_file),
"--model",
"fake-model",
],
)

generate_variant.main()
output = json.loads(capsys.readouterr().out)

assert output["variant"] == "---\ndescription: updated\n---"
assert output["tokens_used"] == 3
33 changes: 33 additions & 0 deletions scripts/tests/test_eval_compare_optimization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import importlib.util
import json
from pathlib import Path


REPO_ROOT = Path(__file__).resolve().parents[2]


def load_module(name: str, relative_path: str):
spec = importlib.util.spec_from_file_location(name, REPO_ROOT / relative_path)
module = importlib.util.module_from_spec(spec)
assert spec.loader is not None
spec.loader.exec_module(module)
return module


def test_load_optimization_data_ignores_unrelated_results_json(tmp_path):
eval_compare = load_module(
"skill_creator_eval_compare",
"skills/skill-creator/scripts/eval_compare.py",
)
(tmp_path / "results.json").write_text(json.dumps({"status": "not-optimization"}))
(tmp_path / "evals" / "iterations").mkdir(parents=True)
expected = {
"target": "skills/example/SKILL.md",
"baseline_score": {"train": 1.0, "test": 1.0},
"iterations": [],
}
(tmp_path / "evals" / "iterations" / "results.json").write_text(json.dumps(expected))

loaded = eval_compare.load_optimization_data(tmp_path)

assert loaded == expected
141 changes: 135 additions & 6 deletions skills/agent-comparison/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@
name: agent-comparison
description: |
A/B test agent variants measuring quality and total session token cost
across simple and complex benchmarks. Use when creating compact agent
versions, validating agent changes, comparing internal vs external agents,
or deciding between variants for production. Use for "compare agents",
"A/B test", "benchmark agents", or "test agent efficiency". Route single-agent evaluation to agent-evaluation, testing skills, or optimizing prompts
without variant comparison.
version: 2.0.0
across simple and complex benchmarks. Also supports automated optimization
loops (autoresearch) for frontmatter description and routing-trigger quality
using train/test eval sets. Use when creating compact agent versions, validating
agent changes, comparing internal vs external agents, optimizing a skill description,
or deciding between variants for production. Use for "compare agents", "A/B test",
"benchmark agents", "test agent efficiency", "optimize description", "optimize skill",
or "run autoresearch". Route single-agent evaluation to agent-evaluation, testing skills,
or simple prompt optimization without variant comparison.
version: 2.2.0
user-invocable: false
allowed-tools:
- Read
Expand All @@ -22,6 +25,9 @@ routing:
- "compare agents"
- "A/B test agents"
- "benchmark agents"
- "optimize skill"
- "optimize description"
- "run autoresearch"
category: meta-tooling
---

Expand Down Expand Up @@ -254,6 +260,129 @@ Remove temporary benchmark files and debug outputs. Keep only the comparison rep

**Gate**: Report generated with all metrics. Verdict stated with evidence. Report saved to benchmark directory.

### Phase 5: OPTIMIZE (optional — invoked explicitly)

**Goal**: Run an automated optimization loop that iteratively improves a markdown target's frontmatter `description` using trigger-rate eval tasks, then keeps only measured improvements.

This phase is for routing/trigger optimization, not full code-generation benchmarking. Invoke it when the user says "optimize this skill", "optimize the description", or "run autoresearch". The existing manual A/B comparison (Phases 1-4) remains the path for full agent benchmarking.

**Step 1: Validate optimization target and goal**

Confirm the target file exists, has YAML frontmatter with a `description`, and the optimization goal is clear:

```bash
# Target must be a markdown file with frontmatter description
test -f skills/{target}/SKILL.md
rg -n '^description:' skills/{target}/SKILL.md

# Goal should be specific and measurable
# Good: "improve error handling instructions"
# Bad: "make it better"
```

**Step 2: Prepare trigger-rate eval tasks**

```bash
python3 skills/agent-comparison/scripts/optimize_loop.py \
--target skills/{target}/SKILL.md \
--goal "{optimization goal}" \
--benchmark-tasks skills/agent-comparison/references/optimization-tasks.example.json \
--train-split 0.6 \
--model claude-sonnet-4-20250514 \
--verbose
```

Supported task schemas:
- Flat `tasks` list with optional `"split": "train" | "test"` per task
- Top-level `train` and `test` arrays

Every task must include:
- `query`: the routing prompt to test
- `should_trigger`: whether the target should trigger for that prompt

If no split markers are present, the loop does a reproducible random split with seed `42`.

**Step 3: Run baseline evaluation**

The loop automatically evaluates the unmodified target against the train set before starting iteration. This establishes the score to beat, and records a held-out baseline if test tasks exist.

**Step 4: Enter optimization loop**

The `optimize_loop.py` script handles the full loop:
- Calls `generate_variant.py` to propose changes (Claude with extended thinking)
- Evaluates each variant against train tasks
- Keeps variants that improve score by more than `--min-gain` (default 0.02)
- Reverts variants that don't improve, break hard gates, or delete sections without justification
- Checks held-out test set every 5 iterations for Goodhart divergence
- Stops on convergence (5 consecutive reverts), Goodhart alarm, or max iterations

```bash
python3 skills/agent-comparison/scripts/optimize_loop.py \
--target skills/{target}/SKILL.md \
--goal "{optimization goal}" \
--benchmark-tasks skills/agent-comparison/references/optimization-tasks.example.json \
--max-iterations 20 \
--min-gain 0.02 \
--train-split 0.6 \
--model claude-sonnet-4-20250514 \
--report optimization-report.html \
--output-dir evals/iterations \
--verbose
```

The `--report` flag generates a live HTML dashboard that auto-refreshes every 10 seconds, showing a convergence chart, iteration table, and cherry-pick controls.

**Step 5: Present results in UI**

Open the generated `optimization-report.html` in a browser. The report shows:
- Progress dashboard (status, baseline vs best, kept/reverted counts)
- Convergence chart (train solid line, held-out dashed line, baseline dotted)
- Iteration table with verdict, composite score, delta, and change summary
- Expandable inline diffs per iteration (click any row)

**Step 6: User cherry-picks improvements**

Not all KEEP iterations are real improvements — some may be harness artifacts. The user reviews each kept iteration's diff and selects which to include:
- Check the "Pick" checkbox for desired iterations
- Click "Preview Combined" to see the merged diff
- Click "Export Selected" to download a JSON file with chosen diffs

**Step 7: Apply selected improvements to target file**

Apply the selected improvements to the original target file.

- If you want the best single kept variant, use `evals/iterations/best_variant.md`.
- If you exported selected diffs, treat that JSON as review material for a manual follow-up apply step. It is not auto-applied by the current tooling.

```bash
# Review the best kept variant before applying
cat evals/iterations/best_variant.md | head -20

# Replace the target with the best kept variant
cp evals/iterations/best_variant.md skills/{target}/SKILL.md
```

**Step 8: Run final evaluation on FULL task set (train + test)**

After applying improvements, run a final evaluation on ALL tasks (not just train) to verify the improvements generalize:

```bash
# Re-run optimize_loop.py against the same task file and inspect results.json/report output
```

Compare final scores to the baseline to confirm net improvement.

**Step 9: Record in learning-db**

```bash
python3 scripts/learning-db.py learn \
--skill agent-comparison \
"autoresearch: {target} improved {baseline}→{best} over {iterations} iterations. \
Kept: {kept}/{total}. Stop: {reason}. Changes: {summaries}"
```

**Gate**: Optimization complete. Results reviewed. Cherry-picked improvements applied and verified against full task set. Results recorded.

### Optional Extensions

These are off by default. Enable explicitly when needed:
Expand Down
Loading
Loading