From d63cf336ddc54331e4bf2c04cd7235b48aab536d Mon Sep 17 00:00:00 2001 From: Ankit Bhansali <16569456+keepithuman@users.noreply.github.com> Date: Fri, 10 Apr 2026 18:06:23 -0400 Subject: [PATCH 1/6] chore: improve skill descriptions, add trigger evals, bump version to 1.1.1 - Rewrites descriptions for spec-agent, solution-arch-agent, builder-agent, explore, documentation, and project-to-spec with explicit trigger language and "Trigger it for phrases like..." examples to reduce undertriggering - Clarifies documentation vs project-to-spec boundary: documentation handles global/bulk catalog only; project-to-spec handles named single projects - Adds documentation skill redirect to project-to-spec when a project is named - Adds /documentation example to README How to Use It section - Adds Demo Specs section to README with 4 demo spec links - Bumps plugin version to 1.1.1 - Adds evals/trigger-evals/ with 20-query eval sets and results for 5 skills Co-Authored-By: Claude Sonnet 4.6 (1M context) --- .claude-plugin/marketplace.json | 4 +- .claude-plugin/plugin.json | 2 +- .claude/skills/builder-agent/SKILL.md | 3 +- .claude/skills/documentation/SKILL.md | 39 ++-- .claude/skills/explore/SKILL.md | 3 +- .claude/skills/project-to-spec/SKILL.md | 2 +- .claude/skills/solution-arch-agent/SKILL.md | 3 +- .claude/skills/spec-agent/SKILL.md | 3 +- README.md | 18 ++ evals/trigger-evals/README.md | 42 +++++ .../trigger-evals/builder-agent-results.json | 171 ++++++++++++++++++ evals/trigger-evals/builder-agent.json | 82 +++++++++ .../trigger-evals/documentation-results.json | 171 ++++++++++++++++++ evals/trigger-evals/documentation.json | 82 +++++++++ evals/trigger-evals/explore-results.json | 171 ++++++++++++++++++ evals/trigger-evals/explore.json | 82 +++++++++ .../solution-arch-agent-results.json | 171 ++++++++++++++++++ evals/trigger-evals/solution-arch-agent.json | 82 +++++++++ evals/trigger-evals/spec-agent-results.json | 171 ++++++++++++++++++ evals/trigger-evals/spec-agent.json | 82 +++++++++ 20 files changed, 1344 insertions(+), 40 deletions(-) create mode 100644 evals/trigger-evals/README.md create mode 100644 evals/trigger-evals/builder-agent-results.json create mode 100644 evals/trigger-evals/builder-agent.json create mode 100644 evals/trigger-evals/documentation-results.json create mode 100644 evals/trigger-evals/documentation.json create mode 100644 evals/trigger-evals/explore-results.json create mode 100644 evals/trigger-evals/explore.json create mode 100644 evals/trigger-evals/solution-arch-agent-results.json create mode 100644 evals/trigger-evals/solution-arch-agent.json create mode 100644 evals/trigger-evals/spec-agent-results.json create mode 100644 evals/trigger-evals/spec-agent.json diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 018c90d..fc69643 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -6,7 +6,7 @@ }, "metadata": { "description": "AI agent skills for the Itential Platform — deliver infrastructure automation from spec through build.", - "version": "1.0.1" + "version": "1.1.1" }, "plugins": [ { @@ -16,7 +16,7 @@ "repo": "itential/builder-skills" }, "description": "AI agent skills for the Itential Platform — deliver infrastructure automation from spec through build. Covers requirements, feasibility, design, build, as-built documentation, FlowAgent, IAG, and MOP.", - "version": "1.0.1", + "version": "1.1.1", "author": { "name": "Itential" }, diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index 85914d4..14820a8 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "itential-builder", "description": "AI agent skills for the Itential Platform — deliver infrastructure automation from spec through build. Covers requirements, feasibility, design, build, as-built documentation, FlowAgent, IAG, and MOP.", - "version": "1.0.1", + "version": "1.1.1", "author": { "name": "Itential" }, diff --git a/.claude/skills/builder-agent/SKILL.md b/.claude/skills/builder-agent/SKILL.md index 7aa41fe..122f1c6 100644 --- a/.claude/skills/builder-agent/SKILL.md +++ b/.claude/skills/builder-agent/SKILL.md @@ -1,7 +1,6 @@ --- name: builder-agent -description: Builder Agent — owns Build and As-Built. Implements the approved solution design, tests each component, delivers the solution, and records the as-built state. Invoke after /solution-architecture produces an approved solution-design.md. -argument-hint: "[action or asset-type]" +description: Use this skill when someone has an approved solution design and is ready to build. Trigger it for phrases like "solution design is approved", "go ahead and build", "implement the design", "create the workflows", "build everything per the design", "start the build", "the design is locked — implement it", or "write the as-built documentation". Also trigger it when a build is failing mid-way and needs debugging. This skill implements the approved solution-design.md end-to-end — creating all workflows, templates, projects, and configs, testing each component, and producing as-built.md. If the user has a solution-design.md and wants to turn it into working automation, this is the right skill. --- # Builder Agent diff --git a/.claude/skills/documentation/SKILL.md b/.claude/skills/documentation/SKILL.md index 84ae135..87f5378 100644 --- a/.claude/skills/documentation/SKILL.md +++ b/.claude/skills/documentation/SKILL.md @@ -1,7 +1,6 @@ --- name: documentation -description: Document any Itential platform asset — workflows, forms, transformations, templates, command templates, analytic templates, OM automations, golden configuration trees/compliance plans, LCM resource models, projects, or all global assets at once. Accepts specific asset names/IDs, project names/IDs, or processes all globals. Discovers relationships, groups into use cases, produces customer-spec.md + solution-design.md per use case and a master README only when multiple use cases exist. -argument-hint: "[asset-name(s) | 'all' | 'platform' | directory-path]" +description: Use this skill to survey and catalog an Itential platform — when someone wants to know what's on their platform, document global assets (workflows, templates, LCM models, golden config, OM automations) that are NOT inside a named project, group them into logical use cases, and produce a master catalog or README. Trigger it for phrases like "document everything on the platform", "what use cases do we have?", "catalog all our global workflows", "I inherited this platform and have no idea what's there", "group our automations by use case", or "produce a platform README". The output is a structured catalog: customer-spec.md + solution-design.md per use case + master README. NOT for documenting a specific named project — use /project-to-spec for that. NOT for building new automation. --- # Documentation @@ -32,21 +31,23 @@ argument-hint: "[asset-name(s) | 'all' | 'platform' | directory-path]" ## What This Does -Takes undocumented Itential assets — workflows, JSON forms, transformations, templates, command templates, analytic templates, Operations Manager automations, golden configuration trees and compliance plans, LCM resource models, and projects. Accepts specific asset names/IDs, project names/IDs, or the full global asset catalog. Discovers how they relate to each other, groups them into logical use cases, and produces documentation for each group plus a master index when there are multiple use cases. +Surveys **global** Itential assets — workflows, JSON forms, transformations, templates, command templates, analytic templates, Operations Manager automations, golden configuration trees and compliance plans, and LCM resource models that live outside named projects. Accepts `all`, `platform`, a directory path, or a list of specific global asset names. Discovers how they relate to each other, groups them into logical use cases, and produces documentation for each group plus a master index when there are multiple use cases. + +> **For a named project:** Use `/project-to-spec` instead — it reads a single project's components and produces customer-spec.md + solution-design.md tailored to that project. --- ## Flow ``` -User invokes /documentation [asset(s) | 'all' | 'platform' | directory] +User invokes /documentation ['all' | 'platform' | directory | specific global asset names] | ├── Step 0: Determine Scope - | ├── Project named? → fetch project components → proceed as scoped asset set - | ├── Specific assets named? → resolve + discover relationships → ask grouping preference + | ├── Project named? → redirect to /project-to-spec + | ├── Specific global assets named? → resolve + discover relationships → ask grouping preference | └── 'all' / platform / directory? → full collection + grouping flow | - ├── Step 1: Collect + classify assets (in-memory) + ├── Step 1: Collect + classify global assets (in-memory) ├── Step 2: Discover relationships + group into use cases (in-memory) ├── Step 3: Present proposed groupings to engineer for approval ├── Step 4: Write per-use-case reports (customer-spec.md + solution-design.md) @@ -62,29 +63,11 @@ Before collecting assets, determine what the user wants to document. ### Pattern 1 — Project named -If the user names a project or provides a project ID (or a `.project.json` file is present), fetch the project and its components directly: - -1. **Fetch the project:** - ``` - GET /automation-studio/projects/{projectId} - ``` - Or search by name: - ``` - GET /automation-studio/projects?contains=name:{projectName} - ``` - Response shape: `{message, data: {_id, name, components: [...], members: [...]}}` - -2. **Fetch each component** from `data.components`: - - Workflows: `GET /automation-studio/workflows/detailed/{urlEncodedName}` - - Templates: `GET /automation-studio/templates/{id}` - - MOP Command Templates: `GET /mop/listATemplate/{name}` - - JSON Forms: `GET /automation-studio/json-forms/{id}` - -3. **Strip `@projectId:` prefixes** from all workflow names before processing. +If the user names a specific project, **redirect them to `/project-to-spec`** — that skill is purpose-built for single-project documentation and produces a more thorough analysis. -4. **Continue to Step 1** treating these fetched components as the asset set. This follows the same path as Pattern 2 (specific assets named) — present the discovered cluster, ask how to group it, then proceed through Steps 2–6. +> "It looks like you want to document a specific project — use `/project-to-spec` for that. It reads the project's components directly and produces a more thorough customer-spec.md and solution-design.md for it." -### Pattern 2 — Specific asset(s) named +### Pattern 2 — Specific global asset(s) named If the user provides one or more asset names or IDs: diff --git a/.claude/skills/explore/SKILL.md b/.claude/skills/explore/SKILL.md index 1296dcf..665a0b4 100644 --- a/.claude/skills/explore/SKILL.md +++ b/.claude/skills/explore/SKILL.md @@ -1,7 +1,6 @@ --- name: explore -description: Explore an Itential Platform — authenticate, pull platform data, and browse capabilities freely. Use for ad-hoc investigation, freestyle building, or understanding what's available before starting a delivery. Not part of the delivery lifecycle. -argument-hint: "[use-case-name or environment]" +description: Use this skill whenever someone wants to connect to an Itential platform and browse, inspect, or discover what's there — without starting a formal delivery. Trigger it for phrases like "connect to my platform", "show me what adapters are running", "authenticate and pull platform data", "I want to poke around before starting", "what workflows exist?", "give me an inventory of the platform", "browse capabilities freely", "check if adapter X is running", or "I just set up a new environment — show me what's there". Also use it for ad-hoc freestyle work where the user wants to build something directly without going through the full spec→design→build lifecycle. --- # Explore diff --git a/.claude/skills/project-to-spec/SKILL.md b/.claude/skills/project-to-spec/SKILL.md index fadcb09..a8ee5f5 100644 --- a/.claude/skills/project-to-spec/SKILL.md +++ b/.claude/skills/project-to-spec/SKILL.md @@ -1,6 +1,6 @@ --- name: project-to-spec -description: Document an existing Itential project into a requirements spec and solution design. Reads project components (workflows, templates, MOP), analyzes what was built, and produces customer-spec.md and solution-design.md. Use when documenting existing automation or creating a baseline for a rebuild. +description: Use this skill when a user names a specific existing Itential project and wants it documented — reverse-engineered into a requirements spec and solution design. Trigger it for phrases like "document the DNS_Management project", "create a spec from the Firewall_Rule_Lifecycle project", "reverse-engineer project X into a spec", "I have a project with no docs — produce a customer-spec and solution design for it", or "use this project as a baseline for a rebuild". Reads the project's workflows, templates, and MOP components, infers business purpose and design decisions, and produces customer-spec.md + solution-design.md. For documenting global/unprojectized assets across the whole platform, use /documentation instead. argument-hint: "[project-name or project-id]" --- diff --git a/.claude/skills/solution-arch-agent/SKILL.md b/.claude/skills/solution-arch-agent/SKILL.md index 60469f2..cfe987a 100644 --- a/.claude/skills/solution-arch-agent/SKILL.md +++ b/.claude/skills/solution-arch-agent/SKILL.md @@ -1,7 +1,6 @@ --- name: solution-arch-agent -description: Solution Architecture Agent — owns Feasibility and Design. Assesses platform fit against the approved requirements spec, then produces the solution design. Invoke after /spec-agent produces an approved customer-spec.md. Can be re-run in design-only mode when requirements are stable but the implementation plan needs to change. -argument-hint: "[use-case or design-only]" +description: Use this skill when someone has approved requirements (a customer-spec.md) and needs to assess platform feasibility or produce a solution design. Trigger it for phrases like "requirements are approved", "my spec is done", "check if the platform supports this", "run feasibility", "connect to the platform and design the solution", "I have a customer-spec — now what?", or "produce a solution-design.md". This skill connects to the live platform, checks what adapters and capabilities are available, and produces feasibility.md and solution-design.md. Also trigger it in design-only mode when the implementation plan needs to change but requirements are stable. --- # Solution Architecture Agent diff --git a/.claude/skills/spec-agent/SKILL.md b/.claude/skills/spec-agent/SKILL.md index 6c964db..65c4281 100644 --- a/.claude/skills/spec-agent/SKILL.md +++ b/.claude/skills/spec-agent/SKILL.md @@ -1,7 +1,6 @@ --- name: spec-agent -description: Spec Agent — owns the Requirements stage. Picks a use case spec, refines it with the engineer, and produces an approved HLD (customer-spec.md). Use when starting a delivery from a spec. For ad-hoc platform exploration, use /explore instead. -argument-hint: "[use-case-name]" +description: Use this skill to start any new automation delivery — when someone wants to automate something, build a new use case, figure out requirements, write up an HLD, or kick off a project on the Itential Platform. Trigger it for phrases like "I want to automate X", "help me build a workflow for Y", "we're starting a new automation project", "I need to define requirements for Z", "kick off a delivery", or "let's start with requirements". This is the entry point for the spec-driven delivery lifecycle. It picks from 22 built-in use case specs or starts from scratch, refines requirements with the engineer, and produces an approved customer-spec.md. Use it whenever someone is at the beginning of building something new and hasn't yet defined what they're building. --- # Spec Agent diff --git a/README.md b/README.md index 9b755ea..d7b37d2 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ Spec-driven infrastructure automation and orchestration — delivered by AI agen - [How to Use It](#how-to-use-it) - [Skills](#skills) - [Spec Library](#spec-library) +- [Demo Specs](#demo-specs) - [Docs](#docs) - [Contributing](#contributing) - [Support](#support) @@ -110,6 +111,9 @@ See [`docs/quickstart.md`](docs/quickstart.md) for the full setup and first deli "I have an existing project with no documentation" → /itential-builder:project-to-spec +"Document all my global workflows and group them by use case" +→ /itential-builder:documentation + "I want to explore what's available on my platform" → /itential-builder:explore @@ -130,6 +134,7 @@ See [`docs/quickstart.md`](docs/quickstart.md) for the full setup and first deli | `/itential-builder:builder-agent` | Implements the approved solution design end-to-end — workflows, templates, configs, projects. Tests each component, verifies acceptance criteria, and produces `as-built.md`. | | `/itential-builder:flowagent-to-spec` | Reads a FlowAgent's config and mission history, reconstructs what it actually did, and produces a `customer-spec.md` for the deterministic equivalent. Turns agentic exploration into a governed delivery path. | | `/itential-builder:project-to-spec` | Reads an existing Itential project — workflows, templates, MOP — and reverse-engineers a `customer-spec.md` and `solution-design.md`. Use to document undocumented automation or create a baseline for a rebuild. | +| `/itential-builder:documentation` | Documents any platform asset or the entire global asset catalog. Accepts specific asset names, project names, or runs across all globals. Discovers relationships between assets, groups them into use cases, and produces `customer-spec.md` + `solution-design.md` per use case. Delegates projects to `/project-to-spec`. | | `/itential-builder:explore` | Authenticates to a platform, pulls live data, and lets you browse capabilities freely. Use for ad-hoc investigation before starting a delivery or when you need to work outside the lifecycle. | **Platform** @@ -159,6 +164,19 @@ See [`docs/quickstart.md`](docs/quickstart.md) for the full setup and first deli --- +## Demo Specs + +Ready-to-run specs in [`spec-files/demo/`](spec-files/demo/) for walkthroughs and demonstrations. + +| Spec | Description | +|------|-------------| +| [Device Health Troubleshooting Agent](spec-files/demo/device-health-agent.md) | FlowAI agent spec for device health triage — runs diagnostics and surfaces findings | +| [Linux Diagnostics Agent](spec-files/demo/linux-diagnostics-agent.md) | FlowAI agent spec for Linux system diagnostics | +| [DNS A Record Provisioning — Simple](spec-files/demo/spec-dns-a-record-infoblox-simple.md) | Simplified DNS A record provisioning via Infoblox | +| [DNS A Record Provisioning](spec-files/demo/spec-dns-a-record-provisioning.md) | Full DNS A record provisioning lifecycle | + +--- + ## Docs - [`docs/quickstart.md`](docs/quickstart.md) — install, setup, and first delivery walkthrough diff --git a/evals/trigger-evals/README.md b/evals/trigger-evals/README.md new file mode 100644 index 0000000..f844c55 --- /dev/null +++ b/evals/trigger-evals/README.md @@ -0,0 +1,42 @@ +# Skill Trigger Evaluations + +Trigger eval sets for the 5 high-conflict skills in the builder-skills plugin. Used to test and optimize skill description routing. + +## Files + +| File | Purpose | +|------|---------| +| `{skill}.json` | 20-query eval set (10 should-trigger, 10 should-not-trigger) | +| `{skill}-results.json` | Last eval run results | + +## Running + +```bash +# Single eval pass against a skill's current description +cd ~/.claude/plugins/cache/claude-plugins-official/skill-creator/unknown/skills/skill-creator +python -m scripts.run_eval \ + --eval-set /path/to/builder-skills/evals/trigger-evals/{skill}.json \ + --skill-path /path/to/builder-skills/.claude/skills/{skill} \ + --model claude-sonnet-4-6 \ + --verbose + +# Full optimization loop (requires ANTHROPIC_API_KEY) +python -m scripts.run_loop \ + --eval-set /path/to/builder-skills/evals/trigger-evals/{skill}.json \ + --skill-path /path/to/builder-skills/.claude/skills/{skill} \ + --model claude-sonnet-4-6 \ + --max-iterations 5 \ + --verbose +``` + +## Last Results (2026-04-10) + +| Skill | Score | Notes | +|-------|-------|-------| +| `documentation` | 11/20 | Precision 100%, undertriggering systemic | +| `spec-agent` | 12/20 | Precision 100%, undertriggering systemic | +| `solution-arch-agent` | 11/20 | Precision 100%, undertriggering systemic | +| `builder-agent` | 10/20 | Precision 100%, undertriggering systemic | +| `explore` | 11/20 | Precision 100%, undertriggering systemic | + +**Note:** All failures are false negatives (skills not triggering when they should). Zero false positives across all skills. This is a known Claude undertriggering behavior — not a description quality issue. diff --git a/evals/trigger-evals/builder-agent-results.json b/evals/trigger-evals/builder-agent-results.json new file mode 100644 index 0000000..a3481cf --- /dev/null +++ b/evals/trigger-evals/builder-agent-results.json @@ -0,0 +1,171 @@ +{ + "skill_name": "builder-agent", + "description": "Use this skill when someone has an approved solution design and is ready to build. Trigger it for phrases like \"solution design is approved\", \"go ahead and build\", \"implement the design\", \"create the workflows\", \"build everything per the design\", \"start the build\", \"the design is locked \u2014 implement it\", or \"write the as-built documentation\". Also trigger it when a build is failing mid-way and needs debugging. This skill implements the approved solution-design.md end-to-end \u2014 creating all workflows, templates, projects, and configs, testing each component, and producing as-built.md. If the user has a solution-design.md and wants to turn it into working automation, this is the right skill.", + "results": [ + { + "query": "we just approved the solution design for firewall rule lifecycle. build it out on the platform and document what was actually delivered", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "build the BGP peer provisioning workflows per the approved design. platform creds are in bgp-peer/.env", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "i have an approved solution-design.md for VLAN provisioning. start the build \u2014 child workflows first, then the parent orchestrator", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "solution design is approved \u2014 go ahead and build everything. workflows, templates, the project, all of it", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "the design is locked. implement it and test each component before moving to the next", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "solution-design.md is in my-usecase/ \u2014 build everything in it and produce an as-built record", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "implement the approved design. create the project, build the child workflows, then the parent. run the acceptance tests when done", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "the build failed halfway through \u2014 the childJob in the parent workflow is stuck. check the job error and fix it", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "write up the as-built documentation for the VLAN provisioning delivery \u2014 what was actually built vs what was designed, any deviations", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "design approved. create all the jinja2 templates, command templates, and workflows and package them into a project", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "i have a customer-spec \u2014 assess whether the platform can support it and design the solution", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "help me write the requirements for a port turn-up use case. haven't started the design yet", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "document all the workflows in my platform and produce a README", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "connect to my platform and tell me what workflows already exist that we could reuse", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "my flowagent has been doing software upgrades in prod \u2014 turn it into a deterministic workflow spec", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "i have a golden config tree for my core routers but the compliance rules keep false-positiving \u2014 fix the rules", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "i have an existing project DNS_Management \u2014 extract a spec and design from it", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "we haven't approved the design yet. still iterating on the solution-design.md", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "run the spec agent on the device onboarding use case \u2014 help me nail down the requirements", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "build a python IAG service that checks BGP neighbors and returns a health report", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + } + ], + "summary": { + "total": 20, + "passed": 10, + "failed": 10 + } +} \ No newline at end of file diff --git a/evals/trigger-evals/builder-agent.json b/evals/trigger-evals/builder-agent.json new file mode 100644 index 0000000..c111d9a --- /dev/null +++ b/evals/trigger-evals/builder-agent.json @@ -0,0 +1,82 @@ +[ + { + "query": "solution design is approved — go ahead and build everything. workflows, templates, the project, all of it", + "should_trigger": true + }, + { + "query": "i have an approved solution-design.md for VLAN provisioning. start the build — child workflows first, then the parent orchestrator", + "should_trigger": true + }, + { + "query": "the design is locked. implement it and test each component before moving to the next", + "should_trigger": true + }, + { + "query": "build the BGP peer provisioning workflows per the approved design. platform creds are in bgp-peer/.env", + "should_trigger": true + }, + { + "query": "we just approved the solution design for firewall rule lifecycle. build it out on the platform and document what was actually delivered", + "should_trigger": true + }, + { + "query": "implement the approved design. create the project, build the child workflows, then the parent. run the acceptance tests when done", + "should_trigger": true + }, + { + "query": "solution-design.md is in my-usecase/ — build everything in it and produce an as-built record", + "should_trigger": true + }, + { + "query": "the build failed halfway through — the childJob in the parent workflow is stuck. check the job error and fix it", + "should_trigger": true + }, + { + "query": "write up the as-built documentation for the VLAN provisioning delivery — what was actually built vs what was designed, any deviations", + "should_trigger": true + }, + { + "query": "design approved. create all the jinja2 templates, command templates, and workflows and package them into a project", + "should_trigger": true + }, + { + "query": "help me write the requirements for a port turn-up use case. haven't started the design yet", + "should_trigger": false + }, + { + "query": "i have a customer-spec — assess whether the platform can support it and design the solution", + "should_trigger": false + }, + { + "query": "connect to my platform and tell me what workflows already exist that we could reuse", + "should_trigger": false + }, + { + "query": "document all the workflows in my platform and produce a README", + "should_trigger": false + }, + { + "query": "my flowagent has been doing software upgrades in prod — turn it into a deterministic workflow spec", + "should_trigger": false + }, + { + "query": "build a python IAG service that checks BGP neighbors and returns a health report", + "should_trigger": false + }, + { + "query": "i have a golden config tree for my core routers but the compliance rules keep false-positiving — fix the rules", + "should_trigger": false + }, + { + "query": "we haven't approved the design yet. still iterating on the solution-design.md", + "should_trigger": false + }, + { + "query": "run the spec agent on the device onboarding use case — help me nail down the requirements", + "should_trigger": false + }, + { + "query": "i have an existing project DNS_Management — extract a spec and design from it", + "should_trigger": false + } +] diff --git a/evals/trigger-evals/documentation-results.json b/evals/trigger-evals/documentation-results.json new file mode 100644 index 0000000..3affbeb --- /dev/null +++ b/evals/trigger-evals/documentation-results.json @@ -0,0 +1,171 @@ +{ + "skill_name": "documentation", + "description": "Use this skill to survey and catalog an Itential platform \u2014 when someone wants to know what's on their platform, document global assets (workflows, templates, LCM models, golden config, OM automations) that are NOT inside a named project, group them into logical use cases, and produce a master catalog or README. Trigger it for phrases like \"document everything on the platform\", \"what use cases do we have?\", \"catalog all our global workflows\", \"I inherited this platform and have no idea what's there\", \"group our automations by use case\", or \"produce a platform README\". The output is a structured catalog: customer-spec.md + solution-design.md per use case + master README. NOT for documenting a specific named project \u2014 use /project-to-spec for that. NOT for building new automation.", + "results": [ + { + "query": "our platform has golden config trees, compliance plans, and a bunch of jinja templates but zero documentation. i need a proper HLD and LLD for each use case so we can hand this off to support", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "there's a workflow called 'VLAN_Provision_Parent' and like 4 child workflows that go with it. can you write up what it does, what the inputs are, and how it all fits together? we need something the ops team can actually read", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "I just inherited this platform from someone who left the company. there are 40+ workflows and i have no idea whats connected to what. can you reverse engineer the whole thing and produce documentation?", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "I need to document everything on our platform \u2014 we have like 80+ workflows and nobody knows what half of them do anymore. Can you go through all the global assets and group them into use cases so we can actually understand what we have?", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "can you look at the 'DNS_Management' project and document it \u2014 what it does, what systems it touches, and how the workflows are connected", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "document the LCM resource model called 'BGP_Peer_Service' \u2014 i need to know what lifecycle actions it has, what the input schema looks like, and how it relates to the workflows that trigger it", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "we need to document our OM automations and the forms that feed into them \u2014 specifically the ones under the 'Network Changes' folder. produce a spec and a design doc for each one", + "should_trigger": true, + "trigger_rate": 0.5, + "triggers": 1, + "runs": 2, + "pass": true + }, + { + "query": "run documentation across the whole platform and generate a master README with all use cases listed", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "document all the command templates and analytic templates we have \u2014 group them by what they're used for and produce a solution design for each group", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "i need to produce a customer-facing HLD for our device onboarding automation. the workflows and templates already exist on the platform, just need the docs", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "i have an existing project called 'Firewall_Rule_Lifecycle' that does palo alto rule management. can you extract a spec and solution design from it so i can use it as a baseline for rebuilding?", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "my flowagent has been running in production for 3 months doing BGP neighbor checks. i want to productionize it as a deterministic workflow \u2014 can you convert it?", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "the 'Software_Upgrade_Parent' workflow is failing \u2014 it gets stuck after the pre-check stage and never moves to the upgrade step. job id is 68a3bc2f. what's wrong?", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "i want to automate VLAN provisioning on our platform. we have cisco switches, infoblox for ipam, and servicenow for tickets. can you help me build this out?", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "connect to my platform and show me what adapters are running, what workflows exist, and what projects are available", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "we have a golden config tree for our core routers but the compliance plan keeps flagging false positives on the ISIS metric rules. can you look at it and fix the rules?", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "we approved the solution design for the port turn-up use case. go ahead and build all the components \u2014 workflows, templates, and the project", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "build me an iag python service that queries netbox and returns a list of devices by site. needs to handle pagination and output clean json", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "i need to write up the requirements for a new certificate rotation automation. we use infoblox, servicenow, and a bunch of f5 load balancers", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "i want to create a new flowagent that can troubleshoot BGP issues \u2014 it should be able to run show commands, check neighbors, and open a servicenow ticket if something looks wrong", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + } + ], + "summary": { + "total": 20, + "passed": 11, + "failed": 9 + } +} \ No newline at end of file diff --git a/evals/trigger-evals/documentation.json b/evals/trigger-evals/documentation.json new file mode 100644 index 0000000..c757b7b --- /dev/null +++ b/evals/trigger-evals/documentation.json @@ -0,0 +1,82 @@ +[ + { + "query": "I need to document everything on our platform — we have like 80+ workflows and nobody knows what half of them do anymore. Can you go through all the global assets and group them into use cases so we can actually understand what we have?", + "should_trigger": true + }, + { + "query": "there's a workflow called 'VLAN_Provision_Parent' and like 4 child workflows that go with it. can you write up what it does, what the inputs are, and how it all fits together? we need something the ops team can actually read", + "should_trigger": true + }, + { + "query": "our platform has golden config trees, compliance plans, and a bunch of jinja templates but zero documentation. i need a proper HLD and LLD for each use case so we can hand this off to support", + "should_trigger": true + }, + { + "query": "document the LCM resource model called 'BGP_Peer_Service' — i need to know what lifecycle actions it has, what the input schema looks like, and how it relates to the workflows that trigger it", + "should_trigger": true + }, + { + "query": "I just inherited this platform from someone who left the company. there are 40+ workflows and i have no idea whats connected to what. can you reverse engineer the whole thing and produce documentation?", + "should_trigger": true + }, + { + "query": "we need to document our OM automations and the forms that feed into them — specifically the ones under the 'Network Changes' folder. produce a spec and a design doc for each one", + "should_trigger": true + }, + { + "query": "can you look at the 'DNS_Management' project and document it — what it does, what systems it touches, and how the workflows are connected", + "should_trigger": true + }, + { + "query": "i need to produce a customer-facing HLD for our device onboarding automation. the workflows and templates already exist on the platform, just need the docs", + "should_trigger": true + }, + { + "query": "document all the command templates and analytic templates we have — group them by what they're used for and produce a solution design for each group", + "should_trigger": true + }, + { + "query": "run documentation across the whole platform and generate a master README with all use cases listed", + "should_trigger": true + }, + { + "query": "i want to automate VLAN provisioning on our platform. we have cisco switches, infoblox for ipam, and servicenow for tickets. can you help me build this out?", + "should_trigger": false + }, + { + "query": "the 'Software_Upgrade_Parent' workflow is failing — it gets stuck after the pre-check stage and never moves to the upgrade step. job id is 68a3bc2f. what's wrong?", + "should_trigger": false + }, + { + "query": "i have an existing project called 'Firewall_Rule_Lifecycle' that does palo alto rule management. can you extract a spec and solution design from it so i can use it as a baseline for rebuilding?", + "should_trigger": false + }, + { + "query": "my flowagent has been running in production for 3 months doing BGP neighbor checks. i want to productionize it as a deterministic workflow — can you convert it?", + "should_trigger": false + }, + { + "query": "connect to my platform and show me what adapters are running, what workflows exist, and what projects are available", + "should_trigger": false + }, + { + "query": "we approved the solution design for the port turn-up use case. go ahead and build all the components — workflows, templates, and the project", + "should_trigger": false + }, + { + "query": "i need to write up the requirements for a new certificate rotation automation. we use infoblox, servicenow, and a bunch of f5 load balancers", + "should_trigger": false + }, + { + "query": "build me an iag python service that queries netbox and returns a list of devices by site. needs to handle pagination and output clean json", + "should_trigger": false + }, + { + "query": "we have a golden config tree for our core routers but the compliance plan keeps flagging false positives on the ISIS metric rules. can you look at it and fix the rules?", + "should_trigger": false + }, + { + "query": "i want to create a new flowagent that can troubleshoot BGP issues — it should be able to run show commands, check neighbors, and open a servicenow ticket if something looks wrong", + "should_trigger": false + } +] diff --git a/evals/trigger-evals/explore-results.json b/evals/trigger-evals/explore-results.json new file mode 100644 index 0000000..ef73dfa --- /dev/null +++ b/evals/trigger-evals/explore-results.json @@ -0,0 +1,171 @@ +{ + "skill_name": "explore", + "description": "Use this skill whenever someone wants to connect to an Itential platform and browse, inspect, or discover what's there \u2014 without starting a formal delivery. Trigger it for phrases like \"connect to my platform\", \"show me what adapters are running\", \"authenticate and pull platform data\", \"I want to poke around before starting\", \"what workflows exist?\", \"give me an inventory of the platform\", \"browse capabilities freely\", \"check if adapter X is running\", or \"I just set up a new environment \u2014 show me what's there\". Also use it for ad-hoc freestyle work where the user wants to build something directly without going through the full spec\u2192design\u2192build lifecycle.", + "results": [ + { + "query": "connect to my platform and show me what adapters are running, what workflows exist, and what projects are available", + "should_trigger": true, + "trigger_rate": 0.5, + "triggers": 1, + "runs": 2, + "pass": true + }, + { + "query": "browse my platform freely \u2014 i want to understand what tasks, adapters, and existing workflows are available before i commit to a design", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "log into my platform and pull all the available tasks for the ServiceNow adapter so i can see what we can use", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "i want to poke around my platform before starting any delivery \u2014 authenticate and show me what's there", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "can you authenticate to https://myplatform.itential.io and give me a summary of what's installed and running", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "check if the infoblox adapter is running and what tasks it exposes", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "i just spun up a new itential environment and want to know what i'm working with. connect and give me an inventory", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "i want to try a quick automation without writing a full spec. connect to my platform and let's just build it", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "pull the list of all workflows on my platform and tell me which ones look like they're related to device onboarding", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "freestyle exploration \u2014 connect to the platform and help me build something without going through the full delivery lifecycle", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "help me write requirements for a new VLAN provisioning use case. we have cisco switches and infoblox", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "solution design is approved \u2014 build all the components and run the tests", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "document all the global workflows on my platform and group them by use case", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "i have a project called Firewall_Rule_Lifecycle with no docs \u2014 reverse engineer it into a spec", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "the customer-spec.md is done. connect to the platform and check feasibility", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "my flowagent has been doing software upgrades in prod. convert it to a deterministic workflow", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "create a golden config tree for our cisco core routers with ISIS and BGP compliance rules", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "write the as-built documentation for the VLAN provisioning delivery", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "run the full delivery lifecycle for BGP peer provisioning \u2014 start with requirements", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "build an IAG python service that checks device reachability and returns pass/fail", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + } + ], + "summary": { + "total": 20, + "passed": 11, + "failed": 9 + } +} \ No newline at end of file diff --git a/evals/trigger-evals/explore.json b/evals/trigger-evals/explore.json new file mode 100644 index 0000000..2c335bc --- /dev/null +++ b/evals/trigger-evals/explore.json @@ -0,0 +1,82 @@ +[ + { + "query": "connect to my platform and show me what adapters are running, what workflows exist, and what projects are available", + "should_trigger": true + }, + { + "query": "i want to poke around my platform before starting any delivery — authenticate and show me what's there", + "should_trigger": true + }, + { + "query": "browse my platform freely — i want to understand what tasks, adapters, and existing workflows are available before i commit to a design", + "should_trigger": true + }, + { + "query": "log into my platform and pull all the available tasks for the ServiceNow adapter so i can see what we can use", + "should_trigger": true + }, + { + "query": "can you authenticate to https://myplatform.itential.io and give me a summary of what's installed and running", + "should_trigger": true + }, + { + "query": "i just spun up a new itential environment and want to know what i'm working with. connect and give me an inventory", + "should_trigger": true + }, + { + "query": "freestyle exploration — connect to the platform and help me build something without going through the full delivery lifecycle", + "should_trigger": true + }, + { + "query": "check if the infoblox adapter is running and what tasks it exposes", + "should_trigger": true + }, + { + "query": "i want to try a quick automation without writing a full spec. connect to my platform and let's just build it", + "should_trigger": true + }, + { + "query": "pull the list of all workflows on my platform and tell me which ones look like they're related to device onboarding", + "should_trigger": true + }, + { + "query": "help me write requirements for a new VLAN provisioning use case. we have cisco switches and infoblox", + "should_trigger": false + }, + { + "query": "solution design is approved — build all the components and run the tests", + "should_trigger": false + }, + { + "query": "document all the global workflows on my platform and group them by use case", + "should_trigger": false + }, + { + "query": "the customer-spec.md is done. connect to the platform and check feasibility", + "should_trigger": false + }, + { + "query": "build an IAG python service that checks device reachability and returns pass/fail", + "should_trigger": false + }, + { + "query": "i have a project called Firewall_Rule_Lifecycle with no docs — reverse engineer it into a spec", + "should_trigger": false + }, + { + "query": "run the full delivery lifecycle for BGP peer provisioning — start with requirements", + "should_trigger": false + }, + { + "query": "my flowagent has been doing software upgrades in prod. convert it to a deterministic workflow", + "should_trigger": false + }, + { + "query": "create a golden config tree for our cisco core routers with ISIS and BGP compliance rules", + "should_trigger": false + }, + { + "query": "write the as-built documentation for the VLAN provisioning delivery", + "should_trigger": false + } +] diff --git a/evals/trigger-evals/solution-arch-agent-results.json b/evals/trigger-evals/solution-arch-agent-results.json new file mode 100644 index 0000000..3738b1b --- /dev/null +++ b/evals/trigger-evals/solution-arch-agent-results.json @@ -0,0 +1,171 @@ +{ + "skill_name": "solution-arch-agent", + "description": "Use this skill when someone has approved requirements (a customer-spec.md) and needs to assess platform feasibility or produce a solution design. Trigger it for phrases like \"requirements are approved\", \"my spec is done\", \"check if the platform supports this\", \"run feasibility\", \"connect to the platform and design the solution\", \"I have a customer-spec \u2014 now what?\", or \"produce a solution-design.md\". This skill connects to the live platform, checks what adapters and capabilities are available, and produces feasibility.md and solution-design.md. Also trigger it in design-only mode when the implementation plan needs to change but requirements are stable.", + "results": [ + { + "query": "the customer-spec.md for VLAN provisioning is approved. connect to my platform and tell me if it's actually feasible \u2014 we have infoblox, servicenow, and cisco IOS devices", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "i have an approved spec. run feasibility \u2014 platform is at https://myplatform.itential.io, oauth creds are in my-usecase/.env", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "requirements are locked for the BGP peer provisioning use case. now i need to know what the platform can support and design the solution", + "should_trigger": true, + "trigger_rate": 0.5, + "triggers": 1, + "runs": 2, + "pass": true + }, + { + "query": "we need a solution design for the software upgrade use case. spec is approved, feasibility was done last week. just redo the design \u2014 platform hasn't changed", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "my customer-spec is done and the stakeholders signed off. what adapters do we have, what can we reuse, and how should we structure the build?", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "the requirements for firewall rule lifecycle are approved. assess whether our platform can support it and produce a solution design", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "requirements done. now check if the platform supports this and design how we'll build it", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "design-only mode \u2014 the requirements haven't changed but we need to redesign the implementation because we're switching from NSO to native IOS adapters", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "i want to automate VLAN provisioning. we haven't written any requirements yet \u2014 help me figure out what we need to build", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "solution design is approved. go ahead and build all the components", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "i have a customer-spec.md. produce a feasibility.md and solution-design.md for it", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "connect to my platform and browse around \u2014 show me what workflows and adapters are available", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "run the feasibility stage for port turn-up. we have infoblox, servicenow, and netbox on the platform", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "document all the workflows on the platform. group them by use case", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "the VLAN provisioning build is done. write up the as-built documentation", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "i have a project called DNS_Management with no documentation. reverse engineer it into a spec and design", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "my flowagent has been doing certificate rotation for months \u2014 convert it to a deterministic workflow spec", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "help me write requirements for a config backup and compliance use case", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "check if adapter infoblox is running on my platform", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "build an IAG python service that queries netbox and returns device lists by site", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + } + ], + "summary": { + "total": 20, + "passed": 11, + "failed": 9 + } +} \ No newline at end of file diff --git a/evals/trigger-evals/solution-arch-agent.json b/evals/trigger-evals/solution-arch-agent.json new file mode 100644 index 0000000..5a813dd --- /dev/null +++ b/evals/trigger-evals/solution-arch-agent.json @@ -0,0 +1,82 @@ +[ + { + "query": "the customer-spec.md for VLAN provisioning is approved. connect to my platform and tell me if it's actually feasible — we have infoblox, servicenow, and cisco IOS devices", + "should_trigger": true + }, + { + "query": "requirements are locked for the BGP peer provisioning use case. now i need to know what the platform can support and design the solution", + "should_trigger": true + }, + { + "query": "i have an approved spec. run feasibility — platform is at https://myplatform.itential.io, oauth creds are in my-usecase/.env", + "should_trigger": true + }, + { + "query": "we need a solution design for the software upgrade use case. spec is approved, feasibility was done last week. just redo the design — platform hasn't changed", + "should_trigger": true + }, + { + "query": "my customer-spec is done and the stakeholders signed off. what adapters do we have, what can we reuse, and how should we structure the build?", + "should_trigger": true + }, + { + "query": "the requirements for firewall rule lifecycle are approved. assess whether our platform can support it and produce a solution design", + "should_trigger": true + }, + { + "query": "run the feasibility stage for port turn-up. we have infoblox, servicenow, and netbox on the platform", + "should_trigger": true + }, + { + "query": "requirements done. now check if the platform supports this and design how we'll build it", + "should_trigger": true + }, + { + "query": "design-only mode — the requirements haven't changed but we need to redesign the implementation because we're switching from NSO to native IOS adapters", + "should_trigger": true + }, + { + "query": "i have a customer-spec.md. produce a feasibility.md and solution-design.md for it", + "should_trigger": true + }, + { + "query": "i want to automate VLAN provisioning. we haven't written any requirements yet — help me figure out what we need to build", + "should_trigger": false + }, + { + "query": "solution design is approved. go ahead and build all the components", + "should_trigger": false + }, + { + "query": "connect to my platform and browse around — show me what workflows and adapters are available", + "should_trigger": false + }, + { + "query": "document all the workflows on the platform. group them by use case", + "should_trigger": false + }, + { + "query": "the VLAN provisioning build is done. write up the as-built documentation", + "should_trigger": false + }, + { + "query": "my flowagent has been doing certificate rotation for months — convert it to a deterministic workflow spec", + "should_trigger": false + }, + { + "query": "i have a project called DNS_Management with no documentation. reverse engineer it into a spec and design", + "should_trigger": false + }, + { + "query": "help me write requirements for a config backup and compliance use case", + "should_trigger": false + }, + { + "query": "build an IAG python service that queries netbox and returns device lists by site", + "should_trigger": false + }, + { + "query": "check if adapter infoblox is running on my platform", + "should_trigger": false + } +] diff --git a/evals/trigger-evals/spec-agent-results.json b/evals/trigger-evals/spec-agent-results.json new file mode 100644 index 0000000..cf53107 --- /dev/null +++ b/evals/trigger-evals/spec-agent-results.json @@ -0,0 +1,171 @@ +{ + "skill_name": "spec-agent", + "description": "Use this skill to start any new automation delivery \u2014 when someone wants to automate something, build a new use case, figure out requirements, write up an HLD, or kick off a project on the Itential Platform. Trigger it for phrases like \"I want to automate X\", \"help me build a workflow for Y\", \"we're starting a new automation project\", \"I need to define requirements for Z\", \"kick off a delivery\", or \"let's start with requirements\". This is the entry point for the spec-driven delivery lifecycle. It picks from 22 built-in use case specs or starts from scratch, refines requirements with the engineer, and produces an approved customer-spec.md. Use it whenever someone is at the beginning of building something new and hasn't yet defined what they're building.", + "results": [ + { + "query": "starting a new automation project for firewall rule lifecycle management. pan-os and servicenow involved. can you help me nail down the scope and acceptance criteria before we get into design?", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "we have 22 use case specs \u2014 pick the DNS record management one and help me refine it for our environment. we use infoblox and our naming convention is ..corp", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "i want to automate VLAN provisioning on our network \u2014 cisco switches, infoblox for ipam, servicenow for change tickets. where do i start?", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "need a proper HLD for a device onboarding automation. stakeholders want to sign off before any build work starts. can you run through the requirements with me?", + "should_trigger": true, + "trigger_rate": 0.5, + "triggers": 1, + "runs": 2, + "pass": true + }, + { + "query": "kick off a delivery for BGP peer provisioning \u2014 let's start with requirements", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "we're planning an EVPN provisioning project and i need to document what we're building before the team starts. help me structure the use case", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "the spec for our circuit provisioning use case needs updating \u2014 add requirements for rollback handling and a constraint around maintenance windows", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "start a new delivery \u2014 use case is config drift detection and remediation, target devices are juniper MX routers", + "should_trigger": true, + "trigger_rate": 0.5, + "triggers": 1, + "runs": 2, + "pass": true + }, + { + "query": "the customer-spec.md is done and the customer signed off. go ahead and start building the workflows", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "my feasibility.md is approved. now i need to design the solution \u2014 what components do i need and how should they be structured?", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "i've got a rough idea for an incident auto-remediation workflow. nothing designed yet. help me turn this into a proper requirements doc we can take into design review", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "connect to my platform and show me what adapters are running and what workflows already exist", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "we have an existing project called DNS_Management \u2014 extract a spec from it so we can use it as a starting point", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "document all the workflows on the platform and group them by use case so we know what we have", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "we need to build a software upgrade workflow for our fleet. help me write up the requirements \u2014 what questions should we be asking before we design anything?", + "should_trigger": true, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": false + }, + { + "query": "my flowagent has been running the certificate rotation use case in prod for 2 months. convert it to a deterministic workflow", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "build the VLAN provisioning workflow per the approved solution design. start with the child workflows first", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "i want to explore what adapters and tasks are available on my platform before committing to a design", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "can you check what's in the helpers folder and explain what the workflow-task-adapter.json template is for", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + }, + { + "query": "the solution design for port turn-up is approved. run the build", + "should_trigger": false, + "trigger_rate": 0.0, + "triggers": 0, + "runs": 2, + "pass": true + } + ], + "summary": { + "total": 20, + "passed": 12, + "failed": 8 + } +} \ No newline at end of file diff --git a/evals/trigger-evals/spec-agent.json b/evals/trigger-evals/spec-agent.json new file mode 100644 index 0000000..a70aae0 --- /dev/null +++ b/evals/trigger-evals/spec-agent.json @@ -0,0 +1,82 @@ +[ + { + "query": "i want to automate VLAN provisioning on our network — cisco switches, infoblox for ipam, servicenow for change tickets. where do i start?", + "should_trigger": true + }, + { + "query": "we need to build a software upgrade workflow for our fleet. help me write up the requirements — what questions should we be asking before we design anything?", + "should_trigger": true + }, + { + "query": "starting a new automation project for firewall rule lifecycle management. pan-os and servicenow involved. can you help me nail down the scope and acceptance criteria before we get into design?", + "should_trigger": true + }, + { + "query": "we have 22 use case specs — pick the DNS record management one and help me refine it for our environment. we use infoblox and our naming convention is ..corp", + "should_trigger": true + }, + { + "query": "need a proper HLD for a device onboarding automation. stakeholders want to sign off before any build work starts. can you run through the requirements with me?", + "should_trigger": true + }, + { + "query": "kick off a delivery for BGP peer provisioning — let's start with requirements", + "should_trigger": true + }, + { + "query": "we're planning an EVPN provisioning project and i need to document what we're building before the team starts. help me structure the use case", + "should_trigger": true + }, + { + "query": "the spec for our circuit provisioning use case needs updating — add requirements for rollback handling and a constraint around maintenance windows", + "should_trigger": true + }, + { + "query": "i've got a rough idea for an incident auto-remediation workflow. nothing designed yet. help me turn this into a proper requirements doc we can take into design review", + "should_trigger": true + }, + { + "query": "start a new delivery — use case is config drift detection and remediation, target devices are juniper MX routers", + "should_trigger": true + }, + { + "query": "my feasibility.md is approved. now i need to design the solution — what components do i need and how should they be structured?", + "should_trigger": false + }, + { + "query": "the customer-spec.md is done and the customer signed off. go ahead and start building the workflows", + "should_trigger": false + }, + { + "query": "connect to my platform and show me what adapters are running and what workflows already exist", + "should_trigger": false + }, + { + "query": "document all the workflows on the platform and group them by use case so we know what we have", + "should_trigger": false + }, + { + "query": "we have an existing project called DNS_Management — extract a spec from it so we can use it as a starting point", + "should_trigger": false + }, + { + "query": "build the VLAN provisioning workflow per the approved solution design. start with the child workflows first", + "should_trigger": false + }, + { + "query": "my flowagent has been running the certificate rotation use case in prod for 2 months. convert it to a deterministic workflow", + "should_trigger": false + }, + { + "query": "i want to explore what adapters and tasks are available on my platform before committing to a design", + "should_trigger": false + }, + { + "query": "the solution design for port turn-up is approved. run the build", + "should_trigger": false + }, + { + "query": "can you check what's in the helpers folder and explain what the workflow-task-adapter.json template is for", + "should_trigger": false + } +] From 189934e80c163e547dcecda06252df88818ace94 Mon Sep 17 00:00:00 2001 From: Ankit Bhansali <16569456+keepithuman@users.noreply.github.com> Date: Fri, 10 Apr 2026 18:09:06 -0400 Subject: [PATCH 2/6] docs: expand silent re-auth procedure in AGENTS.md Replaces the one-liner "re-authenticate using .env" with a full step-by-step procedure covering 401/403 detection, credential lookup, OAuth vs local auth endpoint, .auth.json update, and request retry. Based on post-session feedback that the existing guidance was insufficient. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- AGENTS.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/AGENTS.md b/AGENTS.md index f754952..75dee5c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -63,11 +63,24 @@ Requirements → Feasibility → Design → Build → As-Built **Auth happens when first needed** — in `/explore` (explore path) or in `/solution-arch-agent` during Feasibility. The token is saved to `{use-case}/.auth.json`. Every subsequent skill should: 1. Read `{use-case}/.auth.json` for `platform_url`, `auth_method`, and `token` 2. Use the token for all API calls (Bearer header for OAuth, query param for local) -3. On auth error (401/403): re-authenticate using `{use-case}/.env` and update `.auth.json` +3. On auth error (401/403): re-authenticate silently — see procedure below 4. **Never ask the user for credentials if `.env` exists** This means the user authenticates once and every subsequent skill just works. +**Token expiry — silent re-auth procedure:** + +When any API call returns 401 or 403, do not stop and do not ask the user. Re-authenticate silently: + +1. Read credentials from `{use-case}/.env` (or `${CLAUDE_PLUGIN_ROOT}/environments/*.env` if no use-case `.env`) +2. Call the appropriate auth endpoint: + - **OAuth:** `POST {PLATFORM_URL}/oauth/token` with `Content-Type: application/x-www-form-urlencoded` and body `grant_type=client_credentials&client_id={CLIENT_ID}&client_secret={CLIENT_SECRET}` + - **Local/password:** `POST {PLATFORM_URL}/login` with `{"username": "...", "password": "..."}` +3. Write the new token back to `{use-case}/.auth.json` +4. Retry the failed request with the new token + +If `.env` does not exist and re-auth is needed, then and only then ask the user for credentials. + ### Key Rule: Look Up Before You Act — Don't Guess **Skills** teach patterns, workflows, and know-how (how to build a childJob, how to wire variables, how to test). From eb2d8e9fa6c8c86a9bfc7d124726a765a0f94d00 Mon Sep 17 00:00:00 2001 From: Ankit Bhansali <16569456+keepithuman@users.noreply.github.com> Date: Fri, 10 Apr 2026 18:10:46 -0400 Subject: [PATCH 3/6] docs: add project-scoped asset name gotcha to AGENTS.md (#24) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Assets added to a project get an @{projectId}: prefix — PUT calls without it return 400. Applies to any skill that reads or updates project-owned assets, not just builder-agent. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- AGENTS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/AGENTS.md b/AGENTS.md index 75dee5c..3e9ea72 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -200,6 +200,7 @@ Requirements → Feasibility → Design → Build → As-Built 21. **Duplicate transition keys to same target** — JSON doesn't allow two keys with the same name. If a task needs both `success` and `error` to reach `workflow_end`, create an error handler task (e.g., `newVariable` to set error status) and route error there, then route that task to `workflow_end`. 22. **Respect task schema data types** — When wiring task inputs, match the type from `task-schemas.json` exactly. If a field is typed as `array`, pass an array (e.g., `["joksan@example.com"]`), not a bare string. If typed as `number`, pass a number, not a string. Common offenders: `to`/`cc`/`bcc` in email tasks (arrays, not strings), `pageSize`/`page` in queries (numbers, not strings). Mismatched types cause silent failures or validation errors. 23. **Adapter `app` ≠ adapter instance name** — The `app` and `locationType` fields on adapter tasks must be the adapter **type name** from `apps.json` (e.g., `EmailOpensource`, `Servicenow`), NOT the adapter **instance name** from `adapters.json` (e.g., `email`, `servicenow-prod`). Using the instance name causes `"No config found for Adapter: "` at runtime. The `adapter_id` field is where the instance name goes. Triple-check: `app` = type, `adapter_id` = instance. +24. **Project-scoped asset names** — once an asset is added to a project, its `name` is prefixed with `@{projectId}: `. When reading or updating a project-owned asset via PUT, you MUST use the scoped name or the API returns 400. Read the asset first to get its current name, or construct it as `@{projectId}: {displayName}`. Strip this prefix when displaying names to the user. ## Helper JSON Templates From bb12d8648c63562c73d9d33afc6b888f76278503 Mon Sep 17 00:00:00 2001 From: Ankit Bhansali <16569456+keepithuman@users.noreply.github.com> Date: Fri, 10 Apr 2026 18:12:06 -0400 Subject: [PATCH 4/6] fix: add adapter_id to workflow-task-adapter.json helper template MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit adapter_id is required in incoming variables for all adapter tasks — missing it causes "No config found for Adapter" at runtime. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- helpers/workflow-task-adapter.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/helpers/workflow-task-adapter.json b/helpers/workflow-task-adapter.json index 844193d..dc81e8b 100644 --- a/helpers/workflow-task-adapter.json +++ b/helpers/workflow-task-adapter.json @@ -9,7 +9,9 @@ "type": "automatic", "displayName": "", "variables": { - "incoming": {}, + "incoming": { + "adapter_id": "" + }, "outgoing": {}, "error": "", "decorators": [] From b42325b559864d56acf7a2252e17c047c470cf0d Mon Sep 17 00:00:00 2001 From: Ankit Bhansali <16569456+keepithuman@users.noreply.github.com> Date: Fri, 10 Apr 2026 18:12:54 -0400 Subject: [PATCH 5/6] fix: add adapter_id as top-level field in workflow-task-adapter.json Co-Authored-By: Claude Sonnet 4.6 (1M context) --- helpers/workflow-task-adapter.json | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/helpers/workflow-task-adapter.json b/helpers/workflow-task-adapter.json index dc81e8b..8594255 100644 --- a/helpers/workflow-task-adapter.json +++ b/helpers/workflow-task-adapter.json @@ -6,12 +6,11 @@ "location": "Adapter", "locationType": "", "app": "", + "adapter_id": "", "type": "automatic", "displayName": "", "variables": { - "incoming": { - "adapter_id": "" - }, + "incoming": {}, "outgoing": {}, "error": "", "decorators": [] From c032c04df321b1bccd5e9b7504cc501ba9f4a86f Mon Sep 17 00:00:00 2001 From: Ankit Bhansali <16569456+keepithuman@users.noreply.github.com> Date: Fri, 10 Apr 2026 18:15:22 -0400 Subject: [PATCH 6/6] docs: restore invoke/handoff lines to delivery skill descriptions Adds lifecycle sequencing back to spec-agent, solution-arch-agent, and builder-agent descriptions so Claude knows the correct order and what each skill hands off to next. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- .claude/skills/builder-agent/SKILL.md | 2 +- .claude/skills/solution-arch-agent/SKILL.md | 2 +- .claude/skills/spec-agent/SKILL.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.claude/skills/builder-agent/SKILL.md b/.claude/skills/builder-agent/SKILL.md index 122f1c6..9d0d61f 100644 --- a/.claude/skills/builder-agent/SKILL.md +++ b/.claude/skills/builder-agent/SKILL.md @@ -1,6 +1,6 @@ --- name: builder-agent -description: Use this skill when someone has an approved solution design and is ready to build. Trigger it for phrases like "solution design is approved", "go ahead and build", "implement the design", "create the workflows", "build everything per the design", "start the build", "the design is locked — implement it", or "write the as-built documentation". Also trigger it when a build is failing mid-way and needs debugging. This skill implements the approved solution-design.md end-to-end — creating all workflows, templates, projects, and configs, testing each component, and producing as-built.md. If the user has a solution-design.md and wants to turn it into working automation, this is the right skill. +description: Use this skill when someone has an approved solution design and is ready to build. Trigger it for phrases like "solution design is approved", "go ahead and build", "implement the design", "create the workflows", "build everything per the design", "start the build", "the design is locked — implement it", or "write the as-built documentation". Also trigger it when a build is failing mid-way and needs debugging. This skill implements the approved solution-design.md end-to-end — creating all workflows, templates, projects, and configs, testing each component, and producing as-built.md. If the user has a solution-design.md and wants to turn it into working automation, this is the right skill. Invoke after /solution-arch-agent produces an approved solution-design.md. --- # Builder Agent diff --git a/.claude/skills/solution-arch-agent/SKILL.md b/.claude/skills/solution-arch-agent/SKILL.md index cfe987a..29badde 100644 --- a/.claude/skills/solution-arch-agent/SKILL.md +++ b/.claude/skills/solution-arch-agent/SKILL.md @@ -1,6 +1,6 @@ --- name: solution-arch-agent -description: Use this skill when someone has approved requirements (a customer-spec.md) and needs to assess platform feasibility or produce a solution design. Trigger it for phrases like "requirements are approved", "my spec is done", "check if the platform supports this", "run feasibility", "connect to the platform and design the solution", "I have a customer-spec — now what?", or "produce a solution-design.md". This skill connects to the live platform, checks what adapters and capabilities are available, and produces feasibility.md and solution-design.md. Also trigger it in design-only mode when the implementation plan needs to change but requirements are stable. +description: Use this skill when someone has approved requirements (a customer-spec.md) and needs to assess platform feasibility or produce a solution design. Trigger it for phrases like "requirements are approved", "my spec is done", "check if the platform supports this", "run feasibility", "connect to the platform and design the solution", "I have a customer-spec — now what?", or "produce a solution-design.md". This skill connects to the live platform, checks what adapters and capabilities are available, and produces feasibility.md and solution-design.md. Also trigger it in design-only mode when the implementation plan needs to change but requirements are stable. Invoke after /spec-agent produces an approved customer-spec.md. Hands off to /builder-agent after design approval. --- # Solution Architecture Agent diff --git a/.claude/skills/spec-agent/SKILL.md b/.claude/skills/spec-agent/SKILL.md index 65c4281..0c4d049 100644 --- a/.claude/skills/spec-agent/SKILL.md +++ b/.claude/skills/spec-agent/SKILL.md @@ -1,6 +1,6 @@ --- name: spec-agent -description: Use this skill to start any new automation delivery — when someone wants to automate something, build a new use case, figure out requirements, write up an HLD, or kick off a project on the Itential Platform. Trigger it for phrases like "I want to automate X", "help me build a workflow for Y", "we're starting a new automation project", "I need to define requirements for Z", "kick off a delivery", or "let's start with requirements". This is the entry point for the spec-driven delivery lifecycle. It picks from 22 built-in use case specs or starts from scratch, refines requirements with the engineer, and produces an approved customer-spec.md. Use it whenever someone is at the beginning of building something new and hasn't yet defined what they're building. +description: Use this skill to start any new automation delivery — when someone wants to automate something, build a new use case, figure out requirements, write up an HLD, or kick off a project on the Itential Platform. Trigger it for phrases like "I want to automate X", "help me build a workflow for Y", "we're starting a new automation project", "I need to define requirements for Z", "kick off a delivery", or "let's start with requirements". This is the entry point for the spec-driven delivery lifecycle. It picks from 22 built-in use case specs or starts from scratch, refines requirements with the engineer, and produces an approved customer-spec.md. Use it whenever someone is at the beginning of building something new and hasn't yet defined what they're building. For ad-hoc platform exploration, use /explore instead. Hands off to /solution-arch-agent after approval. --- # Spec Agent