accuknox · Eshrath027 · May 4, 2026
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,9 @@ dist/
 # Sample data and benchmark outputs
 samples/
 
+# Reference/legacy implementations
+references/
+
 # Generated context dumps
 llms-full.txt
 

diff --git a/README.md b/README.md
@@ -1,47 +1,55 @@
 # CodeAssure
 
-AI-powered SAST finding verification. Takes SAST scanner results and a codebase, uses an LLM agent to independently verify each finding, and produces enriched results with verdicts.
-
-![CodeAssure System Architecture](./codeassure.png)
+AI-powered SAST finding verification. Takes SAST scanner results and a codebase, uses an LLM agent to verify each finding, and produces enriched results with verdicts, severity ratings, and visual explanations.
 
 ## Quick Start
 
 ```bash
+# Install
 uv sync
 uv pip install -e .
+
+# Run
 codeassure \
   --codebase ./my-project \
   --findings results.json \
   --output verified.json
+
+# With benchmarking
+codeassure \
+  --codebase ./my-project \
+  --findings results.json \
+  --output verified.json \
+  --verify ground_truth.json
 ```
 
 ## How It Works
 
-CodeAssure runs a two-stage agent pipeline on each SAST finding:
+CodeAssure runs a three-stage agent pipeline:
 
-1. **Analyzer** — tool-using agent reads the flagged code, gathers context via `read_file` and `grep_code`, and produces a structured analysis
+1. **Analyzer** (Generator) — tool-using agent reads the flagged code, gathers context via `read_file` and `grep_code`, produces a structured analysis
 2. **Formatter** — extracts a JSON verdict from the analysis, with a repair loop for malformed output
+3. **Evaluator** — reviews the verdict for internal consistency, assigns severity, and can reject for retry
 
-The verdict answers two independent questions:
-- **Did the scanner correctly detect the pattern?** (`verdict`: true_positive / false_positive / uncertain)
-- **Is this a security vulnerability?** (`is_security_vulnerability`: true / false / null)
+Related findings are **grouped** before analysis — co-located findings on the same code get analyzed together with coherence constraints, so verdicts don't contradict each other.
 
-Known rule families get deterministic verdict policies to reduce false negatives on common patterns.
+A **finding policy** config tells the model what counts as a true positive for each customer (security-only vs. detection-semantics).
 
 ## CLI
 
 ```
-codeassure --codebase DIR --findings FILE --output FILE [--config PATH] [--jobs N] [--verify FILE]
+codeassure --codebase DIR --findings FILE --output FILE [OPTIONS]
 ```
 
-| Option | Required | Description |
-|---|---|---|
-| `--codebase DIR` | yes | Root directory that finding paths are relative to |
-| `--findings FILE` | yes | SAST findings JSON |
-| `--output, -o FILE` | yes | Output path for verified findings |
-| `--config, -c PATH` | no | Path to codeassure.json (default: `./codeassure.json`) |
-| `--jobs, -j N` | no | Max concurrent LLM requests (overrides config) |
-| `--verify FILE` | no | Compare output against ground-truth JSON and write a CSV report |
+| Option | Description |
+|---|---|
+| `--codebase DIR` | Root directory that finding paths are relative to |
+| `--findings FILE` | SAST findings JSON (e.g., Semgrep results.json) |
+| `--output, -o FILE` | Output path for verified findings |
+| `--config, -c PATH` | Path to codeassure.json (default: `./codeassure.json`) |
+| `--jobs, -j N` | Max concurrent LLM requests (overrides config) |
+| `--no-grouping` | Disable finding grouping (analyze each finding independently) |
+| `--verify FILE` | Compare output against ground-truth JSON and write a CSV report |
 
 ## Configuration
 
@@ -57,7 +65,12 @@ codeassure --codebase DIR --findings FILE --output FILE [--config PATH] [--jobs
   },
   "concurrency": 4,
   "stage_timeout": 120,
-  "finding_timeout": 300
+  "finding_timeout": 300,
+  "finding_policy": {
+    "best_practice_is_tp": true,
+    "informational_detection_is_tp": true,
+    "audit_rule_is_tp": true
+  }
 }
 ```
 
@@ -73,63 +86,12 @@ codeassure --codebase DIR --findings FILE --output FILE [--config PATH] [--jobs
 
 ### `api_base` per provider
 
-Always provide the root host. The SDK or CodeAssure appends the correct path automatically:
-
 | Provider | You set `api_base` | Actual endpoint called |
 |---|---|---|
 | `openai` / `openai-compatible` | `http://localhost:5000` | `http://localhost:5000/v1/chat/completions` |
 | `anthropic` | `https://your-proxy.example.com` | `https://your-proxy.example.com/v1/messages` |
 | `google` / `gemini` | `https://your-proxy.example.com` | `https://your-proxy.example.com/v1beta/models/{model}:generateContent` |
 
-### Provider examples
-
-**Local vLLM / OpenAI-compatible:**
-```json
-{
-  "model": {
-    "provider": "openai-compatible",
-    "name": "qwen/qwen3.5-9b",
-    "api_base": "http://localhost:5000",
-    "tool_calling": false
-  }
-}
-```
-
-**Anthropic-compatible proxy:**
-```json
-{
-  "model": {
-    "provider": "anthropic",
-    "name": "qwen/qwen3.5-9b",
-    "api_base": "https://your-proxy.example.com",
-    "api_key": "$ANTHROPIC_API_KEY",
-    "tool_calling": false
-  }
-}
-```
-
-**Anthropic (direct):**
-```json
-{
-  "model": {
-    "provider": "anthropic",
-    "name": "claude-sonnet-4-6",
-    "api_key": "$ANTHROPIC_API_KEY"
-  }
-}
-```
-
-**Google Gemini:**
-```json
-{
-  "model": {
-    "provider": "gemini",
-    "name": "gemini-2.0-flash",
-    "api_key": "$GEMINI_API_KEY"
-  }
-}
-```
-
 ### Other config fields
 
 | Field | Default | Description |
@@ -138,20 +100,10 @@ Always provide the root host. The SDK or CodeAssure appends the correct path aut
 | `stage_timeout` | `120` | Seconds per LLM stage (analyzer or formatter) |
 | `finding_timeout` | `300` | Seconds for the entire finding (both stages + repair) |
 | `request_limit` | `200` | Max requests per `agent.run()` call |
-
-## Brev Setup (Remote GPU Instance)
-
-> Instance: `accuknox-nemotron-super-3`
-> Local endpoint after port-forward: `http://localhost:5000`
-> Model name: `qwen35-nvfp4`
-
-```bash
-brev login
-brev list
-brev port-forward accuknox-nemotron-super-3 --port 5000:5000
-```
-
-The vLLM endpoint is now available at `http://localhost:5000`. Set `api_base` to `http://localhost:5000` in `codeassure.json`.
+| `voting_rounds` | `1` | Run each finding N times and take majority verdict |
+| `max_tokens` | `4096` | Max completion tokens per LLM call |
+| `thinking_map` | `null` | Severity → thinking effort (`full`/`low`/`off`). null = disabled |
+| `finding_policy` | all true | What counts as true_positive for this customer |
 
 ## Output
 
@@ -161,10 +113,17 @@ Each finding gets a `verification` block:
   "verification": {
     "verdict": "true_positive",
     "is_security_vulnerability": true,
+    "severity": "high",
     "confidence": "high",
     "severity": "high",
     "reason": "subprocess.run called with dynamic user input and shell=True.",
-    "evidence": [{"location": "app/utils.py:42"}]
+    "evidence": [{"location": "app/utils.py:42"}],
+    "graph": {
+      "summary": "Taint flow: os.environ → subprocess.run",
+      "mermaid": "graph TD\n    ...",
+      "nodes": [...],
+      "edges": [...]
+    }
   }
 }
 ```
@@ -176,39 +135,75 @@ Each finding gets a `verification` block:
 | `confidence` | `high`, `medium`, `low` | Confidence level |
 | `severity` | `critical`, `high`, `medium`, `low` | Assessed severity for `true_positive`; always `low` for `false_positive`/`uncertain` |
 
-## Benchmarking
+The output also includes a `codebase_tree` for visualization:
+```json
+{
+  "results": [...],
+  "codebase_tree": [
+    {"path": "src/app.py", "type": "file", "size": 1234},
+    {"path": "src/utils", "type": "dir", "size": 0}
+  ]
+}
+```
 
-Pass `--verify` with a ground-truth JSON (`is_false_positive: bool` per finding):
+## Visualization UI
 
 ```bash
-codeassure --codebase ./code --findings results.json --output out.json --verify ground_truth.json
+cd ui
+pnpm install
+pnpm dev --port 3333
+```
+
+Open http://localhost:3333 and drop the output JSON. The UI renders a D3 force graph with findings overlaid, severity shading, and a detail panel per finding.
+
+## Checkpointing
+
+If the run crashes mid-way, re-run the same command. CodeAssure saves progress to `<output>.checkpoint.json` every 5 findings and resumes from where it left off. The checkpoint is deleted on successful completion.
+
+## Benchmarking
+
+```bash
+codeassure \
+  --codebase samples/sample-9/k8s_jobs \
+  --findings samples/sample-9/results.json \
+  --output samples/sample-9/output.json \
+  --verify samples/sample-9/final_results.json
 ```
 
-Prints two confusion matrices:
-1. **Finding Correctness** — raw verdict vs ground truth
-2. **Security Vulnerability** — collapsed view (verdict=TP + is_sec=false maps to FP)
+Prints a confusion matrix comparing the effective verdict against ground truth (`is_false_positive` field). The collapse rule: `verdict=TP + is_security_vulnerability=false → effective FP`.
 
 ## Project Structure
 
 ```
 sast_verify/
   cli.py                  # CLI entry point
-  config.py               # Config model, loads codeassure.json
-  pipeline.py             # Orchestration + dual-metric evaluation
+  config.py               # Config model + FindingPolicy
+  pipeline.py             # Orchestration, checkpointing, codebase tree walker
   preprocess.py           # Normalizes raw SAST JSON into Finding objects
   retrieval.py            # Anchors findings to source code evidence
   schema.py               # Pydantic models: Finding, Evidence, Verdict
+  grouping.py             # Finding Relationship Graph: groups co-located findings
+  graph.py                # Mermaid flow diagram generator per finding
 
   agents/
-    analyzer.py           # Builds analyzer + formatter agents
-    runner.py             # Async runner: both stages per finding, concurrency control
+    analyzer.py           # Builds analyzer, formatter, evaluator agents
+    runner.py             # Async runner: generator/evaluator pipeline, group analysis
     tools.py              # read_file, grep_code (sandboxed to codebase)
+    deps.py               # AnalyzerDeps (tool access scope)
 
   prompts/
-    __init__.py           # Message builders for analyzer and formatter
-    analyzer.py           # System prompts for both agents
+    __init__.py           # Message builders (single, group, evaluator)
+    analyzer.py           # System prompts (analyzer, formatter, evaluator, group variants)
     rule_policies.py      # Deterministic verdict policies for known rule families
 
-  eval/
-    evaluate.py           # Fingerprint-based evaluation
+ui/                       # Next.js visualization app
+  src/
+    app/page.tsx          # Upload + force graph view
+    components/
+      ForceGraph.tsx      # D3 force graph with finding overlay
+      FileUpload.tsx      # JSON file upload
+    lib/
+      types.ts            # TypeScript types matching output schema
+      theme.ts            # AccuKnox brand colors
+      graph-builder.ts    # Finding flow graph for detail view
 ```
diff --git a/codeassure.json b/codeassure.json
@@ -1,11 +1,11 @@
 {
-  "model": {
+ "model": {
     "provider": "openai",
-    "name": "qwen35-nvfp4",
-    "api_base": "http://localhost:5000",
-    "temperature": 0.1
+    "name": "gemma-4-31B-it-NVFP4",
+    "api_base": "http://100.92.159.5:47821/v1"
   },
-  "concurrency": 7,
-  "stage_timeout": 300,
-  "finding_timeout": 600
+  "concurrency": 48,
+  "stage_timeout": 600,
+  "finding_timeout": 900,
+  "max_tokens": 16384
 }