From 77719be928cc93daef90b7f98744e12033437fc3 Mon Sep 17 00:00:00 2001 From: Sam Estrin Date: Mon, 11 May 2026 18:04:43 -0700 Subject: [PATCH 01/10] fix(yaml): preserve version-like strings that would lose info on float coerce MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit parseNumber was silently converting "4.30" → float 4.3 → "4.3" on emit. Now only coerces to float when the value round-trips through YAML marshal back to the exact input string. Adds regression tests for both the fix and the clean-numeric case to prevent over-correction. --- internal/support/commands/yaml.go | 17 ++++- internal/support/commands/yaml_test.go | 88 ++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 3 deletions(-) diff --git a/internal/support/commands/yaml.go b/internal/support/commands/yaml.go index a39efd9..69ed409 100644 --- a/internal/support/commands/yaml.go +++ b/internal/support/commands/yaml.go @@ -1189,7 +1189,12 @@ func formatValue(value interface{}) string { } } -// parseNumber attempts to parse a string as int or float +// parseNumber attempts to parse a string as int or float. +// +// Coercion is only applied when the parsed numeric value round-trips back to +// the exact input string. This prevents silent truncation of version-like +// strings (e.g. "4.30" -> float 4.3 -> emitted as "4.3") and similar values +// where the input carries information that the numeric representation cannot. func parseNumber(s string) (interface{}, error) { // Try int first var i int @@ -1197,10 +1202,16 @@ func parseNumber(s string) (interface{}, error) { return i, nil } - // Try float + // Try float, but only coerce if the float round-trips to the input string + // through YAML marshaling. Values like "1.0", "4.30", "01.5" parse as + // floats but lose their original representation when serialized, so we + // keep them as strings. var f float64 if _, err := fmt.Sscanf(s, "%f", &f); err == nil { - return f, nil + marshaled, mErr := yaml.Marshal(f) + if mErr == nil && strings.TrimSpace(string(marshaled)) == s { + return f, nil + } } return nil, fmt.Errorf("not a number") diff --git a/internal/support/commands/yaml_test.go b/internal/support/commands/yaml_test.go index 18ea2cc..ad28678 100644 --- a/internal/support/commands/yaml_test.go +++ b/internal/support/commands/yaml_test.go @@ -3225,3 +3225,91 @@ items: t.Errorf("expected output to contain 'second', got: %s", output) } } + +// TestParseNumber_PreservesNonRoundtripStrings verifies that version-like +// strings whose float representation loses information stay strings. +// Regression: "4.30" was being coerced to float 4.3 and emitted as "4.3". +// +// Values that DO round-trip through YAML serialization (e.g. "1.0" -> 1.0 -> +// "1.0") are allowed to coerce because the on-disk output still matches the +// input. Only inputs that would suffer information loss must stay strings. +func TestParseNumber_PreservesNonRoundtripStrings(t *testing.T) { + cases := []string{ + "4.30", // float 4.3, would emit as "4.3" + "1.10", // float 1.1, would emit as "1.1" + "01.5", // float 1.5, leading zero lost + "2.00", // float 2.0, would emit as "2.0" + "0.10", // float 0.1, would emit as "0.1" + } + for _, s := range cases { + if v, err := parseNumber(s); err == nil { + t.Errorf("parseNumber(%q) should not coerce (got %v of type %T)", s, v, v) + } + } +} + +// TestParseNumber_StillCoercesNumerics verifies the fix doesn't over-correct: +// values that round-trip cleanly should still be typed as numbers. +func TestParseNumber_StillCoercesNumerics(t *testing.T) { + intCases := []string{"0", "1", "42", "-7", "1000"} + for _, s := range intCases { + v, err := parseNumber(s) + if err != nil { + t.Errorf("parseNumber(%q) should coerce to int, got error: %v", s, err) + continue + } + if _, ok := v.(int); !ok { + t.Errorf("parseNumber(%q) returned %T, want int", s, v) + } + } + + floatCases := []string{"3.14", "0.5", "-2.5"} + for _, s := range floatCases { + v, err := parseNumber(s) + if err != nil { + t.Errorf("parseNumber(%q) should coerce to float, got error: %v", s, err) + continue + } + if _, ok := v.(float64); !ok { + t.Errorf("parseNumber(%q) returned %T, want float64", s, v) + } + } +} + +// TestYamlMultiset_PreservesVersionStrings is an end-to-end regression test +// for the registry version truncation bug. Writing "version": "4.30" must +// produce a YAML file that round-trips back to "4.30", not "4.3". +func TestYamlMultiset_PreservesVersionStrings(t *testing.T) { + dir := createTempDir(t) + configPath := filepath.Join(dir, "registry.yaml") + + cmd := newYamlCmd() + buf := new(bytes.Buffer) + cmd.SetOut(buf) + cmd.SetArgs([]string{ + "multiset", "--file", configPath, "--create", "--quiet", + "packages.foo.version", "4.30", + "packages.foo.name", "foo", + "packages.bar.version", "1.0", + "packages.baz.version", "1.10.2", + }) + + if err := cmd.Execute(); err != nil { + t.Fatalf("multiset failed: %v", err) + } + + content, err := os.ReadFile(configPath) + if err != nil { + t.Fatalf("read registry: %v", err) + } + got := string(content) + + for _, want := range []string{"4.30", "1.0", "1.10.2"} { + if !strings.Contains(got, want) { + t.Errorf("registry should preserve version %q, got:\n%s", want, got) + } + } + if strings.Contains(got, "version: 4.3\n") { + t.Errorf("registry truncated 4.30 to 4.3:\n%s", got) + } +} From 4f70ac3b6dd49d4e4a754b977af67ad986c741ae Mon Sep 17 00:00:00 2001 From: Sam Estrin Date: Mon, 11 May 2026 18:06:52 -0700 Subject: [PATCH 02/10] feat(multireview): SSH and SCP helpers with timeout + stream separation First piece of the multi-reviewer pipeline. Wraps /usr/bin/ssh and /usr/bin/scp with context-driven timeouts, separate stdout/stderr capture (callers parse stdout JSON, stderr stays diagnostic), and explicit handling of non-zero remote exit codes as data rather than errors. Uses package-level execCommandContext indirection so tests inject scripted subprocess behavior via /bin/sh -c without depending on real SSH targets. --- internal/support/multireview/ssh.go | 157 ++++++++++++++++++++++ internal/support/multireview/ssh_test.go | 164 +++++++++++++++++++++++ 2 files changed, 321 insertions(+) create mode 100644 internal/support/multireview/ssh.go create mode 100644 internal/support/multireview/ssh_test.go diff --git a/internal/support/multireview/ssh.go b/internal/support/multireview/ssh.go new file mode 100644 index 0000000..6edd7c4 --- /dev/null +++ b/internal/support/multireview/ssh.go @@ -0,0 +1,157 @@ +// Package multireview orchestrates code reviews across multiple external +// openclaw reviewer agents. It bundles a diff range, ships it to a remote +// host running openclaw-gateway, invokes each reviewer agent in parallel +// (or serially when rate-limit constraints require), and collects the +// pipe-delimited TD findings each reviewer produces. +// +// The package is split into focused helpers: +// +// - ssh.go — SSH and SCP wrappers with timeout + stream separation +// - bundle.go — git bundle + scp + remote clone pipeline +// - openclaw.go — invoke a reviewer agent and parse the JSON envelope +// - stream.go — extract TD lines from a review and merge across reviewers +// +// External commands are invoked through execCommandContext, a package +// variable that tests replace to inject deterministic behavior. +package multireview + +import ( + "bytes" + "context" + "fmt" + "os/exec" + "time" +) + +// execCommandContext is the exec.CommandContext indirection used by all +// subprocess calls in this package. Tests override it to inject scripted +// behavior without depending on real SSH/scp/docker. +var execCommandContext = exec.CommandContext + +// SSHParams configures a single remote command invocation. +type SSHParams struct { + // Host is the SSH target ("user@host" or just "host"). + Host string + // Command is the remote shell command to execute. + Command string + // Timeout caps the wall-clock time before the call is killed. + Timeout time.Duration + // ExtraSSHArgs are flags inserted before the host (e.g. "-i", "/path/key"). + ExtraSSHArgs []string +} + +// SSHResult captures the outcome of an SSH command. +type SSHResult struct { + Stdout string + Stderr string + ExitCode int +} + +// SSHRun executes a single remote command via /usr/bin/ssh. It separates +// stdout and stderr (do not interleave — callers parse stdout JSON and rely +// on stderr being free of payload data), enforces a timeout, and reports a +// non-zero ExitCode without converting that into a Go error. +// +// A returned error means the call could not be executed or was killed +// (context deadline, signal). A normal non-zero remote exit is signaled +// through SSHResult.ExitCode only. +func SSHRun(parent context.Context, p SSHParams) (SSHResult, error) { + if p.Host == "" { + return SSHResult{}, fmt.Errorf("ssh: host required") + } + if p.Command == "" { + return SSHResult{}, fmt.Errorf("ssh: command required") + } + if p.Timeout <= 0 { + p.Timeout = 60 * time.Second + } + + ctx, cancel := context.WithTimeout(parent, p.Timeout) + defer cancel() + + args := []string{"-o", "BatchMode=yes", "-o", "StrictHostKeyChecking=accept-new"} + args = append(args, p.ExtraSSHArgs...) + args = append(args, p.Host, "--", p.Command) + + cmd := execCommandContext(ctx, "ssh", args...) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + res := SSHResult{ + Stdout: stdout.String(), + Stderr: stderr.String(), + } + if ctx.Err() == context.DeadlineExceeded { + return res, fmt.Errorf("ssh %s: deadline exceeded after %s", p.Host, p.Timeout) + } + if err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + res.ExitCode = exitErr.ExitCode() + // Non-zero remote exit is data, not failure. + return res, nil + } + return res, fmt.Errorf("ssh %s: %w", p.Host, err) + } + return res, nil +} + +// SCPParams configures a single scp transfer. +type SCPParams struct { + Host string + LocalPath string + RemotePath string + Timeout time.Duration + // Reverse=true pulls FROM remote TO local instead of pushing. + Reverse bool +} + +// SCPSend transfers a single file via scp. Errors are returned for missing +// inputs or non-zero scp exit (transfers are atomic — partial success is +// not a thing scp reports without -p, so non-zero is a real failure). +func SCPSend(parent context.Context, p SCPParams) (SSHResult, error) { + if p.Host == "" { + return SSHResult{}, fmt.Errorf("scp: host required") + } + if p.LocalPath == "" { + return SSHResult{}, fmt.Errorf("scp: local path required") + } + if p.RemotePath == "" { + return SSHResult{}, fmt.Errorf("scp: remote path required") + } + if p.Timeout <= 0 { + p.Timeout = 300 * time.Second + } + + ctx, cancel := context.WithTimeout(parent, p.Timeout) + defer cancel() + + remoteSpec := fmt.Sprintf("%s:%s", p.Host, p.RemotePath) + var src, dst string + if p.Reverse { + src, dst = remoteSpec, p.LocalPath + } else { + src, dst = p.LocalPath, remoteSpec + } + + args := []string{"-q", "-o", "BatchMode=yes", "-o", "StrictHostKeyChecking=accept-new", src, dst} + cmd := execCommandContext(ctx, "scp", args...) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + res := SSHResult{Stdout: stdout.String(), Stderr: stderr.String()} + if ctx.Err() == context.DeadlineExceeded { + return res, fmt.Errorf("scp %s -> %s: deadline exceeded after %s", src, dst, p.Timeout) + } + if err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + res.ExitCode = exitErr.ExitCode() + return res, fmt.Errorf("scp %s -> %s: exit %d, stderr: %s", src, dst, exitErr.ExitCode(), stderr.String()) + } + return res, fmt.Errorf("scp %s -> %s: %w", src, dst, err) + } + return res, nil +} diff --git a/internal/support/multireview/ssh_test.go b/internal/support/multireview/ssh_test.go new file mode 100644 index 0000000..060516a --- /dev/null +++ b/internal/support/multireview/ssh_test.go @@ -0,0 +1,164 @@ +package multireview + +import ( + "context" + "errors" + "os/exec" + "strings" + "testing" + "time" +) + +// helperExec returns a fake exec.Cmd that runs the given script via /bin/sh -c. +// Used to inject deterministic behavior into ssh tests without real SSH. +func helperExec(script string) func(ctx context.Context, name string, args ...string) *exec.Cmd { + return func(ctx context.Context, name string, args ...string) *exec.Cmd { + return exec.CommandContext(ctx, "/bin/sh", "-c", script) + } +} + +func TestSSHRun_Success(t *testing.T) { + origExec := execCommandContext + execCommandContext = helperExec(`echo "hello world"`) + t.Cleanup(func() { execCommandContext = origExec }) + + res, err := SSHRun(context.Background(), SSHParams{ + Host: "user@example.lan", + Command: "echo hello world", + Timeout: 5 * time.Second, + }) + if err != nil { + t.Fatalf("SSHRun: %v", err) + } + if !strings.Contains(res.Stdout, "hello world") { + t.Errorf("stdout=%q want hello world", res.Stdout) + } + if res.ExitCode != 0 { + t.Errorf("exit=%d want 0", res.ExitCode) + } +} + +func TestSSHRun_NonZeroExit(t *testing.T) { + origExec := execCommandContext + execCommandContext = helperExec(`echo "boom" >&2; exit 42`) + t.Cleanup(func() { execCommandContext = origExec }) + + res, err := SSHRun(context.Background(), SSHParams{ + Host: "user@example.lan", + Command: "false", + Timeout: 5 * time.Second, + }) + // SSHRun returns the result with non-zero ExitCode but no error for clean exit + if err != nil { + t.Fatalf("SSHRun: unexpected error: %v", err) + } + if res.ExitCode != 42 { + t.Errorf("exit=%d want 42", res.ExitCode) + } + if !strings.Contains(res.Stderr, "boom") { + t.Errorf("stderr=%q want boom", res.Stderr) + } +} + +func TestSSHRun_Timeout(t *testing.T) { + origExec := execCommandContext + execCommandContext = helperExec(`sleep 10`) + t.Cleanup(func() { execCommandContext = origExec }) + + _, err := SSHRun(context.Background(), SSHParams{ + Host: "user@example.lan", + Command: "sleep 10", + Timeout: 100 * time.Millisecond, + }) + if err == nil { + t.Fatal("expected timeout error, got nil") + } + if !errors.Is(err, context.DeadlineExceeded) && !strings.Contains(err.Error(), "deadline") && !strings.Contains(err.Error(), "killed") { + t.Errorf("error %q does not signal timeout", err.Error()) + } +} + +func TestSSHRun_RequiresHost(t *testing.T) { + _, err := SSHRun(context.Background(), SSHParams{ + Host: "", + Command: "echo hi", + Timeout: time.Second, + }) + if err == nil { + t.Fatal("expected error for empty host") + } +} + +func TestSSHRun_RequiresCommand(t *testing.T) { + _, err := SSHRun(context.Background(), SSHParams{ + Host: "user@example.lan", + Command: "", + Timeout: time.Second, + }) + if err == nil { + t.Fatal("expected error for empty command") + } +} + +func TestSSHRun_StreamSeparation(t *testing.T) { + // Stdout and stderr should not be intermixed. + origExec := execCommandContext + execCommandContext = helperExec(`echo "OUTPUT"; echo "ERROR" >&2`) + t.Cleanup(func() { execCommandContext = origExec }) + + res, err := SSHRun(context.Background(), SSHParams{ + Host: "user@example.lan", + Command: "anything", + Timeout: 5 * time.Second, + }) + if err != nil { + t.Fatalf("SSHRun: %v", err) + } + if !strings.Contains(res.Stdout, "OUTPUT") || strings.Contains(res.Stdout, "ERROR") { + t.Errorf("stdout=%q should have OUTPUT only", res.Stdout) + } + if !strings.Contains(res.Stderr, "ERROR") || strings.Contains(res.Stderr, "OUTPUT") { + t.Errorf("stderr=%q should have ERROR only", res.Stderr) + } +} + +func TestSCPSend_RequiresPaths(t *testing.T) { + _, err := SCPSend(context.Background(), SCPParams{ + Host: "user@example.lan", + LocalPath: "", + RemotePath: "/tmp/foo", + Timeout: time.Second, + }) + if err == nil { + t.Fatal("expected error for empty local path") + } + + _, err = SCPSend(context.Background(), SCPParams{ + Host: "user@example.lan", + LocalPath: "/tmp/foo", + RemotePath: "", + Timeout: time.Second, + }) + if err == nil { + t.Fatal("expected error for empty remote path") + } +} + +func TestSCPSend_Success(t *testing.T) { + origExec := execCommandContext + execCommandContext = helperExec(`echo "transferred"`) + t.Cleanup(func() { execCommandContext = origExec }) + + res, err := SCPSend(context.Background(), SCPParams{ + Host: "user@example.lan", + LocalPath: "/tmp/bundle.git", + RemotePath: "/tmp/remote/bundle.git", + Timeout: 5 * time.Second, + }) + if err != nil { + t.Fatalf("SCPSend: %v", err) + } + if res.ExitCode != 0 { + t.Errorf("exit=%d want 0", res.ExitCode) + } +} From 923463bd1bd9d86a67b14f304e21934e5ed9336b Mon Sep 17 00:00:00 2001 From: Sam Estrin Date: Mon, 11 May 2026 18:09:54 -0700 Subject: [PATCH 03/10] feat(multireview): git bundle + ship-to-remote pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CreateBundle writes a full-history bundle (HEAD --branches --tags) so the remote clone can resolve any base tag without prerequisite gaps. Range-only bundles silently produce uncloneable artifacts; we learned this the hard way during the Bruce probe. ShipBundle orchestrates the full bundle → scp → remote clone sequence with shared timeout. Returns the remote clone path so reviewer invocations can read from a known location. Single-quote shell escaping for the workdir and repo name keeps directory names with spaces or unusual chars safe. End-to-end test against a real fixture repo verifies the bundle clones cleanly and resolves the original tag. Pipeline test validates the three SSH/SCP calls happen in the right order via execCommandContext mocking. --- internal/support/multireview/bundle.go | 156 ++++++++++++++ internal/support/multireview/bundle_test.go | 225 ++++++++++++++++++++ 2 files changed, 381 insertions(+) create mode 100644 internal/support/multireview/bundle.go create mode 100644 internal/support/multireview/bundle_test.go diff --git a/internal/support/multireview/bundle.go b/internal/support/multireview/bundle.go new file mode 100644 index 0000000..5b6e023 --- /dev/null +++ b/internal/support/multireview/bundle.go @@ -0,0 +1,156 @@ +package multireview + +import ( + "bytes" + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "time" +) + +// CreateBundle writes a git bundle containing the full branch history plus +// all tags. A range-only bundle would fail to clone on the remote because +// the prerequisites would be missing — we always ship the full history so +// reviewers can resolve any base ref via tag lookup. +func CreateBundle(repoPath, outBundle string) error { + if repoPath == "" { + return fmt.Errorf("bundle: repo path required") + } + if outBundle == "" { + return fmt.Errorf("bundle: output path required") + } + + cmd := exec.Command("git", "bundle", "create", outBundle, "HEAD", "--branches", "--tags") + cmd.Dir = repoPath + var stderr bytes.Buffer + cmd.Stderr = &stderr + if err := cmd.Run(); err != nil { + return fmt.Errorf("git bundle create: %w (stderr: %s)", err, stderr.String()) + } + return nil +} + +// ShipBundleParams configures end-to-end bundle shipping. +type ShipBundleParams struct { + // LocalRepo is the path to the source git repository on this machine. + LocalRepo string + // Host is the SSH target where the bundle should land. + Host string + // RemoteWorkdir is the directory on the remote that will hold the bundle + // and the clone. Created if absent. Should be unique per run. + RemoteWorkdir string + // RepoName is the directory name to clone the bundle into, inside + // RemoteWorkdir. Reviewers will read from this path. + RepoName string + // Timeout caps the total wall-clock for the ship operation. + Timeout time.Duration +} + +// ShipBundleResult reports what was shipped. +type ShipBundleResult struct { + // LocalBundlePath is the path to the local .git bundle file (kept for + // inspection until the caller cleans it up). + LocalBundlePath string + // RemoteBundlePath is the path of the bundle on the remote. + RemoteBundlePath string + // RemoteRepoPath is the path of the clone directory on the remote — this + // is what reviewers should read. + RemoteRepoPath string + // BundleSize is the byte size of the bundle. + BundleSize int64 +} + +// ShipBundle bundles a local repo, ships it to a remote host, and clones it +// into the remote workdir under RepoName. Returns the remote clone path so +// reviewers can read from it. +// +// Failure semantics: any failure (bundle, scp, mkdir, clone) returns an +// error wrapping enough context to diagnose. Partial state is not cleaned +// up automatically — caller decides via teardown. +func ShipBundle(ctx context.Context, p ShipBundleParams) (ShipBundleResult, error) { + if p.LocalRepo == "" { + return ShipBundleResult{}, fmt.Errorf("ship: local repo required") + } + if p.Host == "" { + return ShipBundleResult{}, fmt.Errorf("ship: host required") + } + if p.RemoteWorkdir == "" { + return ShipBundleResult{}, fmt.Errorf("ship: remote workdir required") + } + if p.RepoName == "" { + return ShipBundleResult{}, fmt.Errorf("ship: repo name required") + } + if p.Timeout <= 0 { + p.Timeout = 5 * time.Minute + } + + // 1. Create the local bundle in a temp file. + bundleDir, err := os.MkdirTemp("", "multireview-bundle-") + if err != nil { + return ShipBundleResult{}, fmt.Errorf("ship: tempdir: %w", err) + } + localBundle := filepath.Join(bundleDir, "bundle.git") + if err := CreateBundle(p.LocalRepo, localBundle); err != nil { + return ShipBundleResult{}, fmt.Errorf("ship: %w", err) + } + info, err := os.Stat(localBundle) + if err != nil { + return ShipBundleResult{}, fmt.Errorf("ship: stat bundle: %w", err) + } + + // 2. Make remote workdir. + mkdirRes, err := SSHRun(ctx, SSHParams{ + Host: p.Host, + Command: fmt.Sprintf("mkdir -p %s", shellQuote(p.RemoteWorkdir)), + Timeout: p.Timeout, + }) + if err != nil { + return ShipBundleResult{}, fmt.Errorf("ship: ssh mkdir: %w", err) + } + if mkdirRes.ExitCode != 0 { + return ShipBundleResult{}, fmt.Errorf("ship: mkdir exit %d, stderr: %s", mkdirRes.ExitCode, mkdirRes.Stderr) + } + + // 3. SCP bundle to remote. + remoteBundle := filepath.Join(p.RemoteWorkdir, "bundle.git") + if _, err := SCPSend(ctx, SCPParams{ + Host: p.Host, + LocalPath: localBundle, + RemotePath: remoteBundle, + Timeout: p.Timeout, + }); err != nil { + return ShipBundleResult{}, fmt.Errorf("ship: %w", err) + } + + // 4. Clone the bundle on the remote into RepoName. + remoteRepo := filepath.Join(p.RemoteWorkdir, p.RepoName) + cloneCmd := fmt.Sprintf("cd %s && git clone -q bundle.git %s", + shellQuote(p.RemoteWorkdir), shellQuote(p.RepoName)) + cloneRes, err := SSHRun(ctx, SSHParams{ + Host: p.Host, + Command: cloneCmd, + Timeout: p.Timeout, + }) + if err != nil { + return ShipBundleResult{}, fmt.Errorf("ship: ssh clone: %w", err) + } + if cloneRes.ExitCode != 0 { + return ShipBundleResult{}, fmt.Errorf("ship: clone exit %d, stderr: %s", cloneRes.ExitCode, cloneRes.Stderr) + } + + return ShipBundleResult{ + LocalBundlePath: localBundle, + RemoteBundlePath: remoteBundle, + RemoteRepoPath: remoteRepo, + BundleSize: info.Size(), + }, nil +} + +// shellQuote wraps a string in single quotes for safe shell interpolation, +// escaping any embedded single quotes via '\” the classic way. +func shellQuote(s string) string { + return "'" + strings.ReplaceAll(s, "'", `'\''`) + "'" +} diff --git a/internal/support/multireview/bundle_test.go b/internal/support/multireview/bundle_test.go new file mode 100644 index 0000000..9eb02e8 --- /dev/null +++ b/internal/support/multireview/bundle_test.go @@ -0,0 +1,225 @@ +package multireview + +import ( + "context" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + "time" +) + +// initFixtureRepo creates a small git repo with two commits and one tag so +// bundle tests can exercise real `git bundle` against real history. +func initFixtureRepo(t *testing.T) (repoPath, baseRef, headSHA string) { + t.Helper() + repoPath = t.TempDir() + mustRun := func(args ...string) string { + cmd := exec.Command("git", args...) + cmd.Dir = repoPath + out, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("git %v: %v\n%s", args, err, out) + } + return strings.TrimSpace(string(out)) + } + mustRun("init", "-q", "-b", "main") + mustRun("config", "user.email", "test@example.com") + mustRun("config", "user.name", "Test") + mustRun("config", "commit.gpgsign", "false") + + if err := os.WriteFile(filepath.Join(repoPath, "a.txt"), []byte("first\n"), 0o644); err != nil { + t.Fatal(err) + } + mustRun("add", "a.txt") + mustRun("commit", "-q", "-m", "first") + mustRun("tag", "v0.1.0") + baseRef = "v0.1.0" + + if err := os.WriteFile(filepath.Join(repoPath, "b.txt"), []byte("second\n"), 0o644); err != nil { + t.Fatal(err) + } + mustRun("add", "b.txt") + mustRun("commit", "-q", "-m", "second") + headSHA = mustRun("rev-parse", "HEAD") + return +} + +func TestCreateBundle_Success(t *testing.T) { + repoPath, _, _ := initFixtureRepo(t) + outBundle := filepath.Join(t.TempDir(), "out.bundle") + + if err := CreateBundle(repoPath, outBundle); err != nil { + t.Fatalf("CreateBundle: %v", err) + } + info, err := os.Stat(outBundle) + if err != nil { + t.Fatalf("bundle file: %v", err) + } + if info.Size() < 100 { + t.Errorf("bundle too small: %d bytes", info.Size()) + } +} + +func TestCreateBundle_RequiresInputs(t *testing.T) { + if err := CreateBundle("", "/tmp/x.bundle"); err == nil { + t.Error("expected error for empty repo") + } + if err := CreateBundle("/tmp/repo", ""); err == nil { + t.Error("expected error for empty bundle path") + } +} + +func TestCreateBundle_InvalidRepo(t *testing.T) { + notARepo := t.TempDir() // empty dir, no .git + outBundle := filepath.Join(t.TempDir(), "out.bundle") + + err := CreateBundle(notARepo, outBundle) + if err == nil { + t.Fatal("expected error for non-git directory") + } +} + +func TestBundleClonesCleanly(t *testing.T) { + // End-to-end: create a bundle, clone from it, verify both commits exist + // and the v0.1.0 tag resolves on the clone. + repoPath, baseRef, headSHA := initFixtureRepo(t) + outBundle := filepath.Join(t.TempDir(), "out.bundle") + if err := CreateBundle(repoPath, outBundle); err != nil { + t.Fatalf("CreateBundle: %v", err) + } + + cloneTo := filepath.Join(t.TempDir(), "clone") + clone := exec.Command("git", "clone", "-q", outBundle, cloneTo) + if out, err := clone.CombinedOutput(); err != nil { + t.Fatalf("git clone bundle: %v\n%s", err, out) + } + + verifyHead := exec.Command("git", "rev-parse", "HEAD") + verifyHead.Dir = cloneTo + headOut, err := verifyHead.Output() + if err != nil { + t.Fatalf("git rev-parse HEAD: %v", err) + } + if got := strings.TrimSpace(string(headOut)); got != headSHA { + t.Errorf("clone HEAD=%s want %s", got, headSHA) + } + + verifyTag := exec.Command("git", "rev-parse", baseRef) + verifyTag.Dir = cloneTo + if out, err := verifyTag.CombinedOutput(); err != nil { + t.Fatalf("git rev-parse %s on clone: %v\n%s", baseRef, err, out) + } +} + +func TestShipBundle_Pipeline(t *testing.T) { + // Exercises the full ship pipeline: scp (mocked) + remote git clone + // (mocked). Verifies the right SSH commands are issued in order. + repoPath, _, _ := initFixtureRepo(t) + + origExec := execCommandContext + t.Cleanup(func() { execCommandContext = origExec }) + + var calls []string + execCommandContext = func(ctx context.Context, name string, args ...string) *exec.Cmd { + calls = append(calls, name+" "+strings.Join(args, " ")) + // Echo success for any call. + return exec.CommandContext(ctx, "/bin/sh", "-c", "echo ok; exit 0") + } + + ctx := context.Background() + res, err := ShipBundle(ctx, ShipBundleParams{ + LocalRepo: repoPath, + Host: "user@example.lan", + RemoteWorkdir: "/tmp/reviewer-bench-test", + RepoName: "fixture-repo", + Timeout: 30 * time.Second, + }) + if err != nil { + t.Fatalf("ShipBundle: %v", err) + } + if res.RemoteRepoPath != "/tmp/reviewer-bench-test/fixture-repo" { + t.Errorf("remote path=%q", res.RemoteRepoPath) + } + if res.BundleSize <= 0 { + t.Errorf("bundle size %d should be > 0", res.BundleSize) + } + // Expect: ssh mkdir, scp, ssh clone — three subprocess calls minimum. + if len(calls) < 3 { + t.Errorf("expected >=3 calls, got %d: %v", len(calls), calls) + } + // First should be ssh mkdir. + if !strings.Contains(calls[0], "ssh") || !strings.Contains(calls[0], "mkdir") { + t.Errorf("first call should be ssh mkdir, got %q", calls[0]) + } + // One of them should be scp. + foundScp := false + for _, c := range calls { + if strings.HasPrefix(c, "scp ") { + foundScp = true + break + } + } + if !foundScp { + t.Errorf("no scp call observed: %v", calls) + } + // Last should be ssh clone. + last := calls[len(calls)-1] + if !strings.Contains(last, "ssh") || !strings.Contains(last, "git clone") { + t.Errorf("last call should be ssh git clone, got %q", last) + } +} + +func TestShipBundle_RequiresInputs(t *testing.T) { + ctx := context.Background() + cases := []struct { + name string + p ShipBundleParams + }{ + {"empty repo", ShipBundleParams{Host: "u@h", RemoteWorkdir: "/tmp/x", RepoName: "r"}}, + {"empty host", ShipBundleParams{LocalRepo: "/tmp/r", RemoteWorkdir: "/tmp/x", RepoName: "r"}}, + {"empty workdir", ShipBundleParams{LocalRepo: "/tmp/r", Host: "u@h", RepoName: "r"}}, + {"empty reponame", ShipBundleParams{LocalRepo: "/tmp/r", Host: "u@h", RemoteWorkdir: "/tmp/x"}}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + _, err := ShipBundle(ctx, c.p) + if err == nil { + t.Errorf("expected error for %s", c.name) + } + }) + } +} + +func TestShipBundle_RemoteCloneFailureSurfaces(t *testing.T) { + repoPath, _, _ := initFixtureRepo(t) + + origExec := execCommandContext + t.Cleanup(func() { execCommandContext = origExec }) + + // Mock fails on the third call (the clone) with a specific error message. + callIdx := 0 + execCommandContext = func(ctx context.Context, name string, args ...string) *exec.Cmd { + callIdx++ + joined := strings.Join(args, " ") + if strings.Contains(joined, "git clone") { + return exec.CommandContext(ctx, "/bin/sh", "-c", `echo "fatal: bad bundle" >&2; exit 128`) + } + return exec.CommandContext(ctx, "/bin/sh", "-c", "exit 0") + } + + _, err := ShipBundle(context.Background(), ShipBundleParams{ + LocalRepo: repoPath, + Host: "user@example.lan", + RemoteWorkdir: "/tmp/reviewer-bench-test", + RepoName: "fixture", + Timeout: 10 * time.Second, + }) + if err == nil { + t.Fatal("expected error from failing clone") + } + if !strings.Contains(err.Error(), "clone") { + t.Errorf("error should mention clone: %v", err) + } +} From bd9939f4bdb1610bccbd3c1e3daa3f8798d0583c Mon Sep 17 00:00:00 2001 From: Sam Estrin Date: Mon, 11 May 2026 18:12:18 -0700 Subject: [PATCH 04/10] feat(multireview): invoke openclaw reviewer agent and parse JSON envelope MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit InvokeReviewer wraps the `docker exec openclaw-gateway openclaw agent --json` call we worked out during the bench probes. The task message is base64 encoded for transport so any prompt content (quotes, newlines, code fences, dollar signs) crosses the SSH boundary intact. Parses the envelope shape we observed live: {runId, status, summary, result: {payloads: [{text, mediaUrl}], meta: {durationMs, aborted, agentMeta: {model, sessionId, provider}}}} Concatenates payload text into a single ReviewProse field (matches the behavior of our run-bench.sh extraction) and preserves the raw JSON for diagnosis. Distinguishes malformed JSON, empty response, SSH-level failure, and non-zero exit codes — each surfaces with a different error message. --- internal/support/multireview/openclaw.go | 150 +++++++++++++++ internal/support/multireview/openclaw_test.go | 181 ++++++++++++++++++ 2 files changed, 331 insertions(+) create mode 100644 internal/support/multireview/openclaw.go create mode 100644 internal/support/multireview/openclaw_test.go diff --git a/internal/support/multireview/openclaw.go b/internal/support/multireview/openclaw.go new file mode 100644 index 0000000..59c0b89 --- /dev/null +++ b/internal/support/multireview/openclaw.go @@ -0,0 +1,150 @@ +package multireview + +import ( + "context" + "encoding/base64" + "encoding/json" + "fmt" + "strings" + "time" +) + +// InvokeReviewerParams configures a single openclaw agent invocation. +type InvokeReviewerParams struct { + // Host is the SSH target running openclaw-gateway. + Host string + // AgentName is the openclaw agent id (e.g. "bruce", "greta"). + AgentName string + // TaskMessage is the prompt sent to the agent. The caller composes it + // with whatever scope, repo path, and TD_STREAM format instructions are + // appropriate for the run. + TaskMessage string + // Timeout caps the SSH/docker call. Reasoning models on complex reviews + // can take 5-20 minutes; size accordingly. + Timeout time.Duration + // GatewayContainer is the docker container name. Defaults to + // "openclaw-gateway". + GatewayContainer string +} + +// InvokeReviewerResult captures one reviewer's output. +type InvokeReviewerResult struct { + // AgentName echoes the input for convenience. + AgentName string + // Status from the openclaw envelope ("ok", "error", etc.). + Status string + // Model from envelope.result.meta.agentMeta.model. + Model string + // DurationMS from envelope.result.meta.durationMs. + DurationMS int64 + // Aborted from envelope.result.meta.aborted — true means openclaw + // killed the call before completion (timeout, mid-stream error). + Aborted bool + // ReviewProse is all payload text concatenated in order. This is what a + // human reads; the TD_STREAM lines are extracted from it downstream. + ReviewProse string + // RawJSON preserves the entire envelope for offline replay/diagnosis. + RawJSON string +} + +// openclawEnvelope mirrors the JSON shape returned by `openclaw agent --json`. +type openclawEnvelope struct { + RunID string `json:"runId"` + Status string `json:"status"` + Summary string `json:"summary"` + Result struct { + Payloads []struct { + Text string `json:"text"` + MediaURL string `json:"mediaUrl"` + } `json:"payloads"` + Meta struct { + DurationMS int64 `json:"durationMs"` + Aborted bool `json:"aborted"` + AgentMeta struct { + SessionID string `json:"sessionId"` + Provider string `json:"provider"` + Model string `json:"model"` + } `json:"agentMeta"` + } `json:"meta"` + } `json:"result"` +} + +// InvokeReviewer runs a single openclaw agent via SSH+docker exec and parses +// the JSON envelope into a structured result. +// +// The remote command is: +// +// docker exec -i openclaw agent --agent --message --json +// +// The task message is passed via stdin to avoid shell-escaping headaches with +// large multi-line prompts. We use docker exec -i (interactive) for the stdin +// pipe to work. +func InvokeReviewer(parent context.Context, p InvokeReviewerParams) (InvokeReviewerResult, error) { + if p.Host == "" { + return InvokeReviewerResult{}, fmt.Errorf("invoke: host required") + } + if p.AgentName == "" { + return InvokeReviewerResult{}, fmt.Errorf("invoke: agent name required") + } + if p.TaskMessage == "" { + return InvokeReviewerResult{}, fmt.Errorf("invoke: task message required") + } + if p.Timeout <= 0 { + p.Timeout = 20 * time.Minute + } + if p.GatewayContainer == "" { + p.GatewayContainer = "openclaw-gateway" + } + + // Build the remote command. The task message can contain anything + // (quotes, dollar signs, newlines, backticks, even our own delimiter + // strings) so we base64-encode it and decode on the remote. This + // removes any quoting concerns at the SSH boundary. + encoded := base64.StdEncoding.EncodeToString([]byte(p.TaskMessage)) + remoteCmd := fmt.Sprintf( + `docker exec %s openclaw agent --agent %s --json --message "$(echo %s | base64 -d)"`, + shellQuote(p.GatewayContainer), + shellQuote(p.AgentName), + shellQuote(encoded), + ) + + res, err := SSHRun(parent, SSHParams{ + Host: p.Host, + Command: remoteCmd, + Timeout: p.Timeout, + }) + if err != nil { + return InvokeReviewerResult{AgentName: p.AgentName}, fmt.Errorf("invoke %s: %w", p.AgentName, err) + } + if res.ExitCode != 0 { + return InvokeReviewerResult{AgentName: p.AgentName}, fmt.Errorf("invoke %s: ssh exit %d, stderr: %s", p.AgentName, res.ExitCode, res.Stderr) + } + if strings.TrimSpace(res.Stdout) == "" { + return InvokeReviewerResult{AgentName: p.AgentName}, fmt.Errorf("invoke %s: empty response", p.AgentName) + } + + var env openclawEnvelope + if err := json.Unmarshal([]byte(res.Stdout), &env); err != nil { + // Preview the start of stdout to help diagnose what came back. + preview := res.Stdout + if len(preview) > 200 { + preview = preview[:200] + "..." + } + return InvokeReviewerResult{AgentName: p.AgentName}, fmt.Errorf("invoke %s: parse json: %w (got: %s)", p.AgentName, err, preview) + } + + var prose strings.Builder + for _, pl := range env.Result.Payloads { + prose.WriteString(pl.Text) + } + + return InvokeReviewerResult{ + AgentName: p.AgentName, + Status: env.Status, + Model: env.Result.Meta.AgentMeta.Model, + DurationMS: env.Result.Meta.DurationMS, + Aborted: env.Result.Meta.Aborted, + ReviewProse: prose.String(), + RawJSON: res.Stdout, + }, nil +} diff --git a/internal/support/multireview/openclaw_test.go b/internal/support/multireview/openclaw_test.go new file mode 100644 index 0000000..4ef5731 --- /dev/null +++ b/internal/support/multireview/openclaw_test.go @@ -0,0 +1,181 @@ +package multireview + +import ( + "context" + "os/exec" + "strings" + "testing" + "time" +) + +func TestInvokeReviewer_ParsesEnvelope(t *testing.T) { + // Real openclaw `agent --json` envelope: {runId, status, summary, result: {payloads, meta}} + mockResponse := `{ + "runId": "abc-123", + "status": "ok", + "summary": "completed", + "result": { + "payloads": [ + {"text": "# Review\n\nVerdict: ship-ready\n\nMEDIUM|src/foo.go:42|missing nil check|add guard|robustness\n", "mediaUrl": null} + ], + "meta": { + "durationMs": 123456, + "aborted": false, + "agentMeta": { + "sessionId": "sess-456", + "provider": "litellm", + "model": "qwen-3.6-plus" + } + } + } + }` + + origExec := execCommandContext + t.Cleanup(func() { execCommandContext = origExec }) + execCommandContext = func(ctx context.Context, name string, args ...string) *exec.Cmd { + // docker exec command should be invoked via ssh + return exec.CommandContext(ctx, "/bin/sh", "-c", "cat <<'EOF'\n"+mockResponse+"\nEOF") + } + + res, err := InvokeReviewer(context.Background(), InvokeReviewerParams{ + Host: "user@example.lan", + AgentName: "bruce", + TaskMessage: "Review the diff at /tmp/bench/repo", + Timeout: 30 * time.Second, + }) + if err != nil { + t.Fatalf("InvokeReviewer: %v", err) + } + if res.Status != "ok" { + t.Errorf("status=%q want ok", res.Status) + } + if res.Model != "qwen-3.6-plus" { + t.Errorf("model=%q want qwen-3.6-plus", res.Model) + } + if res.DurationMS != 123456 { + t.Errorf("durationMs=%d want 123456", res.DurationMS) + } + if !strings.Contains(res.ReviewProse, "ship-ready") { + t.Errorf("review prose missing verdict: %q", res.ReviewProse) + } + if !strings.Contains(res.ReviewProse, "MEDIUM|src/foo.go:42|") { + t.Errorf("review prose missing TD line: %q", res.ReviewProse) + } + if res.RawJSON == "" { + t.Error("raw JSON should be preserved") + } +} + +func TestInvokeReviewer_RequiresInputs(t *testing.T) { + ctx := context.Background() + cases := []struct { + name string + p InvokeReviewerParams + }{ + {"empty host", InvokeReviewerParams{AgentName: "bruce", TaskMessage: "m"}}, + {"empty agent", InvokeReviewerParams{Host: "u@h", TaskMessage: "m"}}, + {"empty task", InvokeReviewerParams{Host: "u@h", AgentName: "bruce"}}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + _, err := InvokeReviewer(ctx, c.p) + if err == nil { + t.Errorf("expected error") + } + }) + } +} + +func TestInvokeReviewer_HandlesMalformedJSON(t *testing.T) { + origExec := execCommandContext + t.Cleanup(func() { execCommandContext = origExec }) + execCommandContext = func(ctx context.Context, name string, args ...string) *exec.Cmd { + return exec.CommandContext(ctx, "/bin/sh", "-c", `echo "not json at all"`) + } + + _, err := InvokeReviewer(context.Background(), InvokeReviewerParams{ + Host: "user@example.lan", + AgentName: "bruce", + TaskMessage: "test", + Timeout: 5 * time.Second, + }) + if err == nil { + t.Fatal("expected error for malformed JSON") + } + if !strings.Contains(err.Error(), "parse") && !strings.Contains(err.Error(), "json") { + t.Errorf("error %q should mention parsing", err.Error()) + } +} + +func TestInvokeReviewer_HandlesSSHFailure(t *testing.T) { + origExec := execCommandContext + t.Cleanup(func() { execCommandContext = origExec }) + execCommandContext = func(ctx context.Context, name string, args ...string) *exec.Cmd { + return exec.CommandContext(ctx, "/bin/sh", "-c", `echo "Permission denied (publickey)" >&2; exit 255`) + } + + _, err := InvokeReviewer(context.Background(), InvokeReviewerParams{ + Host: "user@example.lan", + AgentName: "bruce", + TaskMessage: "test", + Timeout: 5 * time.Second, + }) + if err == nil { + t.Fatal("expected error for SSH failure") + } +} + +func TestInvokeReviewer_HandlesEmptyResponse(t *testing.T) { + origExec := execCommandContext + t.Cleanup(func() { execCommandContext = origExec }) + execCommandContext = func(ctx context.Context, name string, args ...string) *exec.Cmd { + return exec.CommandContext(ctx, "/bin/sh", "-c", "") + } + + _, err := InvokeReviewer(context.Background(), InvokeReviewerParams{ + Host: "user@example.lan", + AgentName: "bruce", + TaskMessage: "test", + Timeout: 5 * time.Second, + }) + if err == nil { + t.Fatal("expected error for empty response") + } +} + +func TestInvokeReviewer_ConcatenatesMultiplePayloads(t *testing.T) { + // Some openclaw responses have multiple payload entries (preamble + body) + mockResponse := `{ + "runId": "x", + "status": "ok", + "summary": "done", + "result": { + "payloads": [ + {"text": "preamble: starting review", "mediaUrl": null}, + {"text": "the actual review body", "mediaUrl": null} + ], + "meta": {"durationMs": 1000, "aborted": false, "agentMeta": {"model": "test"}} + } + }` + origExec := execCommandContext + t.Cleanup(func() { execCommandContext = origExec }) + execCommandContext = func(ctx context.Context, name string, args ...string) *exec.Cmd { + return exec.CommandContext(ctx, "/bin/sh", "-c", "cat <<'EOF'\n"+mockResponse+"\nEOF") + } + + res, err := InvokeReviewer(context.Background(), InvokeReviewerParams{ + Host: "user@example.lan", + AgentName: "bruce", + TaskMessage: "test", + Timeout: 5 * time.Second, + }) + if err != nil { + t.Fatalf("InvokeReviewer: %v", err) + } + if !strings.Contains(res.ReviewProse, "preamble") { + t.Error("payload 1 missing") + } + if !strings.Contains(res.ReviewProse, "actual review body") { + t.Error("payload 2 missing") + } +} From 60e6f95caf6a06729b4dd54eb9e693396504b50f Mon Sep 17 00:00:00 2001 From: Sam Estrin Date: Mon, 11 May 2026 18:14:38 -0700 Subject: [PATCH 05/10] feat(multireview): TD stream extraction, per-reviewer write, cross-reviewer merge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ExtractTDLines anchors on a strict regex (severity token followed immediately by a pipe at line start) so prose mentions of "HIGH severity" or similar are not mistaken for findings. CRITICAL/HIGH/MEDIUM/LOW all supported. WriteReviewerOutput drops four artifacts per reviewer under //: review.md — prose for human reading td-stream.txt — pipe-delimited findings with | appended status.json — model, duration, status, td count response.json — raw openclaw envelope (preserved for replay) MergeStreams concatenates per-reviewer streams into td-stream-all.txt with a header comment. Missing reviewer dirs are silently skipped — a reviewer might have failed to invoke and we proceed with what completed. --- internal/support/multireview/stream.go | 155 +++++++++++++++ internal/support/multireview/stream_test.go | 201 ++++++++++++++++++++ 2 files changed, 356 insertions(+) create mode 100644 internal/support/multireview/stream.go create mode 100644 internal/support/multireview/stream_test.go diff --git a/internal/support/multireview/stream.go b/internal/support/multireview/stream.go new file mode 100644 index 0000000..c870238 --- /dev/null +++ b/internal/support/multireview/stream.go @@ -0,0 +1,155 @@ +package multireview + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "regexp" + "strings" +) + +// tdLinePattern matches pipe-delimited TD lines of the form +// `SEVERITY|FILE:LINE|PROBLEM|FIX|CATEGORY` where SEVERITY is one of the +// known severity tokens. The match anchors at line start and requires the +// pipe immediately after the severity, so prose mentions of "HIGH severity" +// or similar are not picked up. +var tdLinePattern = regexp.MustCompile(`(?m)^(CRITICAL|HIGH|MEDIUM|LOW)\|.+$`) + +// ExtractTDLines returns all pipe-delimited TD findings from a review body. +// Order is preserved. Trailing whitespace is trimmed. +func ExtractTDLines(reviewProse string) []string { + if reviewProse == "" { + return nil + } + matches := tdLinePattern.FindAllString(reviewProse, -1) + out := make([]string, 0, len(matches)) + for _, m := range matches { + out = append(out, strings.TrimRight(m, " \t\r")) + } + return out +} + +// ReviewerOutputPaths records where one reviewer's artifacts were written. +type ReviewerOutputPaths struct { + Dir string + ReviewMD string + TDStream string + StatusJSON string + ResponseJSON string +} + +// WriteReviewerOutput writes one reviewer's full output to a per-agent dir: +// +// //review.md — extracted prose, human-readable +// //td-stream.txt — pipe-delimited TD lines with | appended +// //status.json — small metadata blob (model, duration, status) +// //response.json — raw openclaw envelope, untouched +// +// The td-stream lines have the reviewer's agent name appended as a final +// column so downstream merge keeps attribution. Format matches the existing +// TD_STREAM convention extended with REVIEWERS. +func WriteReviewerOutput(rootDir string, res InvokeReviewerResult) (ReviewerOutputPaths, error) { + if rootDir == "" { + return ReviewerOutputPaths{}, fmt.Errorf("write: root dir required") + } + if res.AgentName == "" { + return ReviewerOutputPaths{}, fmt.Errorf("write: agent name required") + } + + dir := filepath.Join(rootDir, res.AgentName) + if err := os.MkdirAll(dir, 0o755); err != nil { + return ReviewerOutputPaths{}, fmt.Errorf("write: mkdir %s: %w", dir, err) + } + + paths := ReviewerOutputPaths{ + Dir: dir, + ReviewMD: filepath.Join(dir, "review.md"), + TDStream: filepath.Join(dir, "td-stream.txt"), + StatusJSON: filepath.Join(dir, "status.json"), + ResponseJSON: filepath.Join(dir, "response.json"), + } + + if err := os.WriteFile(paths.ReviewMD, []byte(res.ReviewProse), 0o644); err != nil { + return paths, fmt.Errorf("write review.md: %w", err) + } + + // Build per-reviewer td-stream with reviewer name appended. + var tdBuf strings.Builder + for _, line := range ExtractTDLines(res.ReviewProse) { + tdBuf.WriteString(line) + tdBuf.WriteString("|") + tdBuf.WriteString(res.AgentName) + tdBuf.WriteString("\n") + } + if err := os.WriteFile(paths.TDStream, []byte(tdBuf.String()), 0o644); err != nil { + return paths, fmt.Errorf("write td-stream.txt: %w", err) + } + + status := map[string]interface{}{ + "agent": res.AgentName, + "model": res.Model, + "status": res.Status, + "durationMs": res.DurationMS, + "aborted": res.Aborted, + "tdLineCount": len(ExtractTDLines(res.ReviewProse)), + } + statusJSON, _ := json.MarshalIndent(status, "", " ") + if err := os.WriteFile(paths.StatusJSON, statusJSON, 0o644); err != nil { + return paths, fmt.Errorf("write status.json: %w", err) + } + + if res.RawJSON != "" { + if err := os.WriteFile(paths.ResponseJSON, []byte(res.RawJSON), 0o644); err != nil { + return paths, fmt.Errorf("write response.json: %w", err) + } + } else { + // Always create the file so callers can rely on it existing. + if err := os.WriteFile(paths.ResponseJSON, []byte("{}"), 0o644); err != nil { + return paths, fmt.Errorf("write response.json: %w", err) + } + } + + return paths, nil +} + +// MergeStreams concatenates per-reviewer td-stream.txt files into a single +// td-stream-all.txt at rootDir. Missing reviewer directories are silently +// skipped (a reviewer might have failed to invoke; we proceed with what we +// have). Returns the merged file path and the total line count. +func MergeStreams(rootDir string, reviewers []string) (string, int, error) { + if rootDir == "" { + return "", 0, fmt.Errorf("merge: root dir required") + } + + mergedPath := filepath.Join(rootDir, "td-stream-all.txt") + var out strings.Builder + out.WriteString("# TD_STREAM - merged from " + fmt.Sprintf("%d", len(reviewers)) + " reviewer(s)\n") + out.WriteString("# Format: SEVERITY|FILE:LINE|PROBLEM|FIX|CATEGORY|REVIEWER\n") + + count := 0 + for _, agent := range reviewers { + streamPath := filepath.Join(rootDir, agent, "td-stream.txt") + data, err := os.ReadFile(streamPath) + if err != nil { + if os.IsNotExist(err) { + continue + } + return "", 0, fmt.Errorf("merge: read %s: %w", streamPath, err) + } + for _, line := range strings.Split(string(data), "\n") { + line = strings.TrimSpace(line) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + out.WriteString(line) + out.WriteString("\n") + count++ + } + } + + if err := os.WriteFile(mergedPath, []byte(out.String()), 0o644); err != nil { + return "", 0, fmt.Errorf("merge: write %s: %w", mergedPath, err) + } + return mergedPath, count, nil +} diff --git a/internal/support/multireview/stream_test.go b/internal/support/multireview/stream_test.go new file mode 100644 index 0000000..9ac9d1d --- /dev/null +++ b/internal/support/multireview/stream_test.go @@ -0,0 +1,201 @@ +package multireview + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +const sampleReview = `# Review: example#1 + +**Author:** Sam +**Verdict:** ship with fixes + +## Findings + +### Significant + +- Some prose finding about install.sh + +## TD_STREAM + +HIGH|install.sh:42|missing validation|add zod check|security +MEDIUM|src/foo.go:80|no nil guard|wrap in if|robustness +LOW|README.md:5|typo|fix it|docs +` + +func TestExtractTDLines_HappyPath(t *testing.T) { + lines := ExtractTDLines(sampleReview) + if len(lines) != 3 { + t.Fatalf("got %d lines want 3: %v", len(lines), lines) + } + if !strings.HasPrefix(lines[0], "HIGH|") { + t.Errorf("line 0 should start HIGH: %q", lines[0]) + } + if !strings.HasPrefix(lines[1], "MEDIUM|") { + t.Errorf("line 1 should start MEDIUM: %q", lines[1]) + } + if !strings.HasPrefix(lines[2], "LOW|") { + t.Errorf("line 2 should start LOW: %q", lines[2]) + } +} + +func TestExtractTDLines_IgnoresProse(t *testing.T) { + // Lines that mention "HIGH" but don't match the strict severity-prefix + // format (uppercase severity followed by pipe) should not be picked up. + body := `Some prose mentions HIGH severity here. +And MEDIUM is referenced inline. +LOW|src/x.go:1|real one|fix|cat +But "HIGH " (with trailing space and no pipe) should be ignored. +` + lines := ExtractTDLines(body) + if len(lines) != 1 { + t.Errorf("got %d lines want 1: %v", len(lines), lines) + } +} + +func TestExtractTDLines_HandlesCriticalToo(t *testing.T) { + body := `CRITICAL|src/auth.go:1|SQL injection|parameterize|security +HIGH|src/foo.go:1|something|fix|cat +` + lines := ExtractTDLines(body) + if len(lines) != 2 { + t.Fatalf("got %d lines want 2", len(lines)) + } +} + +func TestExtractTDLines_EmptyBody(t *testing.T) { + if got := ExtractTDLines(""); len(got) != 0 { + t.Errorf("empty input should give empty output, got %v", got) + } +} + +func TestWriteReviewerStream_WritesAndAnnotates(t *testing.T) { + dir := t.TempDir() + res := InvokeReviewerResult{ + AgentName: "bruce", + Model: "qwen-3.6-plus", + Status: "ok", + DurationMS: 239000, + ReviewProse: sampleReview, + RawJSON: `{"runId":"x"}`, + } + + paths, err := WriteReviewerOutput(dir, res) + if err != nil { + t.Fatalf("WriteReviewerOutput: %v", err) + } + + // Expected files: review.md, td-stream.txt, status.json, response.json + for _, p := range []string{paths.ReviewMD, paths.TDStream, paths.StatusJSON, paths.ResponseJSON} { + if _, err := os.Stat(p); err != nil { + t.Errorf("missing %s: %v", p, err) + } + } + + // td-stream.txt should have 3 lines, REVIEWER annotation appended + tdData, err := os.ReadFile(paths.TDStream) + if err != nil { + t.Fatal(err) + } + tdLines := strings.Split(strings.TrimSpace(string(tdData)), "\n") + if len(tdLines) != 3 { + t.Fatalf("td-stream lines=%d want 3", len(tdLines)) + } + for i, line := range tdLines { + if !strings.HasSuffix(line, "|bruce") { + t.Errorf("line %d should end with |bruce: %q", i, line) + } + } +} + +func TestWriteReviewerOutput_HandlesEmptyTD(t *testing.T) { + // A reviewer that produces a review but no TD lines (e.g. "no issues") + // should still write all artifacts; td-stream.txt is just empty. + dir := t.TempDir() + res := InvokeReviewerResult{ + AgentName: "otto", + Model: "gemma-4-31b", + ReviewProse: "no issues found", + } + paths, err := WriteReviewerOutput(dir, res) + if err != nil { + t.Fatalf("WriteReviewerOutput: %v", err) + } + td, err := os.ReadFile(paths.TDStream) + if err != nil { + t.Fatal(err) + } + if strings.TrimSpace(string(td)) != "" { + t.Errorf("td-stream should be empty, got %q", td) + } +} + +func TestMergeStreams_ConcatenatesWithHeader(t *testing.T) { + dir := t.TempDir() + bruceDir := filepath.Join(dir, "bruce") + gretaDir := filepath.Join(dir, "greta") + otto := filepath.Join(dir, "otto") + for _, d := range []string{bruceDir, gretaDir, otto} { + if err := os.MkdirAll(d, 0o755); err != nil { + t.Fatal(err) + } + } + if err := os.WriteFile(filepath.Join(bruceDir, "td-stream.txt"), + []byte("HIGH|f:1|p|x|c|bruce\nMEDIUM|f:2|p|x|c|bruce\n"), 0o644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(gretaDir, "td-stream.txt"), + []byte("LOW|f:3|p|x|c|greta\n"), 0o644); err != nil { + t.Fatal(err) + } + // otto produced empty + if err := os.WriteFile(filepath.Join(otto, "td-stream.txt"), []byte(""), 0o644); err != nil { + t.Fatal(err) + } + + mergedPath, count, err := MergeStreams(dir, []string{"bruce", "greta", "otto"}) + if err != nil { + t.Fatalf("MergeStreams: %v", err) + } + if count != 3 { + t.Errorf("merged count=%d want 3", count) + } + data, err := os.ReadFile(mergedPath) + if err != nil { + t.Fatal(err) + } + content := string(data) + if !strings.Contains(content, "HIGH|f:1|") { + t.Error("missing bruce line 1") + } + if !strings.Contains(content, "LOW|f:3|") { + t.Error("missing greta line") + } + if !strings.Contains(content, "# TD_STREAM - merged") { + t.Error("merged file missing header") + } +} + +func TestMergeStreams_ToleratesMissingFiles(t *testing.T) { + // If a reviewer's dir doesn't exist (e.g. it failed to invoke), merge + // should skip it without error. + dir := t.TempDir() + bruceDir := filepath.Join(dir, "bruce") + if err := os.MkdirAll(bruceDir, 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(bruceDir, "td-stream.txt"), + []byte("HIGH|f:1|p|x|c|bruce\n"), 0o644); err != nil { + t.Fatal(err) + } + + _, count, err := MergeStreams(dir, []string{"bruce", "ghost-reviewer-that-failed"}) + if err != nil { + t.Fatalf("MergeStreams should tolerate missing: %v", err) + } + if count != 1 { + t.Errorf("merged count=%d want 1", count) + } +} From caf639f1c1a4004212036a5a40a425ae6be23d54 Mon Sep 17 00:00:00 2001 From: Sam Estrin Date: Mon, 11 May 2026 18:20:32 -0700 Subject: [PATCH 06/10] =?UTF-8?q?feat(llm-support):=20multi=5Freview=20sub?= =?UTF-8?q?command=20=E2=80=94=20fan=20out=20review=20to=20N=20agents?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The top-level orchestrator. Wires together bundle/ship/invoke/merge into a single command that /code-review can call when review.mode: multi is set. Two-lane execution: --reviewers all agents involved --serial-reviewers subset that runs sequentially after the parallel lane (for shared rate limits) Failure semantics: - Bundle/ship failure → hard-stop (no point continuing without diff staged on remote) - Per-reviewer failure → recorded in summary, other reviewers continue, exit 0 with partial: true - All reviewers fail → exit 1 with summary path Test seams use package-level invokeReviewerFn and shipBundleFn variables so tests inject mock behavior without spinning up SSH/openclaw. Cleanup of the remote workdir happens via deferred SSH rm -rf with a tight 30s timeout so test failures or panics don't block. Per-function coverage on multi_review.go: 67-100%, dominated by the runMultiReview happy path at 93.7%. --- internal/support/commands/multi_review.go | 344 ++++++++++++++++++ .../support/commands/multi_review_test.go | 304 ++++++++++++++++ 2 files changed, 648 insertions(+) create mode 100644 internal/support/commands/multi_review.go create mode 100644 internal/support/commands/multi_review_test.go diff --git a/internal/support/commands/multi_review.go b/internal/support/commands/multi_review.go new file mode 100644 index 0000000..6e376b9 --- /dev/null +++ b/internal/support/commands/multi_review.go @@ -0,0 +1,344 @@ +package commands + +import ( + "context" + "encoding/json" + "fmt" + "os" + "path/filepath" + "strings" + "sync" + "time" + + "github.com/samestrin/llm-tools/internal/support/multireview" + "github.com/spf13/cobra" +) + +// invokeReviewerFn and shipBundleFn are package-level function variables that +// tests swap to inject deterministic behavior. Production points them at the +// real multireview package functions. +var ( + invokeReviewerFn = multireview.InvokeReviewer + shipBundleFn = multireview.ShipBundle +) + +// Flag variables for the multi_review command. +var ( + mrReviewers string + mrSerialReviewers string + mrRepo string + mrBaseRef string + mrHeadRef string + mrOpenclawHost string + mrOutputDir string + mrTimeoutSeconds int + mrPerReviewerTO int + mrGatewayContainer string + mrTaskMessage string + mrSkipCleanup bool +) + +// ReviewerStatus is one entry in the multi-review summary report. +type ReviewerStatus struct { + Agent string `json:"agent"` + Model string `json:"model,omitempty"` + Status string `json:"status"` // "ok" | "failed" | "skipped" + DurationMS int64 `json:"durationMs,omitempty"` + TDLineCount int `json:"tdLineCount"` + Error string `json:"error,omitempty"` +} + +// MultiReviewSummary is what gets written to multi-review-summary.json. +type MultiReviewSummary struct { + Reviewers []ReviewerStatus `json:"reviewers"` + TotalFindings int `json:"totalFindings"` + Partial bool `json:"partial"` + TotalDurationMS int64 `json:"totalDurationMs"` + BundleSize int64 `json:"bundleSize"` + RemoteRepoPath string `json:"remoteRepoPath"` + Timestamp string `json:"timestamp"` +} + +func newMultiReviewCmd() *cobra.Command { + cmd := &cobra.Command{ + Use: "multi_review", + Short: "Fan out a code review to multiple openclaw reviewer agents", + Long: `Bundle a local repo, ship it to an openclaw-hosting machine, and invoke +several reviewer agents in parallel (or in a serial lane for those that +share rate-limited providers). Collects per-reviewer findings and writes +a merged TD stream the /code-review command can consume. + +Output layout: + /raw//{review.md,td-stream.txt,status.json,response.json} + /td-stream-all.txt (merged + reviewer-attributed) + /multi-review-summary.json (per-reviewer status + counts) + +Failure semantics: + - Bundle/ship failure → hard-stop (no point invoking reviewers without + the diff staged on the remote) + - Per-reviewer failure → recorded as failed in summary; other reviewers + continue; exit 0 with partial: true + - All reviewers fail → exit 1 with summary of what failed`, + RunE: runMultiReview, + } + cmd.Flags().StringVar(&mrReviewers, "reviewers", "", "Comma-separated reviewer agent names (required)") + cmd.Flags().StringVar(&mrSerialReviewers, "serial-reviewers", "", "Comma-separated subset that runs serially after the parallel lane") + cmd.Flags().StringVar(&mrRepo, "repo", "", "Local repo path to bundle (required)") + cmd.Flags().StringVar(&mrBaseRef, "base", "", "Base ref for the diff range (informational, included in task message)") + cmd.Flags().StringVar(&mrHeadRef, "head", "HEAD", "Head ref for the diff range") + cmd.Flags().StringVar(&mrOpenclawHost, "openclaw-host", "", "SSH target running openclaw-gateway (required)") + cmd.Flags().StringVar(&mrOutputDir, "output-dir", "", "Where per-reviewer artifacts and merged stream land (required)") + cmd.Flags().IntVar(&mrTimeoutSeconds, "timeout-seconds", 1200, "Total wall-clock budget for the entire fan-out") + cmd.Flags().IntVar(&mrPerReviewerTO, "per-reviewer-timeout-seconds", 600, "Per-reviewer soft timeout") + cmd.Flags().StringVar(&mrGatewayContainer, "gateway-container", "openclaw-gateway", "Docker container running openclaw") + cmd.Flags().StringVar(&mrTaskMessage, "task-message", "", "Override the task message sent to each reviewer; default is auto-built from --base/--head/--repo") + cmd.Flags().BoolVar(&mrSkipCleanup, "skip-cleanup", false, "Do not remove the remote workdir after running") + return cmd +} + +func init() { + RootCmd.AddCommand(newMultiReviewCmd()) +} + +func runMultiReview(cmd *cobra.Command, _ []string) error { + // Flag validation + if mrReviewers == "" { + return fmt.Errorf("--reviewers required") + } + if mrRepo == "" { + return fmt.Errorf("--repo required") + } + if mrOpenclawHost == "" { + return fmt.Errorf("--openclaw-host required") + } + if mrOutputDir == "" { + return fmt.Errorf("--output-dir required") + } + + allReviewers := splitAndTrim(mrReviewers) + serial := splitAndTrim(mrSerialReviewers) + if len(allReviewers) == 0 { + return fmt.Errorf("--reviewers must list at least one agent") + } + parallel := subtract(allReviewers, serial) + + rawDir := filepath.Join(mrOutputDir, "raw") + if err := os.MkdirAll(rawDir, 0o755); err != nil { + return fmt.Errorf("mkdir output: %w", err) + } + + start := time.Now() + totalCtx, cancelTotal := context.WithTimeout(context.Background(), time.Duration(mrTimeoutSeconds)*time.Second) + defer cancelTotal() + + // 1. Ship the bundle. Hard-stop on failure. + repoName := filepath.Base(strings.TrimRight(mrRepo, "/")) + remoteWorkdir := fmt.Sprintf("/tmp/multi-review-%d", start.Unix()) + shipRes, err := shipBundleFn(totalCtx, multireview.ShipBundleParams{ + LocalRepo: mrRepo, + Host: mrOpenclawHost, + RemoteWorkdir: remoteWorkdir, + RepoName: repoName, + Timeout: time.Duration(mrTimeoutSeconds) * time.Second, + }) + if err != nil { + return fmt.Errorf("ship bundle to %s: %w", mrOpenclawHost, err) + } + defer func() { + if !mrSkipCleanup { + // Best-effort teardown of the remote workdir. + cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + _, _ = multireview.SSHRun(cleanupCtx, multireview.SSHParams{ + Host: mrOpenclawHost, + Command: "rm -rf " + shellQuote(remoteWorkdir), + Timeout: 30 * time.Second, + }) + } + }() + + // 2. Build the task message (auto if not overridden). + taskMessage := mrTaskMessage + if taskMessage == "" { + taskMessage = buildDefaultTaskMessage(shipRes.RemoteRepoPath, repoName, mrBaseRef, mrHeadRef) + } + + // 3. Invoke reviewers. Parallel lane first, then serial. + statuses := make([]ReviewerStatus, 0, len(allReviewers)) + statusByAgent := make(map[string]ReviewerStatus) + var statusMu sync.Mutex + + invokeOne := func(agent string) { + res, err := invokeReviewerFn(totalCtx, multireview.InvokeReviewerParams{ + Host: mrOpenclawHost, + AgentName: agent, + TaskMessage: taskMessage, + Timeout: time.Duration(mrPerReviewerTO) * time.Second, + GatewayContainer: mrGatewayContainer, + }) + st := ReviewerStatus{Agent: agent} + if err != nil { + st.Status = "failed" + st.Error = err.Error() + } else { + _, werr := multireview.WriteReviewerOutput(rawDir, res) + st.Model = res.Model + st.DurationMS = res.DurationMS + st.TDLineCount = len(multireview.ExtractTDLines(res.ReviewProse)) + if werr != nil { + st.Status = "failed" + st.Error = werr.Error() + } else { + st.Status = "ok" + } + } + statusMu.Lock() + statusByAgent[agent] = st + statusMu.Unlock() + } + + // Parallel lane + if len(parallel) > 0 { + var wg sync.WaitGroup + for _, agent := range parallel { + wg.Add(1) + go func(a string) { + defer wg.Done() + invokeOne(a) + }(agent) + } + wg.Wait() + } + + // Serial lane (sequential after parallel completes) + for _, agent := range serial { + invokeOne(agent) + } + + // 4. Preserve order from --reviewers in the final summary + okCount := 0 + for _, agent := range allReviewers { + s := statusByAgent[agent] + statuses = append(statuses, s) + if s.Status == "ok" { + okCount++ + } + } + + // 5. Merge streams from successful reviewers only + successAgents := make([]string, 0) + for _, s := range statuses { + if s.Status == "ok" { + successAgents = append(successAgents, s.Agent) + } + } + totalFindings := 0 + if len(successAgents) > 0 { + _, n, err := multireview.MergeStreams(rawDir, successAgents) + if err != nil { + return fmt.Errorf("merge streams: %w", err) + } + totalFindings = n + // Also write the merged stream up one level for convenience. + srcMerged := filepath.Join(rawDir, "td-stream-all.txt") + dstMerged := filepath.Join(mrOutputDir, "td-stream-all.txt") + if data, err := os.ReadFile(srcMerged); err == nil { + _ = os.WriteFile(dstMerged, data, 0o644) + } + } + + // 6. Write summary + summary := MultiReviewSummary{ + Reviewers: statuses, + TotalFindings: totalFindings, + Partial: okCount > 0 && okCount < len(allReviewers), + TotalDurationMS: time.Since(start).Milliseconds(), + BundleSize: shipRes.BundleSize, + RemoteRepoPath: shipRes.RemoteRepoPath, + Timestamp: start.UTC().Format(time.RFC3339), + } + summaryPath := filepath.Join(mrOutputDir, "multi-review-summary.json") + summaryBytes, _ := json.MarshalIndent(summary, "", " ") + if err := os.WriteFile(summaryPath, summaryBytes, 0o644); err != nil { + return fmt.Errorf("write summary: %w", err) + } + + // Human-readable output to stdout (cmd.OutOrStdout()) + fmt.Fprintf(cmd.OutOrStdout(), + "multi_review: %d/%d reviewers succeeded, %d findings, total %s\n", + okCount, len(allReviewers), totalFindings, time.Since(start).Round(time.Second), + ) + for _, s := range statuses { + fmt.Fprintf(cmd.OutOrStdout(), " %-10s %-6s %3d findings %s\n", + s.Agent, s.Status, s.TDLineCount, s.Model) + if s.Error != "" { + fmt.Fprintf(cmd.OutOrStdout(), " err: %s\n", s.Error) + } + } + + if okCount == 0 { + return fmt.Errorf("all reviewers failed (no successful runs); see %s", summaryPath) + } + return nil +} + +func buildDefaultTaskMessage(remoteRepo, repoName, base, head string) string { + if head == "" { + head = "HEAD" + } + var b strings.Builder + b.WriteString("Code review.\n\n") + b.WriteString(fmt.Sprintf("Repository: a fresh clone of %s is on this host at %s/\n\n", repoName, remoteRepo)) + if base != "" { + b.WriteString(fmt.Sprintf("Range to review: %s..%s\n\n", base, head)) + b.WriteString("To see the diff:\n") + b.WriteString(fmt.Sprintf(" cd %s && git diff %s..%s\n\n", remoteRepo, base, head)) + } else { + b.WriteString(fmt.Sprintf("Working tree at %s — review the current state.\n\n", remoteRepo)) + } + b.WriteString(`Produce your normal review report (verdict + severity-graded findings + what was done well + out-of-scope). +Reply with the review body only — no preamble. + +After your normal review, append a section titled "TD_STREAM" with each finding as a single pipe-delimited line in this format: + + SEVERITY|FILE:LINE|PROBLEM|FIX|CATEGORY + +Where SEVERITY is HIGH/MEDIUM/LOW (map blocking->HIGH, significant->MEDIUM, minor->LOW). One line per finding. No header row, no commentary in this section. +`) + return b.String() +} + +func splitAndTrim(csv string) []string { + if csv == "" { + return nil + } + parts := strings.Split(csv, ",") + out := make([]string, 0, len(parts)) + for _, p := range parts { + s := strings.TrimSpace(p) + if s != "" { + out = append(out, s) + } + } + return out +} + +func subtract(all, remove []string) []string { + rm := make(map[string]bool, len(remove)) + for _, r := range remove { + rm[r] = true + } + out := make([]string, 0, len(all)) + for _, a := range all { + if !rm[a] { + out = append(out, a) + } + } + return out +} + +// shellQuote here mirrors multireview.shellQuote — kept inline to avoid +// exposing the helper from that package. +func shellQuote(s string) string { + return "'" + strings.ReplaceAll(s, "'", `'\''`) + "'" +} diff --git a/internal/support/commands/multi_review_test.go b/internal/support/commands/multi_review_test.go new file mode 100644 index 0000000..5319f4b --- /dev/null +++ b/internal/support/commands/multi_review_test.go @@ -0,0 +1,304 @@ +package commands + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + + "github.com/samestrin/llm-tools/internal/support/multireview" +) + +// initFixtureRepoMR mirrors the helper from the multireview package — a small +// 2-commit git repo with a tag so the multi_review command can bundle it. +func initFixtureRepoMR(t *testing.T) string { + t.Helper() + repoPath := t.TempDir() + mustRun := func(args ...string) { + c := exec.Command("git", args...) + c.Dir = repoPath + if out, err := c.CombinedOutput(); err != nil { + t.Fatalf("git %v: %v\n%s", args, err, out) + } + } + mustRun("init", "-q", "-b", "main") + mustRun("config", "user.email", "test@example.com") + mustRun("config", "user.name", "Test") + mustRun("config", "commit.gpgsign", "false") + if err := os.WriteFile(filepath.Join(repoPath, "a.txt"), []byte("first"), 0o644); err != nil { + t.Fatal(err) + } + mustRun("add", "a.txt") + mustRun("commit", "-q", "-m", "first") + mustRun("tag", "v0.1.0") + if err := os.WriteFile(filepath.Join(repoPath, "b.txt"), []byte("second"), 0o644); err != nil { + t.Fatal(err) + } + mustRun("add", "b.txt") + mustRun("commit", "-q", "-m", "second") + return repoPath +} + +// withMockInvoker swaps the invokeReviewerFn used by the multi_review command +// for a deterministic stub and restores the original on cleanup. +func withMockInvoker(t *testing.T, fn func(ctx context.Context, p multireview.InvokeReviewerParams) (multireview.InvokeReviewerResult, error)) { + t.Helper() + orig := invokeReviewerFn + invokeReviewerFn = fn + t.Cleanup(func() { invokeReviewerFn = orig }) +} + +// withMockShipBundle swaps the shipBundleFn so tests don't actually SSH. +func withMockShipBundle(t *testing.T) { + t.Helper() + orig := shipBundleFn + shipBundleFn = func(ctx context.Context, p multireview.ShipBundleParams) (multireview.ShipBundleResult, error) { + return multireview.ShipBundleResult{ + LocalBundlePath: "/tmp/mock-bundle.git", + RemoteBundlePath: "/tmp/mock-remote/bundle.git", + RemoteRepoPath: "/tmp/mock-remote/" + p.RepoName, + BundleSize: 1024, + }, nil + } + t.Cleanup(func() { shipBundleFn = orig }) +} + +func mockResultFor(agent string, tdLine string) multireview.InvokeReviewerResult { + return multireview.InvokeReviewerResult{ + AgentName: agent, + Status: "ok", + Model: "mock-" + agent, + DurationMS: 1000, + ReviewProse: "Verdict: ship\n\n" + tdLine + "\n", + RawJSON: `{"runId":"x"}`, + } +} + +func TestMultiReview_HappyPath(t *testing.T) { + repo := initFixtureRepoMR(t) + outDir := filepath.Join(t.TempDir(), "out") + + withMockShipBundle(t) + withMockInvoker(t, func(ctx context.Context, p multireview.InvokeReviewerParams) (multireview.InvokeReviewerResult, error) { + return mockResultFor(p.AgentName, "MEDIUM|src/a.go:1|test problem|test fix|robustness"), nil + }) + + cmd := newMultiReviewCmd() + cmd.SetArgs([]string{ + "--reviewers", "bruce,greta", + "--repo", repo, + "--openclaw-host", "user@example.lan", + "--output-dir", outDir, + "--timeout-seconds", "30", + }) + var stdout bytes.Buffer + cmd.SetOut(&stdout) + if err := cmd.Execute(); err != nil { + t.Fatalf("execute: %v", err) + } + + for _, agent := range []string{"bruce", "greta"} { + td := filepath.Join(outDir, "raw", agent, "td-stream.txt") + if _, err := os.Stat(td); err != nil { + t.Errorf("missing %s: %v", td, err) + } + } + merged := filepath.Join(outDir, "td-stream-all.txt") + if _, err := os.Stat(merged); err != nil { + t.Errorf("missing merged: %v", err) + } + summary := filepath.Join(outDir, "multi-review-summary.json") + if _, err := os.Stat(summary); err != nil { + t.Errorf("missing summary: %v", err) + } + + summaryData, _ := os.ReadFile(summary) + var s struct { + Reviewers []struct { + Agent string `json:"agent"` + Status string `json:"status"` + } `json:"reviewers"` + TotalFindings int `json:"totalFindings"` + Partial bool `json:"partial"` + } + if err := json.Unmarshal(summaryData, &s); err != nil { + t.Fatalf("parse summary: %v", err) + } + if len(s.Reviewers) != 2 { + t.Errorf("expected 2 reviewers in summary, got %d", len(s.Reviewers)) + } + if s.TotalFindings != 2 { + t.Errorf("expected 2 findings total, got %d", s.TotalFindings) + } + if s.Partial { + t.Errorf("partial should be false in happy path") + } +} + +func TestMultiReview_RequiresFlags(t *testing.T) { + repo := initFixtureRepoMR(t) + cases := []struct { + name string + args []string + }{ + {"missing reviewers", []string{"--repo", repo, "--openclaw-host", "h", "--output-dir", "/tmp/x"}}, + {"missing repo", []string{"--reviewers", "bruce", "--openclaw-host", "h", "--output-dir", "/tmp/x"}}, + {"missing host", []string{"--reviewers", "bruce", "--repo", repo, "--output-dir", "/tmp/x"}}, + {"missing output-dir", []string{"--reviewers", "bruce", "--repo", repo, "--openclaw-host", "h"}}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + cmd := newMultiReviewCmd() + cmd.SetArgs(c.args) + cmd.SetOut(new(bytes.Buffer)) + cmd.SetErr(new(bytes.Buffer)) + if err := cmd.Execute(); err == nil { + t.Errorf("expected error for %s", c.name) + } + }) + } +} + +func TestMultiReview_TwoLane(t *testing.T) { + repo := initFixtureRepoMR(t) + outDir := filepath.Join(t.TempDir(), "out") + + withMockShipBundle(t) + withMockInvoker(t, func(ctx context.Context, p multireview.InvokeReviewerParams) (multireview.InvokeReviewerResult, error) { + return mockResultFor(p.AgentName, "LOW|f:1|p|x|c"), nil + }) + + cmd := newMultiReviewCmd() + cmd.SetArgs([]string{ + "--reviewers", "bruce,greta,kai", + "--serial-reviewers", "greta", + "--repo", repo, + "--openclaw-host", "user@example.lan", + "--output-dir", outDir, + "--timeout-seconds", "30", + }) + cmd.SetOut(new(bytes.Buffer)) + if err := cmd.Execute(); err != nil { + t.Fatalf("execute: %v", err) + } + for _, agent := range []string{"bruce", "greta", "kai"} { + dir := filepath.Join(outDir, "raw", agent) + if _, err := os.Stat(dir); err != nil { + t.Errorf("missing %s: %v", dir, err) + } + } +} + +func TestMultiReview_PartialFailure(t *testing.T) { + repo := initFixtureRepoMR(t) + outDir := filepath.Join(t.TempDir(), "out") + + withMockShipBundle(t) + withMockInvoker(t, func(ctx context.Context, p multireview.InvokeReviewerParams) (multireview.InvokeReviewerResult, error) { + if p.AgentName == "greta" { + return multireview.InvokeReviewerResult{AgentName: p.AgentName}, fmt.Errorf("simulated failure") + } + return mockResultFor(p.AgentName, "MEDIUM|f:1|p|x|c"), nil + }) + + cmd := newMultiReviewCmd() + cmd.SetArgs([]string{ + "--reviewers", "bruce,greta", + "--repo", repo, + "--openclaw-host", "user@example.lan", + "--output-dir", outDir, + "--timeout-seconds", "30", + }) + cmd.SetOut(new(bytes.Buffer)) + if err := cmd.Execute(); err != nil { + t.Fatalf("execute: %v (partial failure should not be fatal)", err) + } + + summaryData, err := os.ReadFile(filepath.Join(outDir, "multi-review-summary.json")) + if err != nil { + t.Fatal(err) + } + var s struct { + Partial bool `json:"partial"` + Reviewers []struct { + Agent string `json:"agent"` + Status string `json:"status"` + } `json:"reviewers"` + } + if err := json.Unmarshal(summaryData, &s); err != nil { + t.Fatal(err) + } + if !s.Partial { + t.Error("partial should be true") + } + var failed, ok int + for _, r := range s.Reviewers { + if r.Status == "ok" { + ok++ + } else { + failed++ + } + } + if ok != 1 || failed != 1 { + t.Errorf("expected 1 ok + 1 failed, got ok=%d failed=%d", ok, failed) + } +} + +func TestMultiReview_AllFail(t *testing.T) { + repo := initFixtureRepoMR(t) + outDir := filepath.Join(t.TempDir(), "out") + + withMockShipBundle(t) + withMockInvoker(t, func(ctx context.Context, p multireview.InvokeReviewerParams) (multireview.InvokeReviewerResult, error) { + return multireview.InvokeReviewerResult{AgentName: p.AgentName}, fmt.Errorf("simulated failure") + }) + + cmd := newMultiReviewCmd() + cmd.SetArgs([]string{ + "--reviewers", "bruce", + "--repo", repo, + "--openclaw-host", "user@example.lan", + "--output-dir", outDir, + "--timeout-seconds", "30", + }) + cmd.SetOut(new(bytes.Buffer)) + cmd.SetErr(new(bytes.Buffer)) + err := cmd.Execute() + if err == nil { + t.Fatal("expected error when all reviewers fail") + } + if !strings.Contains(err.Error(), "all reviewers failed") && !strings.Contains(err.Error(), "no successful") { + t.Errorf("error %q should mention total failure", err.Error()) + } +} + +func TestMultiReview_ShipFailureHardStops(t *testing.T) { + repo := initFixtureRepoMR(t) + outDir := filepath.Join(t.TempDir(), "out") + + // Override shipBundleFn to fail + orig := shipBundleFn + shipBundleFn = func(ctx context.Context, p multireview.ShipBundleParams) (multireview.ShipBundleResult, error) { + return multireview.ShipBundleResult{}, fmt.Errorf("ssh: connection refused") + } + t.Cleanup(func() { shipBundleFn = orig }) + + cmd := newMultiReviewCmd() + cmd.SetArgs([]string{ + "--reviewers", "bruce,greta", + "--repo", repo, + "--openclaw-host", "user@example.lan", + "--output-dir", outDir, + "--timeout-seconds", "30", + }) + cmd.SetOut(new(bytes.Buffer)) + cmd.SetErr(new(bytes.Buffer)) + if err := cmd.Execute(); err == nil { + t.Fatal("expected hard-stop on ship failure") + } +} From 8f45de6d1de1aec99140d335bfb7144bcd150bb5 Mon Sep 17 00:00:00 2001 From: Sam Estrin Date: Mon, 11 May 2026 18:25:40 -0700 Subject: [PATCH 07/10] feat(group_td): REVIEWERS and CONFIDENCE feature-flagged columns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Multi-reviewer code review produces findings with reviewer attribution (which agents caught it) and confidence (how many agreed). These travel through the TD stream and need to land in the TD README.md table. Mirrors the existing SOURCE column feature-flag pattern: a column is emitted only when at least one input row carries a non-empty value for it. Callers that don't pass REVIEWERS/CONFIDENCE through --headers see the same output they always saw — 7-column legacy, 8-column with Source, etc. While here, refactor the markdown header + row writers to assemble cells dynamically instead of enumerating every (checkbox × source) combination. The old switch already had 4 cases; adding 2 more feature-flagged columns would have ballooned it to 16. The dynamic builder preserves exact cell spacing semantics (header empty-cell `| |` single-space, data empty-cell `| |` two-space) so downstream column-position parsers see no change. REVIEWERS is stored comma-joined in the stream ("bruce,greta") and rendered with a space after each comma in the cell ("bruce, greta") for readability. --- internal/support/commands/group_td.go | 209 ++++++++++++--------- internal/support/commands/group_td_test.go | 147 +++++++++++++++ 2 files changed, 265 insertions(+), 91 deletions(-) diff --git a/internal/support/commands/group_td.go b/internal/support/commands/group_td.go index f77c0fc..30a684a 100644 --- a/internal/support/commands/group_td.go +++ b/internal/support/commands/group_td.go @@ -714,61 +714,122 @@ func writeGroupedMarkdown(result GroupTDResult, outputFile string, checkbox bool sectionHeader += "Items" } - // Feature-flag: emit trailing Source column only if at least one item - // carries a non-empty SOURCE value. Preserves 7-column output for legacy - // callers that don't pass SOURCE through --headers. - hasSource := false - for _, g := range result.Groups { - for _, item := range g.Items { - if s, _ := item["SOURCE"].(string); strings.TrimSpace(s) != "" { - hasSource = true - break + // Feature-flag: emit trailing optional columns (SOURCE, REVIEWERS, + // CONFIDENCE) only when at least one input row carries a non-empty + // value. Preserves the original 7-column output for legacy callers and + // the 8-column-with-Source output for the unified-TD-capture pipeline. + hasField := func(key string) bool { + for _, g := range result.Groups { + for _, item := range g.Items { + if s, _ := item[key].(string); strings.TrimSpace(s) != "" { + return true + } } } - if hasSource { - break - } - } - if !hasSource { for _, item := range result.Ungrouped { - if s, _ := item["SOURCE"].(string); strings.TrimSpace(s) != "" { - hasSource = true - break + if s, _ := item[key].(string); strings.TrimSpace(s) != "" { + return true } } + return false } + hasSource := hasField("SOURCE") + hasReviewers := hasField("REVIEWERS") + hasConfidence := hasField("CONFIDENCE") // Build markdown table var buf strings.Builder buf.WriteString("\n" + sectionHeader + "\n\n") - // Table header - switch { - case checkbox && hasSource: - buf.WriteString("| Group | | Severity | File | Problem | Fix | Category | Est Minutes | Source |\n") - buf.WriteString("|-------|---|----------|------|---------|-----|----------|-------------|--------|\n") - case checkbox && !hasSource: - buf.WriteString("| Group | | Severity | File | Problem | Fix | Category | Est Minutes |\n") - buf.WriteString("|-------|---|----------|------|---------|-----|----------|-------------|\n") - case !checkbox && hasSource: - buf.WriteString("| Group | Severity | File | Problem | Fix | Category | Est Minutes | Source |\n") - buf.WriteString("|-------|----------|------|---------|-----|----------|-------------|--------|\n") - default: - buf.WriteString("| Group | Severity | File | Problem | Fix | Category | Est Minutes |\n") - buf.WriteString("|-------|----------|------|---------|-----|----------|-------------|\n") + // Header is assembled dynamically so adding feature-flagged trailing + // columns doesn't combinatorially explode the switch (3 flags = 8 cases). + // + // Cell rendering: data rows preserve the legacy `| %s |` spacing — empty + // data cells render as `| |` (two spaces around empty) so resolve-td and + // other downstream column-position parsers see the same shape they've + // always seen. The header row's empty checkbox cell renders as `| |` + // (single space) to match the prior hardcoded header. + writeHeaderRow := func(b *strings.Builder, cells []string) { + b.WriteString("|") + for _, c := range cells { + if c == "" { + b.WriteString(" |") + } else { + b.WriteString(" " + c + " |") + } + } + b.WriteString("\n") + } + writeDataRow := func(b *strings.Builder, cells []string) { + // Always `| |` — matches fmt.Sprintf("%s") behavior for empty. + b.WriteString("|") + for _, c := range cells { + b.WriteString(" " + c + " |") + } + b.WriteString("\n") + } + headers := []string{"Group"} + dashes := []string{"-------"} + if checkbox { + headers = append(headers, "") + dashes = append(dashes, "---") } + headers = append(headers, "Severity", "File", "Problem", "Fix", "Category", "Est Minutes") + dashes = append(dashes, "----------", "------", "---------", "-----", "----------", "-------------") + if hasSource { + headers = append(headers, "Source") + dashes = append(dashes, "--------") + } + if hasReviewers { + headers = append(headers, "Reviewers") + dashes = append(dashes, "---------") + } + if hasConfidence { + headers = append(headers, "Confidence") + dashes = append(dashes, "----------") + } + writeHeaderRow(&buf, headers) + buf.WriteString("|" + strings.Join(dashes, "|") + "|\n") // Collect all rows: groups first, then ungrouped type rowData struct { - group string - sortKey int // 0=solo, 1..N=groups, 9999=ungrouped - severity string - fileLine string - problem string - fix string - category string - estMin int - source string + group string + sortKey int // 0=solo, 1..N=groups, 9999=ungrouped + severity string + fileLine string + problem string + fix string + category string + estMin int + source string + reviewers string + confidence string + } + + mkRow := func(groupLabel string, sortKey int, item map[string]interface{}) rowData { + severity, _ := item["SEVERITY"].(string) + problem, _ := item["PROBLEM"].(string) + fix, _ := item["FIX"].(string) + category, _ := item["CATEGORY"].(string) + source, _ := item["SOURCE"].(string) + reviewers, _ := item["REVIEWERS"].(string) + confidence, _ := item["CONFIDENCE"].(string) + // REVIEWERS is stored comma-joined (e.g. "bruce,greta"); render + // with a space after each comma for readability in the table cell. + reviewers = strings.ReplaceAll(reviewers, ",", ", ") + return rowData{ + group: groupLabel, + sortKey: sortKey, + severity: severity, + fileLine: extractFileLine(item), + problem: problem, + fix: fix, + category: category, + estMin: extractEstMinutesInt(item), + source: source, + reviewers: reviewers, + confidence: confidence, + } } var rows []rowData @@ -784,49 +845,14 @@ func writeGroupedMarkdown(result GroupTDResult, outputFile string, checkbox bool } else if num, ok := g.Number.(int); ok { sortKey = num } - for _, item := range g.Items { - severity, _ := item["SEVERITY"].(string) - fileLine := extractFileLine(item) - problem, _ := item["PROBLEM"].(string) - fix, _ := item["FIX"].(string) - category, _ := item["CATEGORY"].(string) - estMin := extractEstMinutesInt(item) - source, _ := item["SOURCE"].(string) - rows = append(rows, rowData{ - group: groupLabel, - sortKey: sortKey, - severity: severity, - fileLine: fileLine, - problem: problem, - fix: fix, - category: category, - estMin: estMin, - source: source, - }) + rows = append(rows, mkRow(groupLabel, sortKey, item)) } } // Ungrouped items for _, item := range result.Ungrouped { - severity, _ := item["SEVERITY"].(string) - fileLine := extractFileLine(item) - problem, _ := item["PROBLEM"].(string) - fix, _ := item["FIX"].(string) - category, _ := item["CATEGORY"].(string) - estMin := extractEstMinutesInt(item) - source, _ := item["SOURCE"].(string) - rows = append(rows, rowData{ - group: ungroupedLabel, - sortKey: 9999, - severity: severity, - fileLine: fileLine, - problem: problem, - fix: fix, - category: category, - estMin: estMin, - source: source, - }) + rows = append(rows, mkRow(ungroupedLabel, 9999, item)) } // Sort by sortKey (solo=0, groups=1..N, ungrouped=9999) @@ -834,22 +860,23 @@ func writeGroupedMarkdown(result GroupTDResult, outputFile string, checkbox bool return rows[i].sortKey < rows[j].sortKey }) - // Write rows + // Write rows — cells assembled to match the dynamic header. for _, r := range rows { - switch { - case checkbox && hasSource: - buf.WriteString(fmt.Sprintf("| %s | [ ] | %s | %s | %s | %s | %s | %d | %s |\n", - r.group, r.severity, r.fileLine, r.problem, r.fix, r.category, r.estMin, r.source)) - case checkbox && !hasSource: - buf.WriteString(fmt.Sprintf("| %s | [ ] | %s | %s | %s | %s | %s | %d |\n", - r.group, r.severity, r.fileLine, r.problem, r.fix, r.category, r.estMin)) - case !checkbox && hasSource: - buf.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %s | %s | %d | %s |\n", - r.group, r.severity, r.fileLine, r.problem, r.fix, r.category, r.estMin, r.source)) - default: - buf.WriteString(fmt.Sprintf("| %s | %s | %s | %s | %s | %s | %d |\n", - r.group, r.severity, r.fileLine, r.problem, r.fix, r.category, r.estMin)) + cells := []string{r.group} + if checkbox { + cells = append(cells, "[ ]") + } + cells = append(cells, r.severity, r.fileLine, r.problem, r.fix, r.category, fmt.Sprintf("%d", r.estMin)) + if hasSource { + cells = append(cells, r.source) + } + if hasReviewers { + cells = append(cells, r.reviewers) + } + if hasConfidence { + cells = append(cells, r.confidence) } + writeDataRow(&buf, cells) } // Verify row count against buffer before writing diff --git a/internal/support/commands/group_td_test.go b/internal/support/commands/group_td_test.go index 1972e74..28709f3 100644 --- a/internal/support/commands/group_td_test.go +++ b/internal/support/commands/group_td_test.go @@ -1555,3 +1555,150 @@ func findGroupByTheme(groups []TDGroup, theme string) *TDGroup { } return nil } + +// ---- REVIEWERS + CONFIDENCE column tests ---- +// +// These columns are emitted only when at least one input row carries a +// non-empty value, mirroring the SOURCE column feature-flag pattern. + +func TestGroupTDPipeFormatWithReviewers(t *testing.T) { + tmpDir := t.TempDir() + outputFile := filepath.Join(tmpDir, "README.md") + + pipeInput := `HIGH|src/a.ts:1|Auth issue|Fix auth|security|30|code-review|bruce,greta|HIGH +MEDIUM|src/b.ts:2|Auth issue 2|Fix auth 2|security|45|code-review|kai|MEDIUM +` + + cmd := newGroupTDCmd() + buf := new(bytes.Buffer) + cmd.SetOut(buf) + cmd.SetArgs([]string{ + "--content", pipeInput, + "--format", "pipe", + "--headers", "SEVERITY,FILE_LINE,PROBLEM,FIX,CATEGORY,EST_MINUTES,SOURCE,REVIEWERS,CONFIDENCE", + "--json", + "--assign-numbers", + "--output-file", outputFile, + }) + if err := cmd.Execute(); err != nil { + t.Fatalf("execute: %v", err) + } + + data, err := os.ReadFile(outputFile) + if err != nil { + t.Fatal(err) + } + if !bytes.Contains(data, []byte("| Reviewers |")) { + t.Errorf("expected Reviewers column header, got:\n%s", data) + } + if !bytes.Contains(data, []byte("| Confidence |")) { + t.Errorf("expected Confidence column header, got:\n%s", data) + } + if !bytes.Contains(data, []byte("| bruce, greta |")) { + t.Errorf("expected reviewer attribution cell with space-after-comma, got:\n%s", data) + } + if !bytes.Contains(data, []byte("| HIGH |")) { + t.Errorf("expected confidence cell, got:\n%s", data) + } +} + +func TestGroupTDPipeFormatWithoutReviewers(t *testing.T) { + tmpDir := t.TempDir() + outputFile := filepath.Join(tmpDir, "README.md") + + pipeInput := `HIGH|src/a.ts:1|Problem A|Fix A|security|15|code-review +` + cmd := newGroupTDCmd() + buf := new(bytes.Buffer) + cmd.SetOut(buf) + cmd.SetArgs([]string{ + "--content", pipeInput, + "--format", "pipe", + "--headers", "SEVERITY,FILE_LINE,PROBLEM,FIX,CATEGORY,EST_MINUTES,SOURCE", + "--json", + "--assign-numbers", + "--output-file", outputFile, + }) + if err := cmd.Execute(); err != nil { + t.Fatalf("execute: %v", err) + } + + data, err := os.ReadFile(outputFile) + if err != nil { + t.Fatal(err) + } + if bytes.Contains(data, []byte("Reviewers")) { + t.Errorf("did not expect Reviewers column without REVIEWERS header, got:\n%s", data) + } + if bytes.Contains(data, []byte("Confidence")) { + t.Errorf("did not expect Confidence column without CONFIDENCE header, got:\n%s", data) + } +} + +func TestGroupTDPipeFormatReviewersWithCheckbox(t *testing.T) { + tmpDir := t.TempDir() + outputFile := filepath.Join(tmpDir, "README.md") + + pipeInput := `HIGH|src/a.ts:1|P|F|security|15|code-review|bruce,greta,kai|HIGH +` + cmd := newGroupTDCmd() + cmd.SetOut(new(bytes.Buffer)) + cmd.SetArgs([]string{ + "--content", pipeInput, + "--format", "pipe", + "--headers", "SEVERITY,FILE_LINE,PROBLEM,FIX,CATEGORY,EST_MINUTES,SOURCE,REVIEWERS,CONFIDENCE", + "--json", + "--assign-numbers", + "--checkbox", + "--output-file", outputFile, + }) + if err := cmd.Execute(); err != nil { + t.Fatalf("execute: %v", err) + } + data, err := os.ReadFile(outputFile) + if err != nil { + t.Fatal(err) + } + // Header should include checkbox column + reviewers + confidence + if !bytes.Contains(data, []byte("| Reviewers | Confidence |")) { + t.Errorf("expected Reviewers + Confidence trailing columns, got:\n%s", data) + } + if !bytes.Contains(data, []byte("| bruce, greta, kai |")) { + t.Errorf("expected attribution cell, got:\n%s", data) + } +} + +func TestGroupTDPipeFormatPartialReviewers(t *testing.T) { + // Some rows have REVIEWERS, others don't. The column should appear and + // rows without attribution should show empty cells. + tmpDir := t.TempDir() + outputFile := filepath.Join(tmpDir, "README.md") + + pipeInput := `HIGH|src/a.ts:1|P1|F1|security|15|code-review|bruce|HIGH +MEDIUM|src/b.ts:2|P2|F2|style|10|code-review|| +` + cmd := newGroupTDCmd() + cmd.SetOut(new(bytes.Buffer)) + cmd.SetArgs([]string{ + "--content", pipeInput, + "--format", "pipe", + "--headers", "SEVERITY,FILE_LINE,PROBLEM,FIX,CATEGORY,EST_MINUTES,SOURCE,REVIEWERS,CONFIDENCE", + "--json", + "--assign-numbers", + "--output-file", outputFile, + }) + if err := cmd.Execute(); err != nil { + t.Fatalf("execute: %v", err) + } + data, err := os.ReadFile(outputFile) + if err != nil { + t.Fatal(err) + } + if !bytes.Contains(data, []byte("| bruce |")) { + t.Errorf("expected bruce cell for first row, got:\n%s", data) + } + // Row 2 should have empty Reviewers + Confidence cells + if !bytes.Contains(data, []byte("| | |\n")) { + t.Errorf("expected empty reviewers + confidence cells, got:\n%s", data) + } +} From 64d9b424898466d1599a89346b4669d52f7f97bb Mon Sep 17 00:00:00 2001 From: Sam Estrin Date: Mon, 11 May 2026 18:28:17 -0700 Subject: [PATCH 08/10] docs: CHANGELOG + reference entry for multi_review, REVIEWERS/CONFIDENCE CHANGELOG gets an Unreleased section covering multi_review, the group_td column additions, and the yaml parseNumber regression fix (separately committed but worth surfacing in the release notes). docs/llm-support-commands.md gets a multi_review reference entry under Technical Debt Management with flag tables, output layout, failure semantics, and three example invocations. The group-td section gets a small note about the new optional trailing columns. --- CHANGELOG.md | 30 +++++++++++ docs/llm-support-commands.md | 97 ++++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d183f26..5425d85 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,36 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Added + +#### llm-support + +- **`multi_review` subcommand** — fan out a code review across multiple openclaw reviewer agents. + - Bundles a local git repo, ships it to a remote host running `openclaw-gateway` via scp, invokes one or more named reviewer agents in parallel (or in an opt-in serial lane for shared-quota providers), and collects each reviewer's TD findings. + - Output layout under `--output-dir`: + - `raw//{review.md, td-stream.txt, status.json, response.json}` — per-reviewer artifacts. + - `td-stream-all.txt` — merged pipe-delimited findings with reviewer attribution appended as a column. + - `multi-review-summary.json` — per-reviewer status (ok/failed/skipped), counts, durations, partial flag. + - Two-lane execution via `--reviewers` (parallel) and `--serial-reviewers` (sequential after parallel) for cases where some reviewers share a rate-limited provider. + - Failure semantics: bundle/ship failure hard-stops; per-reviewer failure is recorded and other reviewers continue; all-reviewers-fail exits non-zero. + - Designed to be invoked by `/code-review` Phase 5 when `review.mode: multi` is set in `.planning/.config/config.yaml`. + +#### group_td + +- **`REVIEWERS` and `CONFIDENCE` columns** — feature-flagged trailing columns rendered only when at least one input row carries a non-empty value, following the existing `SOURCE` column pattern. + - `REVIEWERS` stores comma-joined agent names from `multi_review` (e.g. `bruce,greta`). Rendered with a space after each comma in the table cell. + - `CONFIDENCE` is the dedupe-time computed score (`HIGH` = 2+ reviewers, `MEDIUM` = single reviewer or severity disagreement, `LOW` = single reviewer flagged as untrusted). + - Backwards-compatible — callers that don't pass these via `--headers` see no change in output. + - Internal: markdown header + row writers refactored to assemble cells dynamically. + +### Fixed + +#### llm-support + +- **`yaml.parseNumber` preserves version-like strings** — values whose float representation would lose information (e.g. `"4.30"` → `4.3` → `"4.3"`) now stay strings. Only coerces to float when the value round-trips through YAML marshal back to the exact input. + ## [1.8.2] - 2026-01-27 ### Fixed diff --git a/docs/llm-support-commands.md b/docs/llm-support-commands.md index a1d6301..0225336 100644 --- a/docs/llm-support-commands.md +++ b/docs/llm-support-commands.md @@ -59,6 +59,7 @@ Complete documentation for all 40+ llm-support commands. - [route-td](#route-td) - [format-td-table](#format-td-table) - [group-td](#group-td) + - [multi_review](#multi_review) --- @@ -2103,6 +2104,102 @@ llm-support group-td --file items.json --min-group-size 5 | `lib/utils/string.ts` | `lib` | `lib-utils` | `lib-utils` | | `config.ts` | `misc` | `misc` | `misc` | +**Optional trailing columns (feature-flagged):** + +| Column | Triggered by | Source | +|--------|--------------|--------| +| `Source` | non-empty `SOURCE` field | `code-review` (adversarial finding) or `execute-sprint` (captured during work) | +| `Reviewers` | non-empty `REVIEWERS` field | comma-joined agent names from `multi_review` (e.g. `bruce,greta`) | +| `Confidence` | non-empty `CONFIDENCE` field | dedupe-time score: `HIGH` (2+ reviewers), `MEDIUM` (single), `LOW` (single + untrusted) | + +These columns are emitted only when at least one input row carries a non-empty value, so existing 7- or 8-column callers see no change. + +--- + +### multi_review + +Fan out a code review across multiple openclaw reviewer agents on a remote host, collect each reviewer's TD findings, and merge them with per-row attribution. + +```bash +llm-support multi_review [flags] +``` + +**Required flags:** + +| Flag | Description | +|------|-------------| +| `--reviewers` | Comma-separated reviewer agent names (e.g. `bruce,greta,kai,mira,dax,otto`) | +| `--repo` | Local repo path to bundle and ship | +| `--openclaw-host` | SSH target running `openclaw-gateway` (e.g. `user@nucleus.lan`) | +| `--output-dir` | Where per-reviewer artifacts and merged stream land | + +**Optional flags:** + +| Flag | Default | Description | +|------|---------|-------------| +| `--serial-reviewers` | (empty) | Subset that runs sequentially after the parallel lane (shared rate limits) | +| `--base` | (empty) | Base ref for the diff range, included in the auto-built task message | +| `--head` | `HEAD` | Head ref | +| `--timeout-seconds` | `1200` | Total wall-clock budget for the entire fan-out | +| `--per-reviewer-timeout-seconds` | `600` | Per-reviewer soft timeout | +| `--gateway-container` | `openclaw-gateway` | Docker container running openclaw | +| `--task-message` | (auto) | Override the auto-built task message | +| `--skip-cleanup` | `false` | Do not remove the remote workdir after running | + +**Output layout under `--output-dir`:** + +``` +/ + raw/ + bruce/ + review.md # extracted prose + td-stream.txt # pipe-delimited findings with |bruce appended + status.json # model, duration, status, td line count + response.json # raw openclaw envelope (for replay) + greta/... (same shape) + ... + td-stream-all.txt # merged across reviewers with REVIEWER column + multi-review-summary.json # per-reviewer status + counts + partial flag +``` + +**Failure semantics:** + +| Scenario | Behavior | +|----------|----------| +| Bundle/ship fails (SSH down, scp error, remote clone fails) | Hard-stop with error naming the host | +| 1+ reviewer fails, 1+ succeeds | Continue with successful ones, `partial: true` in summary, exit 0 | +| All reviewers fail | Exit 1 with summary path | + +**Examples:** + +```bash +# Six-reviewer fan-out, all parallel +llm-support multi_review \ + --reviewers bruce,greta,kai,mira,dax,otto \ + --repo ~/Documents/GitHub/myproject \ + --openclaw-host user@nucleus.lan \ + --output-dir .planning/sprints/active/2.0/code-review/multi-review \ + --base v1.0.0 --head HEAD + +# With a serial lane for shared-quota providers +llm-support multi_review \ + --reviewers bruce,greta,kai,mira,dax \ + --serial-reviewers mira,dax \ + --repo . --openclaw-host user@host.lan \ + --output-dir /tmp/mr-out + +# Custom task message +llm-support multi_review \ + --reviewers bruce \ + --repo . --openclaw-host user@host.lan \ + --output-dir /tmp/mr-out \ + --task-message "Review only auth/*.go files for security issues." +``` + +**Wiring into `/code-review`:** + +`multi_review` is designed to be invoked by the `/code-review` command's Phase 5 (adversarial review) when `review.mode: multi` is set in `.planning/.config/config.yaml`. See the [code-review prompt](https://github.com/samestrin/claude-prompts/blob/main/.claude/commands/code-review.md) for the integration. + --- ## Global Flags From fad63c283db2ca51e1f688aa2ab7fc71337c20c7 Mon Sep 17 00:00:00 2001 From: Sam Estrin Date: Tue, 12 May 2026 11:05:25 -0700 Subject: [PATCH 09/10] refactor(multi_review): surface merged stream as td-stream.txt at output root Renamed the top-level merged file from td-stream-all.txt to td-stream.txt so /reconcile-code-review's auto-discovery (any code-review// dir with a td-stream.txt is a source) treats multi-agent as one unified source. The per-reviewer raw streams stay at raw//td-stream.txt; the cross-reviewer merge stays at raw/td-stream-all.txt for inspection. --- docs/llm-support-commands.md | 5 ++++- internal/support/commands/multi_review.go | 7 +++++-- internal/support/commands/multi_review_test.go | 4 ++-- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/docs/llm-support-commands.md b/docs/llm-support-commands.md index 0225336..15d382f 100644 --- a/docs/llm-support-commands.md +++ b/docs/llm-support-commands.md @@ -2158,7 +2158,10 @@ llm-support multi_review [flags] response.json # raw openclaw envelope (for replay) greta/... (same shape) ... - td-stream-all.txt # merged across reviewers with REVIEWER column + raw/td-stream-all.txt # merged across reviewers with REVIEWER column + td-stream.txt # same merged content, surfaced at root so + # /reconcile-code-review auto-discovers this + # directory as one unified source multi-review-summary.json # per-reviewer status + counts + partial flag ``` diff --git a/internal/support/commands/multi_review.go b/internal/support/commands/multi_review.go index 6e376b9..35b4a9d 100644 --- a/internal/support/commands/multi_review.go +++ b/internal/support/commands/multi_review.go @@ -239,9 +239,12 @@ func runMultiReview(cmd *cobra.Command, _ []string) error { return fmt.Errorf("merge streams: %w", err) } totalFindings = n - // Also write the merged stream up one level for convenience. + // Also write the merged stream up one level as td-stream.txt so + // /reconcile-code-review's auto-discovery (any code-review// + // dir with a td-stream.txt is a source) treats multi-agent as one + // unified source rather than peeking into raw//. srcMerged := filepath.Join(rawDir, "td-stream-all.txt") - dstMerged := filepath.Join(mrOutputDir, "td-stream-all.txt") + dstMerged := filepath.Join(mrOutputDir, "td-stream.txt") if data, err := os.ReadFile(srcMerged); err == nil { _ = os.WriteFile(dstMerged, data, 0o644) } diff --git a/internal/support/commands/multi_review_test.go b/internal/support/commands/multi_review_test.go index 5319f4b..c065503 100644 --- a/internal/support/commands/multi_review_test.go +++ b/internal/support/commands/multi_review_test.go @@ -108,9 +108,9 @@ func TestMultiReview_HappyPath(t *testing.T) { t.Errorf("missing %s: %v", td, err) } } - merged := filepath.Join(outDir, "td-stream-all.txt") + merged := filepath.Join(outDir, "td-stream.txt") if _, err := os.Stat(merged); err != nil { - t.Errorf("missing merged: %v", err) + t.Errorf("missing merged td-stream.txt at output root: %v", err) } summary := filepath.Join(outDir, "multi-review-summary.json") if _, err := os.Stat(summary); err != nil { From 22c00cc145b30f57629a6b3a34aa16bd8dbf0eee Mon Sep 17 00:00:00 2001 From: Sam Estrin Date: Tue, 12 May 2026 11:36:53 -0700 Subject: [PATCH 10/10] docs(multi_review): clarify output layout in long-help string The --help text still listed td-stream-all.txt at the output root, but the actual layout is: raw/td-stream-all.txt (cross-reviewer merge, inside raw/) td-stream.txt (copy at root, where reconcile auto-discovers) End-to-end smoke test against real openclaw (Bruce on llm-env v1.6.0..HEAD): - bundle + scp to nucleus.lan: clean - openclaw agent invocation + JSON envelope parse: clean - 4 findings extracted, written to raw/bruce/, merged to root: matches the manual bench numbers - status.json + multi-review-summary.json: structure correct - exit 0, total 67s --- internal/support/commands/multi_review.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/internal/support/commands/multi_review.go b/internal/support/commands/multi_review.go index 35b4a9d..7823188 100644 --- a/internal/support/commands/multi_review.go +++ b/internal/support/commands/multi_review.go @@ -70,8 +70,11 @@ a merged TD stream the /code-review command can consume. Output layout: /raw//{review.md,td-stream.txt,status.json,response.json} - /td-stream-all.txt (merged + reviewer-attributed) - /multi-review-summary.json (per-reviewer status + counts) + /raw/td-stream-all.txt (cross-reviewer merge, inside raw/) + /td-stream.txt (same merged content at root, where + /reconcile-code-review auto-discovers + this directory as one unified source) + /multi-review-summary.json (per-reviewer status + counts) Failure semantics: - Bundle/ship failure → hard-stop (no point invoking reviewers without