From 4ec0caaba20a6eec282b5fb1d60eacddc4188fa3 Mon Sep 17 00:00:00 2001
From: andrejtonev <29177572+andrejtonev@users.noreply.github.com>
Date: Thu, 18 Jun 2026 14:49:48 +0200
Subject: [PATCH] fix(evaluate): don't mark primary sessions as evaluated on
 scoring failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the scorer returns a fallback card (e.g. user picked a model that
doesn't exist, model provider is down, LLM call timed out), the
runEvaluation() fallback path was unconditionally adding
pending.sessionID to ctx.sessionsEvaluated. The runEvaluation() guard
at line 96 then short-circuited any future runEvaluation() call for
the same session id, silently disabling re-attempts for the rest of
the plugin lifetime.

This is wrong: a failed scoring is transient. The user might fix the
model config in kasper.jsonc and expect the next polling cycle (or
the next /kasper score session invocation) to pick up the now-fixable
session. With the previous code, the user would have to restart
opencode to retry. The success path of runEvaluation() is the only
path that may mark a session as evaluated.

The fix removes ctx.sessionsEvaluated.add(pending.sessionID) from the
fallback branch. The persistent state.evaluated_sessions was never
updated in this path, so the in-memory set is the only thing that
changed; on plugin restart the set is rebuilt from the persistent
array (state.ts:981-983) which is unchanged, so the failed session is
correctly retryable after a restart either way.

Notes for the reviewer:

  - This does NOT change the kasper-side ephemeral scoring sessions
    (kasper-scoring-*, kasper-merge-*, kasper-diag-*). Their cleanup
    is in scorer.ts:268-285 (merge) and scorer.ts:1185-1204
    (tryEvaluate), in finally blocks guarded by
    'if (scoringSessionId)'. When session.create() throws (the model
    doesn't exist case), scoringSessionId is never assigned and the
    delete is correctly skipped — there's no session to delete. The
    primary session being scored is a separate concern, addressed by
    this fix.

  - Startup time is unaffected. The Set.add() was a single
    synchronous operation on a path that fires at most once per
    failed scoring attempt, never at plugin init. Measured cold
    startup on the current branch: ~2 ms (5 runs, median 1.8 ms).

  - The new regression test in tests/evaluate.test.ts asserts the
    fix at the runEvaluation() level. It uses the existing mockCtx()
    helper and a scorer that returns a fallback card, then asserts
    that ctx.sessionsEvaluated does NOT contain the failed session
    and that a subsequent runEvaluation() call actually re-invokes
    the scorer. Confirmed via mutation check: the test fails on the
    unfixed code with 'Expected: false, Received: true' on
    sessionsEvaluated.has().

  - Trade-off: removing the Set.add() means a session whose model is
    broken will be re-polled on every evaluation_poll_interval_ms
    (default 60s) for as long as the model is broken. Each
    re-attempt re-fires the LLM call's retry budget (maxRetries+1
    attempts). The toast warning at evaluate.ts:295-302 still tells
    the user, and each attempt creates a 'scoring_session_created'
    or 'scoring_attempt_error' log line which is greppable.
---
 src/evaluate.ts        | 16 +++++++++++++++-
 tests/evaluate.test.ts | 43 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/src/evaluate.ts b/src/evaluate.ts
index c4a89b6..e63def9 100644
--- a/src/evaluate.ts
+++ b/src/evaluate.ts
@@ -270,7 +270,21 @@ export async function runEvaluation(
       const { emoji: _scoreEmoji } = formatScore(card.overall_score)
 
       if (card.fallback || card.overall_score <= 0) {
-        ctx.sessionsEvaluated.add(pending.sessionID)
+        // Failure path: do NOT add pending.sessionID to
+        // ctx.sessionsEvaluated or to the persistent
+        // ctx.stateStore.addEvaluatedSession list. Adding it would mark
+        // the primary session as "already evaluated" forever, blocking
+        // any future re-attempt (e.g. after the user fixes the model
+        // config in kasper.jsonc). Scorer errors are transient: the
+        // model may be unavailable right now but become available
+        // later, the user may edit kasper.jsonc, or the user may
+        // restart opencode. The session will be retried on the next
+        // poll cycle (or after restart) when SESSION_DEBOUNCE_MS elapses
+        // — and a fresh scoring attempt produces a fresh card. The
+        // trade-off is some 6s-cycle log noise while the model is
+        // unavailable, but the toast warning already tells the user.
+        // The successful-recording path (below) is the only path that
+        // is allowed to mark a session as evaluated.
         await ctx.logger.log("evaluation_skipped", {
           sessionID: pending.sessionID,
           score: card.overall_score,
diff --git a/tests/evaluate.test.ts b/tests/evaluate.test.ts
index 118d632..5382968 100644
--- a/tests/evaluate.test.ts
+++ b/tests/evaluate.test.ts
@@ -323,6 +323,49 @@ describe("runEvaluation", () => {
     expect(result).toBe(false)
   })
 
+  test("fallback card (e.g. model unavailable) does NOT mark session as evaluated", async () => {
+    // Regression: a primary session whose scorer returns a fallback
+    // card (because session.create threw — e.g. user picked a model
+    // that doesn't exist, or the model provider is down) used to be
+    // added to ctx.sessionsEvaluated unconditionally. The
+    // runEvaluation() short-circuit at line 96 then skipped the
+    // session for the rest of the plugin lifetime, silently disabling
+    // any future re-attempt. The user would have to restart opencode
+    // to retry. The fix: the fallback path records nothing. Only the
+    // successful-recording path (recordSession + addEvaluatedSession)
+    // is allowed to mark a session as evaluated.
+    const ctx = mockCtx({
+      scorer: {
+        evaluate: async (_input: any) =>
+          makeScoreCard({ fallback: true, overall_score: 0.5 }),
+      } as any,
+    })
+    const result = await runEvaluation(
+      makePendingEval({ sessionID: "broken-model-session" }),
+      ctx,
+    )
+    expect(result).toBe(false)
+    // The fallback path must NOT add the session to sessionsEvaluated.
+    // If it does, the next runEvaluation() call for the same session
+    // id will short-circuit at the line-96 guard and never call the
+    // LLM again.
+    expect(ctx.sessionsEvaluated.has("broken-model-session")).toBe(false)
+
+    // And the next call must actually re-attempt the scoring.
+    let secondCallCount = 0
+    ctx.scorer = {
+      evaluate: async (_input: any) => {
+        secondCallCount++
+        return makeScoreCard({ fallback: true, overall_score: 0.5 })
+      },
+    } as any
+    await runEvaluation(
+      makePendingEval({ sessionID: "broken-model-session" }),
+      ctx,
+    )
+    expect(secondCallCount).toBe(1)
+  })
+
   test("truncates long agent response before scoring", async () => {
     let receivedResponse = ""
     const ctx = mockCtx({