andrejtonev · andrejtonev · Jun 18, 2026 · Jun 18, 2026 · qodo-code-review · Jun 18, 2026
diff --git a/src/evaluate.ts b/src/evaluate.ts
@@ -270,7 +270,21 @@ export async function runEvaluation(
       const { emoji: _scoreEmoji } = formatScore(card.overall_score)
 
       if (card.fallback || card.overall_score <= 0) {
-        ctx.sessionsEvaluated.add(pending.sessionID)
+        // Failure path: do NOT add pending.sessionID to
+        // ctx.sessionsEvaluated or to the persistent
+        // ctx.stateStore.addEvaluatedSession list. Adding it would mark
+        // the primary session as "already evaluated" forever, blocking
+        // any future re-attempt (e.g. after the user fixes the model
+        // config in kasper.jsonc). Scorer errors are transient: the
+        // model may be unavailable right now but become available
+        // later, the user may edit kasper.jsonc, or the user may
+        // restart opencode. The session will be retried on the next
+        // poll cycle (or after restart) when SESSION_DEBOUNCE_MS elapses
+        // — and a fresh scoring attempt produces a fresh card. The
+        // trade-off is some 6s-cycle log noise while the model is
+        // unavailable, but the toast warning already tells the user.
+        // The successful-recording path (below) is the only path that
+        // is allowed to mark a session as evaluated.
         await ctx.logger.log("evaluation_skipped", {
           sessionID: pending.sessionID,
           score: card.overall_score,

diff --git a/tests/evaluate.test.ts b/tests/evaluate.test.ts
@@ -323,6 +323,49 @@ describe("runEvaluation", () => {
     expect(result).toBe(false)
   })
 
+  test("fallback card (e.g. model unavailable) does NOT mark session as evaluated", async () => {
+    // Regression: a primary session whose scorer returns a fallback
+    // card (because session.create threw — e.g. user picked a model
+    // that doesn't exist, or the model provider is down) used to be
+    // added to ctx.sessionsEvaluated unconditionally. The
+    // runEvaluation() short-circuit at line 96 then skipped the
+    // session for the rest of the plugin lifetime, silently disabling
+    // any future re-attempt. The user would have to restart opencode
+    // to retry. The fix: the fallback path records nothing. Only the
+    // successful-recording path (recordSession + addEvaluatedSession)
+    // is allowed to mark a session as evaluated.
+    const ctx = mockCtx({
+      scorer: {
+        evaluate: async (_input: any) =>
+          makeScoreCard({ fallback: true, overall_score: 0.5 }),
+      } as any,
+    })
+    const result = await runEvaluation(
+      makePendingEval({ sessionID: "broken-model-session" }),
+      ctx,
+    )
+    expect(result).toBe(false)
+    // The fallback path must NOT add the session to sessionsEvaluated.
+    // If it does, the next runEvaluation() call for the same session
+    // id will short-circuit at the line-96 guard and never call the
+    // LLM again.
+    expect(ctx.sessionsEvaluated.has("broken-model-session")).toBe(false)
+
+    // And the next call must actually re-attempt the scoring.
+    let secondCallCount = 0
+    ctx.scorer = {
+      evaluate: async (_input: any) => {
+        secondCallCount++
+        return makeScoreCard({ fallback: true, overall_score: 0.5 })
+      },
+    } as any
+    await runEvaluation(
+      makePendingEval({ sessionID: "broken-model-session" }),
+      ctx,
+    )
+    expect(secondCallCount).toBe(1)
+  })
+
   test("truncates long agent response before scoring", async () => {
     let receivedResponse = ""
     const ctx = mockCtx({