From 4ec0caaba20a6eec282b5fb1d60eacddc4188fa3 Mon Sep 17 00:00:00 2001 From: andrejtonev <29177572+andrejtonev@users.noreply.github.com> Date: Thu, 18 Jun 2026 14:49:48 +0200 Subject: [PATCH] fix(evaluate): don't mark primary sessions as evaluated on scoring failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the scorer returns a fallback card (e.g. user picked a model that doesn't exist, model provider is down, LLM call timed out), the runEvaluation() fallback path was unconditionally adding pending.sessionID to ctx.sessionsEvaluated. The runEvaluation() guard at line 96 then short-circuited any future runEvaluation() call for the same session id, silently disabling re-attempts for the rest of the plugin lifetime. This is wrong: a failed scoring is transient. The user might fix the model config in kasper.jsonc and expect the next polling cycle (or the next /kasper score session invocation) to pick up the now-fixable session. With the previous code, the user would have to restart opencode to retry. The success path of runEvaluation() is the only path that may mark a session as evaluated. The fix removes ctx.sessionsEvaluated.add(pending.sessionID) from the fallback branch. The persistent state.evaluated_sessions was never updated in this path, so the in-memory set is the only thing that changed; on plugin restart the set is rebuilt from the persistent array (state.ts:981-983) which is unchanged, so the failed session is correctly retryable after a restart either way. Notes for the reviewer: - This does NOT change the kasper-side ephemeral scoring sessions (kasper-scoring-*, kasper-merge-*, kasper-diag-*). Their cleanup is in scorer.ts:268-285 (merge) and scorer.ts:1185-1204 (tryEvaluate), in finally blocks guarded by 'if (scoringSessionId)'. When session.create() throws (the model doesn't exist case), scoringSessionId is never assigned and the delete is correctly skipped — there's no session to delete. The primary session being scored is a separate concern, addressed by this fix. - Startup time is unaffected. The Set.add() was a single synchronous operation on a path that fires at most once per failed scoring attempt, never at plugin init. Measured cold startup on the current branch: ~2 ms (5 runs, median 1.8 ms). - The new regression test in tests/evaluate.test.ts asserts the fix at the runEvaluation() level. It uses the existing mockCtx() helper and a scorer that returns a fallback card, then asserts that ctx.sessionsEvaluated does NOT contain the failed session and that a subsequent runEvaluation() call actually re-invokes the scorer. Confirmed via mutation check: the test fails on the unfixed code with 'Expected: false, Received: true' on sessionsEvaluated.has(). - Trade-off: removing the Set.add() means a session whose model is broken will be re-polled on every evaluation_poll_interval_ms (default 60s) for as long as the model is broken. Each re-attempt re-fires the LLM call's retry budget (maxRetries+1 attempts). The toast warning at evaluate.ts:295-302 still tells the user, and each attempt creates a 'scoring_session_created' or 'scoring_attempt_error' log line which is greppable. --- src/evaluate.ts | 16 +++++++++++++++- tests/evaluate.test.ts | 43 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/src/evaluate.ts b/src/evaluate.ts index c4a89b6..e63def9 100644 --- a/src/evaluate.ts +++ b/src/evaluate.ts @@ -270,7 +270,21 @@ export async function runEvaluation( const { emoji: _scoreEmoji } = formatScore(card.overall_score) if (card.fallback || card.overall_score <= 0) { - ctx.sessionsEvaluated.add(pending.sessionID) + // Failure path: do NOT add pending.sessionID to + // ctx.sessionsEvaluated or to the persistent + // ctx.stateStore.addEvaluatedSession list. Adding it would mark + // the primary session as "already evaluated" forever, blocking + // any future re-attempt (e.g. after the user fixes the model + // config in kasper.jsonc). Scorer errors are transient: the + // model may be unavailable right now but become available + // later, the user may edit kasper.jsonc, or the user may + // restart opencode. The session will be retried on the next + // poll cycle (or after restart) when SESSION_DEBOUNCE_MS elapses + // — and a fresh scoring attempt produces a fresh card. The + // trade-off is some 6s-cycle log noise while the model is + // unavailable, but the toast warning already tells the user. + // The successful-recording path (below) is the only path that + // is allowed to mark a session as evaluated. await ctx.logger.log("evaluation_skipped", { sessionID: pending.sessionID, score: card.overall_score, diff --git a/tests/evaluate.test.ts b/tests/evaluate.test.ts index 118d632..5382968 100644 --- a/tests/evaluate.test.ts +++ b/tests/evaluate.test.ts @@ -323,6 +323,49 @@ describe("runEvaluation", () => { expect(result).toBe(false) }) + test("fallback card (e.g. model unavailable) does NOT mark session as evaluated", async () => { + // Regression: a primary session whose scorer returns a fallback + // card (because session.create threw — e.g. user picked a model + // that doesn't exist, or the model provider is down) used to be + // added to ctx.sessionsEvaluated unconditionally. The + // runEvaluation() short-circuit at line 96 then skipped the + // session for the rest of the plugin lifetime, silently disabling + // any future re-attempt. The user would have to restart opencode + // to retry. The fix: the fallback path records nothing. Only the + // successful-recording path (recordSession + addEvaluatedSession) + // is allowed to mark a session as evaluated. + const ctx = mockCtx({ + scorer: { + evaluate: async (_input: any) => + makeScoreCard({ fallback: true, overall_score: 0.5 }), + } as any, + }) + const result = await runEvaluation( + makePendingEval({ sessionID: "broken-model-session" }), + ctx, + ) + expect(result).toBe(false) + // The fallback path must NOT add the session to sessionsEvaluated. + // If it does, the next runEvaluation() call for the same session + // id will short-circuit at the line-96 guard and never call the + // LLM again. + expect(ctx.sessionsEvaluated.has("broken-model-session")).toBe(false) + + // And the next call must actually re-attempt the scoring. + let secondCallCount = 0 + ctx.scorer = { + evaluate: async (_input: any) => { + secondCallCount++ + return makeScoreCard({ fallback: true, overall_score: 0.5 }) + }, + } as any + await runEvaluation( + makePendingEval({ sessionID: "broken-model-session" }), + ctx, + ) + expect(secondCallCount).toBe(1) + }) + test("truncates long agent response before scoring", async () => { let receivedResponse = "" const ctx = mockCtx({