diff --git a/src/evaluate.ts b/src/evaluate.ts index c4a89b6..e63def9 100644 --- a/src/evaluate.ts +++ b/src/evaluate.ts @@ -270,7 +270,21 @@ export async function runEvaluation( const { emoji: _scoreEmoji } = formatScore(card.overall_score) if (card.fallback || card.overall_score <= 0) { - ctx.sessionsEvaluated.add(pending.sessionID) + // Failure path: do NOT add pending.sessionID to + // ctx.sessionsEvaluated or to the persistent + // ctx.stateStore.addEvaluatedSession list. Adding it would mark + // the primary session as "already evaluated" forever, blocking + // any future re-attempt (e.g. after the user fixes the model + // config in kasper.jsonc). Scorer errors are transient: the + // model may be unavailable right now but become available + // later, the user may edit kasper.jsonc, or the user may + // restart opencode. The session will be retried on the next + // poll cycle (or after restart) when SESSION_DEBOUNCE_MS elapses + // — and a fresh scoring attempt produces a fresh card. The + // trade-off is some 6s-cycle log noise while the model is + // unavailable, but the toast warning already tells the user. + // The successful-recording path (below) is the only path that + // is allowed to mark a session as evaluated. await ctx.logger.log("evaluation_skipped", { sessionID: pending.sessionID, score: card.overall_score, diff --git a/tests/evaluate.test.ts b/tests/evaluate.test.ts index 118d632..5382968 100644 --- a/tests/evaluate.test.ts +++ b/tests/evaluate.test.ts @@ -323,6 +323,49 @@ describe("runEvaluation", () => { expect(result).toBe(false) }) + test("fallback card (e.g. model unavailable) does NOT mark session as evaluated", async () => { + // Regression: a primary session whose scorer returns a fallback + // card (because session.create threw — e.g. user picked a model + // that doesn't exist, or the model provider is down) used to be + // added to ctx.sessionsEvaluated unconditionally. The + // runEvaluation() short-circuit at line 96 then skipped the + // session for the rest of the plugin lifetime, silently disabling + // any future re-attempt. The user would have to restart opencode + // to retry. The fix: the fallback path records nothing. Only the + // successful-recording path (recordSession + addEvaluatedSession) + // is allowed to mark a session as evaluated. + const ctx = mockCtx({ + scorer: { + evaluate: async (_input: any) => + makeScoreCard({ fallback: true, overall_score: 0.5 }), + } as any, + }) + const result = await runEvaluation( + makePendingEval({ sessionID: "broken-model-session" }), + ctx, + ) + expect(result).toBe(false) + // The fallback path must NOT add the session to sessionsEvaluated. + // If it does, the next runEvaluation() call for the same session + // id will short-circuit at the line-96 guard and never call the + // LLM again. + expect(ctx.sessionsEvaluated.has("broken-model-session")).toBe(false) + + // And the next call must actually re-attempt the scoring. + let secondCallCount = 0 + ctx.scorer = { + evaluate: async (_input: any) => { + secondCallCount++ + return makeScoreCard({ fallback: true, overall_score: 0.5 }) + }, + } as any + await runEvaluation( + makePendingEval({ sessionID: "broken-model-session" }), + ctx, + ) + expect(secondCallCount).toBe(1) + }) + test("truncates long agent response before scoring", async () => { let receivedResponse = "" const ctx = mockCtx({