feat(ai-builder): CSV input for evals (no-changelog) (#21150)

burivuhster · web-flow · commit 2b53cc0f5c96 · 2025-10-24T14:15:36.000+02:00
diff --git a/packages/@n8n/ai-workflow-builder.ee/evaluations/cli/runner.ts b/packages/@n8n/ai-workflow-builder.ee/evaluations/cli/runner.ts
@@ -21,16 +21,19 @@ import {
 import { formatHeader, saveEvaluationResults } from '../utils/evaluation-helpers.js';
 import { generateMarkdownReport } from '../utils/evaluation-reporter.js';
 
+type CliEvaluationOptions = {
+	testCaseFilter?: string; // Optional test case ID to run only a specific test
+	testCases?: TestCase[]; // Optional array of test cases to run (if not provided, uses defaults and generation)
+	repetitions?: number; // Number of times to run each test (e.g. for cache warming analysis)
+};
+
 /**
  * Main CLI evaluation runner that executes all test cases in parallel
  * Supports concurrency control via EVALUATION_CONCURRENCY environment variable
- * @param testCaseFilter - Optional test case ID to run only a specific test
- * @param repetitions - Number of times to run each test (for cache warming analysis)
  */
-export async function runCliEvaluation(
-	testCaseFilter?: string,
-	repetitions: number = 1,
-): Promise<void> {
+export async function runCliEvaluation(options: CliEvaluationOptions = {}): Promise<void> {
+	const { repetitions = 1, testCaseFilter } = options;
+
 	console.log(formatHeader('AI Workflow Builder Full Evaluation', 70));
 	if (repetitions > 1) {
 		console.log(pc.yellow(`➔ Each test will be run ${repetitions} times for cache analysis`));
@@ -41,7 +44,14 @@ export async function runCliEvaluation(
 		const { parsedNodeTypes, llm, tracer } = await setupTestEnvironment();
 
 		// Determine test cases to run
-		let testCases: TestCase[] = basicTestCases;
+		const providedTestCases =
+			options.testCases && options.testCases.length > 0 ? options.testCases : undefined;
+
+		let testCases: TestCase[] = providedTestCases ?? basicTestCases;
+
+		if (providedTestCases) {
+			console.log(pc.blue(`➔ Loaded ${providedTestCases.length} test cases from CSV`));
+		}
 
 		// Filter to single test case if specified
 		if (testCaseFilter) {
@@ -56,7 +66,7 @@ export async function runCliEvaluation(
 			}
 		} else {
 			// Optionally generate additional test cases
-			if (shouldGenerateTestCases()) {
+			if (!providedTestCases && shouldGenerateTestCases()) {
 				console.log(pc.blue('➔ Generating additional test cases...'));
 				const generatedCases = await generateTestCases(llm, howManyTestCasesToGenerate());
 				testCases = [...testCases, ...generatedCases];
diff --git a/packages/@n8n/ai-workflow-builder.ee/evaluations/index.ts b/packages/@n8n/ai-workflow-builder.ee/evaluations/index.ts
@@ -1,5 +1,6 @@
 import { runCliEvaluation } from './cli/runner.js';
 import { runLangsmithEvaluation } from './langsmith/runner.js';
+import { loadTestCasesFromCsv } from './utils/csv-prompt-loader.js';
 
 // Re-export for external use if needed
 export { runCliEvaluation } from './cli/runner.js';
@@ -19,6 +20,13 @@ async function main(): Promise<void> {
 		? process.argv[process.argv.indexOf('--test-case') + 1]
 		: undefined;
 
+	// Parse command line argument for CSV prompts file path
+	const promptsCsvPath = getFlagValue('--prompts-csv') ?? process.env.PROMPTS_CSV_FILE;
+
+	if (promptsCsvPath && useLangsmith) {
+		console.warn('CSV-driven evaluations are only supported in CLI mode. Ignoring --prompts-csv.');
+	}
+
 	// Parse command line arguments for a number of repetitions (applies to both modes)
 	const repetitionsArg = process.argv.includes('--repetitions')
 		? parseInt(process.argv[process.argv.indexOf('--repetitions') + 1], 10)
@@ -28,10 +36,33 @@ async function main(): Promise<void> {
 	if (useLangsmith) {
 		await runLangsmithEvaluation(repetitions);
 	} else {
-		await runCliEvaluation(testCaseId, repetitions);
+		const csvTestCases = promptsCsvPath ? loadTestCasesFromCsv(promptsCsvPath) : undefined;
+		await runCliEvaluation({ testCases: csvTestCases, testCaseFilter: testCaseId, repetitions });
 	}
 }
 
+function getFlagValue(flag: string): string | undefined {
+	const exactMatchIndex = process.argv.findIndex((arg) => arg === flag);
+	if (exactMatchIndex !== -1) {
+		const value = process.argv[exactMatchIndex + 1];
+		if (!value || value.startsWith('--')) {
+			throw new Error(`Flag ${flag} requires a value`);
+		}
+		return value;
+	}
+
+	const withValue = process.argv.find((arg) => arg.startsWith(`${flag}=`));
+	if (withValue) {
+		const value = withValue.slice(flag.length + 1);
+		if (!value) {
+			throw new Error(`Flag ${flag} requires a value`);
+		}
+		return value;
+	}
+
+	return undefined;
+}
+
 // Run if called directly
 if (require.main === module) {
 	main().catch(console.error);
diff --git a/packages/@n8n/ai-workflow-builder.ee/evaluations/utils/csv-prompt-loader.ts b/packages/@n8n/ai-workflow-builder.ee/evaluations/utils/csv-prompt-loader.ts
@@ -0,0 +1,108 @@
+import { parse } from 'csv-parse/sync';
+import { existsSync, readFileSync } from 'node:fs';
+import path from 'node:path';
+
+import type { TestCase } from '../types/evaluation.js';
+
+type ParsedCsvRow = string[];
+
+function isHeaderRow(row: ParsedCsvRow) {
+	return row.some((cell) => cell.trim().toLowerCase() === 'prompt');
+}
+
+function detectColumnIndex(header: ParsedCsvRow, name: string) {
+	const normalized = name.toLowerCase();
+	const index = header.findIndex((cell) => cell.trim().toLowerCase() === normalized);
+	return index >= 0 ? index : undefined;
+}
+
+function sanitizeValue(value: string | undefined) {
+	return value?.trim() ?? '';
+}
+
+function generateNameFromPrompt(prompt: string, index: number) {
+	const normalized = prompt.replace(/\s+/g, ' ').trim();
+	if (!normalized) {
+		return `CSV Prompt ${index + 1}`;
+	}
+
+	const maxLength = 60;
+	if (normalized.length <= maxLength) {
+		return normalized;
+	}
+
+	return `${normalized.slice(0, maxLength - 3)}...`;
+}
+
+function parseCsv(content: string): ParsedCsvRow[] {
+	try {
+		const rows = parse(content.replace(/^\ufeff/, ''), {
+			columns: false,
+			skip_empty_lines: true,
+			trim: true,
+			relax_column_count: true,
+		}) as ParsedCsvRow[];
+
+		return rows.map((row) => row.map((cell) => cell ?? ''));
+	} catch (error) {
+		const message = error instanceof Error ? error.message : 'Unknown parsing error';
+		throw new Error(`Failed to parse CSV file: ${message}`);
+	}
+}
+
+export function loadTestCasesFromCsv(csvPath: string): TestCase[] {
+	const resolvedPath = path.isAbsolute(csvPath) ? csvPath : path.resolve(process.cwd(), csvPath);
+
+	if (!existsSync(resolvedPath)) {
+		throw new Error(`CSV file not found at ${resolvedPath}`);
+	}
+
+	const fileContents = readFileSync(resolvedPath, 'utf8');
+	const rows = parseCsv(fileContents);
+
+	if (rows.length === 0) {
+		throw new Error('The provided CSV file is empty');
+	}
+
+	let header: ParsedCsvRow | undefined;
+	let dataRows = rows;
+
+	if (isHeaderRow(rows[0])) {
+		header = rows[0]!;
+		dataRows = rows.slice(1);
+	}
+
+	if (dataRows.length === 0) {
+		throw new Error('No prompt rows found in the provided CSV file');
+	}
+
+	const promptIndex = header ? (detectColumnIndex(header, 'prompt') ?? 0) : 0;
+	const idIndex = header ? detectColumnIndex(header, 'id') : undefined;
+	const nameIndex = header
+		? (detectColumnIndex(header, 'name') ?? detectColumnIndex(header, 'title'))
+		: undefined;
+
+	const testCases = dataRows
+		.map<TestCase | undefined>((row, index) => {
+			const prompt = sanitizeValue(row[promptIndex]);
+			if (!prompt) {
+				return undefined;
+			}
+
+			const idSource = sanitizeValue(idIndex !== undefined ? row[idIndex] : undefined);
+			const nameSource = sanitizeValue(nameIndex !== undefined ? row[nameIndex] : undefined);
+
+			return {
+				id: idSource || `csv-case-${index + 1}`,
+				name: nameSource || generateNameFromPrompt(prompt, index),
+				prompt,
+			};
+		})
+		.filter((testCase): testCase is TestCase => testCase !== undefined);
+
+	if (testCases.length === 0) {
+		throw new Error('No valid prompts found in the provided CSV file');
+	}
+
+	return testCases;
+}
diff --git a/packages/@n8n/ai-workflow-builder.ee/package.json b/packages/@n8n/ai-workflow-builder.ee/package.json
@@ -21,6 +21,7 @@
     "deps:orphans": "madge src/index.ts --orphans",
     "deps:all": "pnpm run deps:graph && pnpm run deps:graph:service && pnpm run deps:graph:tools && pnpm run deps:circular && pnpm run deps:report",
     "eval": "tsx evaluations",
+    "eval:csv": "tsx evaluations --prompts-csv",
     "eval:langsmith": "USE_LANGSMITH_EVAL=true tsx evaluations",
     "eval:generate": "GENERATE_TEST_CASES=true tsx evaluations"
   },
@@ -46,6 +47,7 @@
     "@n8n/config": "workspace:*",
     "@n8n/di": "workspace:*",
     "@n8n_io/ai-assistant-sdk": "catalog:",
+    "csv-parse": "5.5.0",
     "langsmith": "^0.3.45",
     "lodash": "catalog:",
     "n8n-workflow": "workspace:*",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml