Skip to content

Prompt Evaluations

Prompt Evaluations #63

Workflow file for this run

name: Prompt Evaluations
on:
# Run on PRs to validate prompt quality
pull_request:
branches:
- main
- master
paths:
- 'reflection-3.ts'
- 'evals/**'
# Manual trigger for full evaluation
workflow_dispatch:
inputs:
eval_type:
description: 'Which evaluation to run'
required: true
default: 'all'
type: choice
options:
- all
- judge
- stuck
- compression
permissions:
contents: read
pull-requests: write
jobs:
evaluate:
name: Run Prompt Evaluations
runs-on: ubuntu-latest
env:
PROMPTFOO_CONFIG_DIR: ${{ github.workspace }}/.promptfoo
PROMPTFOO_DISABLE_WAL_MODE: "1"
PROMPTFOO_DISABLE_TELEMETRY: "1"
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '20'
cache: 'npm'
- name: Install dependencies
run: npm ci --ignore-scripts
- name: Build native deps (better-sqlite3)
run: npm rebuild better-sqlite3
- name: Create results directory
run: mkdir -p evals/results
- name: Run Judge Evaluation
if: ${{ github.event.inputs.eval_type == 'all' || github.event.inputs.eval_type == 'judge' || github.event_name == 'pull_request' }}
env:
AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
# Must be the base host, e.g. https://vibebrowser-dev.openai.azure.com
AZURE_OPENAI_BASE_URL: ${{ secrets.AZURE_OPENAI_BASE_URL }}
AZURE_OPENAI_API_BASE_URL: ${{ secrets.AZURE_OPENAI_BASE_URL }}
run: npm run eval:judge -- --no-progress-bar -o evals/results/judge-results.json
- name: Run Stuck Detection Evaluation
if: ${{ github.event.inputs.eval_type == 'all' || github.event.inputs.eval_type == 'stuck' }}
env:
AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
AZURE_OPENAI_BASE_URL: ${{ secrets.AZURE_OPENAI_BASE_URL }}
AZURE_OPENAI_API_BASE_URL: ${{ secrets.AZURE_OPENAI_BASE_URL }}
run: npm run eval:stuck -- --no-progress-bar -o evals/results/stuck-results.json
- name: Run Post-Compression Evaluation
if: ${{ github.event.inputs.eval_type == 'all' || github.event.inputs.eval_type == 'compression' }}
env:
AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
AZURE_OPENAI_BASE_URL: ${{ secrets.AZURE_OPENAI_BASE_URL }}
AZURE_OPENAI_API_BASE_URL: ${{ secrets.AZURE_OPENAI_BASE_URL }}
run: npm run eval:compression -- --no-progress-bar -o evals/results/compression-results.json
- name: Upload Evaluation Results
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: eval-results
path: evals/results/*.json
retention-days: 30
- name: Generate Summary
if: ${{ always() }}
run: |
echo "## Prompt Evaluation Results" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
for file in evals/results/*.json; do
if [ -f "$file" ]; then
name=$(basename "$file" .json)
echo "### $name" >> $GITHUB_STEP_SUMMARY
# Extract pass/fail counts using node
node -e "
const fs = require('fs');
const data = JSON.parse(fs.readFileSync('$file', 'utf-8'));
const results = data.results || [];
const passed = results.filter(r => r.success).length;
const failed = results.filter(r => !r.success).length;
const total = results.length;
const passRate = total > 0 ? ((passed / total) * 100).toFixed(1) : 0;
console.log('- Total tests: ' + total);
console.log('- Passed: ' + passed);
console.log('- Failed: ' + failed);
console.log('- Pass rate: ' + passRate + '%');
" >> $GITHUB_STEP_SUMMARY 2>/dev/null || echo "- Could not parse results" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
fi
done
- name: Comment on PR
if: github.event_name == 'pull_request'
continue-on-error: true
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const path = require('path');
const resultsDir = 'evals/results';
if (!fs.existsSync(resultsDir)) {
console.log('No results directory found, skipping comment');
return;
}
const files = fs.readdirSync(resultsDir).filter(f => f.endsWith('.json'));
if (files.length === 0) {
console.log('No result files found, skipping comment');
return;
}
let summary = '## Prompt Evaluation Results\n\n';
for (const file of files) {
try {
const data = JSON.parse(fs.readFileSync(path.join(resultsDir, file), 'utf-8'));
const results = data.results || [];
const passed = results.filter(r => r.success).length;
const total = results.length;
const passRate = total > 0 ? ((passed / total) * 100).toFixed(1) : 0;
const icon = passRate >= 80 ? '✅' : passRate >= 50 ? '⚠️' : '❌';
summary += `### ${icon} ${file.replace('.json', '')}\n`;
summary += `- Pass rate: **${passRate}%** (${passed}/${total})\n\n`;
} catch (e) {
summary += `### ❓ ${file}\n- Could not parse results\n\n`;
}
}
await github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: summary
});