-
Notifications
You must be signed in to change notification settings - Fork 30.6k
Expand file tree
/
Copy pathrun-evals.js
More file actions
193 lines (176 loc) · 6.21 KB
/
run-evals.js
File metadata and controls
193 lines (176 loc) · 6.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/env node
// @ts-check
/**
* Pack the locally-built `next` package and run agent evals against it.
*
* pnpm eval <eval-name> run one eval, both variants (baseline + AGENTS.md)
* pnpm eval <eval-name> --dry preview without executing
* pnpm eval --all run every eval (slow — normally only CI does this)
* NEXT_SKIP_PACK=1 pnpm eval ... reuse tarball from last run
*
* Mirrors run-tests.js: pack once, hand paths to child via env, forward args.
*
* We only pack `next`, not the whole workspace. The sandbox is remote Linux:
* - @next/swc: local darwin binary wouldn't run there; the sandbox downloads
* the right one at runtime (packages/next/src/build/swc/index.ts).
* - @next/env etc: resolved from npm at the pinned canary version.
*
* The experiments/ dir is generated fresh on every run and gitignored. This
* keeps the two variants (baseline vs. AGENTS.md) in one place instead of
* maintaining N committed experiment files that only differ by one line.
*/
const path = require('path')
const fs = require('fs')
const { execFileSync, spawnSync } = require('child_process')
const ROOT = __dirname
// Sandbox + agent API keys. agent-eval looks for .env.local in its own cwd
// (evals/), but `vc env pull` writes to the repo root. Load it here so the
// vars are already in process.env when we spawn the child.
try {
process.loadEnvFile(path.join(ROOT, '.env.local'))
} catch {}
const EVALS_DIR = path.join(ROOT, 'evals')
const FIXTURES_DIR = path.join(EVALS_DIR, 'evals')
const EXPERIMENTS_DIR = path.join(EVALS_DIR, 'experiments')
const TARBALL_DIR = path.join(EVALS_DIR, '.tarballs')
const TARBALL = path.join(TARBALL_DIR, 'next.tgz')
// The two variants we always compare. Order matters for output readability:
// baseline first so a contributor sees "does the agent fail without docs?"
// before "does it pass with docs?".
const VARIANTS = [
{
suffix: 'baseline',
imports: `import { installNextJs } from '../lib/setup.js'`,
setup: `await installNextJs(sandbox)`,
},
{
suffix: 'agents-md',
imports: `import { installNextJs, writeAgentsMd } from '../lib/setup.js'`,
setup: `await installNextJs(sandbox)\n await writeAgentsMd(sandbox)`,
},
]
function pack() {
fs.mkdirSync(TARBALL_DIR, { recursive: true })
const out = execFileSync(
'pnpm',
['pack', '--pack-destination', TARBALL_DIR],
{ cwd: path.join(ROOT, 'packages/next'), encoding: 'utf8' }
)
const produced = out.trim().split('\n').pop()
const src = path.isAbsolute(produced)
? produced
: path.join(TARBALL_DIR, produced)
fs.renameSync(src, TARBALL)
}
/** @param {string | null} evalName null means all evals */
function writeExperiments(evalName) {
fs.rmSync(EXPERIMENTS_DIR, { recursive: true, force: true })
fs.mkdirSync(EXPERIMENTS_DIR, { recursive: true })
const evalsField = evalName ? `\n evals: '${evalName}',` : ''
for (const v of VARIANTS) {
const body = `import type { ExperimentConfig } from '@vercel/agent-eval'
${v.imports}
const config: ExperimentConfig = {
agent: 'claude-code',
model: 'claude-opus-4-6',${evalsField}
scripts: ['build'],
runs: 1,
earlyExit: true,
timeout: 720,
sandbox: 'auto',
setup: async (sandbox) => {
${v.setup}
},
}
export default config
`
fs.writeFileSync(path.join(EXPERIMENTS_DIR, `${v.suffix}.ts`), body)
}
}
function listEvals() {
return fs
.readdirSync(FIXTURES_DIR, { withFileTypes: true })
.filter((d) => d.isDirectory())
.map((d) => d.name)
}
function main() {
const argv = require('yargs/yargs')(process.argv.slice(2))
.command(
'$0 [eval-name]',
'Run an eval (baseline + agents-md variants)',
(y) =>
y.positional('eval-name', {
type: 'string',
describe: 'Fixture directory name',
})
)
.boolean('all')
.describe('all', 'Run every eval (slow — normally only CI does this)')
.boolean('dry')
.describe('dry', 'Preview without executing')
.conflicts('all', 'eval-name')
.check((argv) => {
if (!argv.all && !argv.evalName) {
throw new Error(
`Missing <eval-name>.\n\nAvailable evals:\n${listEvals()
.map((n) => ` ${n}`)
.join('\n')}`
)
}
if (
argv.evalName &&
!fs.existsSync(path.join(FIXTURES_DIR, argv.evalName))
) {
throw new Error(
`Unknown eval: ${argv.evalName}\n(looked in ${FIXTURES_DIR})`
)
}
return true
})
.strict()
.help().argv
/** @type {string | null} */
const evalName = argv.all ? null : /** @type {string} */ (argv.evalName)
// Flags not consumed here are forwarded to agent-eval.
const forward = argv.dry ? ['--dry'] : []
if (!fs.existsSync(path.join(ROOT, 'packages/next/dist'))) {
console.error(
'packages/next/dist not found. Run `pnpm --filter=next build` first.'
)
process.exit(1)
}
if (process.env.NEXT_SKIP_PACK && fs.existsSync(TARBALL)) {
console.log('> Reusing existing tarball (NEXT_SKIP_PACK=1)')
} else {
console.log('> Packing next...')
pack()
const mb = (fs.statSync(TARBALL).size / 1024 / 1024).toFixed(1)
console.log(` ${TARBALL} (${mb} MB)`)
}
writeExperiments(evalName)
console.log(
evalName
? `> Running ${evalName} (baseline + agents-md)`
: '> Running all evals (baseline + agents-md)'
)
// Same handoff pattern as run-tests.js with NEXT_TEST_PKG_PATHS. We invoke
// the bin directly rather than via `pnpm exec` because pnpm resets cwd to
// the workspace root, but agent-eval resolves experiments/ from process.cwd().
const bin = path.join(ROOT, 'node_modules/.bin/agent-eval')
const result = spawnSync(bin, ['run-all', '--force', ...forward], {
cwd: EVALS_DIR,
stdio: 'inherit',
env: { ...process.env, NEXT_EVAL_TARBALL: TARBALL },
})
if (result.error) {
// ENOENT (missing bin), EACCES, etc. — spawnSync returns status: null
// without printing anything, so surface it.
console.error(`Failed to run ${bin}: ${result.error.message}`)
if (/** @type {NodeJS.ErrnoException} */ (result.error).code === 'ENOENT') {
console.error('Did you run `pnpm install`?')
}
process.exit(1)
}
process.exit(result.status ?? 1)
}
main()