From 1110f07a8e0579a940c8dd27148212fbeb63dc7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Ad=C3=A1mek?= Date: Tue, 16 Jun 2026 14:07:54 +0200 Subject: [PATCH 1/5] feat: schema-validated router labels via standard-schema MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce per-label typing of `request.userData` for the router, in two layers: - A `label -> userData` map can be passed as the router's `Routes` type argument, typing `request.userData` per label and rejecting unknown labels at compile time. Backwards compatible (default is an open map). - A per-label Standard Schema map (Zod, Valibot, ArkType, …) passed to `Router.create`/`createXRouter` both infers the `userData` types and validates them at runtime before the handler runs, replacing `request.userData` with the parsed value. Invalid requests throw a new non-retryable `RequestValidationError`. Adds the types-only `@standard-schema/spec` dependency to `@crawlee/core`. Relates to #3082 --- .../src/internals/basic-crawler.ts | 12 +- .../src/internals/cheerio-crawler.ts | 13 +- packages/core/package.json | 1 + packages/core/src/errors.ts | 26 +++ packages/core/src/router.ts | 208 ++++++++++++++++-- .../src/internals/file-download.ts | 21 +- .../src/internals/http-crawler.ts | 13 +- .../src/internals/jsdom-crawler.ts | 13 +- .../src/internals/linkedom-crawler.ts | 13 +- .../internals/adaptive-playwright-crawler.ts | 13 +- .../src/internals/playwright-crawler.ts | 13 +- .../src/internals/puppeteer-crawler.ts | 13 +- .../src/internals/stagehand-crawler.ts | 13 +- pnpm-lock.yaml | 3 + test/core/router.test.ts | 72 +++++- 15 files changed, 407 insertions(+), 40 deletions(-) diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index eeb4d4adfea4..9b5f10013c14 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -21,6 +21,8 @@ import type { RequestTransform, RouterHandler, RouterRoutes, + RouteSchemas, + RoutesFromSchemas, SkippedRequestCallback, Source, StatisticsOptions, @@ -2348,6 +2350,12 @@ export interface CrawlerRunOptions extends CrawlerAddRequestsOptions { export function createBasicRouter< Context extends BasicCrawlingContext = BasicCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest, ->(routes?: RouterRoutes) { - return Router.create(routes); + Routes extends Record = Record, +>(routes?: RouterRoutes): RouterHandler; +export function createBasicRouter< + Context extends BasicCrawlingContext = BasicCrawlingContext, + const Schemas extends RouteSchemas = RouteSchemas, +>(schemas: Schemas): RouterHandler>; +export function createBasicRouter(routesOrSchemas?: any): any { + return Router.create(routesOrSchemas); } diff --git a/packages/cheerio-crawler/src/internals/cheerio-crawler.ts b/packages/cheerio-crawler/src/internals/cheerio-crawler.ts index bb3b0da76ebf..8884963edd5b 100644 --- a/packages/cheerio-crawler/src/internals/cheerio-crawler.ts +++ b/packages/cheerio-crawler/src/internals/cheerio-crawler.ts @@ -8,7 +8,10 @@ import type { InternalHttpHook, IRequestManager, RequestHandler, + RouterHandler, RouterRoutes, + RouteSchemas, + RoutesFromSchemas, SkippedRequestCallback, } from '@crawlee/http'; import { @@ -364,6 +367,12 @@ export async function cheerioCrawlerEnqueueLinks( export function createCheerioRouter< Context extends CheerioCrawlingContext = CheerioCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest, ->(routes?: RouterRoutes) { - return Router.create(routes); + Routes extends Record = Record, +>(routes?: RouterRoutes): RouterHandler; +export function createCheerioRouter< + Context extends CheerioCrawlingContext = CheerioCrawlingContext, + const Schemas extends RouteSchemas = RouteSchemas, +>(schemas: Schemas): RouterHandler>; +export function createCheerioRouter(routesOrSchemas?: any): any { + return Router.create(routesOrSchemas); } diff --git a/packages/core/package.json b/packages/core/package.json index 0bef02294a6a..07bd847a272e 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -57,6 +57,7 @@ "@crawlee/types": "workspace:*", "@crawlee/utils": "workspace:*", "@sapphire/async-queue": "^1.5.5", + "@standard-schema/spec": "^1.0.0", "@vladfrangu/async_event_emitter": "^2.4.6", "csv-stringify": "^6.5.2", "json5": "^2.2.3", diff --git a/packages/core/src/errors.ts b/packages/core/src/errors.ts index 3d4301b305db..e9f27c372451 100644 --- a/packages/core/src/errors.ts +++ b/packages/core/src/errors.ts @@ -16,6 +16,32 @@ export class CriticalError extends NonRetryableError {} */ export class MissingRouteError extends CriticalError {} +/** + * Thrown when a request's `userData` does not match the {@apilink RouteMap} schema registered for its label. + * + * As the `userData` does not change between attempts, this error is non-retryable. + */ +export class RequestValidationError extends NonRetryableError { + constructor( + readonly label: string | symbol, + readonly issues: readonly { + readonly message: string; + readonly path?: readonly (PropertyKey | { key: PropertyKey })[]; + }[], + ) { + const details = issues + .map((issue) => { + const path = (issue.path ?? []) + .map((segment) => (typeof segment === 'object' ? segment.key : segment)) + .join('.'); + return `- ${path ? `${path}: ` : ''}${issue.message}`; + }) + .join('\n'); + + super(`Request userData for label '${String(label)}' failed schema validation:\n${details}`); + } +} + /** * Errors of `RetryRequestError` type will always be retried by the crawler. * diff --git a/packages/core/src/router.ts b/packages/core/src/router.ts index 4d0897389746..db863b109d55 100644 --- a/packages/core/src/router.ts +++ b/packages/core/src/router.ts @@ -1,22 +1,73 @@ import type { Dictionary } from '@crawlee/types'; +import type { StandardSchemaV1 } from '@standard-schema/spec'; import type { CrawlingContext, LoadedRequest, RestrictedCrawlingContext } from './crawlers/crawler_commons.js'; -import { MissingRouteError } from './errors.js'; +import { MissingRouteError, RequestValidationError } from './errors.js'; import type { Request } from './request.js'; import type { Awaitable } from './typedefs.js'; const defaultRoute = Symbol('default-route'); +/** + * A map of request labels to the shape of `request.userData` expected for that label. Pass it as the + * `Routes` type argument of {@apilink Router} (or a `createXRouter` factory) to get per-label typing of + * `request.userData` and autocomplete/validation of labels in {@apilink Router.addHandler}. + * + * ```ts + * interface MyRoutes { + * PRODUCT: { sku: string; price: number }; + * CATEGORY: { categoryId: string }; + * } + * ``` + */ +export type RouteMap = Record; + +/** + * A map of request labels to a [Standard Schema](https://standardschema.dev) (Zod, Valibot, ArkType, …) + * validating that label's `request.userData`. Pass it to {@apilink Router.create} or a `createXRouter` + * factory to derive the per-label `request.userData` types *and* validate them at runtime before the + * matching handler runs. + */ +export type RouteSchemas = Record; + +/** + * Derives a {@apilink RouteMap} (label → `userData` type) from a {@apilink RouteSchemas} map by inferring + * each schema's output type. Outputs that are not object-shaped fall back to a plain {@apilink Dictionary}. + */ +export type RoutesFromSchemas = { + [Label in keyof Schemas]: StandardSchemaV1.InferOutput extends Dictionary + ? StandardSchemaV1.InferOutput + : Dictionary; +}; + +/** + * The crawling context received by a route handler, with `request.userData` narrowed to `UserData`. + */ +export type RouterHandlerContext = Omit & { + request: LoadedRequest>; +}; + +/** + * The set of labels accepted by {@apilink Router.addHandler}. When the router declares a concrete + * {@apilink RouteMap} (e.g. `{ PRODUCT: ...; CATEGORY: ... }`), only those labels (plus symbols) are + * allowed — unknown labels become a compile-time error. When the map is left open (the default + * `Record`), any string or symbol label is accepted, preserving the original behaviour. + */ +export type RouterLabel> = string extends keyof Routes + ? string | symbol + : (keyof Routes & string) | symbol; + export interface RouterHandler< Context extends Omit = CrawlingContext, -> extends Router { + Routes extends Record = Record>, +> extends Router { (ctx: Context): Awaitable; } export type GetUserDataFromRequest = T extends Request ? Y : never; -export type RouterRoutes = { - [label in string | symbol]: (ctx: Omit & { request: Request }) => Awaitable; +export type RouterRoutes> = { + [Label in keyof Routes]: (ctx: Omit & { request: Request }) => Awaitable; }; /** @@ -83,9 +134,57 @@ export type RouterRoutes = { * ctx.log.info('...'); * }); * ``` + * + * ## Typed labels + * + * To get `request.userData` typed per label, declare a {@apilink RouteMap} and pass it as the second + * type argument. The label passed to {@apilink Router.addHandler} then drives the type of + * `request.userData`, and unknown labels are rejected at compile time: + * + * ```ts + * import { createCheerioRouter, CheerioCrawlingContext } from 'crawlee'; + * + * interface Routes { + * PRODUCT: { sku: string; price: number }; + * CATEGORY: { categoryId: string }; + * } + * + * const router = createCheerioRouter(); + * + * router.addHandler('PRODUCT', async ({ request }) => { + * request.userData.sku; // string + * request.userData.price; // number + * }); + * + * router.addHandler('TYPO', async () => {}); // compile error: not a known label + * ``` + * + * ## Schema-validated labels + * + * Passing a [Standard Schema](https://standardschema.dev) per label both infers the `request.userData` + * types *and* validates them at runtime before the handler runs (replacing `request.userData` with the + * parsed value). A failing request throws a {@apilink RequestValidationError}. + * + * ```ts + * import { z } from 'zod'; + * import { createCheerioRouter } from 'crawlee'; + * + * const router = createCheerioRouter({ + * PRODUCT: z.object({ sku: z.string(), price: z.number() }), + * CATEGORY: z.object({ categoryId: z.string() }), + * }); + * + * router.addHandler('PRODUCT', async ({ request }) => { + * request.userData.price; // number, inferred from the schema and validated at runtime + * }); + * ``` */ -export class Router> { +export class Router< + Context extends Omit, + Routes extends Record = Record>, +> { private readonly routes: Map Awaitable> = new Map(); + private readonly schemas: Map = new Map(); private readonly middlewares: ((ctx: Context) => Awaitable)[] = []; /** @@ -95,26 +194,52 @@ export class Router( + label: Label, + handler: (ctx: RouterHandlerContext) => Awaitable, + ): void; + + /** + * Registers new route handler for given label, with an explicit `request.userData` type. Use this + * overload to type a handler whose label is not part of the router's {@apilink RouteMap}. */ addHandler>( - label: string | symbol, - handler: (ctx: Omit & { request: LoadedRequest> }) => Awaitable, - ) { + label: RouterLabel, + handler: (ctx: RouterHandlerContext) => Awaitable, + ): void; + + addHandler(label: string | symbol, handler: (ctx: any) => Awaitable): void { this.validate(label); this.routes.set(label, handler); } /** - * Registers default route handler. + * Registers default route handler. By default `request.userData` is typed as the union of all + * `userData` shapes declared in the router's {@apilink RouteMap}. */ - addDefaultHandler>( - handler: (ctx: Omit & { request: LoadedRequest> }) => Awaitable, + addDefaultHandler( + handler: (ctx: RouterHandlerContext) => Awaitable, ) { this.validate(defaultRoute); this.routes.set(defaultRoute, handler); } + /** + * Registers {@apilink RouteSchemas|Standard Schema} validators for the given labels. Before a matching + * route handler runs, `request.userData` is validated against the label's schema and replaced with the + * parsed value; a failing request throws a {@apilink RequestValidationError}. + */ + addSchemas(schemas: Partial>) { + for (const [label, schema] of Object.entries(schemas)) { + if (schema) { + this.schemas.set(label, schema as StandardSchemaV1); + } + } + } + /** * Registers a middleware that will be fired before the matching route handler. * Multiple middlewares can be registered, they will be fired in the same order. @@ -142,6 +267,27 @@ export class Router = CrawlingContext, UserData extends Dictionary = GetUserDataFromRequest, - >(routes?: RouterRoutes): RouterHandler { - const router = new Router(); + Routes extends Record = Record, + >(routes?: RouterRoutes): RouterHandler; + + static create< + Context extends Omit = CrawlingContext, + const Schemas extends RouteSchemas = RouteSchemas, + >(schemas: Schemas): RouterHandler>; + + static create = CrawlingContext>( + routesOrSchemas?: Record Awaitable) | StandardSchemaV1>, + ): RouterHandler { + const router = new Router(); const obj = Object.create(Function.prototype); obj.addHandler = router.addHandler.bind(router); obj.addDefaultHandler = router.addDefaultHandler.bind(router); + obj.addSchemas = router.addSchemas.bind(router); obj.getHandler = router.getHandler.bind(router); obj.use = router.use.bind(router); - for (const [label, handler] of Object.entries(routes ?? {})) { - router.addHandler(label, handler); + for (const [label, value] of Object.entries(routesOrSchemas ?? {})) { + if (typeof value === 'function') { + router.addHandler(label as keyof Context & string, value as (ctx: any) => Awaitable); + } else { + router.schemas.set(label, value); + } } const func = async function (context: Context) { const { url, loadedUrl, label } = context.request; context.log.debug('Page opened.', { label, url: loadedUrl ?? url }); + await router.validateRequest(context); + for (const middleware of router.middlewares) { await middleware(context); } @@ -204,6 +378,6 @@ export class Router; + return func as unknown as RouterHandler; } } diff --git a/packages/http-crawler/src/internals/file-download.ts b/packages/http-crawler/src/internals/file-download.ts index 81dd217d0618..2f7b485fd2a7 100644 --- a/packages/http-crawler/src/internals/file-download.ts +++ b/packages/http-crawler/src/internals/file-download.ts @@ -6,7 +6,16 @@ import type { CrawlingContext, LoadedRequest, Request } from '@crawlee/core'; import { ResponseWithUrl } from '@crawlee/http-client'; import type { Dictionary } from '@crawlee/types'; -import type { ErrorHandler, GetUserDataFromRequest, InternalHttpHook, RequestHandler, RouterRoutes } from '../index.js'; +import type { + ErrorHandler, + GetUserDataFromRequest, + InternalHttpHook, + RequestHandler, + RouterHandler, + RouterRoutes, + RouteSchemas, + RoutesFromSchemas, +} from '../index.js'; import { Router } from '../index.js'; import { parseContentTypeFromResponse } from './utils.js'; @@ -254,6 +263,12 @@ function trackBodyConsumption(response: Response): { response: ResponseWithUrl; export function createFileRouter< Context extends FileDownloadCrawlingContext = FileDownloadCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest, ->(routes?: RouterRoutes) { - return Router.create(routes); + Routes extends Record = Record, +>(routes?: RouterRoutes): RouterHandler; +export function createFileRouter< + Context extends FileDownloadCrawlingContext = FileDownloadCrawlingContext, + const Schemas extends RouteSchemas = RouteSchemas, +>(schemas: Schemas): RouterHandler>; +export function createFileRouter(routesOrSchemas?: any): any { + return Router.create(routesOrSchemas); } diff --git a/packages/http-crawler/src/internals/http-crawler.ts b/packages/http-crawler/src/internals/http-crawler.ts index 288a83fc1855..2d4d3f535221 100644 --- a/packages/http-crawler/src/internals/http-crawler.ts +++ b/packages/http-crawler/src/internals/http-crawler.ts @@ -11,7 +11,10 @@ import type { Request as CrawleeRequest, RequestHandler, RequireContextPipeline, + RouterHandler, RouterRoutes, + RouteSchemas, + RoutesFromSchemas, } from '@crawlee/basic'; import { BasicCrawler, @@ -841,6 +844,12 @@ interface RequestFunctionOptions { export function createHttpRouter< Context extends HttpCrawlingContext = HttpCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest, ->(routes?: RouterRoutes) { - return Router.create(routes); + Routes extends Record = Record, +>(routes?: RouterRoutes): RouterHandler; +export function createHttpRouter< + Context extends HttpCrawlingContext = HttpCrawlingContext, + const Schemas extends RouteSchemas = RouteSchemas, +>(schemas: Schemas): RouterHandler>; +export function createHttpRouter(routesOrSchemas?: any): any { + return Router.create(routesOrSchemas); } diff --git a/packages/jsdom-crawler/src/internals/jsdom-crawler.ts b/packages/jsdom-crawler/src/internals/jsdom-crawler.ts index fc8debd5c9e7..f46f97e5a1e0 100644 --- a/packages/jsdom-crawler/src/internals/jsdom-crawler.ts +++ b/packages/jsdom-crawler/src/internals/jsdom-crawler.ts @@ -8,7 +8,10 @@ import type { InternalHttpHook, IRequestManager, RequestHandler, + RouterHandler, RouterRoutes, + RouteSchemas, + RoutesFromSchemas, SkippedRequestCallback, } from '@crawlee/http'; import { @@ -495,6 +498,12 @@ function extractUrlsFromWindow(window: DOMWindow, selector: string, baseUrl: str export function createJSDOMRouter< Context extends JSDOMCrawlingContext = JSDOMCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest, ->(routes?: RouterRoutes) { - return Router.create(routes); + Routes extends Record = Record, +>(routes?: RouterRoutes): RouterHandler; +export function createJSDOMRouter< + Context extends JSDOMCrawlingContext = JSDOMCrawlingContext, + const Schemas extends RouteSchemas = RouteSchemas, +>(schemas: Schemas): RouterHandler>; +export function createJSDOMRouter(routesOrSchemas?: any): any { + return Router.create(routesOrSchemas); } diff --git a/packages/linkedom-crawler/src/internals/linkedom-crawler.ts b/packages/linkedom-crawler/src/internals/linkedom-crawler.ts index 1d4065949d8c..0afda51dfa43 100644 --- a/packages/linkedom-crawler/src/internals/linkedom-crawler.ts +++ b/packages/linkedom-crawler/src/internals/linkedom-crawler.ts @@ -8,7 +8,10 @@ import type { InternalHttpHook, IRequestManager, RequestHandler, + RouterHandler, RouterRoutes, + RouteSchemas, + RoutesFromSchemas, SkippedRequestCallback, } from '@crawlee/http'; import { @@ -385,6 +388,12 @@ function extractUrlsFromWindow(window: Window, selector: string, baseUrl: string export function createLinkeDOMRouter< Context extends LinkeDOMCrawlingContext = LinkeDOMCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest, ->(routes?: RouterRoutes) { - return Router.create(routes); + Routes extends Record = Record, +>(routes?: RouterRoutes): RouterHandler; +export function createLinkeDOMRouter< + Context extends LinkeDOMCrawlingContext = LinkeDOMCrawlingContext, + const Schemas extends RouteSchemas = RouteSchemas, +>(schemas: Schemas): RouterHandler>; +export function createLinkeDOMRouter(routesOrSchemas?: any): any { + return Router.create(routesOrSchemas); } diff --git a/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts b/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts index 2c359e51b752..3de967842d26 100644 --- a/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts +++ b/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts @@ -13,7 +13,10 @@ import type { GetUserDataFromRequest, RequestQueue, RestrictedCrawlingContext, + RouterHandler, RouterRoutes, + RouteSchemas, + RoutesFromSchemas, StatisticPersistedState, StatisticsOptions, StatisticState, @@ -782,6 +785,12 @@ export class AdaptivePlaywrightCrawler< export function createAdaptivePlaywrightRouter< Context extends AdaptivePlaywrightCrawlerContext = AdaptivePlaywrightCrawlerContext, UserData extends Dictionary = GetUserDataFromRequest, ->(routes?: RouterRoutes) { - return Router.create(routes); + Routes extends Record = Record, +>(routes?: RouterRoutes): RouterHandler; +export function createAdaptivePlaywrightRouter< + Context extends AdaptivePlaywrightCrawlerContext = AdaptivePlaywrightCrawlerContext, + const Schemas extends RouteSchemas = RouteSchemas, +>(schemas: Schemas): RouterHandler>; +export function createAdaptivePlaywrightRouter(routesOrSchemas?: any): any { + return Router.create(routesOrSchemas); } diff --git a/packages/playwright-crawler/src/internals/playwright-crawler.ts b/packages/playwright-crawler/src/internals/playwright-crawler.ts index 42431bbbc446..8836889db323 100644 --- a/packages/playwright-crawler/src/internals/playwright-crawler.ts +++ b/packages/playwright-crawler/src/internals/playwright-crawler.ts @@ -4,7 +4,10 @@ import type { BrowserHook, GetUserDataFromRequest, RequestHandler, + RouterHandler, RouterRoutes, + RouteSchemas, + RoutesFromSchemas, } from '@crawlee/browser'; import { BrowserCrawler, RequestState, Router, serviceLocator } from '@crawlee/browser'; import type { BrowserPoolOptions, PlaywrightPlugin } from '@crawlee/browser-pool'; @@ -347,6 +350,12 @@ export function handleCloudflareChallengeHook(options?: HandleCloudflareChalleng export function createPlaywrightRouter< Context extends PlaywrightCrawlingContext = PlaywrightCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest, ->(routes?: RouterRoutes) { - return Router.create(routes); + Routes extends Record = Record, +>(routes?: RouterRoutes): RouterHandler; +export function createPlaywrightRouter< + Context extends PlaywrightCrawlingContext = PlaywrightCrawlingContext, + const Schemas extends RouteSchemas = RouteSchemas, +>(schemas: Schemas): RouterHandler>; +export function createPlaywrightRouter(routesOrSchemas?: any): any { + return Router.create(routesOrSchemas); } diff --git a/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts b/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts index cd6680bbab00..81f4931ab147 100644 --- a/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts +++ b/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts @@ -3,7 +3,10 @@ import type { BrowserCrawlingContext, BrowserHook, GetUserDataFromRequest, + RouterHandler, RouterRoutes, + RouteSchemas, + RoutesFromSchemas, } from '@crawlee/browser'; import { BrowserCrawler, RequestState, Router } from '@crawlee/browser'; import type { BrowserPoolOptions, PuppeteerPlugin } from '@crawlee/browser-pool'; @@ -310,6 +313,12 @@ export class PuppeteerCrawler< export function createPuppeteerRouter< Context extends PuppeteerCrawlingContext = PuppeteerCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest, ->(routes?: RouterRoutes) { - return Router.create(routes); + Routes extends Record = Record, +>(routes?: RouterRoutes): RouterHandler; +export function createPuppeteerRouter< + Context extends PuppeteerCrawlingContext = PuppeteerCrawlingContext, + const Schemas extends RouteSchemas = RouteSchemas, +>(schemas: Schemas): RouterHandler>; +export function createPuppeteerRouter(routesOrSchemas?: any): any { + return Router.create(routesOrSchemas); } diff --git a/packages/stagehand-crawler/src/internals/stagehand-crawler.ts b/packages/stagehand-crawler/src/internals/stagehand-crawler.ts index 75fe0ca79dce..e5bdfdf4b0cc 100644 --- a/packages/stagehand-crawler/src/internals/stagehand-crawler.ts +++ b/packages/stagehand-crawler/src/internals/stagehand-crawler.ts @@ -20,7 +20,10 @@ import type { GetUserDataFromRequest, LoadedContext, RequestHandler, + RouterHandler, RouterRoutes, + RouteSchemas, + RoutesFromSchemas, } from '@crawlee/browser'; import { BrowserCrawler, Router } from '@crawlee/browser'; import type { BrowserPoolOptions } from '@crawlee/browser-pool'; @@ -508,6 +511,12 @@ export class StagehandCrawler< export function createStagehandRouter< Context extends StagehandCrawlingContext = StagehandCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest, ->(routes?: RouterRoutes) { - return Router.create(routes); + Routes extends Record = Record, +>(routes?: RouterRoutes): RouterHandler; +export function createStagehandRouter< + Context extends StagehandCrawlingContext = StagehandCrawlingContext, + const Schemas extends RouteSchemas = RouteSchemas, +>(schemas: Schemas): RouterHandler>; +export function createStagehandRouter(routesOrSchemas?: any): any { + return Router.create(routesOrSchemas); } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 2c9cf535b126..7c709dd0ce8f 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -439,6 +439,9 @@ importers: '@sapphire/async-queue': specifier: ^1.5.5 version: 1.5.5 + '@standard-schema/spec': + specifier: ^1.0.0 + version: 1.1.0 '@vladfrangu/async_event_emitter': specifier: ^2.4.6 version: 2.4.7 diff --git a/test/core/router.test.ts b/test/core/router.test.ts index 49aa0036a505..5f55af34326b 100644 --- a/test/core/router.test.ts +++ b/test/core/router.test.ts @@ -1,7 +1,8 @@ import { BasicCrawler } from '@crawlee/basic'; import type { CrawlingContext } from '@crawlee/core'; -import { MissingRouteError, Router } from '@crawlee/core'; -import { createPlaywrightRouter, type PlaywrightCrawlingContext } from 'crawlee'; +import { MissingRouteError, RequestValidationError, Router } from '@crawlee/core'; +import { createCheerioRouter, createPlaywrightRouter, type PlaywrightCrawlingContext } from 'crawlee'; +import { z } from 'zod'; describe('Router', () => { test('should be callable and route based on the label', async () => { @@ -173,4 +174,71 @@ describe('Router', () => { testType<'bar'>(ctx.request.userData.foo); }); }); + + test('addHandler infers userData from a declared route map', async () => { + const testType = (t: T): void => {}; + + interface Routes { + PRODUCT: { sku: string; price: number }; + CATEGORY: { categoryId: string }; + } + + const router: Router = { + addHandler: () => {}, + addDefaultHandler: () => {}, + } as any; + + router.addHandler('PRODUCT', (ctx) => { + testType(ctx.request.userData.sku); + testType(ctx.request.userData.price); + }); + + router.addHandler('CATEGORY', (ctx) => { + testType(ctx.request.userData.categoryId); + }); + + // @ts-expect-error unknown labels are rejected when a route map is declared + router.addHandler('UNKNOWN', () => {}); + + router.addDefaultHandler((ctx) => { + testType<{ sku: string; price: number } | { categoryId: string }>(ctx.request.userData); + }); + }); + + test('schema map infers userData types and validates at runtime', async () => { + const testType = (t: T): void => {}; + + const logs: string[] = []; + const router = createCheerioRouter({ + PRODUCT: z.object({ sku: z.string(), price: z.coerce.number() }), + CATEGORY: z.object({ categoryId: z.string() }), + }); + + router.addHandler('PRODUCT', async (ctx) => { + // inferred from the schema (note: price is coerced to a number) + testType(ctx.request.userData.sku); + testType(ctx.request.userData.price); + logs.push(`product ${ctx.request.userData.sku} @ ${ctx.request.userData.price}`); + }); + + const log = { info: vitest.fn(), warn: vitest.fn(), debug: vitest.fn() }; + + // valid userData passes and is replaced with the parsed (coerced) value + const validRequest = { + loadedUrl: 'https://example.com/p', + label: 'PRODUCT', + userData: { sku: 'A1', price: '42' }, + }; + await router({ request: validRequest, log } as any); + expect(logs).toEqual(['product A1 @ 42']); + expect(validRequest.userData.price).toBe(42); + + // invalid userData throws a RequestValidationError before the handler runs + await expect( + router({ + request: { loadedUrl: 'https://example.com/p', label: 'PRODUCT', userData: { sku: 123 } }, + log, + } as any), + ).rejects.toThrow(RequestValidationError); + }); }); From 4b70a481fee755f86dc0028e09c9d63b094ee95b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Ad=C3=A1mek?= Date: Thu, 18 Jun 2026 19:06:01 +0200 Subject: [PATCH 2/5] fix(core): align typed router with v3 PR review polish - default handler keeps `request.userData` loosely typed (it is a fallback for any request, including labels not in the route map) - split factory/`Router.create` into explicit overloads (route map vs legacy flat userData) for backwards compatibility, keeping the schema overload - drop the exported `RouteMap` alias (referenced as prose in docs instead) --- .../src/internals/basic-crawler.ts | 7 +- .../src/internals/cheerio-crawler.ts | 7 +- packages/core/src/errors.ts | 2 +- packages/core/src/router.ts | 81 +++++++++---------- .../src/internals/file-download.ts | 7 +- .../src/internals/http-crawler.ts | 7 +- .../src/internals/jsdom-crawler.ts | 7 +- .../src/internals/linkedom-crawler.ts | 7 +- .../internals/adaptive-playwright-crawler.ts | 7 +- .../src/internals/playwright-crawler.ts | 7 +- .../src/internals/puppeteer-crawler.ts | 7 +- .../src/internals/stagehand-crawler.ts | 7 +- test/core/router.test.ts | 50 +++++++++++- 13 files changed, 138 insertions(+), 65 deletions(-) diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 9b5f10013c14..c53dd13bae96 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -2349,9 +2349,12 @@ export interface CrawlerRunOptions extends CrawlerAddRequestsOptions { */ export function createBasicRouter< Context extends BasicCrawlingContext = BasicCrawlingContext, - UserData extends Dictionary = GetUserDataFromRequest, - Routes extends Record = Record, + Routes extends Record = Record>, >(routes?: RouterRoutes): RouterHandler; +export function createBasicRouter< + Context extends BasicCrawlingContext = BasicCrawlingContext, + UserData extends Dictionary = GetUserDataFromRequest, +>(routes?: RouterRoutes>): RouterHandler>; export function createBasicRouter< Context extends BasicCrawlingContext = BasicCrawlingContext, const Schemas extends RouteSchemas = RouteSchemas, diff --git a/packages/cheerio-crawler/src/internals/cheerio-crawler.ts b/packages/cheerio-crawler/src/internals/cheerio-crawler.ts index 8884963edd5b..010f14fcfb6f 100644 --- a/packages/cheerio-crawler/src/internals/cheerio-crawler.ts +++ b/packages/cheerio-crawler/src/internals/cheerio-crawler.ts @@ -366,9 +366,12 @@ export async function cheerioCrawlerEnqueueLinks( */ export function createCheerioRouter< Context extends CheerioCrawlingContext = CheerioCrawlingContext, - UserData extends Dictionary = GetUserDataFromRequest, - Routes extends Record = Record, + Routes extends Record = Record>, >(routes?: RouterRoutes): RouterHandler; +export function createCheerioRouter< + Context extends CheerioCrawlingContext = CheerioCrawlingContext, + UserData extends Dictionary = GetUserDataFromRequest, +>(routes?: RouterRoutes>): RouterHandler>; export function createCheerioRouter< Context extends CheerioCrawlingContext = CheerioCrawlingContext, const Schemas extends RouteSchemas = RouteSchemas, diff --git a/packages/core/src/errors.ts b/packages/core/src/errors.ts index e9f27c372451..92a30dbe68c9 100644 --- a/packages/core/src/errors.ts +++ b/packages/core/src/errors.ts @@ -17,7 +17,7 @@ export class CriticalError extends NonRetryableError {} export class MissingRouteError extends CriticalError {} /** - * Thrown when a request's `userData` does not match the {@apilink RouteMap} schema registered for its label. + * Thrown when a request's `userData` does not match the {@apilink RouteSchemas|Standard Schema} registered for its label. * * As the `userData` does not change between attempts, this error is non-retryable. */ diff --git a/packages/core/src/router.ts b/packages/core/src/router.ts index db863b109d55..97ada02abcb7 100644 --- a/packages/core/src/router.ts +++ b/packages/core/src/router.ts @@ -9,18 +9,21 @@ import type { Awaitable } from './typedefs.js'; const defaultRoute = Symbol('default-route'); /** - * A map of request labels to the shape of `request.userData` expected for that label. Pass it as the - * `Routes` type argument of {@apilink Router} (or a `createXRouter` factory) to get per-label typing of - * `request.userData` and autocomplete/validation of labels in {@apilink Router.addHandler}. - * - * ```ts - * interface MyRoutes { - * PRODUCT: { sku: string; price: number }; - * CATEGORY: { categoryId: string }; - * } - * ``` + * The crawling context received by a route handler, with `request.userData` narrowed to `UserData`. + */ +export type RouterHandlerContext = Omit & { + request: LoadedRequest>; +}; + +/** + * The set of labels accepted by {@apilink Router.addHandler}. When the router declares a concrete + * route map (e.g. `{ PRODUCT: ...; CATEGORY: ... }`), only those labels (plus symbols) are + * allowed — unknown labels become a compile-time error. When the map is left open (the default + * `Record`), any string or symbol label is accepted, preserving the original behaviour. */ -export type RouteMap = Record; +export type RouterLabel> = string extends keyof Routes + ? string | symbol + : (keyof Routes & string) | symbol; /** * A map of request labels to a [Standard Schema](https://standardschema.dev) (Zod, Valibot, ArkType, …) @@ -31,8 +34,8 @@ export type RouteMap = Record; export type RouteSchemas = Record; /** - * Derives a {@apilink RouteMap} (label → `userData` type) from a {@apilink RouteSchemas} map by inferring - * each schema's output type. Outputs that are not object-shaped fall back to a plain {@apilink Dictionary}. + * Derives a route map (label → `userData` type) from a {@apilink RouteSchemas} map by inferring each + * schema's output type. Outputs that are not object-shaped fall back to a plain {@apilink Dictionary}. */ export type RoutesFromSchemas = { [Label in keyof Schemas]: StandardSchemaV1.InferOutput extends Dictionary @@ -40,23 +43,6 @@ export type RoutesFromSchemas = { : Dictionary; }; -/** - * The crawling context received by a route handler, with `request.userData` narrowed to `UserData`. - */ -export type RouterHandlerContext = Omit & { - request: LoadedRequest>; -}; - -/** - * The set of labels accepted by {@apilink Router.addHandler}. When the router declares a concrete - * {@apilink RouteMap} (e.g. `{ PRODUCT: ...; CATEGORY: ... }`), only those labels (plus symbols) are - * allowed — unknown labels become a compile-time error. When the map is left open (the default - * `Record`), any string or symbol label is accepted, preserving the original behaviour. - */ -export type RouterLabel> = string extends keyof Routes - ? string | symbol - : (keyof Routes & string) | symbol; - export interface RouterHandler< Context extends Omit = CrawlingContext, Routes extends Record = Record>, @@ -137,9 +123,9 @@ export type RouterRoutes( @@ -203,8 +189,9 @@ export class Router< ): void; /** - * Registers new route handler for given label, with an explicit `request.userData` type. Use this - * overload to type a handler whose label is not part of the router's {@apilink RouteMap}. + * Registers new route handler for given label, explicitly typing `request.userData` via the + * `UserData` type argument. Useful when the router has no declared route map (the open default) + * and you want to type a single handler, or to register a handler under a `symbol` label. */ addHandler>( label: RouterLabel, @@ -217,10 +204,11 @@ export class Router< } /** - * Registers default route handler. By default `request.userData` is typed as the union of all - * `userData` shapes declared in the router's {@apilink RouteMap}. + * Registers default route handler. As a fallback it can receive any request (including labels not + * declared in the route map), so `request.userData` defaults to the context's `userData` type + * (loosely typed by default). Pass an explicit `UserData` type argument to narrow it. */ - addDefaultHandler( + addDefaultHandler>( handler: (ctx: RouterHandlerContext) => Awaitable, ) { this.validate(defaultRoute); @@ -332,12 +320,21 @@ export class Router< * }); * ``` */ + // The handler overloads keep the second type argument backwards compatible. When it is a route map + // (every value is a `Dictionary`) the first overload applies and labels are typed per route. Otherwise + // it fails the `Record` constraint and falls through to the second overload, + // where it is treated as the legacy flat `userData` shape shared by all handlers. The third overload + // accepts a Standard Schema per label, inferring the route map and validating `userData` at runtime. static create< Context extends Omit = CrawlingContext, - UserData extends Dictionary = GetUserDataFromRequest, - Routes extends Record = Record, + Routes extends Record = Record>, >(routes?: RouterRoutes): RouterHandler; + static create< + Context extends Omit = CrawlingContext, + UserData extends Dictionary = GetUserDataFromRequest, + >(routes?: RouterRoutes>): RouterHandler>; + static create< Context extends Omit = CrawlingContext, const Schemas extends RouteSchemas = RouteSchemas, @@ -357,7 +354,7 @@ export class Router< for (const [label, value] of Object.entries(routesOrSchemas ?? {})) { if (typeof value === 'function') { - router.addHandler(label as keyof Context & string, value as (ctx: any) => Awaitable); + router.addHandler(label, value as (ctx: any) => Awaitable); } else { router.schemas.set(label, value); } diff --git a/packages/http-crawler/src/internals/file-download.ts b/packages/http-crawler/src/internals/file-download.ts index 2f7b485fd2a7..08f6b4ae9f11 100644 --- a/packages/http-crawler/src/internals/file-download.ts +++ b/packages/http-crawler/src/internals/file-download.ts @@ -262,9 +262,12 @@ function trackBodyConsumption(response: Response): { response: ResponseWithUrl; */ export function createFileRouter< Context extends FileDownloadCrawlingContext = FileDownloadCrawlingContext, - UserData extends Dictionary = GetUserDataFromRequest, - Routes extends Record = Record, + Routes extends Record = Record>, >(routes?: RouterRoutes): RouterHandler; +export function createFileRouter< + Context extends FileDownloadCrawlingContext = FileDownloadCrawlingContext, + UserData extends Dictionary = GetUserDataFromRequest, +>(routes?: RouterRoutes>): RouterHandler>; export function createFileRouter< Context extends FileDownloadCrawlingContext = FileDownloadCrawlingContext, const Schemas extends RouteSchemas = RouteSchemas, diff --git a/packages/http-crawler/src/internals/http-crawler.ts b/packages/http-crawler/src/internals/http-crawler.ts index 2d4d3f535221..bea23a631429 100644 --- a/packages/http-crawler/src/internals/http-crawler.ts +++ b/packages/http-crawler/src/internals/http-crawler.ts @@ -843,9 +843,12 @@ interface RequestFunctionOptions { */ export function createHttpRouter< Context extends HttpCrawlingContext = HttpCrawlingContext, - UserData extends Dictionary = GetUserDataFromRequest, - Routes extends Record = Record, + Routes extends Record = Record>, >(routes?: RouterRoutes): RouterHandler; +export function createHttpRouter< + Context extends HttpCrawlingContext = HttpCrawlingContext, + UserData extends Dictionary = GetUserDataFromRequest, +>(routes?: RouterRoutes>): RouterHandler>; export function createHttpRouter< Context extends HttpCrawlingContext = HttpCrawlingContext, const Schemas extends RouteSchemas = RouteSchemas, diff --git a/packages/jsdom-crawler/src/internals/jsdom-crawler.ts b/packages/jsdom-crawler/src/internals/jsdom-crawler.ts index f46f97e5a1e0..14795b20ceb3 100644 --- a/packages/jsdom-crawler/src/internals/jsdom-crawler.ts +++ b/packages/jsdom-crawler/src/internals/jsdom-crawler.ts @@ -497,9 +497,12 @@ function extractUrlsFromWindow(window: DOMWindow, selector: string, baseUrl: str */ export function createJSDOMRouter< Context extends JSDOMCrawlingContext = JSDOMCrawlingContext, - UserData extends Dictionary = GetUserDataFromRequest, - Routes extends Record = Record, + Routes extends Record = Record>, >(routes?: RouterRoutes): RouterHandler; +export function createJSDOMRouter< + Context extends JSDOMCrawlingContext = JSDOMCrawlingContext, + UserData extends Dictionary = GetUserDataFromRequest, +>(routes?: RouterRoutes>): RouterHandler>; export function createJSDOMRouter< Context extends JSDOMCrawlingContext = JSDOMCrawlingContext, const Schemas extends RouteSchemas = RouteSchemas, diff --git a/packages/linkedom-crawler/src/internals/linkedom-crawler.ts b/packages/linkedom-crawler/src/internals/linkedom-crawler.ts index 0afda51dfa43..a9fe842f82ea 100644 --- a/packages/linkedom-crawler/src/internals/linkedom-crawler.ts +++ b/packages/linkedom-crawler/src/internals/linkedom-crawler.ts @@ -387,9 +387,12 @@ function extractUrlsFromWindow(window: Window, selector: string, baseUrl: string */ export function createLinkeDOMRouter< Context extends LinkeDOMCrawlingContext = LinkeDOMCrawlingContext, - UserData extends Dictionary = GetUserDataFromRequest, - Routes extends Record = Record, + Routes extends Record = Record>, >(routes?: RouterRoutes): RouterHandler; +export function createLinkeDOMRouter< + Context extends LinkeDOMCrawlingContext = LinkeDOMCrawlingContext, + UserData extends Dictionary = GetUserDataFromRequest, +>(routes?: RouterRoutes>): RouterHandler>; export function createLinkeDOMRouter< Context extends LinkeDOMCrawlingContext = LinkeDOMCrawlingContext, const Schemas extends RouteSchemas = RouteSchemas, diff --git a/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts b/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts index 3de967842d26..045961cfcf5d 100644 --- a/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts +++ b/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts @@ -784,9 +784,12 @@ export class AdaptivePlaywrightCrawler< export function createAdaptivePlaywrightRouter< Context extends AdaptivePlaywrightCrawlerContext = AdaptivePlaywrightCrawlerContext, - UserData extends Dictionary = GetUserDataFromRequest, - Routes extends Record = Record, + Routes extends Record = Record>, >(routes?: RouterRoutes): RouterHandler; +export function createAdaptivePlaywrightRouter< + Context extends AdaptivePlaywrightCrawlerContext = AdaptivePlaywrightCrawlerContext, + UserData extends Dictionary = GetUserDataFromRequest, +>(routes?: RouterRoutes>): RouterHandler>; export function createAdaptivePlaywrightRouter< Context extends AdaptivePlaywrightCrawlerContext = AdaptivePlaywrightCrawlerContext, const Schemas extends RouteSchemas = RouteSchemas, diff --git a/packages/playwright-crawler/src/internals/playwright-crawler.ts b/packages/playwright-crawler/src/internals/playwright-crawler.ts index 8836889db323..182f5b730133 100644 --- a/packages/playwright-crawler/src/internals/playwright-crawler.ts +++ b/packages/playwright-crawler/src/internals/playwright-crawler.ts @@ -349,9 +349,12 @@ export function handleCloudflareChallengeHook(options?: HandleCloudflareChalleng */ export function createPlaywrightRouter< Context extends PlaywrightCrawlingContext = PlaywrightCrawlingContext, - UserData extends Dictionary = GetUserDataFromRequest, - Routes extends Record = Record, + Routes extends Record = Record>, >(routes?: RouterRoutes): RouterHandler; +export function createPlaywrightRouter< + Context extends PlaywrightCrawlingContext = PlaywrightCrawlingContext, + UserData extends Dictionary = GetUserDataFromRequest, +>(routes?: RouterRoutes>): RouterHandler>; export function createPlaywrightRouter< Context extends PlaywrightCrawlingContext = PlaywrightCrawlingContext, const Schemas extends RouteSchemas = RouteSchemas, diff --git a/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts b/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts index 81f4931ab147..4cc6ee25cf93 100644 --- a/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts +++ b/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts @@ -312,9 +312,12 @@ export class PuppeteerCrawler< */ export function createPuppeteerRouter< Context extends PuppeteerCrawlingContext = PuppeteerCrawlingContext, - UserData extends Dictionary = GetUserDataFromRequest, - Routes extends Record = Record, + Routes extends Record = Record>, >(routes?: RouterRoutes): RouterHandler; +export function createPuppeteerRouter< + Context extends PuppeteerCrawlingContext = PuppeteerCrawlingContext, + UserData extends Dictionary = GetUserDataFromRequest, +>(routes?: RouterRoutes>): RouterHandler>; export function createPuppeteerRouter< Context extends PuppeteerCrawlingContext = PuppeteerCrawlingContext, const Schemas extends RouteSchemas = RouteSchemas, diff --git a/packages/stagehand-crawler/src/internals/stagehand-crawler.ts b/packages/stagehand-crawler/src/internals/stagehand-crawler.ts index e5bdfdf4b0cc..8b3907d2d51d 100644 --- a/packages/stagehand-crawler/src/internals/stagehand-crawler.ts +++ b/packages/stagehand-crawler/src/internals/stagehand-crawler.ts @@ -510,9 +510,12 @@ export class StagehandCrawler< */ export function createStagehandRouter< Context extends StagehandCrawlingContext = StagehandCrawlingContext, - UserData extends Dictionary = GetUserDataFromRequest, - Routes extends Record = Record, + Routes extends Record = Record>, >(routes?: RouterRoutes): RouterHandler; +export function createStagehandRouter< + Context extends StagehandCrawlingContext = StagehandCrawlingContext, + UserData extends Dictionary = GetUserDataFromRequest, +>(routes?: RouterRoutes>): RouterHandler>; export function createStagehandRouter< Context extends StagehandCrawlingContext = StagehandCrawlingContext, const Schemas extends RouteSchemas = RouteSchemas, diff --git a/test/core/router.test.ts b/test/core/router.test.ts index 5f55af34326b..9649115696a5 100644 --- a/test/core/router.test.ts +++ b/test/core/router.test.ts @@ -1,7 +1,12 @@ import { BasicCrawler } from '@crawlee/basic'; import type { CrawlingContext } from '@crawlee/core'; import { MissingRouteError, RequestValidationError, Router } from '@crawlee/core'; -import { createCheerioRouter, createPlaywrightRouter, type PlaywrightCrawlingContext } from 'crawlee'; +import { + type CheerioCrawlingContext, + createCheerioRouter, + createPlaywrightRouter, + type PlaywrightCrawlingContext, +} from 'crawlee'; import { z } from 'zod'; describe('Router', () => { @@ -201,7 +206,48 @@ describe('Router', () => { router.addHandler('UNKNOWN', () => {}); router.addDefaultHandler((ctx) => { - testType<{ sku: string; price: number } | { categoryId: string }>(ctx.request.userData); + // the default handler is a fallback for any request, so userData stays loosely typed + testType>(ctx.request.userData); + }); + }); + + test('factory infers userData from a route map passed as the second type argument', async () => { + const testType = (t: T): void => {}; + + interface Routes { + PRODUCT: { sku: string; price: number }; + CATEGORY: { categoryId: string }; + } + + // the documented two-argument form: `Routes` is the second type argument of the factory + const router = createCheerioRouter(); + + router.addHandler('PRODUCT', (ctx) => { + testType(ctx.request.userData.sku); + testType(ctx.request.userData.price); + }); + + router.addHandler('CATEGORY', (ctx) => { + testType(ctx.request.userData.categoryId); + }); + + // @ts-expect-error unknown labels are rejected when a route map is declared + router.addHandler('UNKNOWN', () => {}); + }); + + test('factory keeps the legacy flat-userData generic working (backwards compatibility)', async () => { + const testType = (t: T): void => {}; + + // a flat `userData` shape (with a scalar field) resolves to the legacy open-map router, + // so any label is accepted and `userData` is typed as the passed shape + const router = createCheerioRouter(); + + router.addHandler('anyLabel', (ctx) => { + testType(ctx.request.userData.token); + }); + + router.addHandler('anotherLabel', (ctx) => { + testType(ctx.request.userData.token); }); }); From 7f3cc4e6b1711f39229baa924b0c29110c6d6a0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Ad=C3=A1mek?= Date: Thu, 18 Jun 2026 20:00:20 +0200 Subject: [PATCH 3/5] feat: propagate router route map to crawler & context request methods MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a typed router is used, the route map now also types the request inputs: - handler context: `ctx.addRequests` and `ctx.enqueueLinks` require the `userData` shape matching the request's `label` (and reject unknown labels); this is driven by the router, so it works for every crawler type. - crawler instance: `Routes` is inferred from the `requestHandler` option and used to type `crawler.addRequests`/`crawler.run` for the HTTP-based crawlers (Basic/Http/Cheerio/JSDOM/LinkeDOM). Unlabeled requests keep loose `userData` (they hit the default handler). All typing is backwards compatible via the open-map default. Note: crawler-instance `addRequests` typing is not yet wired for the browser crawlers (Playwright/Puppeteer/Stagehand) — their `requestHandler` is redefined in BrowserCrawlerOptions which breaks generic inference through the hierarchy; their handler-context methods are still fully typed via the router. --- .../src/internals/basic-crawler.ts | 20 ++++-- .../src/internals/cheerio-crawler.ts | 11 ++- packages/core/src/crawlers/crawler_commons.ts | 69 ++++++++++++++++++- packages/core/src/router.ts | 29 ++++++-- .../src/internals/http-crawler.ts | 6 +- .../src/internals/jsdom-crawler.ts | 11 ++- .../src/internals/linkedom-crawler.ts | 11 ++- test/core/router.test.ts | 60 ++++++++++++++++ 8 files changed, 191 insertions(+), 26 deletions(-) diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index c53dd13bae96..2f49cb743be8 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -28,6 +28,7 @@ import type { StatisticsOptions, StatisticState, StorageIdentifier, + TypedRequestsLike, } from '@crawlee/core'; import { AutoscaledPool, @@ -112,7 +113,7 @@ export type ErrorHandler< export interface StatusMessageCallbackParams< Context extends CrawlingContext = BasicCrawlingContext, - Crawler extends BasicCrawler = BasicCrawler, + Crawler extends BasicCrawler = BasicCrawler, > { state: StatisticState; crawler: Crawler; @@ -122,7 +123,7 @@ export interface StatusMessageCallbackParams< export type StatusMessageCallback< Context extends CrawlingContext = BasicCrawlingContext, - Crawler extends BasicCrawler = BasicCrawler, + Crawler extends BasicCrawler = BasicCrawler, > = (params: StatusMessageCallbackParams) => Awaitable; export type RequireContextPipeline< @@ -136,6 +137,7 @@ export interface BasicCrawlerOptions< Context extends CrawlingContext = CrawlingContext, ContextExtension = Dictionary, ExtendedContext extends Context = Context & ContextExtension, + Routes extends Record = Record>, > { /** * User-provided function that performs the logic of the crawler. It is called for each URL to crawl. @@ -154,7 +156,7 @@ export interface BasicCrawlerOptions< * The exceptions are logged to the request using the * {@apilink Request.pushErrorMessage|`Request.pushErrorMessage()`} function. */ - requestHandler?: RequestHandler; + requestHandler?: RouterHandler | RequestHandler; /** * Allows the user to extend the crawling context passed to the request handler with custom functionality. @@ -514,6 +516,7 @@ export class BasicCrawler< Context extends CrawlingContext = CrawlingContext, ContextExtension = Dictionary, ExtendedContext extends Context = Context & ContextExtension, + Routes extends Record = Record>, > { protected static readonly CRAWLEE_STATE_KEY = 'CRAWLEE_STATE'; @@ -582,7 +585,10 @@ export class BasicCrawler< * Default {@apilink Router} instance that will be used if we don't specify any {@apilink BasicCrawlerOptions.requestHandler|`requestHandler`}. * See {@apilink Router.addHandler|`router.addHandler()`} and {@apilink Router.addDefaultHandler|`router.addDefaultHandler()`}. */ - readonly router: RouterHandler = Router.create(); + readonly router: RouterHandler = Router.create() as unknown as RouterHandler< + Context, + Routes + >; private _basicContextPipeline?: ContextPipeline<{ request: Request }, CrawlingContext>; @@ -707,7 +713,7 @@ export class BasicCrawler< * All `BasicCrawler` parameters are passed via an options object. */ constructor( - options: BasicCrawlerOptions & + options: BasicCrawlerOptions & RequireContextPipeline = {} as any, // cast because the constructor logic handles missing `contextPipelineBuilder` - the type is just for DX ) { ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape)); @@ -1265,7 +1271,7 @@ export class BasicCrawler< * @param [requests] The requests to add. * @param [options] Options for the request queue. */ - async run(requests?: RequestsLike, options?: CrawlerRunOptions): Promise { + async run(requests?: TypedRequestsLike, options?: CrawlerRunOptions): Promise { if (this.running) { throw new Error( 'This crawler instance is already running, you can add more requests to it via `crawler.addRequests()`.', @@ -1536,7 +1542,7 @@ export class BasicCrawler< * @param options Options for the request queue */ async addRequests( - requests: ReadonlyDeep, + requests: ReadonlyDeep>, options: CrawlerAddRequestsOptions = {}, ): Promise { await this.getRequestManager(); diff --git a/packages/cheerio-crawler/src/internals/cheerio-crawler.ts b/packages/cheerio-crawler/src/internals/cheerio-crawler.ts index 010f14fcfb6f..46a5c5ec6365 100644 --- a/packages/cheerio-crawler/src/internals/cheerio-crawler.ts +++ b/packages/cheerio-crawler/src/internals/cheerio-crawler.ts @@ -37,7 +37,8 @@ export interface CheerioCrawlerOptions< ExtendedContext extends CheerioCrawlingContext = CheerioCrawlingContext & ContextExtension, UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler -> extends HttpCrawlerOptions, ContextExtension, ExtendedContext> {} + Routes extends Record = Record, +> extends HttpCrawlerOptions, ContextExtension, ExtendedContext, Routes> {} export type CheerioHook< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler @@ -182,11 +183,15 @@ export type CheerioRequestHandler< export class CheerioCrawler< ContextExtension = Dictionary, ExtendedContext extends CheerioCrawlingContext = CheerioCrawlingContext & ContextExtension, -> extends HttpCrawler { + Routes extends Record = Record< + string, + GetUserDataFromRequest + >, +> extends HttpCrawler { /** * All `CheerioCrawler` parameters are passed via an options object. */ - constructor(options?: CheerioCrawlerOptions) { + constructor(options?: CheerioCrawlerOptions) { const { contextPipelineBuilder, ...rest } = options ?? {}; super({ diff --git a/packages/core/src/crawlers/crawler_commons.ts b/packages/core/src/crawlers/crawler_commons.ts index 7f0c921e42e9..9071bbfc4c72 100644 --- a/packages/core/src/crawlers/crawler_commons.ts +++ b/packages/core/src/crawlers/crawler_commons.ts @@ -4,7 +4,7 @@ import type { ReadonlyDeep, SetRequired } from 'type-fest'; import type { Configuration } from '../configuration.js'; import type { EnqueueLinksOptions } from '../enqueue_links/enqueue_links.js'; import type { CrawleeLogger } from '../log.js'; -import type { Request, Source } from '../request.js'; +import type { Request, RequestOptions, Source } from '../request.js'; import type { Dataset } from '../storages/dataset.js'; import { KeyValueStore, type RecordOptions } from '../storages/key_value_store.js'; import type { RequestQueueOperationOptions } from '../storages/request_queue.js'; @@ -13,6 +13,73 @@ import type { StorageIdentifier } from '../storages/storage_instance_manager.js' /** @internal */ export type IsAny = 0 extends 1 & T ? true : false; +/** + * A request input (URL string, request-options object, or {@apilink Request}) whose `userData` is typed + * according to its `label`, based on a router's route map. + * + * When the route map is open (the default `Record`), this is just the regular loose + * {@apilink Source} input. When the map declares concrete labels, providing a `label` requires the matching + * `userData` shape and rejects labels not present in the map; unlabeled requests keep loose `userData`. + */ +export type LabeledSource> = string extends keyof Routes + ? string | Source + : + | string + | Request + | ({ requestsFromUrl?: string; regex?: RegExp } & ( + | { + [Label in keyof Routes & string]: Omit>, 'label'> & { + label: Label; + }; + }[keyof Routes & string] + | (Omit, 'label'> & { label?: undefined }) + )); + +/** + * The iterable/array of {@apilink LabeledSource} inputs accepted by the label-aware `addRequests`/`run` + * methods of a crawler bound to a typed router. + */ +export type TypedRequestsLike> = + | AsyncIterable> + | Iterable> + | LabeledSource[]; + +/** + * The label-aware `addRequests` method signature exposed on a request handler's context when the crawler is + * bound to a typed router. Mirrors {@apilink RestrictedCrawlingContext.addRequests} with typed sources. + */ +export type TypedContextAddRequests> = ( + requestsLike: ReadonlyDeep[]>, + options?: ReadonlyDeep, +) => Promise; + +/** + * An `enqueueLinks`-options object with its `label`/`userData` retyped according to a router's route map: a + * declared `label` requires the matching `userData` shape (unknown labels are rejected), while unlabeled + * calls keep loose `userData`. Returns the options unchanged when the route map is open (the default). + */ +type TypedEnqueueLinksOptions> = string extends keyof Routes + ? Options + : Omit & + ( + | { [Label in keyof Routes & string]: { label: Label; userData?: Routes[Label] } }[keyof Routes & string] + | { label?: undefined; userData?: Dictionary } + ); + +/** + * Transforms a context's existing `enqueueLinks` method so that the `label`/`userData` in its options follow + * the router's route map, while preserving everything else about the signature (argument optionality and + * return type, which differ between crawler types). + */ +export type TypedContextEnqueueLinks< + EnqueueLinks, + Routes extends Record, +> = EnqueueLinks extends (options?: infer Options) => infer Result + ? (options?: TypedEnqueueLinksOptions) => Result + : EnqueueLinks extends (options: infer Options) => infer Result + ? (options: TypedEnqueueLinksOptions) => Result + : EnqueueLinks; + /** @internal */ export type WithRequired = T & { [P in K]-?: T[P] }; diff --git a/packages/core/src/router.ts b/packages/core/src/router.ts index 97ada02abcb7..012a6ca916b7 100644 --- a/packages/core/src/router.ts +++ b/packages/core/src/router.ts @@ -1,7 +1,13 @@ import type { Dictionary } from '@crawlee/types'; import type { StandardSchemaV1 } from '@standard-schema/spec'; -import type { CrawlingContext, LoadedRequest, RestrictedCrawlingContext } from './crawlers/crawler_commons.js'; +import type { + CrawlingContext, + LoadedRequest, + RestrictedCrawlingContext, + TypedContextAddRequests, + TypedContextEnqueueLinks, +} from './crawlers/crawler_commons.js'; import { MissingRouteError, RequestValidationError } from './errors.js'; import type { Request } from './request.js'; import type { Awaitable } from './typedefs.js'; @@ -9,11 +15,20 @@ import type { Awaitable } from './typedefs.js'; const defaultRoute = Symbol('default-route'); /** - * The crawling context received by a route handler, with `request.userData` narrowed to `UserData`. + * The crawling context received by a route handler, with `request.userData` narrowed to `UserData`, and + * `addRequests`/`enqueueLinks` typed according to the router's route map (`Routes`) so that enqueuing a + * request under a declared label requires the matching `userData` shape. */ -export type RouterHandlerContext = Omit & { +export type RouterHandlerContext< + Context, + UserData extends Dictionary, + Routes extends Record, +> = Omit & { request: LoadedRequest>; -}; + addRequests: TypedContextAddRequests; +} & (Context extends { enqueueLinks: infer EnqueueLinks } + ? { enqueueLinks: TypedContextEnqueueLinks } + : {}); /** * The set of labels accepted by {@apilink Router.addHandler}. When the router declares a concrete @@ -185,7 +200,7 @@ export class Router< */ addHandler