From aa4da8611af23f0771cb571a86bb481ef7e3d22a Mon Sep 17 00:00:00 2001 From: Igor Loskutov Date: Fri, 20 Mar 2026 19:19:24 -0400 Subject: [PATCH] fix: prevent duplicate request processing in delayRequest --- .../src/internals/basic-crawler.ts | 10 ---- test/core/crawlers/basic_crawler.test.ts | 50 ++++++++++++++++++- 2 files changed, 49 insertions(+), 11 deletions(-) diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 1dc5efea4dbd..161a1151c401 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -1511,11 +1511,6 @@ export class BasicCrawler { this.log.debug(`Adding request ${request.url} (${request.id}) back to the queue`); - if (source instanceof RequestQueueV1) { - // eslint-disable-next-line dot-notation - source['inProgress'].add(request.id!); - } - await source.reclaimRequest(request, { forefront: request.userData?.__crawlee?.forefront }); }, delay); diff --git a/test/core/crawlers/basic_crawler.test.ts b/test/core/crawlers/basic_crawler.test.ts index cd160f0d02c9..8d9befa3cd9b 100644 --- a/test/core/crawlers/basic_crawler.test.ts +++ b/test/core/crawlers/basic_crawler.test.ts @@ -23,7 +23,7 @@ import { RequestList, RequestQueue, } from '@crawlee/basic'; -import { RequestState } from '@crawlee/core'; +import { RequestState, RequestQueueV1 } from '@crawlee/core'; import type { Dictionary } from '@crawlee/utils'; import { RobotsTxtFile, sleep } from '@crawlee/utils'; import express from 'express'; @@ -2090,4 +2090,52 @@ describe('BasicCrawler', () => { expect(crawlerB.requestQueue?.config).toBe(configB); }); }); + + describe('sameDomainDelaySecs race condition', () => { + test('delayRequest should keep request in inProgress during delay', async () => { + const requestQueue = await RequestQueueV1.open(); + await requestQueue.addRequest({ url: 'http://example.com/a', uniqueKey: 'a' }); + + const request = await requestQueue.fetchNextRequest(); + expect(request).not.toBeNull(); + expect(requestQueue.inProgressCount()).toBe(1); + + const crawler = new BasicCrawler({ + requestQueue, + sameDomainDelaySecs: 10, + requestHandler: async () => {}, + }); + + // Set domain access time to force delay trigger + (crawler as any).domainAccessedTime.set('example.com', Date.now()); + + const delayed = (crawler as any).delayRequest(request, requestQueue); + expect(delayed).toBe(true); + + // Request must remain in inProgress to prevent duplicate fetching + expect(requestQueue.inProgressCount()).toBe(1); + }); + + test('second fetchNextRequest should not return the same request after delayRequest', async () => { + const requestQueue = await RequestQueueV1.open(); + await requestQueue.addRequest({ url: 'http://example.com/a', uniqueKey: 'a' }); + + const r1 = await requestQueue.fetchNextRequest(); + expect(r1).not.toBeNull(); + + const crawler = new BasicCrawler({ + requestQueue, + sameDomainDelaySecs: 10, + requestHandler: async () => {}, + }); + (crawler as any).domainAccessedTime.set('example.com', Date.now()); + + const delayed = (crawler as any).delayRequest(r1, requestQueue); + expect(delayed).toBe(true); + + // Another worker must not get the same request during the delay window + const r1Again = await requestQueue.fetchNextRequest(); + expect(r1Again).toBeNull(); + }); + }); });