Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import {
mergeCookies,
NonRetryableError,
purgeDefaultStorages,
RateLimitError,
RequestListAdapter,
RequestManagerTandem,
RequestProvider,
Expand Down Expand Up @@ -1834,6 +1835,13 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext

if (error instanceof SessionError) {
await this._rotateSession(crawlingContext);
} else if (error instanceof RateLimitError) {
const domain = getDomain(request.url);
// We default to 1 minute if retryAfterMs is not provided
const cooldown = error.retryAfterMs ?? 60000;
if (domain) {
this.domainAccessedTime.set(domain, Date.now() + cooldown - this.sameDomainDelayMillis);
}
}

if (!request.noRetry) {
Expand Down Expand Up @@ -1932,7 +1940,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
}

// User requested retry (we ignore retry count here as its explicitly told by the user to retry)
if (error instanceof RetryRequestError) {
// Rate limit errors are also explicitly retried (with cooldown).
if (error instanceof RetryRequestError || error instanceof RateLimitError) {
return true;
}

Expand Down
12 changes: 12 additions & 0 deletions packages/core/src/errors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,15 @@ export class SessionError extends RetryRequestError {
super(`Detected a session error, rotating session... ${message ? `\n${message}` : ''}`);
}
}

/**
* Errors of `RateLimitError` type will trigger a cooldown before the request is retried.
*/
export class RateLimitError extends Error {
constructor(
message?: string,
public readonly retryAfterMs?: number,
) {
super(message ?? 'Rate limit exceeded');
}
}
2 changes: 1 addition & 1 deletion packages/core/src/session_pool/consts.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
export const BLOCKED_STATUS_CODES = [401, 403, 429];
export const BLOCKED_STATUS_CODES = [401, 403];
export const PERSIST_STATE_KEY = 'SDK_SESSION_POOL_STATE';
export const MAX_POOL_SIZE = 1000;
8 changes: 4 additions & 4 deletions packages/core/src/session_pool/session.ts
Original file line number Diff line number Diff line change
Expand Up @@ -296,10 +296,10 @@ export class Session {
}

/**
* With certain status codes: `401`, `403` or `429` we can be certain
* that the target website is blocking us. This function helps to do this conveniently
* by retiring the session when such code is received. Optionally the default status
* codes can be extended in the second parameter.
* Retires session based on status code.
* With certain status codes: `401` or `403` we can be certain
* that the IP is blocked.
*
* @param statusCode HTTP status code.
* @returns Whether the session was retired.
*/
Expand Down
2 changes: 1 addition & 1 deletion packages/core/src/session_pool/session_pool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ export interface SessionPoolOptions {
/**
* Specifies which response status codes are considered as blocked.
* Session connected to such request will be marked as retired.
* @default [401, 403, 429]
* @default [401, 403]
*/
blockedStatusCodes?: number[];

Expand Down
19 changes: 19 additions & 0 deletions packages/http-crawler/src/internals/http-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ import {
processHttpRequestOptions,
RequestState,
Router,
NonRetryableError,
RateLimitError,
SessionError,
validators,
} from '@crawlee/basic';
Expand Down Expand Up @@ -517,6 +519,23 @@ export class HttpCrawler<
await this._handleNavigation(crawlingContext);
tryCancel();

if (crawlingContext.response?.statusCode === 429) {
const retryAfterHeader = crawlingContext.response.headers['retry-after'];
let retryAfterMs: number | undefined = undefined;
if (retryAfterHeader) {
const retryAfterStr = String(retryAfterHeader);
if (/^\d+$/.test(retryAfterStr)) {
retryAfterMs = parseInt(retryAfterStr, 10) * 1000;
} else {
const date = Date.parse(retryAfterStr);
if (!Number.isNaN(date)) {
retryAfterMs = Math.max(0, date - Date.now());
}
}
}
throw new RateLimitError(`Rate limit exceeded (HTTP 429)`, retryAfterMs);
}

const parsed = await this._parseResponse(request, crawlingContext.response!, crawlingContext);
const response = parsed.response!;
const contentType = parsed.contentType!;
Expand Down
33 changes: 33 additions & 0 deletions test/core/crawlers/http_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,21 @@ router.set('/403-with-octet-stream', (req, res) => {
res.end();
});

let rateLimitHit = 0;
router.set('/429', (req, res) => {
if (rateLimitHit === 0) {
rateLimitHit++;
res.setHeader('content-type', 'text/html');
res.setHeader('retry-after', '1');
res.statusCode = 429;
res.end();
} else {
res.setHeader('content-type', 'text/html');
res.statusCode = 200;
res.end('ok');
}
});

let server: http.Server;
let url: string;

Expand Down Expand Up @@ -89,6 +104,7 @@ afterAll(async () => {
const localStorageEmulator = new MemoryStorageEmulator();

beforeEach(async () => {
rateLimitHit = 0;
await localStorageEmulator.init();
});

Expand Down Expand Up @@ -477,4 +493,21 @@ describe.each(
expect(results[0].includes('Schmexample Domain')).toBeTruthy();
expect(results[1].includes('Hello')).toBeTruthy();
});
test('should respect 429 RateLimitError and retry', async () => {
const results: string[] = [];
const crawler = new HttpCrawler({
httpClient,
maxRequestRetries: 1,
requestHandler: async ({ body }) => {
results.push(body.toString());
},
});

const start = Date.now();
await crawler.run([`${url}/429`]);
const duration = Date.now() - start;

expect(results).toStrictEqual(['ok']);
expect(duration).toBeGreaterThanOrEqual(800); // Because of retry-after: 1, allow some tolerance
});
});