Skip to content

Commit

Permalink
feat: add waitForSelector context helper + parseWithCheerio in ad…
Browse files Browse the repository at this point in the history
…aptive crawler (#2522)

This introduces a new `waitForSelector` context helper to all the
crawler types. In browser based crawlers, it will wait for the element,
in HTTP crawlers it will check if the selector can be found in the HTML
and throw otherwise.

Also, the `parseWithCheerio` is now in adaptive crawler too, and it has
a new parameter that allows waiting for a selector via
`waitForSelector`.
  • Loading branch information
B4nan authored Jun 6, 2024
1 parent 31083aa commit 6f88e73
Show file tree
Hide file tree
Showing 13 changed files with 326 additions and 55 deletions.
2 changes: 1 addition & 1 deletion biome.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"formatter": {
"quoteStyle": "single",
"semicolons": "always",
"trailingComma": "all",
"trailingCommas": "all",
"lineWidth": 120,
"indentStyle": "space",
"indentWidth": 4,
Expand Down
35 changes: 31 additions & 4 deletions packages/cheerio-crawler/src/internals/cheerio-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import type {
} from '@crawlee/http';
import { HttpCrawler, enqueueLinks, Router, resolveBaseUrlForEnqueueLinksFiltering } from '@crawlee/http';
import type { Dictionary } from '@crawlee/types';
import { extractUrlsFromCheerio } from '@crawlee/utils';
import { type CheerioRoot, extractUrlsFromCheerio } from '@crawlee/utils';
import type { CheerioOptions } from 'cheerio';
import * as cheerio from 'cheerio';
import { DomHandler } from 'htmlparser2';
Expand Down Expand Up @@ -45,20 +45,35 @@ export interface CheerioCrawlingContext<
*/
$: cheerio.CheerioAPI;

/**
* Wait for an element matching the selector to appear. Timeout is ignored.
*
* **Example usage:**
* ```ts
* async requestHandler({ waitForSelector, parseWithCheerio }) {
* await waitForSelector('article h1');
* const $ = await parseWithCheerio();
* const title = $('title').text();
* });
* ```
*/
waitForSelector(selector: string, timeoutMs?: number): Promise<void>;

/**
* Returns Cheerio handle, this is here to unify the crawler API, so they all have this handy method.
* It has the same return type as the `$` context property, use it only if you are abstracting your workflow to
* support different context types in one handler.
* When provided with the `selector` argument, it will throw if it's not available.
*
* **Example usage:**
* ```javascript
* ```ts
* async requestHandler({ parseWithCheerio }) {
* const $ = await parseWithCheerio();
* const title = $('title').text();
* });
* ```
*/
parseWithCheerio(): Promise<cheerio.CheerioAPI>;
parseWithCheerio(selector?: string, timeoutMs?: number): Promise<CheerioRoot>;
}

export type CheerioRequestHandler<
Expand Down Expand Up @@ -204,7 +219,19 @@ export class CheerioCrawler extends HttpCrawler<CheerioCrawlingContext> {
}

protected override async _runRequestHandler(context: CheerioCrawlingContext) {
context.parseWithCheerio = async () => Promise.resolve(context.$);
context.waitForSelector = async (selector?: string, _timeoutMs?: number) => {
if (context.$(selector).get().length === 0) {
throw new Error(`Selector '${selector}' not found.`);
}
};
context.parseWithCheerio = async (selector?: string, timeoutMs?: number) => {
if (selector) {
await context.waitForSelector(selector, timeoutMs);
}

return context.$;
};

await super._runRequestHandler(context);
}
}
Expand Down
18 changes: 9 additions & 9 deletions packages/core/src/crawlers/crawler_commons.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,15 @@ import { KeyValueStore } from '../storages';
export interface RestrictedCrawlingContext<UserData extends Dictionary = Dictionary>
// we need `Record<string & {}, unknown>` here, otherwise `Omit<Context>` is resolved badly
extends Record<string & {}, unknown> {
id: string;
session?: Session;

/**
* An object with information about currently used proxy by the crawler
* and configured by the {@apilink ProxyConfiguration} class.
*/
proxyInfo?: ProxyInfo;

/**
* The original {@apilink Request} object.
*/
Expand Down Expand Up @@ -86,15 +95,6 @@ export interface RestrictedCrawlingContext<UserData extends Dictionary = Diction

export interface CrawlingContext<Crawler = unknown, UserData extends Dictionary = Dictionary>
extends RestrictedCrawlingContext<UserData> {
id: string;
session?: Session;

/**
* An object with information about currently used proxy by the crawler
* and configured by the {@apilink ProxyConfiguration} class.
*/
proxyInfo?: ProxyInfo;

crawler: Crawler;

/**
Expand Down
47 changes: 44 additions & 3 deletions packages/http-crawler/src/internals/http-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ import {
SessionError,
} from '@crawlee/basic';
import type { Awaitable, Dictionary } from '@crawlee/types';
import { RETRY_CSS_SELECTORS, gotScraping } from '@crawlee/utils';
import { RETRY_CSS_SELECTORS, gotScraping, type CheerioRoot } from '@crawlee/utils';
import * as cheerio from 'cheerio';
import type { RequestLike, ResponseLike } from 'content-type';
import contentTypeParser from 'content-type';
Expand Down Expand Up @@ -215,7 +215,33 @@ export interface InternalHttpCrawlingContext<
contentType: { type: string; encoding: BufferEncoding };
response: PlainResponse;

parseWithCheerio(): Promise<cheerio.CheerioAPI>;
/**
* Wait for an element matching the selector to appear. Timeout is ignored.
*
* **Example usage:**
* ```ts
* async requestHandler({ waitForSelector, parseWithCheerio }) {
* await waitForSelector('article h1');
* const $ = await parseWithCheerio();
* const title = $('title').text();
* });
* ```
*/
waitForSelector(selector: string, timeoutMs?: number): Promise<void>;

/**
* Returns Cheerio handle for `page.content()`, allowing to work with the data same way as with {@apilink CheerioCrawler}.
* When provided with the `selector` argument, it will throw if it's not available.
*
* **Example usage:**
* ```ts
* async requestHandler({ parseWithCheerio }) {
* const $ = await parseWithCheerio();
* const title = $('title').text();
* });
* ```
*/
parseWithCheerio(selector?: string, timeoutMs?: number): Promise<CheerioRoot>;
}

export interface HttpCrawlingContext<UserData extends Dictionary = any, JSONData extends JsonValue = any>
Expand Down Expand Up @@ -488,7 +514,22 @@ export class HttpCrawler<
tryCancel();

// `??=` because descendant classes may already set optimized version
crawlingContext.parseWithCheerio ??= async () => cheerio.load(parsed.body!.toString());
crawlingContext.waitForSelector ??= async (selector?: string, _timeoutMs?: number) => {
const $ = cheerio.load(parsed.body!.toString());

if ($(selector).get().length === 0) {
throw new Error(`Selector '${selector}' not found.`);
}
};
crawlingContext.parseWithCheerio ??= async (selector?: string, timeoutMs?: number) => {
const $ = cheerio.load(parsed.body!.toString());

if (selector) {
await crawlingContext.waitForSelector(selector, timeoutMs);
}

return $;
};

if (this.useSessionPool) {
this._throwOnBlockedRequest(crawlingContext.session!, response.statusCode!);
Expand Down
42 changes: 40 additions & 2 deletions packages/jsdom-crawler/src/internals/jsdom-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import {
tryAbsoluteURL,
} from '@crawlee/http';
import type { Dictionary } from '@crawlee/types';
import { type CheerioRoot, sleep } from '@crawlee/utils';
import * as cheerio from 'cheerio';
import type { DOMWindow } from 'jsdom';
import { JSDOM, ResourceLoader, VirtualConsole } from 'jsdom';
Expand Down Expand Up @@ -58,8 +59,24 @@ export interface JSDOMCrawlingContext<
window: DOMWindow;
document: Document;

/**
* Wait for an element matching the selector to appear. Timeout is ignored.
* Timeout defaults to 5s.
*
* **Example usage:**
* ```ts
* async requestHandler({ waitForSelector, parseWithCheerio }) {
* await waitForSelector('article h1');
* const $ = await parseWithCheerio();
* const title = $('title').text();
* });
* ```
*/
waitForSelector(selector: string, timeoutMs?: number): Promise<void>;

/**
* Returns Cheerio handle, allowing to work with the data same way as with {@apilink CheerioCrawler}.
* When provided with the `selector` argument, it will first look for the selector with a 5s timeout.
*
* **Example usage:**
* ```javascript
Expand All @@ -69,7 +86,7 @@ export interface JSDOMCrawlingContext<
* });
* ```
*/
parseWithCheerio(): Promise<cheerio.CheerioAPI>;
parseWithCheerio(selector?: string, timeoutMs?: number): Promise<CheerioRoot>;
}

export type JSDOMRequestHandler<
Expand Down Expand Up @@ -294,7 +311,28 @@ export class JSDOMCrawler extends HttpCrawler<JSDOMCrawlingContext> {
}

override async _runRequestHandler(context: JSDOMCrawlingContext) {
context.parseWithCheerio = async () => Promise.resolve(cheerio.load(context.body));
context.waitForSelector = async (selector: string, timeoutMs = 5_000) => {
const $ = cheerio.load(context.body);

if ($(selector).get().length === 0) {
if (timeoutMs) {
await sleep(50);
return context.waitForSelector(selector, Math.max(timeoutMs - 50, 0));
}

throw new Error(`Selector '${selector}' not found.`);
}
};
context.parseWithCheerio = async (selector?: string, _timeoutMs = 5_000) => {
const $ = cheerio.load(context.body);

if (selector && $(selector).get().length === 0) {
throw new Error(`Selector '${selector}' not found.`);
}

return $;
};

await super._runRequestHandler(context);
}
}
Expand Down
47 changes: 45 additions & 2 deletions packages/linkedom-crawler/src/internals/linkedom-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ import {
tryAbsoluteURL,
} from '@crawlee/http';
import type { Dictionary } from '@crawlee/types';
import type * as cheerio from 'cheerio';
import { type CheerioRoot, sleep } from '@crawlee/utils';
import * as cheerio from 'cheerio';
// @ts-expect-error This throws a compilation error due to TypeScript not inferring the module has CJS versions too
import { DOMParser } from 'linkedom/cached';

Expand Down Expand Up @@ -53,8 +54,24 @@ export interface LinkeDOMCrawlingContext<
// even though it's not technically 100% correct
document: Document;

/**
* Wait for an element matching the selector to appear. Timeout is ignored.
* Timeout defaults to 5s.
*
* **Example usage:**
* ```ts
* async requestHandler({ waitForSelector, parseWithCheerio }) {
* await waitForSelector('article h1');
* const $ = await parseWithCheerio();
* const title = $('title').text();
* });
* ```
*/
waitForSelector(selector: string, timeoutMs?: number): Promise<void>;

/**
* Returns Cheerio handle, allowing to work with the data same way as with {@apilink CheerioCrawler}.
* When provided with the `selector` argument, it will first look for the selector with a 5s timeout.
*
* **Example usage:**
* ```javascript
Expand All @@ -64,7 +81,7 @@ export interface LinkeDOMCrawlingContext<
* });
* ```
*/
parseWithCheerio(): Promise<cheerio.CheerioAPI>;
parseWithCheerio(selector?: string, timeoutMs?: number): Promise<CheerioRoot>;
}

export type LinkeDOMRequestHandler<
Expand Down Expand Up @@ -175,6 +192,32 @@ export class LinkeDOMCrawler extends HttpCrawler<LinkeDOMCrawlingContext> {
},
};
}

override async _runRequestHandler(context: LinkeDOMCrawlingContext) {
context.waitForSelector = async (selector: string, timeoutMs = 5_000) => {
const $ = cheerio.load(context.body);

if ($(selector).get().length === 0) {
if (timeoutMs) {
await sleep(50);
return context.waitForSelector(selector, Math.max(timeoutMs - 50, 0));
}

throw new Error(`Selector '${selector}' not found.`);
}
};
context.parseWithCheerio = async (selector?: string, _timeoutMs = 5_000) => {
const $ = cheerio.load(context.body);

if (selector && $(selector).get().length === 0) {
throw new Error(`Selector '${selector}' not found.`);
}

return $;
};

await super._runRequestHandler(context);
}
}

interface EnqueueLinksInternalOptions {
Expand Down
Loading

0 comments on commit 6f88e73

Please sign in to comment.