Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions packages/basic-crawler/src/internals/send-request.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import {
type Request,
type Session,
} from '@crawlee/core';
import type { Method, Response as GotResponse } from 'got-scraping';

/**
* Prepares a function to be used as the `sendRequest` context helper.
Expand All @@ -22,10 +21,7 @@ export function createSendRequest(
session: Session | undefined,
getProxyUrl: () => string | undefined,
) {
return async <Response = string>(
// TODO the type information here (and in crawler_commons) is outright wrong... for BC - replace this with generic HttpResponse in v4
overrideOptions: Partial<HttpRequestOptions> = {},
): Promise<GotResponse<Response>> => {
return async (overrideOptions: Partial<HttpRequestOptions> = {}): Promise<Response> => {
const cookieJar = session
? {
getCookieString: async (url: string) => session.getCookieString(url),
Expand All @@ -36,7 +32,7 @@ export function createSendRequest(

const requestOptions = processHttpRequestOptions({
url: originRequest.url,
method: originRequest.method as Method, // Narrow type to omit CONNECT
method: originRequest.method,
headers: originRequest.headers,
proxyUrl: getProxyUrl(),
sessionToken: session,
Expand All @@ -48,6 +44,6 @@ export function createSendRequest(
// Fill in body as the last step - `processHttpRequestOptions` may use either `body`, `json` or `form` so we cannot override it beforehand
requestOptions.body ??= originRequest.payload;

return httpClient.sendRequest<any>(requestOptions) as unknown as GotResponse<Response>;
return httpClient.sendRequest(requestOptions);
};
}
11 changes: 2 additions & 9 deletions packages/cheerio-crawler/src/internals/cheerio-crawler.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
import type { IncomingMessage } from 'node:http';
import { text as readStreamToString } from 'node:stream/consumers';

import type {
Configuration,
EnqueueLinksOptions,
Expand Down Expand Up @@ -168,12 +165,8 @@ export class CheerioCrawler extends HttpCrawler<CheerioCrawlingContext> {
super(options, config);
}

protected override async _parseHTML(
response: IncomingMessage,
isXml: boolean,
crawlingContext: CheerioCrawlingContext,
) {
const body = await readStreamToString(response);
protected override async _parseHTML(response: Response, isXml: boolean, crawlingContext: CheerioCrawlingContext) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will cause a merge conflict with the ContextPipeline PR. I would like to go first if that's possible (the vessel that's harder to steer has the right of way).

const body = await response.text();
const dom = parseDocument(body, { decodeEntities: true, xmlMode: isXml });
const $ = cheerio.load(dom, {
xml: { decodeEntities: true, xmlMode: isXml },
Expand Down
12 changes: 5 additions & 7 deletions packages/core/src/cookie_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,14 @@ export interface ResponseLike {
/**
* @internal
*/
export function getCookiesFromResponse(response: ResponseLike): Cookie[] {
const headers = typeof response.headers === 'function' ? response.headers() : response.headers;
const cookieHeader = headers?.['set-cookie'] || '';
export function getCookiesFromResponse(response: Response): Cookie[] {
const headers = response.headers;
const cookieHeaders = headers.getSetCookie();

try {
return Array.isArray(cookieHeader)
? cookieHeader.map((cookie) => Cookie.parse(cookie)!)
: [Cookie.parse(cookieHeader)!];
return cookieHeaders.map((cookie) => Cookie.parse(cookie)!);
} catch (e) {
throw new CookieParseError(cookieHeader);
throw new CookieParseError(cookieHeaders);
}
}

Expand Down
4 changes: 2 additions & 2 deletions packages/core/src/crawlers/crawler_commons.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types';
import type { OptionsInit, Response as GotResponse } from 'got-scraping';
import type { OptionsInit } from 'got-scraping';
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would love to see this go as well

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer to do that as a part of a separate PR. Removing got-scraping (and all the type todos) is no small feat, which would make it hard to review, if done all-in-one.

import type { ReadonlyDeep } from 'type-fest';

import type { Configuration } from '../configuration.js';
Expand Down Expand Up @@ -163,7 +163,7 @@ export interface CrawlingContext<Crawler = unknown, UserData extends Dictionary
* },
* ```
*/
sendRequest<Response = string>(overrideOptions?: Partial<OptionsInit>): Promise<GotResponse<Response>>;
sendRequest(overrideOptions?: Partial<OptionsInit>): Promise<Response>;
}

/**
Expand Down
35 changes: 13 additions & 22 deletions packages/core/src/http_clients/base-http-client.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import type { Readable } from 'node:stream';

import type { AllowedHttpMethods } from '@crawlee/types';
import { applySearchParams, type SearchParams } from '@crawlee/utils';

import type { FormDataLike } from './form-data-like.js';
Expand All @@ -15,24 +16,6 @@ type Timeout =
}
| { request: number };

type Method =
| 'GET'
| 'POST'
| 'PUT'
| 'PATCH'
| 'HEAD'
| 'DELETE'
| 'OPTIONS'
| 'TRACE'
| 'get'
| 'post'
| 'put'
| 'patch'
| 'head'
| 'delete'
| 'options'
| 'trace';

/**
* Maps permitted values of the `responseType` option on {@apilink HttpRequest} to the types that they produce.
*/
Expand Down Expand Up @@ -79,7 +62,7 @@ export interface HttpRequest<TResponseType extends keyof ResponseTypes = 'text'>
[k: string]: unknown; // TODO BC with got - remove in 4.0

url: string | URL;
method?: Method;
method?: AllowedHttpMethods;
headers?: SimpleHeaders;
body?: string | Buffer | Readable | Generator | AsyncGenerator | FormDataLike;

Expand Down Expand Up @@ -146,6 +129,14 @@ interface HttpResponseWithoutBody<TResponseType extends keyof ResponseTypes = ke
request: HttpRequest<TResponseType>;
}

export class ResponseWithUrl extends Response {
override url: string;
constructor(body: BodyInit | null, init: ResponseInit & { url?: string }) {
super(body, init);
this.url = init.url ?? '';
}
}

/**
* HTTP response data as returned by the {@apilink BaseHttpClient.sendRequest} method.
*/
Expand All @@ -169,7 +160,7 @@ export interface StreamingHttpResponse extends HttpResponseWithoutBody {
* Type of a function called when an HTTP redirect takes place. It is allowed to mutate the `updatedRequest` argument.
*/
export type RedirectHandler = (
redirectResponse: BaseHttpResponseData,
redirectResponse: Response,
updatedRequest: { url?: string | URL; headers: SimpleHeaders },
) => void;

Expand All @@ -182,12 +173,12 @@ export interface BaseHttpClient {
*/
sendRequest<TResponseType extends keyof ResponseTypes = 'text'>(
request: HttpRequest<TResponseType>,
): Promise<HttpResponse<TResponseType>>;
): Promise<Response>;

/**
* Perform an HTTP Request and return after the response headers are received. The body may be read from a stream contained in the response.
*/
stream(request: HttpRequest, onRedirect?: RedirectHandler): Promise<StreamingHttpResponse>;
stream(request: HttpRequest, onRedirect?: RedirectHandler): Promise<Response>;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't the stream method obsolete? The web Response class can be streamed using response.body when the caller chooses to do so

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, it actually is 👍 I'd prefer to do this in a separate PR, for the same reasons as the total got-scraping phase-out.

}

/**
Expand Down
109 changes: 62 additions & 47 deletions packages/core/src/http_clients/got-scraping-http-client.ts
Original file line number Diff line number Diff line change
@@ -1,25 +1,40 @@
import { Readable } from 'node:stream';

import type { Options, PlainResponse } from 'got-scraping';
import { gotScraping } from 'got-scraping';

import type {
BaseHttpClient,
HttpRequest,
HttpResponse,
RedirectHandler,
ResponseTypes,
StreamingHttpResponse,
import {
type BaseHttpClient,
type HttpRequest,
type RedirectHandler,
type ResponseTypes,
ResponseWithUrl,
} from './base-http-client.js';

/**
* A HTTP client implementation based on the `got-scraping` library.
*/
export class GotScrapingHttpClient implements BaseHttpClient {
/**
* Type guard that validates the HTTP method (excluding CONNECT).
* @param request - The HTTP request to validate
*/
private validateRequest<TResponseType extends keyof ResponseTypes, T extends HttpRequest<TResponseType>>(
request: T,
): request is T & { method: Exclude<T['method'], 'CONNECT' | 'connect'> } {
return !['CONNECT', 'connect'].includes(request.method!);
}

/**
* @inheritDoc
*/
async sendRequest<TResponseType extends keyof ResponseTypes>(
request: HttpRequest<TResponseType>,
): Promise<HttpResponse<TResponseType>> {
): Promise<Response> {
if (!this.validateRequest(request)) {
throw new Error(`The HTTP method CONNECT is not supported by the GotScrapingHttpClient.`);
}

const gotResult = await gotScraping({
...request,
retry: {
Expand All @@ -28,23 +43,45 @@ export class GotScrapingHttpClient implements BaseHttpClient {
},
});

return {
...gotResult,
body: gotResult.body as ResponseTypes[TResponseType],
request: { url: request.url, ...gotResult.request },
};
const parsedHeaders = Object.entries(gotResult.headers)
.map(([key, value]) => {
if (value === undefined) return [];

if (Array.isArray(value)) {
return value.map((v) => [key, v]);
}

return [[key, value]];
})
.flat() as [string, string][];

return new ResponseWithUrl(new Uint8Array(gotResult.rawBody), {
headers: new Headers(parsedHeaders),
status: gotResult.statusCode,
statusText: gotResult.statusMessage ?? '',
url: gotResult.url,
});
}

/**
* @inheritDoc
*/
async stream(request: HttpRequest, handleRedirect?: RedirectHandler): Promise<StreamingHttpResponse> {
async stream(request: HttpRequest, handleRedirect?: RedirectHandler): Promise<Response> {
if (!this.validateRequest(request)) {
throw new Error(`The HTTP method CONNECT is not supported by the GotScrapingHttpClient.`);
}
// eslint-disable-next-line no-async-promise-executor
return new Promise(async (resolve, reject) => {
const stream = gotScraping({ ...request, isStream: true });

stream.on('redirect', (updatedOptions: Options, redirectResponse: PlainResponse) => {
handleRedirect?.(redirectResponse, updatedOptions);
stream.on('redirect', (updatedOptions: Options, redirectResponse: any) => {
const nativeRedirectResponse = new ResponseWithUrl(redirectResponse.rawBody, {
headers: redirectResponse.headers,
status: redirectResponse.statusCode,
statusText: redirectResponse.statusMessage,
url: redirectResponse.url,
});
handleRedirect?.(nativeRedirectResponse, updatedOptions);
});

// We need to end the stream for DELETE requests, otherwise it will hang.
Expand All @@ -55,37 +92,15 @@ export class GotScrapingHttpClient implements BaseHttpClient {
stream.on('error', reject);

stream.on('response', (response: PlainResponse) => {
const result: StreamingHttpResponse = {
stream,
request,
redirectUrls: response.redirectUrls,
url: response.url,
ip: response.ip,
statusCode: response.statusCode,
headers: response.headers,
trailers: response.trailers,
complete: response.complete,
get downloadProgress() {
return stream.downloadProgress;
},
get uploadProgress() {
return stream.uploadProgress;
},
};

Object.assign(result, response); // TODO BC - remove in 4.0

resolve(result);

stream.on('end', () => {
result.complete = response.complete;

result.trailers ??= {};
Object.assign(result.trailers, response.trailers);

(result as any).rawTrailers ??= []; // TODO BC - remove in 4.0
Object.assign((result as any).rawTrailers, response.rawTrailers);
});
// Cast shouldn't be needed here, undici might have a different `ReadableStream` type
resolve(
new ResponseWithUrl(Readable.toWeb(stream) as any, {
status: response.statusCode,
statusText: response.statusMessage ?? '',
headers: response.headers as HeadersInit,
url: response.url,
}),
);
});
});
}
Expand Down
23 changes: 19 additions & 4 deletions packages/core/src/request.ts
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ export enum RequestState {
* ```
* @category Sources
*/
export class Request<UserData extends Dictionary = Dictionary> {
class CrawleeRequest<UserData extends Dictionary = Dictionary> {
/** Request ID */
id?: string;

Expand Down Expand Up @@ -193,7 +193,8 @@ export class Request<UserData extends Dictionary = Dictionary> {
this.url = url;
this.loadedUrl = loadedUrl;
this.uniqueKey =
uniqueKey || Request.computeUniqueKey({ url, method, payload, keepUrlFragment, useExtendedUniqueKey });
uniqueKey ||
CrawleeRequest.computeUniqueKey({ url, method, payload, keepUrlFragment, useExtendedUniqueKey });
this.method = method;
this.payload = payload;
this.noRetry = noRetry;
Expand Down Expand Up @@ -255,6 +256,18 @@ export class Request<UserData extends Dictionary = Dictionary> {
}
}

/**
* Converts the Crawlee Request object to a `fetch` API Request object.
* @returns The native `fetch` API Request object.
*/
public intoFetchAPIRequest(): Request {
return new Request(this.url, {
method: this.method,
headers: this.headers,
body: this.payload,
});
}

/** Tells the crawler processing this request to skip the navigation and process the request directly. */
get skipNavigation(): boolean {
return this.userData.__crawlee?.skipNavigation ?? false;
Expand Down Expand Up @@ -398,7 +411,7 @@ export class Request<UserData extends Dictionary = Dictionary> {
}
return normalizedUrl;
}
const payloadHash = payload ? Request.hashPayload(payload) : '';
const payloadHash = payload ? CrawleeRequest.hashPayload(payload) : '';
return `${normalizedMethod}(${payloadHash}):${normalizedUrl}`;
}

Expand Down Expand Up @@ -526,10 +539,12 @@ interface ComputeUniqueKeyOptions {
useExtendedUniqueKey?: boolean;
}

export type Source = (Partial<RequestOptions> & { requestsFromUrl?: string; regex?: RegExp }) | Request;
export type Source = (Partial<RequestOptions> & { requestsFromUrl?: string; regex?: RegExp }) | CrawleeRequest;

/** @internal */
export interface InternalSource {
requestsFromUrl: string;
regex?: RegExp;
}

export { CrawleeRequest as Request };
Loading