-
-
Notifications
You must be signed in to change notification settings - Fork 71
Description
I found a github url that exhibited a couple problems, I believe.
https://gist.github.com/1wErt3r/4048722
The first problem is that Hero's snippet that registers an UnhandledRejection:
process.on('unhandledRejection', (error: Error) => {
if (!error || error[hasBeenLoggedSymbol]) return;
log.error('UnhandledRejection', { error, sessionId: null });
});
gets triggered, with:
2024-03-06T05:06:23.921Z ERROR [hero-core/index] UnhandledRejection { context: {}, sessionId: null, sessionName: undefined } RangeError: Maximum call stack size exceeded
at MirrorPage.onPageEvents (/Users/andy/repos/ec2/stag-secret-agent/app/node_modules/timetravel/lib/MirrorPage.ts:461:28)
at Tab.emit (node:events:517:28)
at Tab.emit (/Users/andy/repos/ec2/stag-secret-agent/app/node_modules/commons/lib/TypedEventEmitter.ts:158:18)
at FrameEnvironment.onPageRecorderEvents (/Users/andy/repos/ec2/stag-secret-agent/app/node_modules/core/lib/FrameEnvironment.ts:600:14)
at Tab.onPageCallback (/Users/andy/repos/ec2/stag-secret-agent/app/node_modules/core/lib/Tab.ts:934:47)
at Page.emit (node:events:517:28)
at Page.emit (/Users/andy/repos/ec2/stag-secret-agent/app/node_modules/commons/lib/TypedEventEmitter.ts:158:18)
at /Users/andy/repos/ec2/stag-secret-agent/app/agent/main/lib/Page.ts:263:14
at DevtoolsSession.<anonymous> (/Users/andy/repos/ec2/stag-secret-agent/app/agent/main/lib/FramesManager.ts:229:9)
The thing is, I have found that any trigger of this UnhandledRejection is often associated with subsequent
failures and timeouts for scraping other pages, that I have decided to also first do:
process.on('unhandledRejection', (error: Error) => {
// causing pm2 to restart
process.exit(1);
}
So that my node server immediately quits and restarts, "resetting" existing scraping sessions of course,
but this is better than causing other scrape failures, and such a reset socket is just a retry. For example,
for this and certain scraping exceptions, I have found that I can dramatically reduce my timeout
error rate for both hero.goto() and my various waitForLoad functions by forcing such abrupt
restarts of my PM2 wrapper of my node server.
In this particular case, it appears that maybe timetravel has a recording bug that blows up the stack?
It makes me wonder, regardless of the resolution of this issue, is it possible for those of us that don't
get value from timetravel, to turn it off? Would it speed things up? Reduce the risk of bugs?
But the other interesting thing about my scrape of this page is that it appears that within just
20 secs of page-loading (on my localhost mac), this page gives "screen" dimension of:
scrollHeight: 333057,
viewport: {
screenWidth: 1920,
screenHeight: 1080,
windowWidth: 1241,
windowHeight: 905
},
offsetWidth: 1241,
offsetHeight: 333056
and the outerHtml is about 7 million chars. I did a quick look at this page, and it is quite huge, but is it that big?
Let me share my snippet of test code:
import Hero, { FrameEnvironment } from '@ulixee/hero';
import { ConnectionToHeroCore } from '@ulixee/hero';
import HeroCore from '@ulixee/hero-core';
import TransportBridge from '@ulixee/net/lib/TransportBridge';
import ExecuteJsPlugin from '@ulixee/execute-js-plugin';
interface ActualViewportSize {
screenWidth: number;
screenHeight: number;
windowWidth: number;
windowHeight: number;
}
interface Dom {
location: string;
html: string; // outerhtml of the document
}
interface Result {
dom: Dom;
scrollHeight?: number;
offsetWidth: number;
offsetHeight: number;
viewport?: ActualViewportSize;
}
let connection: ConnectionToHeroCore | undefined;
let core: HeroCore | undefined;
async function createConnection(): Promise<ConnectionToHeroCore> {
const bridge = new TransportBridge();
const maxConcurrency = 4;
const connectionToCore = new ConnectionToHeroCore(bridge.transportToCore, { maxConcurrency } );
const heroCore = new HeroCore();
heroCore.addConnection(bridge.transportToClient);
connection = connectionToCore;
core = heroCore;
return connectionToCore;
}
async function closeConnection() {
await connection.disconnect();
await core.close();
}
async function newHero(): Promise<Hero> {
const connectionToCore = await createConnection();
// some random size
const viewport = {
screenHeight: 1080,
screenWidth: 1920,
windowHeight: 905,
windowWidth: 1241,
}
console.log('VIEWPORT', viewport);
const { screenHeight, screenWidth, windowHeight, windowWidth } = viewport;
const hero = new Hero({
connectionToCore,
viewport: {
height: windowHeight,
width: windowWidth,
screenHeight,
screenWidth,
},
});
hero.use(ExecuteJsPlugin);
return hero;
}
function domCapture(): Result {
const win: any = window;
const doc = win.document;
const getScrollHeight = (): number => {
const body: any = doc.body;
const el: any = doc.scrollingElement;
if (!body) return !el ? 0 : el.scrollHeight;
else if (!el) return body.scrollHeight;
else return Math.max(body.scrollHeight, el.scrollHeight);
};
const getDom = (): Dom => {
let html: string = doc.documentElement.outerHTML;
return {
location: win.location.href,
html,
};
};
let viewport: ActualViewportSize;
const screen = win.screen;
const screenWidth = screen.width >>> 0;
const screenHeight = screen.height >>> 0;
const windowWidth = win.innerWidth >>> 0;
const windowHeight = win.innerHeight >>> 0;
viewport = { screenWidth, screenHeight, windowWidth, windowHeight };
const dom = getDom();
const scrollHeight = getScrollHeight();
return {
dom,
scrollHeight,
viewport,
offsetWidth: doc.body.offsetWidth,
offsetHeight: doc.body.offsetHeight,
};
}
interface FrameNode {
id: number;
parentId: number;
isMain: boolean;
isDomContentLoaded: Promise<boolean>;
url: Promise<string>;
name: Promise<string>;
frame: FrameEnvironment;
result?: Result;
}
async function getFrameNode(
frame: FrameEnvironment,
): Promise<FrameNode> {
const id = await frame.frameId;
const { url, parentFrameId, name, isDomContentLoaded, isMainFrame } = frame;
const frameNode = {
id,
frame,
isMain: await isMainFrame,
parentId: await parentFrameId,
isDomContentLoaded,
url: url,
name: name,
};
return frameNode;
}
async function captureDoms(node: FrameNode, label: string) {
const { frame, isMain, url, name } = node;
console.log(`CAPTURE ${label}, url (${await url}), name (${await name})`);
node.result = await frame.executeJs(domCapture);
console.log('CAPTURE RESULT', label, node.result);
}
interface Headers {
[key: string]: string;
}
async function waitForDomLoad(
frame: FrameEnvironment,
timeoutMs: number,
): Promise<boolean> {
try {
await frame.waitForLoad('DomContentLoaded', { timeoutMs });
return true;
} catch (e) {
console.log('ERROR: waitForDomLoad', e);
return false;
}
}
async function test(hero: Hero, requestUrl: string) {
const timeoutMs = 60e3;
const pageResource = await hero.goto(requestUrl, { timeoutMs });
const { url, request, response, id, documentUrl } = pageResource;
const pageResponse = pageResource.response;
const status: number = pageResponse.statusCode;
console.log('GOT status', status);
const pageHeaders: Headers = pageResponse.headers as Headers;
const contentType: string = pageHeaders['Content-Type'];
console.log('GOT contentType', contentType);
const tab = hero.activeTab;
const frames = await tab.frameEnvironments;
let mainFrame: FrameNode | undefined;
for (const frame of frames) {
const frameNode = await getFrameNode(frame);
if (frameNode.isMain) mainFrame = frameNode;
}
if (mainFrame === undefined) throw new Error('No main frame found'); // should not happen
const ready = await waitForDomLoad(mainFrame.frame, 20000);
console.log('MAIN FRAME READY', ready);
await captureDoms(mainFrame, 'some_label');
}
async function main() {
const url = 'https://gist.github.com/1wErt3r/4048722';
const hero = await newHero();
await test(hero, url);
await hero.close();
await closeConnection();
}
main();