Skip to content

Commit

Permalink
extension: better extractors (#8562)
Browse files Browse the repository at this point in the history
* extension: smart-ish extractors

* fix extraction

* micro-nit
  • Loading branch information
spolu authored Nov 13, 2024
1 parent 5d8cd5c commit 97231b8
Show file tree
Hide file tree
Showing 2 changed files with 292 additions and 1 deletion.
3 changes: 2 additions & 1 deletion extension/app/background.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import {
AUTH0_CLIENT_DOMAIN,
AUTH0_CLIENT_ID,
} from "./src/lib/config";
import { extractPage } from "./src/lib/extraction";
import type {
Auth0AuthorizeResponse,
AuthBackgroundMessage,
Expand Down Expand Up @@ -145,7 +146,7 @@ chrome.runtime.onMessage.addListener(
}
: {
target: { tabId: tab.id },
func: () => document.documentElement.innerText,
func: extractPage(tab.url || ""),
}
)
: [undefined];
Expand Down
290 changes: 290 additions & 0 deletions extension/app/src/lib/extraction.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
/** DOM text parsing */

export const extractPage = (url: string) => {
const defaultExtractor = () => {
const textDOMfFromPage = () => {
type TextDOM = {
nodeType: "text" | "clickable" | "input" | "image" | null;
tagName: string | null;
element: Element;
text: string | null;
value: string | null;
children: TextDOM[];
};

const textDOMFromPage = () => {
const CLICKABLE_INPUTS = [
"button",
"checkbox",
// "color",
// "file",
// "image",
"radio",
"reset",
"submit",
];

const FILLABLE_INPUTS = [
"date",
"datetime-local",
"email",
"month",
"number",
"password",
"range",
"search",
"tel",
"text",
"time",
"url",
"week",
"",
];

const traverseDOM = (
element: Element,
tree: TextDOM,
depth: number = 0
) => {
const skipTags = ["SCRIPT", "STYLE"];
if (skipTags.includes(element.tagName)) {
return;
}

// Process all child nodes in order
for (const node of Array.from(element.childNodes)) {
if (skipTags.includes(node.nodeName)) {
continue;
}

let child: TextDOM | null = null;

if (node.nodeType === 3) {
// [t]
const text = node.textContent?.trim();
if (text?.length) {
child = {
nodeType: "text",
element: node.parentElement!,
tagName: null,
text,
value: null,
children: [],
};
tree.children.push(child);
}
} else if (node.nodeType === 1) {
// [v] | [c] | [i] | null
const el = node as Element;

if (el.tagName === "IMG") {
// [v]
const text = el.getAttribute("alt") || null;
child = {
nodeType: "image",
tagName: el.tagName,
element: el,
text,
value: null,
children: [],
};
} else if (
// [c]
el.tagName === "A" ||
el.tagName === "BUTTON" ||
el.getAttribute("role") === "button" ||
// el.getAttribute("tabindex") ||
el.getAttribute("onclick") ||
(el as HTMLElement).onclick ||
(el.tagName === "INPUT" &&
CLICKABLE_INPUTS.includes(el.getAttribute("type") || ""))
) {
const ariaLabel = el.getAttribute("aria-label");
child = {
nodeType: "clickable",
tagName: el.tagName,
element: el,
text: ariaLabel, // Use aria-label as text if available
value: null,
children: [],
};
} else if (
// [i]
el.tagName === "TEXTAREA" ||
el.getAttribute("contenteditable") === "true" ||
(el.tagName === "INPUT" &&
FILLABLE_INPUTS.includes(el.getAttribute("type") || ""))
) {
const placeholder =
el.getAttribute("placeholder") ||
el.getAttribute("aria-label") ||
el.getAttribute("data-placeholder") ||
null;
// console.log("PLACEHOLDER", placeholder);
// console.log(el);
const value = (el as HTMLInputElement).value || null;
child = {
nodeType: "input",
tagName: el.tagName,
element: el,
text: placeholder, // Use placeholder as text if available
value,
children: [],
};
// TODO(spolu): handle select
} else {
const placeholder =
el.getAttribute("placeholder") ||
el.getAttribute("aria-label") ||
el.getAttribute("data-placeholder") ||
null;
// null
child = {
nodeType: placeholder ? "text" : null,
tagName: el.tagName,
element: el,
text: placeholder,
value: null,
children: [],
};
}

// Add child to tree.
tree.children.push(child);

// Recursively traverse children element/tree: el/child.
traverseDOM(el, child, depth + 1);
}
}
};

const body: TextDOM = {
nodeType: null,
element: document.body,
tagName: null,
text: null,
value: null,
children: [],
};

traverseDOM(document.body, body, 0);

return body;
};

const renderTree = (node: TextDOM | null, indent: string) => {
if (!node) {
return { render: "" };
}

const lines: string[] = [];

if (node.nodeType !== null) {
let out = "";
switch (node.nodeType) {
case "text":
out = `[t]${node.text ? ` ${node.text}` : ""}`;
break;
case "clickable":
out = `[c]${node.text ? ` ${node.text}` : ""}`;
break;
case "input":
out = `[i]${node.text ? ` ${node.text}` : ""}`;
if (node.value) {
out += ` {value: "${node.value}"}`;
}
break;
case "image":
out = `[v]${node.text ? ` ${node.text}` : ""}`;
break;
}
if (out) {
lines.push(`${indent}${out}`);
}
}

const hasNonNullChild = node.children.some(
(child) => child.nodeType !== null
);
const childIndent = hasNonNullChild ? indent + " " : indent;

node.children.forEach((child) => {
const { render: childRender } = renderTree(child, childIndent);
if (childRender) {
lines.push(childRender);
}
});

return { render: lines.join("\n") };
};

const tree = textDOMFromPage();
const { render } = renderTree(tree, "");
const header = `\
// The following "text DOM" page representation is computed by striping everything but textual [t],
// clickable [c], input [i], and image [v] elements. The origin DOM structure is preserved through
// the indentation of these elements in the text DOM representation.
//
// URL: ${window.location.href}`;

const extract = `${header}\n${render}`;
// console.log("----------------------------");
// console.log(extract);
return extract;
};

//return document.documentElement.innerText;
return textDOMfFromPage();
};

const gdocsExtractor = () => {
// What an incredible hack. Docs always have script tags with malformed JS that contain
// `DOCS_modelChunk`s, which can be parsed to reconstruct the entire plain-text doc.
const contents = Array.from(document.scripts)
.map((s) => {
try {
if (s.innerHTML.toString().startsWith("DOCS_modelChunk =")) {
return s.innerHTML;
}
return null;
} catch {
return null;
}
})
.filter(Boolean);

const content = contents
.map((c) => {
try {
const arr = JSON.parse(
(c || "").split("=", 2)[1].trim().split("},{")[0] + "}]"
);
return arr[0].s;
} catch {
// ignore
}
return null;
})
.filter(Boolean)
.join("\n");

if (content.length > 0) {
return content;
}

return document.body.innerText;
};

const u = new URL(url);
// console.log('URL', u);

switch (u.host) {
case "docs.google.com":
if (u.pathname.startsWith("/document")) {
return gdocsExtractor;
}
return defaultExtractor;
default:
return defaultExtractor;
}
};

0 comments on commit 97231b8

Please sign in to comment.