add trampolines tutorial

naugtur · naugtur · commit b80fa77fa148 · 2024-06-14T23:11:11.000+02:00
diff --git a/trampolines/README.md b/trampolines/README.md
@@ -0,0 +1,21 @@
+## Trampolines tutorial
+
+
+What if you have a complex implementation of some logic that you need to feed data from various sources and it needs to be synchronous or async depending on where you use?
+Should you duplicate the logic? Should you split the logic into many small functions? what if there's a complex set of conditions?
+
+Implement `crawlAndProcessSync` and `crawlAndProcessAsync` in `crawler.js` without duplicating the main crawling implementation and making significant changes to the body of the `crawlAndProcess` function.
+
+![](https://i.giphy.com/media/v1.Y2lkPTc5MGI3NjExZmJ6NHAwMHVpdm5ycGVzNDhnemR1NWlnNzBqMGZnd2J3aXlqbjM0eiZlcD12MV9pbnRlcm5hbF9naWZfYnlfaWQmY3Q9Zw/n4FpwV0JxhYjtBjn4U/giphy.gif)
+
+### Tips
+
+Train of thought
+- The key idea: We need to outsource awaiting or not awaiting to the Sync/Async functions from `crawlAndProcess`.
+- The logic still has to stop when something potentially asynchronous is run.
+- What other primitive do we know that can pause execution mid-function?
+- Yes, that's the starting point. Now implement wrappers on the outside of the function.
+
+
+Let me guess, you broke recursion? You need to forward all values from it, not only the first one.
+
diff --git a/trampolines/app.js b/trampolines/app.js
@@ -0,0 +1,40 @@
+import dns from "dns/promises";
+import { crawlAndProcessAsync, crawlAndProcessSync } from "./crawler.js";
+import { inMemoryFetch } from "./fixture.js";
+
+const deny = ["127.0.0.1", "192.168.0.1", "localhost"];
+
+function basicDenyFilter(url) {
+  const domain = new URL(url).hostname;
+  return !deny.includes(domain);
+}
+
+async function dnsDenyFilter(url) {
+  const domain = new URL(url).hostname;
+  try {
+    const addresses = await dns.resolve(domain);
+    const isDenied = addresses.some((address) => deny.includes(address));
+    if (!isDenied) {
+      return true;
+    }
+  } catch (error) {
+    console.error(`DNS lookup failed for ${domain}:`, error);
+  }
+  return false;
+}
+
+// Example usage with DNS lookup filter and actual fetch
+const sitemap1 = await crawlAndProcessAsync({
+  url: "https://example.com",
+  filterURL: dnsDenyFilter,
+  download: (url) => fetch(url).then((response) => response.text()),
+});
+console.log(sitemap1);
+
+// Synchronous usage in a faked environment
+const sitemap2 = crawlAndProcessSync({
+  url: "https://example.com",
+  filterURL: basicDenyFilter,
+  download: inMemoryFetch,
+});
+console.log(sitemap2);
diff --git a/trampolines/crawler.js b/trampolines/crawler.js
@@ -0,0 +1,58 @@
+export async function crawlAndProcessAsync(options) {
+  return crawlAndProcess(options);
+}
+
+export function crawlAndProcessSync(options) {
+  return crawlAndProcess(options);
+}
+
+async function crawlAndProcess({
+  url,
+  filterURL,
+  download,
+  
+  depth = 2,
+  visited = new Set(),
+  sitemap = {},
+  memo = new Map(),
+}) {
+  if (depth < 1) return;
+  if (visited.has(url)) return;
+  visited.add(url);
+
+  const html = await download(url);
+
+  // Extract URLs from the HTML content
+  const urlRegex = /https:\/\/[^\s"'<]+/g;
+  const urls = html.match(urlRegex) || [];
+
+  // Filter URLs using the filterURL function and memoize results
+  const filteredUrls = [];
+  for (const url of urls) {
+    if (!memo.has(url)) {
+      const isAllowed = await filterURL(url);
+      memo.set(url, isAllowed);
+    }
+    if (memo.get(url)) {
+      filteredUrls.push(url);
+    }
+  }
+
+  // Add the current URL and its filtered URLs to the sitemap
+  sitemap[url] = filteredUrls;
+
+  // Recursively crawl filtered URLs
+  for (const filteredUrl of filteredUrls) {
+    await crawlAndProcess({
+      url: filteredUrl,
+      filterURL,
+      download,
+      depth: depth - 1,
+      visited,
+      sitemap,
+      memo,
+    });
+  }
+
+  return sitemap;
+}
diff --git a/trampolines/fixture.js b/trampolines/fixture.js
@@ -0,0 +1,26 @@
+export function inMemoryFetch(url) {
+  const html = examplePages.get(url);
+  if (!html) {
+    throw new Error(`URL not found in examplePages: ${url}`);
+  }
+  return html;
+}
+
+export const examplePages = new Map([
+  [
+    "https://example.com",
+    `<a href="https://example.com/page1">Page 1</a>
+    <a href="https://localhost:8080/">You</a>
+    <a href="https://naugtur.pl/">naugtur</a>`,
+  ],
+  [
+    "https://example.com/page1",
+    `<a href="https://example.com/page2">Page 2</a>`,
+  ],
+  [
+    "https://example.com/page2",
+    `<a href="https://example.com/page1">Page 1</a>`,
+  ],
+  ["https://example.com/page3", "No more links here."],
+  ["https://naugtur.pl/", "Hello!"],
+]);
diff --git a/trampolines/package.json b/trampolines/package.json
@@ -0,0 +1,3 @@
+{
+    "type": "module"
+}