diff --git a/.editorconfig b/.editorconfig deleted file mode 100644 index 844771f..0000000 --- a/.editorconfig +++ /dev/null @@ -1,9 +0,0 @@ -root = true - -[*] -indent_style = tab -indent_size = 2 -end_of_line = lf -insert_final_newline = true -trim_trailing_whitespace = true -charset = utf-8 diff --git a/.github/workflows/gh-pages.yml b/.github/workflows/gh-pages.yml deleted file mode 100644 index ac919a0..0000000 --- a/.github/workflows/gh-pages.yml +++ /dev/null @@ -1,48 +0,0 @@ -# If you’d like to deploy this to GitHub pages, rename this -# file to `gh-pages.yml` and read the mini-tutorial on -# https://www.11ty.dev/docs/deployment/#deploy-an-eleventy-project-to-github-pages -name: Deploy to GitHub Pages - -on: - push: - branches: - - main - pull_request: - -jobs: - deploy: - runs-on: ubuntu-22.04 - permissions: - contents: write - concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - steps: - - uses: actions/checkout@v3 - - - name: Setup Node - uses: actions/setup-node@v3 - with: - node-version: '18' - - - name: Cache npm - uses: actions/cache@v3 - with: - path: ~/.npm - key: ${{ runner.os }}-node-${{ hashFiles('**/package.json') }} - - - name: Cache Eleventy .cache - uses: actions/cache@v3 - with: - path: ./.cache - key: ${{ runner.os }}-eleventy-fetch-cache - - - - run: npm install - - run: npm run build-ghpages - - - name: Deploy - uses: peaceiris/actions-gh-pages@v3 - if: github.ref == 'refs/heads/main' - with: - github_token: ${{ secrets.GITHUB_TOKEN }} - publish_dir: ./_site diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 09fb98d..0000000 --- a/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -_site/ -node_modules/ -package-lock.json -.cache -**/.DS_Store \ No newline at end of file diff --git a/.nvmrc b/.nvmrc deleted file mode 100644 index b6a7d89..0000000 --- a/.nvmrc +++ /dev/null @@ -1 +0,0 @@ -16 diff --git a/404.html b/404.html new file mode 100644 index 0000000..3b47aea --- /dev/null +++ b/404.html @@ -0,0 +1,317 @@ + + + + + + Damien C. Tanner + + + + + + + + Skip to main content + +
+ Damien C. Tanner + +
+ +
+ +

Content not found. #

+

Go home.

+ + + +
+ + + + + + diff --git a/README.md b/README.md deleted file mode 100644 index 9426f25..0000000 --- a/README.md +++ /dev/null @@ -1,121 +0,0 @@ -# eleventy-base-blog v8 - -A starter repository showing how to build a blog with the [Eleventy](https://www.11ty.dev/) site generator (using the [v2.0 release](https://www.11ty.dev/blog/eleventy-v2/)). - -## Getting Started - -* [Want a more generic/detailed getting started guide?](https://www.11ty.dev/docs/getting-started/) - -1. Make a directory and navigate to it: - -``` -mkdir my-blog-name -cd my-blog-name -``` - -2. Clone this Repository - -``` -git clone https://github.com/11ty/eleventy-base-blog.git . -``` - -_Optional:_ Review `eleventy.config.js` and `_data/metadata.js` to configure the site’s options and data. - -3. Install dependencies - -``` -npm install -``` - -4. Run Eleventy - -Generate a production-ready build to the `_site` folder: - -``` -npx @11ty/eleventy -``` - -Or build and host on a local development server: - -``` -npx @11ty/eleventy --serve -``` - -Or you can run [debug mode](https://www.11ty.dev/docs/debugging/) to see all the internals. - -## Features - -- Using [Eleventy v2.0](https://www.11ty.dev/blog/eleventy-v2/) with zero-JavaScript output. - - Content is exclusively pre-rendered (this is a static site). - - Can easily [deploy to a subfolder without changing any content](https://www.11ty.dev/docs/plugins/html-base/) - - All URLs are decoupled from the content’s location on the file system. - - Configure templates via the [Eleventy Data Cascade](https://www.11ty.dev/docs/data-cascade/) -- **Performance focused**: four-hundos Lighthouse score out of the box! - - [View the Lighthouse report for the latest build](https://eleventy-base-blog.netlify.app/reports/lighthouse/) courtesy of the [Netlify Lighthouse plugin](https://github.com/netlify/netlify-plugin-lighthouse). - - _0 Cumulative Layout Shift_ - - _0ms Total Blocking Time_ -- Local development live reload provided by [Eleventy Dev Server](https://www.11ty.dev/docs/dev-server/). -- Content-driven [navigation menu](https://www.11ty.dev/docs/plugins/navigation/) -- [Image optimization](https://www.11ty.dev/docs/plugins/image/) via the `{% image %}` shortcode. - - Zero-JavaScript output. - - Support for modern image formats automatically (e.g. AVIF and WebP) - - Prefers `` markup if possible (single image format) but switches automatically to `` for multiple image formats. - - Automated `` syntax markup with `srcset` and optional `sizes` - - Includes `width`/`height` attributes to avoid [content layout shift](https://web.dev/cls/). - - Includes `loading="lazy"` for native lazy loading without JavaScript. - - Includes [`decoding="async"`](https://developer.mozilla.org/en-US/docs/Web/API/HTMLImageElement/decoding) - - Images can be co-located with blog post files. - - View the [Image plugin source code](https://github.com/11ty/eleventy-base-blog/blob/main/eleventy.config.images.js) -- Per page CSS bundles [via `eleventy-plugin-bundle`](https://github.com/11ty/eleventy-plugin-bundle). -- Built-in [syntax highlighter](https://www.11ty.dev/docs/plugins/syntaxhighlight/) (zero-JavaScript output). -- Blog Posts - - Draft posts: use `draft: true` to mark a blog post as a draft. Drafts are **only** included during `--serve`/`--watch` and are excluded from full builds. View the [Drafts plugin source code](https://github.com/11ty/eleventy-base-blog/blob/main/eleventy.config.drafts.js). - - Automated next/previous links - - Accessible deep links to headings -- Generated Pages - - Home, Archive, and About pages. - - [Feeds for Atom and JSON](https://www.11ty.dev/docs/plugins/rss/) - - `sitemap.xml` - - Zero-maintenance tag pages ([View on the Demo](https://eleventy-base-blog.netlify.app/tags/)) - - Content not found (404) page - -## Demos - -- [Netlify](https://eleventy-base-blog.netlify.com/) -- [GitHub Pages](https://11ty.github.io/eleventy-base-blog/) -- [Remix on Glitch](https://glitch.com/~11ty-eleventy-base-blog) -- [Cloudflare Pages](https://eleventy-base-blog-d2a.pages.dev/) - -## Deploy this to your own site - -Deploy this Eleventy site in just a few clicks on these services: - -- [Deploy this to **Netlify**](https://app.netlify.com/start/deploy?repository=https://github.com/11ty/eleventy-base-blog) -- [Deploy this to **Vercel**](https://vercel.com/import/project?template=11ty%2Feleventy-base-blog) -- Look in `.github/workflows/gh-pages.yml.sample` for information on Deploying to **GitHub Pages**. -- [Try it out on **Stackblitz**](https://stackblitz.com/github/11ty/eleventy-base-blog) -- If you run Eleventy locally you can drag your `_site` folder to [`drop.netlify.com`](https://drop.netlify.com/) to upload it without using `git`. -- Read more about [Deploying an Eleventy project](https://www.11ty.dev/docs/deployment/) to the web. - -### Implementation Notes - -- `content/about/index.md` is an example of a content page. -- `content/blog/` has the blog posts but really they can live in any directory. They need only the `posts` tag to be included in the blog posts [collection](https://www.11ty.dev/docs/collections/). -- Use the `eleventyNavigation` key (via the [Eleventy Navigation plugin](https://www.11ty.dev/docs/plugins/navigation/)) in your front matter to add a template to the top level site navigation. This is in use on `content/index.njk` and `content/about/index.md`. -- Content can be in _any template format_ (blog posts needn’t exclusively be markdown, for example). Configure your project’s supported templates in `eleventy.config.js` -> `templateFormats`. -- The `public` folder in your input directory will be copied to the output folder (via `addPassthroughCopy` in the `eleventy.config.js` file). This means `./public/css/*` will live at `./_site/css/*` after your build completes. -- Provides two content feeds: - - `content/feed/feed.njk` - - `content/feed/json.njk` -- This project uses three [Eleventy Layouts](https://www.11ty.dev/docs/layouts/): - - `_includes/layouts/base.njk`: the top level HTML structure - - `_includes/layouts/home.njk`: the home page template (wrapped into `base.njk`) - - `_includes/layouts/post.njk`: the blog post template (wrapped into `base.njk`) -- `_includes/postslist.njk` is a Nunjucks include and is a reusable component used to display a list of all the posts. `content/index.njk` has an example of how to use it. - -#### Content Security Policy - -If your site enforces a [Content Security Policy](https://developer.mozilla.org/en-US/docs/Web/HTTP/CSP) (as public-facing sites should), you have a few choices (pick one): - -1. In `base.njk`, remove `` and uncomment `` -2. Configure the server with the CSP directive `style-src: 'unsafe-inline'` (less secure). diff --git a/_data/metadata.js b/_data/metadata.js deleted file mode 100644 index 7cb3c06..0000000 --- a/_data/metadata.js +++ /dev/null @@ -1,9 +0,0 @@ -module.exports = { - title: "Damien C. Tanner", - url: "https://dc.tanner.me/", - language: "en", - description: "Notes from a journey of compounding curiosity.", - author: { - name: "Damien C. Tanner", - }, -}; diff --git a/_includes/layouts/base.njk b/_includes/layouts/base.njk deleted file mode 100644 index 2db5025..0000000 --- a/_includes/layouts/base.njk +++ /dev/null @@ -1,58 +0,0 @@ - - - - - - {{ title or metadata.title }} - - - {#- Atom and JSON feeds included by default #} - - - - {#- Uncomment this if you’d like folks to know that you used Eleventy to build your site! #} - {#- #} - - {#- - CSS bundles are provided via the `eleventy-plugin-bundle` plugin: - 1. You can add to them using `{% css %}` - 2. You can get from them using `{% getBundle "css" %}` or `{% getBundleFileUrl "css" %}` - 3. You can do the same for JS: {% js %}{% endjs %} and - 4. Learn more: https://github.com/11ty/eleventy-plugin-bundle - #} - - {#- Add an arbitrary string to the bundle #} - {%- css %}* { box-sizing: border-box; }{% endcss %} - {#- Add the contents of a file to the bundle #} - {%- css %}{% include "public/css/index.css" %}{% endcss %} - {#- Or add from node_modules #} - {# {%- css %}{% include "node_modules/prismjs/themes/prism-okaidia.css" %}{% endcss %} #} - - {#- Render the CSS bundle using Inlined CSS (for the fastest site performance in production) #} - - {#- Renders the CSS bundle using a separate file, if you can't set CSP directive style-src: 'unsafe-inline' #} - {#- #} - - - Skip to main content - -
- {{ metadata.title }} - - {#- Read more about `eleventy-navigation` at https://www.11ty.dev/docs/plugins/navigation/ #} - -
- -
- {{ content | safe }} -
- -
- - - - diff --git a/_includes/layouts/home.njk b/_includes/layouts/home.njk deleted file mode 100644 index 35df4f1..0000000 --- a/_includes/layouts/home.njk +++ /dev/null @@ -1,5 +0,0 @@ ---- -layout: layouts/base.njk ---- - -{{ content | safe }} diff --git a/_includes/layouts/post.njk b/_includes/layouts/post.njk deleted file mode 100644 index 9543d2d..0000000 --- a/_includes/layouts/post.njk +++ /dev/null @@ -1,28 +0,0 @@ ---- -layout: layouts/base.njk ---- -{# Only include the syntax highlighter CSS on blog posts #} -{%- css %}{% include "node_modules/prismjs/themes/prism-okaidia.css" %}{% endcss %} -{%- css %}{% include "public/css/prism-diff.css" %}{%- endcss %} -

{{ title }}

- - - -{{ content | safe }} - -{%- if collections.posts %} -{%- set previousPost = collections.posts | getPreviousCollectionItem %} -{%- set nextPost = collections.posts | getNextCollectionItem %} -{%- if nextPost or previousPost %} - -{%- endif %} -{%- endif %} diff --git a/_includes/postslist.njk b/_includes/postslist.njk deleted file mode 100644 index 99272c5..0000000 --- a/_includes/postslist.njk +++ /dev/null @@ -1,9 +0,0 @@ -{%- css %}.postlist { counter-reset: start-from {{ (postslistCounter or postslist.length) + 1 }} }{% endcss %} -
    -{% for post in postslist | reverse %} -
  1. - {% if post.data.title %}{{ post.data.title }}{% else %}{{ post.url }}{% endif %} - -
  2. -{% endfor %} -
diff --git a/blog/building-an-ai-superserver/index.html b/blog/building-an-ai-superserver/index.html new file mode 100644 index 0000000..36166a2 --- /dev/null +++ b/blog/building-an-ai-superserver/index.html @@ -0,0 +1,543 @@ + + + + + + Building an AI SuperServer for LLM training and experiments + + + + + + + + Skip to main content + +
+ Damien C. Tanner + +
+ +
+ +

Building an AI SuperServer for LLM training and experiments

+ + + +

Impressive new language models like Llama and Mistral have broadened the accessibility of AI training. If you want to fine-tune a model with your own data, it's now relatively easy to do with tools like Axolotl and a few dollars spent on a GPU cloud. But if you want to go deeper and train larger models or try new methods, the cloud bill can quickly rack up. Renting 8 A100's on AWS will set you back an astounding $350,000 per year! There are cheaper clouds, but they can still cost tens of thousands a year.

+

I've always enjoyed building PCs. I remember when I was 16 and my grandma bought me my first PC to assemble myself. So in the name of fun and saving money, I embarked on building an AI server so that I can more affordably do independent AI research.

+

Your options #

+

Depending on your budget and use case, there are a few routes to take when building an AI server.

+

Open frame #

+

Miner style

+

If the server is just for you, and you want to keep it at home or in your basement, the most affordable option is essentially a powerful consumer PC, with an open frame case (originally designed for crypto miners). You'll be able to find a lots of advice on Reddit for this route.

+

The important things are a motherboard that has lots of 16x PCIe slots, PCIe risers with redrivers, and multiple PSUs (depending the number of GPUs you choose). You'll be able to buy everything second had if you like, including the GPUs. For GPUs you're best going with RTX 3090s or 4090s in this setup, and because there's no case, you won't have issues with space or airflow.

+

The benefit if this route is cost, but also the ability to start simple with just a single GPU and grow as you desire by adding more.

+

Rack server #

+

Server style

+

If you're planning to train larger models, have more servers, datacenter GPUs or just don't have anywhere to house a noisy hot server at home, you can go the rack mountable server route. This is the the route I've gone, as our house doesn't have a basement and our internet isn't that fast. My server now lives in a datacenter where it's cooled and well connected.

+

I found less resources on this route, so the rest of this guide is aimed at helping you build and setup a rack mountable GPU server.

+

Building the server #

+

Supermicro make great server systems and many specifically for AI use cases. For example the SuperServer 4029GP-TRT2 is a mid range 4U dual CPU server with 10 PCIe slots - ideal for filling with GPUs! I found a well priced one from an IT supplier in the UK. The newer model is more expensive, but may be easier to find. Note that the model I used only have PCIe 3.0. If you are using RTX 4090 or a newer datacenter GPU, you will probably want the newer model which supports PCIe 4.0.

+

SuperServer 4029GP-TRT2

+

It arrived at my house on a pallet. It was heavier than I expected!

+

The pallet

+

After lugging it up the stairs and reading the manual, I installed 10 RTX 3090s I bought second hand from someone who previously was using them for mining. Note that to fit the maximum number of GPUs in a system you'll need to find blower or turbo style GPUs that are only two slots wide. The vast majority of 3090 and 4090 GPUs are for gaming, and they will take up 3 slots and the power comes out the top and you won't be able to put the case on your server. If you can't find blower consumer GPUs, you're next best bet is the RTX A6000 which is still fairly good value for money, even if it's still 3x more than a 4090.

+

You'll also need to add the CPUs (two of them), memory and storage. I sourced everything secondhand from eBay. Most things cost no more than a few hundred dollars each. I went with 2x Intel Xeon Platinum 8160, 12x32GB DDR memory and an 8TB SSD.

+

Once everything was installed, I turned it on for the first time - what I heard could only be described as a mini jet engine. Server fans are noisy.

+

Next step was to setup the OS and environment.

+

Setting up the OS #

+

Supermicro servers have in inbuilt webui called IPMI for accessing the server console and monitor output. There is a dedicated lan port for PICE on this server. You should also plug in a second lan cable to one of the main lan ports, otherwise your server won't actually have internet access (this confused me initially).

+

It will find an IP with DHCP, so I just logged into my router to see the IP it was assigned and visited that in my browser. You'll be asked to login, username is 'ADMIN' and the password is printed stickers in several places in your server case.

+

I decided to install Ubuntu 22.04 sever. Create a bootable Ubuntu USB stick and plug it into the server. Now connect to the webui console by going to the server's IP then clicking Remote Control > iKVM/HTML5 and click the button. You can now reboot the server and you'll see the BIOS popup, where you can hit an F key to choose a boot drive. Do this and select the USB.

+

The IPMI web console doesn't support pasting text. So getting your ssh pubkey over is a bit of a pain. Here's a solution I've used:

+
    +
  1. On your local computer with has your ssh pubkey on it, run cd .ssh && python -m http.server (you are about to serve your private key over http without authentication, please be aware this isn't a great idea).
  2. +
  3. On the server, via the IPMI web console, login with the user you created when installing Ubuntu, and run wget -qO - "http://192.168.178.21:8000/id_ed25519.pub" > ~/.ssh/authorized_keys && chmod 600 .ssh/authorized_keys.
  4. +
  5. You should now be able to ssh into your server. Remember to stop the python -m http.server on your local computer now.
  6. +
+

Important system tweaks #

+

There are some tweaks we can do to improve the performance and reliability of our server. Following the tips here (archived page if Medium paywalls that page), first disable the kernel security patches on computing instances. The collateral performance penalty is much more expensive than the imposed risks. Edit /etc/default/grub and add:

+
GRUB_CMDLINE_LINUX_DEFAULT="pti=off spectre_v2=off l1tf=off nospec_store_bypass_disable no_stf_barrier"
+
+

It's also critical to disable IOMMU if you plan peer-to-peer GPU communication, e.g., multi-GPU model training in Tensorflow or PyTorch. Also add to /etc/default/grub:

+
GRUB_CMDLINE_LINUX_DEFAULT="intel_iommu=off rcutree.rcu_idle_gp_delay=1"
+
+

Check GPU P2P communication #

+

If you're using a GPU that supports it, P2P communication speeds up things a lot.

+

Note it's important check PCI Access Control Services (ACS) is disabled.

+

You can follow these steps to test your system's GPU P2P speed: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html#gpu-to-gpu-communication

+

NVIDIA drivers and python environment #

+

We now want to get the NVIDIA drivers, CUDA and our Python envs setup.

+

I've had success using these steps to install CUDA v11.8: https://gist.github.com/MihailCosmin/affa6b1b71b43787e9228c25fe15aeba +Some people have mentioned using a higher NVIDIA drivers version than the nvidia-driver-515 in the script. But be beware there's a bug in driver version 545 that prevents 3090 and 4090 cards from using P2P (see this github issue for a discussion on the problem). If you have a driver with this bug, you may find your training run stalls and times out. Version 535 worked well for me.

+

I like to use Conda with the fastchan channel for my environments. But you may enjoy a different python virtual env tool.

+

Now you can train some AI #

+

nvidia-smi

+

I'm enjoying using Axolotl for LLM fine tuning. HuggingFace Transformers is also a great place to start.

+

Datacenter trip #

+

Datacenter

+

Since the GPUs are super noisy and hot, I found a local datacenter that would colocate it for a reasonable cost. Installation was easier than I expected, although we ended up putting it on a lower slot on the rack because it was too heavy to lift half way up without a lift.

+

This Colocation Survival Guide was super helpful, as it walks you through all the aspects of colocating, from the physical setup to networking.

+

Other things #

+

Set a lower max power limit for GPUs #

+

Some people find that lowering the power limit just a bit will reduce max temp without any real performance sacrifice. I set the max power for my RTX 3090's to 300W (from 305W) by following these steps.

+

Docker bug workaround #

+

If you're planning to use Docker with the GPUs, note there's a bug on Ubuntu 22.04 which needs working around.

+

Going bigger? #

+

If you're planing to build a cluster, there is an excellent video from the Lambda team: Building a GPU cluster for AI.

+ + + +
+ +
+ + + + diff --git a/blog/index.html b/blog/index.html new file mode 100644 index 0000000..cab32ec --- /dev/null +++ b/blog/index.html @@ -0,0 +1,327 @@ + + + + + + Damien C. Tanner + + + + + + + + Skip to main content + +
+ Damien C. Tanner + +
+ +
+ +

Writing

+ + +
    + +
  1. + LLM tool calling as code blocks + +
  2. + +
  3. + Using LLM tool calling and long context for better RAG + +
  4. + +
  5. + Building an AI SuperServer for LLM training and experiments + +
  6. + +
+ + +
+ +
+ + + + diff --git a/blog/tools-as-code/index.html b/blog/tools-as-code/index.html new file mode 100644 index 0000000..ede1a2c --- /dev/null +++ b/blog/tools-as-code/index.html @@ -0,0 +1,632 @@ + + + + + + LLM tool calling as code blocks + + + + + + + + Skip to main content + +
+ Damien C. Tanner + +
+ +
+ +

LLM tool calling as code blocks

+ + + +

When building sophisticated agents that have more than a handful of tools to call, I've often found the inbuilt structured output/json tool calling methods provided by LMA APIs come up short.

+

Intuitively, one of the reasons for this is that when the structured output is enabled, there is no room for chain of thought, text or inbuilt support for comments amongst the JSON output of tool calls.

+

In addition, when you're trying to compare different LLM APIs, you have to switch between different tool calling schemas.

+

LLMs are trained on a lot of code, and a tool call is really just a function call. So why not just use code blocks for tool calls?

+

When you use LLMs to output code with tool calls, you may initially think of running the code in a sandbox. But that comes with infra overhead and security concerns. Instead, what if we just parse tool calls in code blocks with regex, and then validate the function names and params before calling the internal functions?

+

I've had great results with this approach. It's easy to implement and it works with any LLM (including open source models). The chain of thought and comments next to the tool calls is especially helpful when debugging, as the LLM will explain why it decided to call a particular tool with those params.

+

You can even do clever stuff like parse the text stream from the LLM as it comes in,and call tools as they are returned, instead of waiting for the LLM to finish.

+

He's an example of this approach that makes use of zod and Vercel AI SDK:

+
// Run this example:
+// npm i zod zod-to-ts relaxed-json ai @ai-sdk/openai
+// tsx tool-calls-as-ts-example.ts
+
+import { z } from "zod";
+import RJSON from "relaxed-json";
+import { printNode, zodToTs } from "zod-to-ts";
+
+type ToolsList = {
+	[key: string]: { name: string; schema: z.ZodType<unknown> };
+};
+
+const getToolsAsTypeScriptString = (toolsList: ToolsList) =>
+	Object.entries(toolsList)
+		.map(([toolName, { name, schema }]) => {
+			const { node } = zodToTs(schema, toolName);
+			const nodeString = printNode(node);
+			const tsDefAsString = `/** ${name} */ \n${toolName}(${nodeString})`;
+			return tsDefAsString;
+		})
+		.join("\n\n");
+
+const parseToolsCalledContent = ({
+	llmResponseWithToolCallsAsJsCodeblock,
+	toolsList,
+}: {
+	llmResponseWithToolCallsAsJsCodeblock: string;
+	toolsList: ToolsList;
+}) => {
+	const toolsCallRegex =
+		/(\w+)\(([^()]*(?:\([^()]*\)[^()]*)*)\)(?:\s*\/\/.*)?/g;
+	const toolsCalls =
+		llmResponseWithToolCallsAsJsCodeblock.matchAll(toolsCallRegex);
+	const validatedToolsToCall: {
+		name: string;
+		args: any;
+		originalArgs: string;
+	}[] = [];
+	for (const match of toolsCalls) {
+		// eslint-disable-next-line @typescript-eslint/no-unused-vars
+		const [_call, toolName, argString] = match;
+		// console.log(`Found match for tools call: ${toolsName}(${argString})`)
+		if (toolName && toolsList.hasOwnProperty(toolName)) {
+			const tool = toolsList[toolName as keyof typeof toolsList];
+			const argsObj = RJSON.parse(argString);
+			// Validate the arguments using the Zod schema
+			const validatedArgs = tool.schema.parse(argsObj);
+			validatedToolsToCall.push({
+				name: toolName,
+				args: validatedArgs,
+				originalArgs: argString,
+			});
+		} else {
+			console.warn(`Tool ${toolName} is not found.`);
+		}
+	}
+	return validatedToolsToCall;
+};
+
+// EXAMPLE
+import { generateText } from "ai";
+import { openai } from "@ai-sdk/openai";
+const example = async () => {
+	const tools = {
+		getWeather: {
+			name: "Get weather for location today (default) or N days in the future up to 10 days",
+			function: ({
+				location,
+				daysInFuture,
+			}: {
+				location: string;
+				daysInFuture: number;
+			}) => {
+				// TODO: Do actualy weather API call
+				return {
+					location,
+					daysInFuture,
+					weather: "sunny",
+				};
+			},
+			schema: z.object({
+				location: z.string().describe("The location to get the weather for."),
+				daysInFuture: z
+					.number()
+					.describe("The number of days in the future to get the weather for."),
+			}),
+		},
+	};
+	const toolsAsTypeScriptString = getToolsAsTypeScriptString(tools);
+	const { text: llmResponseWithToolCallsAsJsCodeblock } = await generateText({
+		model: openai("gpt-4o"),
+		prompt: `
+	AVAILABLE_TOOLS:
+	"""
+    ${toolsAsTypeScriptString}
+    """
+
+    AVAILABLE_TOOLS must be called in a single javascript codeblock. All function arguments must be on a single line.
+
+    QUESTION:
+    "What is the weather in San Francisco?"
+    `,
+	});
+	console.log("Tools schema pass to llm:\n");
+	console.log(toolsAsTypeScriptString);
+	console.log("\nResponse from llm with tool call code block:\n");
+	console.log(llmResponseWithToolCallsAsJsCodeblock);
+	const validatedToolsToCall = parseToolsCalledContent({
+		llmResponseWithToolCallsAsJsCodeblock,
+		toolsList: tools,
+	});
+	console.log("\nValidated tools to call:\n");
+	console.log(validatedToolsToCall);
+};
+
+example();
+

Example output:

+
$ tsx tool-calls-as-ts-example.ts
+Tools schema pass to llm:
+
+/** Get weather for location today (default) or N days in the future up to 10 days */
+getWeather({
+    /** The location to get the weather for. */
+    location: string;
+    /** The number of days in the future to get the weather for. */
+    daysInFuture: number;
+})
+
+Response from llm with tool call code block:
+
+```javascript
+getWeather({ location: "San Francisco", daysInFuture: 0 })
+```
+
+Validated tools to call:
+
+[
+  {
+    name: 'getWeather',
+    args: { location: 'San Francisco', daysInFuture: 0 },
+    originalArgs: '{ location: "San Francisco", daysInFuture: 0 }'
+  }
+]
+ + + +
+ +
+ + + + diff --git a/blog/your-rag-may-not-need-a-vector-store/index.html b/blog/your-rag-may-not-need-a-vector-store/index.html new file mode 100644 index 0000000..af7ac5e --- /dev/null +++ b/blog/your-rag-may-not-need-a-vector-store/index.html @@ -0,0 +1,585 @@ + + + + + + Using LLM tool calling and long context for better RAG + + + + + + + + Skip to main content + +
+ Damien C. Tanner + +
+ +
+ +

Using LLM tool calling and long context for better RAG

+ + + +

When building a RAG pipeline you'll probably reach for a vector store to store embeddings of document chunks, which are then retrieved and put into context at query time. This works well if your users are asking single fact queries where the answer can be found in a relevant document chunk. But if your users want to ask more complex questions where the answer requires information spread across the whole document or across multiple documents, retreiveing chunks often leaves out critical information and can lead to inaccurate responses.

+

Relying on document chunks has been a great solution to add knowledge to LLMs with a limited context window. But context windows have grown massively over the past year, with the leading LLMs supporting context windows reaching 1M tokens. This opens the door to new approaches to RAG which are less constrained by context.

+

Whole document querying RAG #

+

Instead of retrieving document chunks, I've had success retreiving and querying whole documents. Queries like 'summarize xyz document ' or 'compare document abc to xyz' yield a full and complete summary without risk of missing important details.

+

When does this appraoch work? This approach works best if your documents are all of the same type or can be put into categories, and if the user queries include enough information to locate the specific document(s) the question is for.

+

For example, if your documents are client contracts, each may have a client name, date and contract type. If a user asks 'Summarize the most recent contract with Acme Inc?' we have enough information to find this document, and then use the whole document as context to fully answer their question.

+

Querying whole documents like this calls for a different RAG workflow than the common single step chunk-retrieve-query workflow. Retrieving whole documents and putting them straight into the context could fill up even a large context window.

+

Instead, we can leverage the function/tool calling ability of many LLMs to create sub-queries to query each document, which can be executed in parallel. We can even make use of cheaper and faster LLMs for these sub-queries which have to process the complete documents.

+

What does this look like in practice?

+

Create document query functions #

+

In the client contracts example, we would need to be able to locate and query a client contract document. We can create a function which takes several search filters, retrieves the full text of the top matching document, and then calls an LLM (e.g. gpt-3.5-turbo) with the full document text and the query. The fuction should accept the filters required to find the document e.g.: client name, date range, contract type. Plus a query param which is the query to send to the LLM with the full document text.

+

There's no set way to search for these documents, you could use SQL, Elastic or even embeddings. The key thing is it should be able handle fuzzy search filters for certain params, e.g. for the client name in this case.

+

Here's an example of this function in Python:

+
def query_client_contract(client_name: str, document_type: str, from_date: str = None, to_date: str = None, query: str):
+	# Search for the document
+	document = search_client_contract(client_name, document_type, from_date, to_date)
+	# Call the LLM with the full document text and the query
+    messages = [
+        {"content": "Answer the query using the provided text.", "role": "system"},
+        {"content": document + "\n\nQuery: " + query, "role": "user"},
+    ]
+	response = client.chat.completions.create(
+        model="gpt-3.5-turbo", # Use a cheaper model for the sub-query which will process the full document
+        messages=messages,
+    )
+	return response.choices[0].message.content
+

Sub-query function calls #

+

Now we have the document query function, we are going to use OpenAI Function Calling to create sub-queries to this function.

+

First we use JSON Schema to define the tool for OpenAI function calling:

+
tools = [
+	{
+		"type": "function",
+		"function": {
+			"name": "query_client_contract",
+			"description":
+				"Send the query to AI to ask the full document text. The AI response will be returned.",
+			"parameters": {
+				"type": "object",
+				"properties": {
+					"client_name": {
+						"type": "string",
+						"description": "Name of the client the contract is for.",
+					},
+					"document_type": {
+						"type": "string",
+						"enum": ["contract", "lease"],
+						"description": "The type of legal contract.",
+					},
+					"from_date": {
+						"type": "string",
+						"format": "date-time",
+						"description": "Find documents from this date.",
+					},
+					"to_date": {
+						"type": "string",
+						"format": "date-time",
+						'description': "Find documents up to this date.",
+					},
+				},
+				"required": ["client_name", "document_type"],
+			},
+		},
+	}
+]
+

Then we need create a helper function to execute the function when requested by the LLM:

+
def execute_function_call(message):
+    if message.tool_calls[0].function.name == "query_client_contract":
+        args = json.loads(message.tool_calls[0].function.arguments)
+        results = ask_database(args["client_name"], args["document_type"], args["from_date"], args["to_date"], args["query"])
+    else:
+        results = f"Error: function {message.tool_calls[0].function.name} does not exist"
+    return results
+

Now in the main chat function, we take a user's query, and if GPT suggests a function call, we execute it and append the results to the chat messages, and then send the messages back to GPT for the final answer:

+
def ask_ai(query: str):
+    messages = [
+        {"content": "Answer the user query, calling functions if required.", "role": "system"},
+        {"content": query, "role": "user"},
+    ]
+
+	chat_response = client.chat.completions.create(
+        model="gpt-4-turbo", # Use a more powerful model for function calling
+        tools=tools,
+        tool_choice="auto", # "auto" means the model can pick between generating a message or calling a function
+        messages=messages,
+    )
+
+	assistant_message = chat_response.choices[0].message
+	assistant_message.content = str(assistant_message.tool_calls[0].function)
+	messages.append({"role": assistant_message.role, "content": assistant_message.content})
+
+	if assistant_message.tool_calls:
+		results = execute_function_call(assistant_message)
+		messages.append({"role": "function", "tool_call_id": assistant_message.tool_calls[0].id, "name": assistant_message.tool_calls[0].function.name, "content": results})
+
+	second_chat_response = client.chat.completions.create(
+        model="gpt-4-turbo", # Use a more powerful model for function calling
+        tools=tools,
+        tool_choice="auto", # "auto" means the model can pick between generating a message or calling a function
+        messages=messages,
+    )
+	print(second_chat_response.choices[0].message.content)
+

The benefits of this approach #

+

There are several benefits to this approach. The main benefit, as discussed above, is that we are querying whole documents. For many use cases this is going to provide more complete answers for users. You can also easily extend this approach by adding more functions for different document types and data sources. GPT will call multiple functions which you can execute in parallel, and in the final GPT call we can use gpt-4-turbo to integrate the results and provide the final answer. If you do have a handful of unknown documents, you can still use the chunk-retrieve-query approach for those, and simply add a function to the tool list to query the chunked documents with a typical RAG pipeline.

+

I'm excited to see how this approach can be used in practice. I think it will be especially useful for complex questions where the answer is spread across multiple documents, or where the user query is for a summary of a document. I'd love to hear how you get on with this approach. Please reach out if you have any other ideas for how to improve this approach, or related new ideas for improving RAG.

+ + + +
+ +
+ + + + diff --git a/content/404.md b/content/404.md deleted file mode 100644 index bd51f61..0000000 --- a/content/404.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -layout: layouts/home.njk -permalink: 404.html -eleventyExcludeFromCollections: true ---- -# Content not found. - -Go home. - - diff --git a/content/blog.njk b/content/blog.njk deleted file mode 100644 index c23f212..0000000 --- a/content/blog.njk +++ /dev/null @@ -1,9 +0,0 @@ ---- -layout: layouts/home.njk -eleventyNavigation: - key: Writing - order: 1 ---- -

Writing

-{% set postslist = collections.posts %} -{% include "postslist.njk" %} \ No newline at end of file diff --git a/content/blog/blog.11tydata.js b/content/blog/blog.11tydata.js deleted file mode 100644 index 2d655b1..0000000 --- a/content/blog/blog.11tydata.js +++ /dev/null @@ -1,6 +0,0 @@ -module.exports = { - tags: [ - "posts" - ], - "layout": "layouts/post.njk", -}; diff --git a/content/blog/building-an-ai-superserver/4029GP-TRT2_angle.jpg.webp b/content/blog/building-an-ai-superserver/4029GP-TRT2_angle.jpg.webp deleted file mode 100644 index 5351783..0000000 Binary files a/content/blog/building-an-ai-superserver/4029GP-TRT2_angle.jpg.webp and /dev/null differ diff --git a/content/blog/building-an-ai-superserver/GE6FS91XcAAbcxS.jpeg b/content/blog/building-an-ai-superserver/GE6FS91XcAAbcxS.jpeg deleted file mode 100644 index 7012b3f..0000000 Binary files a/content/blog/building-an-ai-superserver/GE6FS91XcAAbcxS.jpeg and /dev/null differ diff --git a/content/blog/building-an-ai-superserver/GFvn4JcWkAA9V0j.jpeg b/content/blog/building-an-ai-superserver/GFvn4JcWkAA9V0j.jpeg deleted file mode 100644 index ea55101..0000000 Binary files a/content/blog/building-an-ai-superserver/GFvn4JcWkAA9V0j.jpeg and /dev/null differ diff --git a/content/blog/building-an-ai-superserver/IMG_4750.jpeg b/content/blog/building-an-ai-superserver/IMG_4750.jpeg deleted file mode 100644 index 8fa7d6b..0000000 Binary files a/content/blog/building-an-ai-superserver/IMG_4750.jpeg and /dev/null differ diff --git a/content/blog/building-an-ai-superserver/building-an-ai-superserver.md b/content/blog/building-an-ai-superserver/building-an-ai-superserver.md deleted file mode 100644 index d9992de..0000000 --- a/content/blog/building-an-ai-superserver/building-an-ai-superserver.md +++ /dev/null @@ -1,122 +0,0 @@ ---- -title: Building an AI SuperServer for LLM training and experiments -date: 2024-03-14 -tags: ---- - -Impressive new language models like Llama and Mistral have broadened the accessibility of AI training. If you want to fine-tune a model with your own data, it's now relatively easy to do with tools like [Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) and a few dollars spent on a GPU cloud. But if you want to go deeper and train larger models or try new methods, the cloud bill can quickly rack up. Renting 8 A100's on AWS will set you back an astounding $350,000 per year! There are cheaper clouds, but they can still cost tens of thousands a year. - -I've always enjoyed building PCs. I remember when I was 16 and my grandma bought me my first PC to assemble myself. So in the name of fun and saving money, I embarked on building an AI server so that I can more affordably do independent AI research. - -# Your options - -Depending on your budget and use case, there are a few routes to take when building an AI server. - -## Open frame - -{% image "GE6FS91XcAAbcxS.jpeg", "Miner style" %} - -If the server is just for you, and you want to keep it at home or in your basement, the most affordable option is essentially a powerful consumer PC, with an open frame case (originally designed for crypto miners). You'll be able to find a lots of advice on Reddit for this route. - -The important things are a motherboard that has lots of 16x PCIe slots, PCIe risers with redrivers, and multiple PSUs (depending the number of GPUs you choose). You'll be able to buy everything second had if you like, including the GPUs. For GPUs you're best going with RTX 3090s or 4090s in this setup, and because there's no case, you won't have issues with space or airflow. - -The benefit if this route is cost, but also the ability to start simple with just a single GPU and grow as you desire by adding more. - -## Rack server - -{% image "superserver.jpeg", "Server style" %} - -If you're planning to train larger models, have more servers, datacenter GPUs or just don't have anywhere to house a noisy hot server at home, you can go the rack mountable server route. This is the the route I've gone, as our house doesn't have a basement and our internet isn't that fast. My server now lives in a datacenter where it's cooled and well connected. - -I found less resources on this route, so the rest of this guide is aimed at helping you build and setup a rack mountable GPU server. - -# Building the server - -Supermicro make great server systems and many specifically for AI use cases. For example the [SuperServer 4029GP-TRT2](https://www.supermicro.com/en/products/system/4u/4029/sys-4029gp-trt2.cfm) is a mid range 4U dual CPU server with 10 PCIe slots - ideal for filling with GPUs! I found a well priced one from an IT supplier in the UK. The newer model is more expensive, but may be easier to find. Note that the model I used only have PCIe 3.0. If you are using RTX 4090 or a newer datacenter GPU, you will probably want the newer model which supports PCIe 4.0. - -{% image "4029GP-TRT2_angle.jpg.webp", "SuperServer 4029GP-TRT2" %} - -It arrived at my house on a pallet. It was heavier than I expected! - -{% image "IMG_4750.jpeg", "The pallet" %} - -After lugging it up the stairs and reading the manual, I installed 10 RTX 3090s I bought second hand from someone who previously was using them for mining. Note that to fit the maximum number of GPUs in a system you'll need to find blower or turbo style GPUs that are only two slots wide. The vast majority of 3090 and 4090 GPUs are for gaming, and they will take up 3 slots and the power comes out the top and you won't be able to put the case on your server. If you can't find blower consumer GPUs, you're next best bet is the RTX A6000 which is still fairly good value for money, even if it's still 3x more than a 4090. - -You'll also need to add the CPUs (two of them), memory and storage. I sourced everything secondhand from eBay. Most things cost no more than a few hundred dollars each. I went with 2x Intel Xeon Platinum 8160, 12x32GB DDR memory and an 8TB SSD. - -Once everything was installed, I turned it on for the first time - what I heard could only be described as a mini jet engine. Server fans are noisy. - -Next step was to setup the OS and environment. - -# Setting up the OS - -Supermicro servers have in inbuilt webui called IPMI for accessing the server console and monitor output. There is a dedicated lan port for PICE on this server. You should also plug in a second lan cable to one of the main lan ports, otherwise your server won't actually have internet access (this confused me initially). - -It will find an IP with DHCP, so I just logged into my router to see the IP it was assigned and visited that in my browser. You'll be asked to login, username is 'ADMIN' and the password is printed stickers in several places in your server case. - -I decided to install Ubuntu 22.04 sever. Create a bootable Ubuntu USB stick and plug it into the server. Now connect to the webui console by going to the server's IP then clicking Remote Control > iKVM/HTML5 and click the button. You can now reboot the server and you'll see the BIOS popup, where you can hit an F key to choose a boot drive. Do this and select the USB. - -The IPMI web console doesn't support pasting text. So getting your ssh pubkey over is a bit of a pain. Here's a solution I've used: - -1. On your local computer with has your ssh pubkey on it, run `cd .ssh && python -m http.server` (you are about to serve your private key over http without authentication, please be aware this isn't a great idea). -2. On the server, via the IPMI web console, login with the user you created when installing Ubuntu, and run `wget -qO - "http://192.168.178.21:8000/id_ed25519.pub" > ~/.ssh/authorized_keys && chmod 600 .ssh/authorized_keys`. -3. You should now be able to ssh into your server. Remember to stop the `python -m http.server` on your local computer now. - -# Important system tweaks - -There are some tweaks we can do to improve the performance and reliability of our server. Following [the tips here](https://towardsdatascience.com/deploying-kubeflow-to-a-bare-metal-gpu-cluster-from-scratch-6865ebcde032) ([archived page](https://archive.ph/0Y2DK#selection-611.0-611.103) if Medium paywalls that page), first disable the kernel security patches on computing instances. The collateral performance penalty is much more expensive than the imposed risks. Edit /etc/default/grub and add: -``` -GRUB_CMDLINE_LINUX_DEFAULT="pti=off spectre_v2=off l1tf=off nospec_store_bypass_disable no_stf_barrier" -``` - -It's also critical to disable IOMMU if you plan peer-to-peer GPU communication, e.g., multi-GPU model training in Tensorflow or PyTorch. Also add to /etc/default/grub: -``` -GRUB_CMDLINE_LINUX_DEFAULT="intel_iommu=off rcutree.rcu_idle_gp_delay=1" -``` - -# Check GPU P2P communication - -If you're using a GPU that supports it, P2P communication speeds up things a lot. - -Note it's important check [PCI Access Control Services (ACS)](https://docs.nvidia.com/deeplearning/nccl/archives/nccl_284/user-guide/docs/troubleshooting.html#:~:text=PCI%20Access%20Control%20Services%20(ACS)¶&text=If%20PCI%20switches%20have%20ACS,done%20again%20after%z) is disabled. - -You can follow these steps to test your system's GPU P2P speed: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html#gpu-to-gpu-communication - -# NVIDIA drivers and python environment - -We now want to get the NVIDIA drivers, CUDA and our Python envs setup. - -I've had success using these steps to install CUDA v11.8: https://gist.github.com/MihailCosmin/affa6b1b71b43787e9228c25fe15aeba -Some people have mentioned using a higher NVIDIA drivers version than the nvidia-driver-515 in the script. But be beware there's a bug in driver version 545 that prevents 3090 and 4090 cards from using P2P (see [this github issue](https://github.com/NVIDIA/nccl-tests/issues/117) for a discussion on the problem). If you have a driver with this bug, you may find your training run stalls and times out. Version 535 worked well for me. - -I like to use Conda with the [fastchan channel](https://www.fast.ai/posts/2021-07-15-fastconda.html) for my environments. But you may enjoy a different python virtual env tool. - -# Now you can train some AI - -{% image "nvidia-smi.jpeg", "nvidia-smi" %} - -I'm enjoying using [Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) for LLM fine tuning. [HuggingFace Transformers](https://huggingface.co/docs/transformers/index) is also a great place to start. - -# Datacenter trip - -{% image "datacenter.png", "Datacenter" %} - -Since the GPUs are super noisy and hot, I found a local datacenter that would colocate it for a reasonable cost. Installation was easier than I expected, although we ended up putting it on a lower slot on the rack because it was too heavy to lift half way up without a lift. - -This [Colocation Survival Guide](https://www.datacate.net/wp-content/uploads/2019/04/Colocation-Survival-Guide-6x9-with-bonus-material.pdf) was super helpful, as it walks you through all the aspects of colocating, from the physical setup to networking. - - - -# Other things - -## Set a lower max power limit for GPUs - -Some people find that lowering the power limit just a bit will reduce max temp without any real performance sacrifice. I set the max power for my RTX 3090's to 300W (from 305W) by [following these steps](https://www.reddit.com/r/Fedora/comments/11lh9nn/set_nvidia_gpu_power_and_temp_limit_on_boot/). - -## Docker bug workaround - -If you're planning to use Docker with the GPUs, note there's [a bug on Ubuntu 22.04 which needs working around](https://github.com/NVIDIA/nvidia-container-toolkit/issues/48). - -# Going bigger? - -If you're planing to build a cluster, there is an excellent video from the Lambda team: [Building a GPU cluster for AI](https://www.youtube.com/watch?v=rfu5FwncZ6s). \ No newline at end of file diff --git a/content/blog/building-an-ai-superserver/datacenter.png b/content/blog/building-an-ai-superserver/datacenter.png deleted file mode 100644 index abdd22c..0000000 Binary files a/content/blog/building-an-ai-superserver/datacenter.png and /dev/null differ diff --git a/content/blog/building-an-ai-superserver/nvidia-smi.jpeg b/content/blog/building-an-ai-superserver/nvidia-smi.jpeg deleted file mode 100644 index 4b9f869..0000000 Binary files a/content/blog/building-an-ai-superserver/nvidia-smi.jpeg and /dev/null differ diff --git a/content/blog/building-an-ai-superserver/superserver.jpeg b/content/blog/building-an-ai-superserver/superserver.jpeg deleted file mode 100644 index 6148e67..0000000 Binary files a/content/blog/building-an-ai-superserver/superserver.jpeg and /dev/null differ diff --git a/content/blog/tools-as-code/tools-as-code.md b/content/blog/tools-as-code/tools-as-code.md deleted file mode 100644 index d62a61f..0000000 --- a/content/blog/tools-as-code/tools-as-code.md +++ /dev/null @@ -1,171 +0,0 @@ ---- -title: LLM tool calling as code blocks -date: 2025-02-11 -tags: ---- - -When building sophisticated agents that have more than a handful of tools to call, I've often found the inbuilt structured output/json tool calling methods provided by LMA APIs come up short. - -Intuitively, one of the reasons for this is that when the structured output is enabled, there is no room for chain of thought, text or inbuilt support for comments amongst the JSON output of tool calls. - -In addition, when you're trying to compare different LLM APIs, you have to switch between different tool calling schemas. - -LLMs are trained on a lot of code, and a tool call is really just a function call. So why not just use code blocks for tool calls? - -When you use LLMs to output code with tool calls, you may initially think of running the code in a sandbox. But that comes with infra overhead and security concerns. Instead, what if we just parse tool calls in code blocks with regex, and then validate the function names and params before calling the internal functions? - -I've had great results with this approach. It's easy to implement and it works with any LLM (including open source models). The chain of thought and comments next to the tool calls is especially helpful when debugging, as the LLM will explain why it decided to call a particular tool with those params. - -You can even do clever stuff like parse the text stream from the LLM as it comes in,and call tools as they are returned, instead of waiting for the LLM to finish. - -He's an example of this approach that makes use of zod and Vercel AI SDK: - -```typescript -// Run this example: -// npm i zod zod-to-ts relaxed-json ai @ai-sdk/openai -// tsx tool-calls-as-ts-example.ts - -import { z } from "zod"; -import RJSON from "relaxed-json"; -import { printNode, zodToTs } from "zod-to-ts"; - -type ToolsList = { - [key: string]: { name: string; schema: z.ZodType }; -}; - -const getToolsAsTypeScriptString = (toolsList: ToolsList) => - Object.entries(toolsList) - .map(([toolName, { name, schema }]) => { - const { node } = zodToTs(schema, toolName); - const nodeString = printNode(node); - const tsDefAsString = `/** ${name} */ \n${toolName}(${nodeString})`; - return tsDefAsString; - }) - .join("\n\n"); - -const parseToolsCalledContent = ({ - llmResponseWithToolCallsAsJsCodeblock, - toolsList, -}: { - llmResponseWithToolCallsAsJsCodeblock: string; - toolsList: ToolsList; -}) => { - const toolsCallRegex = - /(\w+)\(([^()]*(?:\([^()]*\)[^()]*)*)\)(?:\s*\/\/.*)?/g; - const toolsCalls = - llmResponseWithToolCallsAsJsCodeblock.matchAll(toolsCallRegex); - const validatedToolsToCall: { - name: string; - args: any; - originalArgs: string; - }[] = []; - for (const match of toolsCalls) { - // eslint-disable-next-line @typescript-eslint/no-unused-vars - const [_call, toolName, argString] = match; - // console.log(`Found match for tools call: ${toolsName}(${argString})`) - if (toolName && toolsList.hasOwnProperty(toolName)) { - const tool = toolsList[toolName as keyof typeof toolsList]; - const argsObj = RJSON.parse(argString); - // Validate the arguments using the Zod schema - const validatedArgs = tool.schema.parse(argsObj); - validatedToolsToCall.push({ - name: toolName, - args: validatedArgs, - originalArgs: argString, - }); - } else { - console.warn(`Tool ${toolName} is not found.`); - } - } - return validatedToolsToCall; -}; - -// EXAMPLE -import { generateText } from "ai"; -import { openai } from "@ai-sdk/openai"; -const example = async () => { - const tools = { - getWeather: { - name: "Get weather for location today (default) or N days in the future up to 10 days", - function: ({ - location, - daysInFuture, - }: { - location: string; - daysInFuture: number; - }) => { - // TODO: Do actualy weather API call - return { - location, - daysInFuture, - weather: "sunny", - }; - }, - schema: z.object({ - location: z.string().describe("The location to get the weather for."), - daysInFuture: z - .number() - .describe("The number of days in the future to get the weather for."), - }), - }, - }; - const toolsAsTypeScriptString = getToolsAsTypeScriptString(tools); - const { text: llmResponseWithToolCallsAsJsCodeblock } = await generateText({ - model: openai("gpt-4o"), - prompt: ` - AVAILABLE_TOOLS: - """ - ${toolsAsTypeScriptString} - """ - - AVAILABLE_TOOLS must be called in a single javascript codeblock. All function arguments must be on a single line. - - QUESTION: - "What is the weather in San Francisco?" - `, - }); - console.log("Tools schema pass to llm:\n"); - console.log(toolsAsTypeScriptString); - console.log("\nResponse from llm with tool call code block:\n"); - console.log(llmResponseWithToolCallsAsJsCodeblock); - const validatedToolsToCall = parseToolsCalledContent({ - llmResponseWithToolCallsAsJsCodeblock, - toolsList: tools, - }); - console.log("\nValidated tools to call:\n"); - console.log(validatedToolsToCall); -}; - -example(); -``` - -Example output: - -````js -$ tsx tool-calls-as-ts-example.ts -Tools schema pass to llm: - -/** Get weather for location today (default) or N days in the future up to 10 days */ -getWeather({ - /** The location to get the weather for. */ - location: string; - /** The number of days in the future to get the weather for. */ - daysInFuture: number; -}) - -Response from llm with tool call code block: - -```javascript -getWeather({ location: "San Francisco", daysInFuture: 0 }) -``` - -Validated tools to call: - -[ - { - name: 'getWeather', - args: { location: 'San Francisco', daysInFuture: 0 }, - originalArgs: '{ location: "San Francisco", daysInFuture: 0 }' - } -] -```` diff --git a/content/blog/your-rag-may-not-need-a-vector-store/your-rag-may-not-need-a-vector-store.md b/content/blog/your-rag-may-not-need-a-vector-store/your-rag-may-not-need-a-vector-store.md deleted file mode 100644 index 75291e3..0000000 --- a/content/blog/your-rag-may-not-need-a-vector-store/your-rag-may-not-need-a-vector-store.md +++ /dev/null @@ -1,142 +0,0 @@ ---- -title: Using LLM tool calling and long context for better RAG -date: 2024-04-25 -tags: ---- - -When building a RAG pipeline you'll probably reach for a vector store to store embeddings of document chunks, which are then retrieved and put into context at query time. This works well if your users are asking single fact queries where the answer can be found in a relevant document chunk. But if your users want to ask more complex questions where the answer requires information spread across the whole document or across multiple documents, retreiveing chunks often leaves out critical information and can lead to inaccurate responses. - -Relying on document chunks has been a great solution to add knowledge to LLMs with a limited context window. But context windows have grown massively over the past year, with the leading LLMs supporting context windows reaching 1M tokens. This opens the door to new approaches to RAG which are less constrained by context. - -## Whole document querying RAG - -Instead of retrieving document chunks, I've had success retreiving and querying whole documents. Queries like 'summarize xyz document ' or 'compare document abc to xyz' yield a full and complete summary without risk of missing important details. - -When does this appraoch work? This approach works best if your documents are all of the same type or can be put into categories, and if the user queries include enough information to locate the specific document(s) the question is for. - -For example, if your documents are client contracts, each may have a client name, date and contract type. If a user asks 'Summarize the most recent contract with Acme Inc?' we have enough information to find this document, and then use the whole document as context to fully answer their question. - -Querying whole documents like this calls for a different RAG workflow than the common single step chunk-retrieve-query workflow. Retrieving whole documents and putting them straight into the context could fill up even a large context window. - -Instead, we can leverage the function/tool calling ability of many LLMs to create sub-queries to query each document, which can be executed in parallel. We can even make use of cheaper and faster LLMs for these sub-queries which have to process the complete documents. - -What does this look like in practice? - -### Create document query functions - -In the client contracts example, we would need to be able to locate and query a client contract document. We can create a function which takes several search filters, retrieves the full text of the top matching document, and then calls an LLM (e.g. gpt-3.5-turbo) with the full document text and the query. The fuction should accept the filters required to find the document e.g.: client name, date range, contract type. Plus a query param which is the query to send to the LLM with the full document text. - -There's no set way to search for these documents, you could use SQL, Elastic or even embeddings. The key thing is it should be able handle fuzzy search filters for certain params, e.g. for the client name in this case. - -Here's an example of this function in Python: - -```python -def query_client_contract(client_name: str, document_type: str, from_date: str = None, to_date: str = None, query: str): - # Search for the document - document = search_client_contract(client_name, document_type, from_date, to_date) - # Call the LLM with the full document text and the query - messages = [ - {"content": "Answer the query using the provided text.", "role": "system"}, - {"content": document + "\n\nQuery: " + query, "role": "user"}, - ] - response = client.chat.completions.create( - model="gpt-3.5-turbo", # Use a cheaper model for the sub-query which will process the full document - messages=messages, - ) - return response.choices[0].message.content -``` - -### Sub-query function calls - -Now we have the document query function, we are going to use [OpenAI Function Calling](https://platform.openai.com/docs/guides/function-calling) to create sub-queries to this function. - -First we use JSON Schema to define the tool for OpenAI function calling: - -```python -tools = [ - { - "type": "function", - "function": { - "name": "query_client_contract", - "description": - "Send the query to AI to ask the full document text. The AI response will be returned.", - "parameters": { - "type": "object", - "properties": { - "client_name": { - "type": "string", - "description": "Name of the client the contract is for.", - }, - "document_type": { - "type": "string", - "enum": ["contract", "lease"], - "description": "The type of legal contract.", - }, - "from_date": { - "type": "string", - "format": "date-time", - "description": "Find documents from this date.", - }, - "to_date": { - "type": "string", - "format": "date-time", - 'description': "Find documents up to this date.", - }, - }, - "required": ["client_name", "document_type"], - }, - }, - } -] -``` - -Then we need create a helper function to execute the function when requested by the LLM: - -```python -def execute_function_call(message): - if message.tool_calls[0].function.name == "query_client_contract": - args = json.loads(message.tool_calls[0].function.arguments) - results = ask_database(args["client_name"], args["document_type"], args["from_date"], args["to_date"], args["query"]) - else: - results = f"Error: function {message.tool_calls[0].function.name} does not exist" - return results -``` - -Now in the main chat function, we take a user's query, and if GPT suggests a function call, we execute it and append the results to the chat messages, and then send the messages back to GPT for the final answer: - -```python -def ask_ai(query: str): - messages = [ - {"content": "Answer the user query, calling functions if required.", "role": "system"}, - {"content": query, "role": "user"}, - ] - - chat_response = client.chat.completions.create( - model="gpt-4-turbo", # Use a more powerful model for function calling - tools=tools, - tool_choice="auto", # "auto" means the model can pick between generating a message or calling a function - messages=messages, - ) - - assistant_message = chat_response.choices[0].message - assistant_message.content = str(assistant_message.tool_calls[0].function) - messages.append({"role": assistant_message.role, "content": assistant_message.content}) - - if assistant_message.tool_calls: - results = execute_function_call(assistant_message) - messages.append({"role": "function", "tool_call_id": assistant_message.tool_calls[0].id, "name": assistant_message.tool_calls[0].function.name, "content": results}) - - second_chat_response = client.chat.completions.create( - model="gpt-4-turbo", # Use a more powerful model for function calling - tools=tools, - tool_choice="auto", # "auto" means the model can pick between generating a message or calling a function - messages=messages, - ) - print(second_chat_response.choices[0].message.content) -``` - -## The benefits of this approach - -There are several benefits to this approach. The main benefit, as discussed above, is that we are querying whole documents. For many use cases this is going to provide more complete answers for users. You can also easily extend this approach by adding more functions for different document types and data sources. GPT will call multiple functions which you can execute in parallel, and in the final GPT call we can use gpt-4-turbo to integrate the results and provide the final answer. If you do have a handful of unknown documents, you can still use the chunk-retrieve-query approach for those, and simply add a function to the tool list to query the chunked documents with a typical RAG pipeline. - -I'm excited to see how this approach can be used in practice. I think it will be especially useful for complex questions where the answer is spread across multiple documents, or where the user query is for a summary of a document. I'd love to hear how you get on with this approach. Please reach out if you have any other ideas for how to improve this approach, or related new ideas for improving RAG. diff --git a/content/feed/feed.11tydata.js b/content/feed/feed.11tydata.js deleted file mode 100644 index ed3fec9..0000000 --- a/content/feed/feed.11tydata.js +++ /dev/null @@ -1,3 +0,0 @@ -module.exports = { - eleventyExcludeFromCollections: true -} diff --git a/content/feed/feed.njk b/content/feed/feed.njk deleted file mode 100755 index a47a7e8..0000000 --- a/content/feed/feed.njk +++ /dev/null @@ -1,27 +0,0 @@ ---- -# Metadata comes from _data/metadata.js -permalink: /feed/feed.xml ---- - - - {{ metadata.title }} - {{ metadata.description }} - - - {{ collections.posts | getNewestCollectionItemDate | dateToRfc3339 }} - {{ metadata.url }} - - {{ metadata.author.name }} - {{ metadata.author.email }} - - {%- for post in collections.posts | reverse %} - {% set absolutePostUrl %}{{ post.url | htmlBaseUrl(metadata.url) }}{% endset %} - - {{ post.data.title }} - - {{ post.date | dateToRfc3339 }} - {{ absolutePostUrl }} - {{ post.templateContent | transformWithHtmlBase(absolutePostUrl, post.url) }} - - {%- endfor %} - diff --git a/content/feed/json.njk b/content/feed/json.njk deleted file mode 100644 index 3b33b59..0000000 --- a/content/feed/json.njk +++ /dev/null @@ -1,29 +0,0 @@ ---- -# Metadata comes from _data/metadata.js -permalink: /feed/feed.json ---- -{ - "version": "https://jsonfeed.org/version/1.1", - "title": "{{ metadata.title }}", - "language": "{{ metadata.language }}", - "home_page_url": "{{ metadata.url | addPathPrefixToFullUrl }}", - "feed_url": "{{ permalink | htmlBaseUrl(metadata.url) }}", - "description": "{{ metadata.description }}", - "author": { - "name": "{{ metadata.author.name }}", - "url": "{{ metadata.author.url }}" - }, - "items": [ - {%- for post in collections.posts | reverse %} - {%- set absolutePostUrl = post.url | htmlBaseUrl(metadata.url) %} - { - "id": "{{ absolutePostUrl }}", - "url": "{{ absolutePostUrl }}", - "title": "{{ post.data.title }}", - "content_html": {% if post.templateContent %}{{ post.templateContent | transformWithHtmlBase(absolutePostUrl, post.url) | dump | safe }}{% else %}""{% endif %}, - "date_published": "{{ post.date | dateToRfc3339 }}" - } - {% if not loop.last %},{% endif %} - {%- endfor %} - ] -} diff --git a/content/index.md b/content/index.md deleted file mode 100644 index 8a9899f..0000000 --- a/content/index.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -layout: layouts/base.njk -eleventyNavigation: - key: About Me - order: 1 ---- - -Me - -# Damien C. Tanner - -[x.com/dctanner](https://x.com/dctanner) - -**I'm the co-founder and CEO of [Layercode](https://layercode.com).** - -Previously I co-founded MediaCore (ed-tech, sold to Workday inc), Pusher (realtime messaging, sold to MessageBird) and thoughtbot (my agency merged with them). - -I also organise the [AI Engineer London Meetup](https://lu.ma/calendar/cal-npDMhGfssuQj9ZE). - -## Companies - -**[Layercode](https://layercode.com) - Voice AI platform for developers.** - -[Pusher](http://pusher.com) (Co-Founder. Acquired by MessageBird in 2020) - Realtime messaging platform. - -MediaCore (Co-Founder. Acquired by Workday in 2015) - Video platform for education. - -[New Bamboo](http://thoughtbot.com) (Co-Founder. Acquired by thoughtbot in 2015) - Digital agency. - -Panda (Co-Founder. Acquired by Xenon in 2013) - Cloud video processing. - -## Investing - -Some select private investments I’ve made: - -[MCJ](https://mcj.vc/) - early LP in the MCJ climate focused funds. - -[Entocycle](https://www.entocycle.com) - Insect farming for fish & animal feed. - -[Mytos](https://www.mytos.bio) - Lab automation. - -[Strateos](https://strateos.com) - A robotic cloud laboratory for the life sciences. diff --git a/content/sitemap/sitemap.xml.njk b/content/sitemap/sitemap.xml.njk deleted file mode 100644 index 4da684a..0000000 --- a/content/sitemap/sitemap.xml.njk +++ /dev/null @@ -1,14 +0,0 @@ ---- -permalink: /sitemap.xml -eleventyExcludeFromCollections: true ---- - - -{%- for page in collections.all %} - {% set absoluteUrl %}{{ page.url | htmlBaseUrl(metadata.url) }}{% endset %} - - {{ absoluteUrl }} - {{ page.date | htmlDateString }} - -{%- endfor %} - diff --git a/public/css/index.css b/css/index.css similarity index 100% rename from public/css/index.css rename to css/index.css diff --git a/public/css/message-box.css b/css/message-box.css similarity index 100% rename from public/css/message-box.css rename to css/message-box.css diff --git a/public/css/prism-diff.css b/css/prism-diff.css similarity index 100% rename from public/css/prism-diff.css rename to css/prism-diff.css diff --git a/css/prism-okaidia.css b/css/prism-okaidia.css new file mode 100644 index 0000000..cf04068 --- /dev/null +++ b/css/prism-okaidia.css @@ -0,0 +1,123 @@ +/** + * okaidia theme for JavaScript, CSS and HTML + * Loosely based on Monokai textmate theme by http://www.monokai.nl/ + * @author ocodia + */ + +code[class*="language-"], +pre[class*="language-"] { + color: #f8f8f2; + background: none; + text-shadow: 0 1px rgba(0, 0, 0, 0.3); + font-family: Consolas, Monaco, 'Andale Mono', 'Ubuntu Mono', monospace; + font-size: 1em; + text-align: left; + white-space: pre; + word-spacing: normal; + word-break: normal; + word-wrap: normal; + line-height: 1.5; + + -moz-tab-size: 4; + -o-tab-size: 4; + tab-size: 4; + + -webkit-hyphens: none; + -moz-hyphens: none; + -ms-hyphens: none; + hyphens: none; +} + +/* Code blocks */ +pre[class*="language-"] { + padding: 1em; + margin: .5em 0; + overflow: auto; + border-radius: 0.3em; +} + +:not(pre) > code[class*="language-"], +pre[class*="language-"] { + background: #272822; +} + +/* Inline code */ +:not(pre) > code[class*="language-"] { + padding: .1em; + border-radius: .3em; + white-space: normal; +} + +.token.comment, +.token.prolog, +.token.doctype, +.token.cdata { + color: #8292a2; +} + +.token.punctuation { + color: #f8f8f2; +} + +.token.namespace { + opacity: .7; +} + +.token.property, +.token.tag, +.token.constant, +.token.symbol, +.token.deleted { + color: #f92672; +} + +.token.boolean, +.token.number { + color: #ae81ff; +} + +.token.selector, +.token.attr-name, +.token.string, +.token.char, +.token.builtin, +.token.inserted { + color: #a6e22e; +} + +.token.operator, +.token.entity, +.token.url, +.language-css .token.string, +.style .token.string, +.token.variable { + color: #f8f8f2; +} + +.token.atrule, +.token.attr-value, +.token.function, +.token.class-name { + color: #e6db74; +} + +.token.keyword { + color: #66d9ef; +} + +.token.regex, +.token.important { + color: #fd971f; +} + +.token.important, +.token.bold { + font-weight: bold; +} +.token.italic { + font-style: italic; +} + +.token.entity { + cursor: help; +} diff --git a/eleventy.config.drafts.js b/eleventy.config.drafts.js deleted file mode 100644 index 8eb92dc..0000000 --- a/eleventy.config.drafts.js +++ /dev/null @@ -1,50 +0,0 @@ -function eleventyComputedPermalink() { - // When using `addGlobalData` and you *want* to return a function, you must nest functions like this. - // `addGlobalData` acts like a global data file and runs the top level function it receives. - return (data) => { - // Always skip during non-watch/serve builds - if(data.draft && !process.env.BUILD_DRAFTS) { - return false; - } - - return data.permalink; - } -}; - -function eleventyComputedExcludeFromCollections() { - // When using `addGlobalData` and you *want* to return a function, you must nest functions like this. - // `addGlobalData` acts like a global data file and runs the top level function it receives. - return (data) => { - // Always exclude from non-watch/serve builds - if(data.draft && !process.env.BUILD_DRAFTS) { - return true; - } - - return data.eleventyExcludeFromCollections; - } -}; - -module.exports.eleventyComputedPermalink = eleventyComputedPermalink; -module.exports.eleventyComputedExcludeFromCollections = eleventyComputedExcludeFromCollections; - -module.exports = eleventyConfig => { - eleventyConfig.addGlobalData("eleventyComputed.permalink", eleventyComputedPermalink); - eleventyConfig.addGlobalData("eleventyComputed.eleventyExcludeFromCollections", eleventyComputedExcludeFromCollections); - - let logged = false; - eleventyConfig.on("eleventy.before", ({runMode}) => { - let text = "Excluding"; - // Only show drafts in serve/watch modes - if(runMode === "serve" || runMode === "watch") { - process.env.BUILD_DRAFTS = true; - text = "Including"; - } - - // Only log once. - if(!logged) { - console.log( `[11ty/eleventy-base-blog] ${text} drafts.` ); - } - - logged = true; - }); -} diff --git a/eleventy.config.images.js b/eleventy.config.images.js deleted file mode 100644 index 122da5c..0000000 --- a/eleventy.config.images.js +++ /dev/null @@ -1,52 +0,0 @@ -const path = require("path"); -const eleventyImage = require("@11ty/eleventy-img"); - -function relativeToInputPath(inputPath, relativeFilePath) { - let split = inputPath.split("/"); - split.pop(); - - return path.resolve(split.join(path.sep), relativeFilePath); - -} - -function isFullUrl(url) { - try { - new URL(url); - return true; - } catch(e) { - return false; - } -} - -module.exports = function(eleventyConfig) { - // Eleventy Image shortcode - // https://www.11ty.dev/docs/plugins/image/ - eleventyConfig.addAsyncShortcode("image", async function imageShortcode(src, alt, widths, sizes) { - // Full list of formats here: https://www.11ty.dev/docs/plugins/image/#output-formats - // Warning: Avif can be resource-intensive so take care! - let formats = ["avif", "webp", "auto"]; - let input; - if(isFullUrl(src)) { - input = src; - } else { - input = relativeToInputPath(this.page.inputPath, src); - } - - let metadata = await eleventyImage(input, { - widths: widths || ["auto"], - formats, - outputDir: path.join(eleventyConfig.dir.output, "img"), // Advanced usage note: `eleventyConfig.dir` works here because we’re using addPlugin. - }); - - // TODO loading=eager and fetchpriority=high - let imageAttributes = { - alt, - sizes, - loading: "lazy", - decoding: "async", - style: "width: 100%; height: auto;", // Add this line to set the default width to 100% and maintain aspect ratio - }; - - return eleventyImage.generateHTML(metadata, imageAttributes); - }); -}; diff --git a/eleventy.config.js b/eleventy.config.js deleted file mode 100644 index 8faf085..0000000 --- a/eleventy.config.js +++ /dev/null @@ -1,143 +0,0 @@ -const { DateTime } = require("luxon"); -const markdownItAnchor = require("markdown-it-anchor"); - -const pluginRss = require("@11ty/eleventy-plugin-rss"); -const pluginSyntaxHighlight = require("@11ty/eleventy-plugin-syntaxhighlight"); -const pluginBundle = require("@11ty/eleventy-plugin-bundle"); -const pluginNavigation = require("@11ty/eleventy-navigation"); -const { EleventyHtmlBasePlugin } = require("@11ty/eleventy"); - -const pluginDrafts = require("./eleventy.config.drafts.js"); -const pluginImages = require("./eleventy.config.images.js"); - -module.exports = function (eleventyConfig) { - // Copy the contents of the `public` folder to the output folder - // For example, `./public/css/` ends up in `_site/css/` - eleventyConfig.addPassthroughCopy({ - "./public/": "/", - "./node_modules/prismjs/themes/prism-okaidia.css": "/css/prism-okaidia.css", - CNAME: "/CNAME", - }); - - // Run Eleventy when these files change: - // https://www.11ty.dev/docs/watch-serve/#add-your-own-watch-targets - - // Watch content images for the image pipeline. - eleventyConfig.addWatchTarget("content/**/*.{svg,webp,png,jpeg}"); - - // App plugins - eleventyConfig.addPlugin(pluginDrafts); - eleventyConfig.addPlugin(pluginImages); - - // Official plugins - eleventyConfig.addPlugin(pluginRss); - eleventyConfig.addPlugin(pluginSyntaxHighlight, { - preAttributes: { tabindex: 0 }, - }); - eleventyConfig.addPlugin(pluginNavigation); - eleventyConfig.addPlugin(EleventyHtmlBasePlugin); - eleventyConfig.addPlugin(pluginBundle); - - // Filters - eleventyConfig.addFilter("readableDate", (dateObj, format, zone) => { - // Formatting tokens for Luxon: https://moment.github.io/luxon/#/formatting?id=table-of-tokens - return DateTime.fromJSDate(dateObj, { zone: zone || "utc" }).toFormat( - format || "dd LLLL yyyy" - ); - }); - - eleventyConfig.addFilter("htmlDateString", (dateObj) => { - // dateObj input: https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#valid-date-string - return DateTime.fromJSDate(dateObj, { zone: "utc" }).toFormat("yyyy-LL-dd"); - }); - - // Get the first `n` elements of a collection. - eleventyConfig.addFilter("head", (array, n) => { - if (!Array.isArray(array) || array.length === 0) { - return []; - } - if (n < 0) { - return array.slice(n); - } - - return array.slice(0, n); - }); - - // Return the smallest number argument - eleventyConfig.addFilter("min", (...numbers) => { - return Math.min.apply(null, numbers); - }); - - // Return all the tags used in a collection - eleventyConfig.addFilter("getAllTags", (collection) => { - let tagSet = new Set(); - for (let item of collection) { - (item.data.tags || []).forEach((tag) => tagSet.add(tag)); - } - return Array.from(tagSet); - }); - - eleventyConfig.addFilter("filterTagList", function filterTagList(tags) { - return (tags || []).filter( - (tag) => ["all", "nav", "post", "posts"].indexOf(tag) === -1 - ); - }); - - // Customize Markdown library settings: - eleventyConfig.amendLibrary("md", (mdLib) => { - mdLib.use(markdownItAnchor, { - permalink: markdownItAnchor.permalink.ariaHidden({ - placement: "after", - class: "header-anchor", - symbol: "#", - ariaHidden: false, - }), - level: [1, 2, 3, 4], - slugify: eleventyConfig.getFilter("slugify"), - }); - }); - - eleventyConfig.addShortcode("currentBuildDate", () => { - return new Date().toISOString(); - }); - - // Features to make your build faster (when you need them) - - // If your passthrough copy gets heavy and cumbersome, add this line - // to emulate the file copy on the dev server. Learn more: - // https://www.11ty.dev/docs/copy/#emulate-passthrough-copy-during-serve - - // eleventyConfig.setServerPassthroughCopyBehavior("passthrough"); - - return { - // Control which files Eleventy will process - // e.g.: *.md, *.njk, *.html, *.liquid - templateFormats: ["md", "njk", "html", "liquid"], - - // Pre-process *.md files with: (default: `liquid`) - markdownTemplateEngine: "njk", - - // Pre-process *.html files with: (default: `liquid`) - htmlTemplateEngine: "njk", - - // These are all optional: - dir: { - input: "content", // default: "." - includes: "../_includes", // default: "_includes" - data: "../_data", // default: "_data" - output: "_site", - }, - - // ----------------------------------------------------------------- - // Optional items: - // ----------------------------------------------------------------- - - // If your site deploys to a subdirectory, change `pathPrefix`. - // Read more: https://www.11ty.dev/docs/config/#deploy-to-a-subdirectory-with-a-path-prefix - - // When paired with the HTML plugin https://www.11ty.dev/docs/plugins/html-base/ - // it will transform any absolute URLs in your HTML to include this - // folder name and does **not** affect where things go in the output folder. - pathPrefix: "/", - }; -}; diff --git a/feed/feed.json b/feed/feed.json new file mode 100644 index 0000000..65e3155 --- /dev/null +++ b/feed/feed.json @@ -0,0 +1,38 @@ +{ + "version": "https://jsonfeed.org/version/1.1", + "title": "Damien C. Tanner", + "language": "en", + "home_page_url": "https://dc.tanner.me/", + "feed_url": "https://dc.tanner.me/feed/feed.json", + "description": "Notes from a journey of compounding curiosity.", + "author": { + "name": "Damien C. Tanner", + "url": "" + }, + "items": [ + { + "id": "https://dc.tanner.me/blog/tools-as-code/", + "url": "https://dc.tanner.me/blog/tools-as-code/", + "title": "LLM tool calling as code blocks", + "content_html": "

When building sophisticated agents that have more than a handful of tools to call, I've often found the inbuilt structured output/json tool calling methods provided by LMA APIs come up short.

\n

Intuitively, one of the reasons for this is that when the structured output is enabled, there is no room for chain of thought, text or inbuilt support for comments amongst the JSON output of tool calls.

\n

In addition, when you're trying to compare different LLM APIs, you have to switch between different tool calling schemas.

\n

LLMs are trained on a lot of code, and a tool call is really just a function call. So why not just use code blocks for tool calls?

\n

When you use LLMs to output code with tool calls, you may initially think of running the code in a sandbox. But that comes with infra overhead and security concerns. Instead, what if we just parse tool calls in code blocks with regex, and then validate the function names and params before calling the internal functions?

\n

I've had great results with this approach. It's easy to implement and it works with any LLM (including open source models). The chain of thought and comments next to the tool calls is especially helpful when debugging, as the LLM will explain why it decided to call a particular tool with those params.

\n

You can even do clever stuff like parse the text stream from the LLM as it comes in,and call tools as they are returned, instead of waiting for the LLM to finish.

\n

He's an example of this approach that makes use of zod and Vercel AI SDK:

\n
// Run this example:\n// npm i zod zod-to-ts relaxed-json ai @ai-sdk/openai\n// tsx tool-calls-as-ts-example.ts\n\nimport { z } from \"zod\";\nimport RJSON from \"relaxed-json\";\nimport { printNode, zodToTs } from \"zod-to-ts\";\n\ntype ToolsList = {\n\t[key: string]: { name: string; schema: z.ZodType<unknown> };\n};\n\nconst getToolsAsTypeScriptString = (toolsList: ToolsList) =>\n\tObject.entries(toolsList)\n\t\t.map(([toolName, { name, schema }]) => {\n\t\t\tconst { node } = zodToTs(schema, toolName);\n\t\t\tconst nodeString = printNode(node);\n\t\t\tconst tsDefAsString = `/** ${name} */ \\n${toolName}(${nodeString})`;\n\t\t\treturn tsDefAsString;\n\t\t})\n\t\t.join(\"\\n\\n\");\n\nconst parseToolsCalledContent = ({\n\tllmResponseWithToolCallsAsJsCodeblock,\n\ttoolsList,\n}: {\n\tllmResponseWithToolCallsAsJsCodeblock: string;\n\ttoolsList: ToolsList;\n}) => {\n\tconst toolsCallRegex =\n\t\t/(\\w+)\\(([^()]*(?:\\([^()]*\\)[^()]*)*)\\)(?:\\s*\\/\\/.*)?/g;\n\tconst toolsCalls =\n\t\tllmResponseWithToolCallsAsJsCodeblock.matchAll(toolsCallRegex);\n\tconst validatedToolsToCall: {\n\t\tname: string;\n\t\targs: any;\n\t\toriginalArgs: string;\n\t}[] = [];\n\tfor (const match of toolsCalls) {\n\t\t// eslint-disable-next-line @typescript-eslint/no-unused-vars\n\t\tconst [_call, toolName, argString] = match;\n\t\t// console.log(`Found match for tools call: ${toolsName}(${argString})`)\n\t\tif (toolName && toolsList.hasOwnProperty(toolName)) {\n\t\t\tconst tool = toolsList[toolName as keyof typeof toolsList];\n\t\t\tconst argsObj = RJSON.parse(argString);\n\t\t\t// Validate the arguments using the Zod schema\n\t\t\tconst validatedArgs = tool.schema.parse(argsObj);\n\t\t\tvalidatedToolsToCall.push({\n\t\t\t\tname: toolName,\n\t\t\t\targs: validatedArgs,\n\t\t\t\toriginalArgs: argString,\n\t\t\t});\n\t\t} else {\n\t\t\tconsole.warn(`Tool ${toolName} is not found.`);\n\t\t}\n\t}\n\treturn validatedToolsToCall;\n};\n\n// EXAMPLE\nimport { generateText } from \"ai\";\nimport { openai } from \"@ai-sdk/openai\";\nconst example = async () => {\n\tconst tools = {\n\t\tgetWeather: {\n\t\t\tname: \"Get weather for location today (default) or N days in the future up to 10 days\",\n\t\t\tfunction: ({\n\t\t\t\tlocation,\n\t\t\t\tdaysInFuture,\n\t\t\t}: {\n\t\t\t\tlocation: string;\n\t\t\t\tdaysInFuture: number;\n\t\t\t}) => {\n\t\t\t\t// TODO: Do actualy weather API call\n\t\t\t\treturn {\n\t\t\t\t\tlocation,\n\t\t\t\t\tdaysInFuture,\n\t\t\t\t\tweather: \"sunny\",\n\t\t\t\t};\n\t\t\t},\n\t\t\tschema: z.object({\n\t\t\t\tlocation: z.string().describe(\"The location to get the weather for.\"),\n\t\t\t\tdaysInFuture: z\n\t\t\t\t\t.number()\n\t\t\t\t\t.describe(\"The number of days in the future to get the weather for.\"),\n\t\t\t}),\n\t\t},\n\t};\n\tconst toolsAsTypeScriptString = getToolsAsTypeScriptString(tools);\n\tconst { text: llmResponseWithToolCallsAsJsCodeblock } = await generateText({\n\t\tmodel: openai(\"gpt-4o\"),\n\t\tprompt: `\n\tAVAILABLE_TOOLS:\n\t\"\"\"\n    ${toolsAsTypeScriptString}\n    \"\"\"\n\n    AVAILABLE_TOOLS must be called in a single javascript codeblock. All function arguments must be on a single line.\n\n    QUESTION:\n    \"What is the weather in San Francisco?\"\n    `,\n\t});\n\tconsole.log(\"Tools schema pass to llm:\\n\");\n\tconsole.log(toolsAsTypeScriptString);\n\tconsole.log(\"\\nResponse from llm with tool call code block:\\n\");\n\tconsole.log(llmResponseWithToolCallsAsJsCodeblock);\n\tconst validatedToolsToCall = parseToolsCalledContent({\n\t\tllmResponseWithToolCallsAsJsCodeblock,\n\t\ttoolsList: tools,\n\t});\n\tconsole.log(\"\\nValidated tools to call:\\n\");\n\tconsole.log(validatedToolsToCall);\n};\n\nexample();
\n

Example output:

\n
$ tsx tool-calls-as-ts-example.ts\nTools schema pass to llm:\n\n/** Get weather for location today (default) or N days in the future up to 10 days */\ngetWeather({\n    /** The location to get the weather for. */\n    location: string;\n    /** The number of days in the future to get the weather for. */\n    daysInFuture: number;\n})\n\nResponse from llm with tool call code block:\n\n```javascript\ngetWeather({ location: \"San Francisco\", daysInFuture: 0 })\n```\n\nValidated tools to call:\n\n[\n  {\n    name: 'getWeather',\n    args: { location: 'San Francisco', daysInFuture: 0 },\n    originalArgs: '{ location: \"San Francisco\", daysInFuture: 0 }'\n  }\n]
\n", + "date_published": "2025-02-11T00:00:00Z" + } + , + { + "id": "https://dc.tanner.me/blog/your-rag-may-not-need-a-vector-store/", + "url": "https://dc.tanner.me/blog/your-rag-may-not-need-a-vector-store/", + "title": "Using LLM tool calling and long context for better RAG", + "content_html": "

When building a RAG pipeline you'll probably reach for a vector store to store embeddings of document chunks, which are then retrieved and put into context at query time. This works well if your users are asking single fact queries where the answer can be found in a relevant document chunk. But if your users want to ask more complex questions where the answer requires information spread across the whole document or across multiple documents, retreiveing chunks often leaves out critical information and can lead to inaccurate responses.

\n

Relying on document chunks has been a great solution to add knowledge to LLMs with a limited context window. But context windows have grown massively over the past year, with the leading LLMs supporting context windows reaching 1M tokens. This opens the door to new approaches to RAG which are less constrained by context.

\n

Whole document querying RAG #

\n

Instead of retrieving document chunks, I've had success retreiving and querying whole documents. Queries like 'summarize xyz document ' or 'compare document abc to xyz' yield a full and complete summary without risk of missing important details.

\n

When does this appraoch work? This approach works best if your documents are all of the same type or can be put into categories, and if the user queries include enough information to locate the specific document(s) the question is for.

\n

For example, if your documents are client contracts, each may have a client name, date and contract type. If a user asks 'Summarize the most recent contract with Acme Inc?' we have enough information to find this document, and then use the whole document as context to fully answer their question.

\n

Querying whole documents like this calls for a different RAG workflow than the common single step chunk-retrieve-query workflow. Retrieving whole documents and putting them straight into the context could fill up even a large context window.

\n

Instead, we can leverage the function/tool calling ability of many LLMs to create sub-queries to query each document, which can be executed in parallel. We can even make use of cheaper and faster LLMs for these sub-queries which have to process the complete documents.

\n

What does this look like in practice?

\n

Create document query functions #

\n

In the client contracts example, we would need to be able to locate and query a client contract document. We can create a function which takes several search filters, retrieves the full text of the top matching document, and then calls an LLM (e.g. gpt-3.5-turbo) with the full document text and the query. The fuction should accept the filters required to find the document e.g.: client name, date range, contract type. Plus a query param which is the query to send to the LLM with the full document text.

\n

There's no set way to search for these documents, you could use SQL, Elastic or even embeddings. The key thing is it should be able handle fuzzy search filters for certain params, e.g. for the client name in this case.

\n

Here's an example of this function in Python:

\n
def query_client_contract(client_name: str, document_type: str, from_date: str = None, to_date: str = None, query: str):\n\t# Search for the document\n\tdocument = search_client_contract(client_name, document_type, from_date, to_date)\n\t# Call the LLM with the full document text and the query\n    messages = [\n        {\"content\": \"Answer the query using the provided text.\", \"role\": \"system\"},\n        {\"content\": document + \"\\n\\nQuery: \" + query, \"role\": \"user\"},\n    ]\n\tresponse = client.chat.completions.create(\n        model=\"gpt-3.5-turbo\", # Use a cheaper model for the sub-query which will process the full document\n        messages=messages,\n    )\n\treturn response.choices[0].message.content
\n

Sub-query function calls #

\n

Now we have the document query function, we are going to use OpenAI Function Calling to create sub-queries to this function.

\n

First we use JSON Schema to define the tool for OpenAI function calling:

\n
tools = [\n\t{\n\t\t\"type\": \"function\",\n\t\t\"function\": {\n\t\t\t\"name\": \"query_client_contract\",\n\t\t\t\"description\":\n\t\t\t\t\"Send the query to AI to ask the full document text. The AI response will be returned.\",\n\t\t\t\"parameters\": {\n\t\t\t\t\"type\": \"object\",\n\t\t\t\t\"properties\": {\n\t\t\t\t\t\"client_name\": {\n\t\t\t\t\t\t\"type\": \"string\",\n\t\t\t\t\t\t\"description\": \"Name of the client the contract is for.\",\n\t\t\t\t\t},\n\t\t\t\t\t\"document_type\": {\n\t\t\t\t\t\t\"type\": \"string\",\n\t\t\t\t\t\t\"enum\": [\"contract\", \"lease\"],\n\t\t\t\t\t\t\"description\": \"The type of legal contract.\",\n\t\t\t\t\t},\n\t\t\t\t\t\"from_date\": {\n\t\t\t\t\t\t\"type\": \"string\",\n\t\t\t\t\t\t\"format\": \"date-time\",\n\t\t\t\t\t\t\"description\": \"Find documents from this date.\",\n\t\t\t\t\t},\n\t\t\t\t\t\"to_date\": {\n\t\t\t\t\t\t\"type\": \"string\",\n\t\t\t\t\t\t\"format\": \"date-time\",\n\t\t\t\t\t\t'description': \"Find documents up to this date.\",\n\t\t\t\t\t},\n\t\t\t\t},\n\t\t\t\t\"required\": [\"client_name\", \"document_type\"],\n\t\t\t},\n\t\t},\n\t}\n]
\n

Then we need create a helper function to execute the function when requested by the LLM:

\n
def execute_function_call(message):\n    if message.tool_calls[0].function.name == \"query_client_contract\":\n        args = json.loads(message.tool_calls[0].function.arguments)\n        results = ask_database(args[\"client_name\"], args[\"document_type\"], args[\"from_date\"], args[\"to_date\"], args[\"query\"])\n    else:\n        results = f\"Error: function {message.tool_calls[0].function.name} does not exist\"\n    return results
\n

Now in the main chat function, we take a user's query, and if GPT suggests a function call, we execute it and append the results to the chat messages, and then send the messages back to GPT for the final answer:

\n
def ask_ai(query: str):\n    messages = [\n        {\"content\": \"Answer the user query, calling functions if required.\", \"role\": \"system\"},\n        {\"content\": query, \"role\": \"user\"},\n    ]\n\n\tchat_response = client.chat.completions.create(\n        model=\"gpt-4-turbo\", # Use a more powerful model for function calling\n        tools=tools,\n        tool_choice=\"auto\", # \"auto\" means the model can pick between generating a message or calling a function\n        messages=messages,\n    )\n\n\tassistant_message = chat_response.choices[0].message\n\tassistant_message.content = str(assistant_message.tool_calls[0].function)\n\tmessages.append({\"role\": assistant_message.role, \"content\": assistant_message.content})\n\n\tif assistant_message.tool_calls:\n\t\tresults = execute_function_call(assistant_message)\n\t\tmessages.append({\"role\": \"function\", \"tool_call_id\": assistant_message.tool_calls[0].id, \"name\": assistant_message.tool_calls[0].function.name, \"content\": results})\n\n\tsecond_chat_response = client.chat.completions.create(\n        model=\"gpt-4-turbo\", # Use a more powerful model for function calling\n        tools=tools,\n        tool_choice=\"auto\", # \"auto\" means the model can pick between generating a message or calling a function\n        messages=messages,\n    )\n\tprint(second_chat_response.choices[0].message.content)
\n

The benefits of this approach #

\n

There are several benefits to this approach. The main benefit, as discussed above, is that we are querying whole documents. For many use cases this is going to provide more complete answers for users. You can also easily extend this approach by adding more functions for different document types and data sources. GPT will call multiple functions which you can execute in parallel, and in the final GPT call we can use gpt-4-turbo to integrate the results and provide the final answer. If you do have a handful of unknown documents, you can still use the chunk-retrieve-query approach for those, and simply add a function to the tool list to query the chunked documents with a typical RAG pipeline.

\n

I'm excited to see how this approach can be used in practice. I think it will be especially useful for complex questions where the answer is spread across multiple documents, or where the user query is for a summary of a document. I'd love to hear how you get on with this approach. Please reach out if you have any other ideas for how to improve this approach, or related new ideas for improving RAG.

\n", + "date_published": "2024-04-25T00:00:00Z" + } + , + { + "id": "https://dc.tanner.me/blog/building-an-ai-superserver/", + "url": "https://dc.tanner.me/blog/building-an-ai-superserver/", + "title": "Building an AI SuperServer for LLM training and experiments", + "content_html": "

Impressive new language models like Llama and Mistral have broadened the accessibility of AI training. If you want to fine-tune a model with your own data, it's now relatively easy to do with tools like Axolotl and a few dollars spent on a GPU cloud. But if you want to go deeper and train larger models or try new methods, the cloud bill can quickly rack up. Renting 8 A100's on AWS will set you back an astounding $350,000 per year! There are cheaper clouds, but they can still cost tens of thousands a year.

\n

I've always enjoyed building PCs. I remember when I was 16 and my grandma bought me my first PC to assemble myself. So in the name of fun and saving money, I embarked on building an AI server so that I can more affordably do independent AI research.

\n

Your options #

\n

Depending on your budget and use case, there are a few routes to take when building an AI server.

\n

Open frame #

\n

\"Miner

\n

If the server is just for you, and you want to keep it at home or in your basement, the most affordable option is essentially a powerful consumer PC, with an open frame case (originally designed for crypto miners). You'll be able to find a lots of advice on Reddit for this route.

\n

The important things are a motherboard that has lots of 16x PCIe slots, PCIe risers with redrivers, and multiple PSUs (depending the number of GPUs you choose). You'll be able to buy everything second had if you like, including the GPUs. For GPUs you're best going with RTX 3090s or 4090s in this setup, and because there's no case, you won't have issues with space or airflow.

\n

The benefit if this route is cost, but also the ability to start simple with just a single GPU and grow as you desire by adding more.

\n

Rack server #

\n

\"Server

\n

If you're planning to train larger models, have more servers, datacenter GPUs or just don't have anywhere to house a noisy hot server at home, you can go the rack mountable server route. This is the the route I've gone, as our house doesn't have a basement and our internet isn't that fast. My server now lives in a datacenter where it's cooled and well connected.

\n

I found less resources on this route, so the rest of this guide is aimed at helping you build and setup a rack mountable GPU server.

\n

Building the server #

\n

Supermicro make great server systems and many specifically for AI use cases. For example the SuperServer 4029GP-TRT2 is a mid range 4U dual CPU server with 10 PCIe slots - ideal for filling with GPUs! I found a well priced one from an IT supplier in the UK. The newer model is more expensive, but may be easier to find. Note that the model I used only have PCIe 3.0. If you are using RTX 4090 or a newer datacenter GPU, you will probably want the newer model which supports PCIe 4.0.

\n

\"SuperServer

\n

It arrived at my house on a pallet. It was heavier than I expected!

\n

\"The

\n

After lugging it up the stairs and reading the manual, I installed 10 RTX 3090s I bought second hand from someone who previously was using them for mining. Note that to fit the maximum number of GPUs in a system you'll need to find blower or turbo style GPUs that are only two slots wide. The vast majority of 3090 and 4090 GPUs are for gaming, and they will take up 3 slots and the power comes out the top and you won't be able to put the case on your server. If you can't find blower consumer GPUs, you're next best bet is the RTX A6000 which is still fairly good value for money, even if it's still 3x more than a 4090.

\n

You'll also need to add the CPUs (two of them), memory and storage. I sourced everything secondhand from eBay. Most things cost no more than a few hundred dollars each. I went with 2x Intel Xeon Platinum 8160, 12x32GB DDR memory and an 8TB SSD.

\n

Once everything was installed, I turned it on for the first time - what I heard could only be described as a mini jet engine. Server fans are noisy.

\n

Next step was to setup the OS and environment.

\n

Setting up the OS #

\n

Supermicro servers have in inbuilt webui called IPMI for accessing the server console and monitor output. There is a dedicated lan port for PICE on this server. You should also plug in a second lan cable to one of the main lan ports, otherwise your server won't actually have internet access (this confused me initially).

\n

It will find an IP with DHCP, so I just logged into my router to see the IP it was assigned and visited that in my browser. You'll be asked to login, username is 'ADMIN' and the password is printed stickers in several places in your server case.

\n

I decided to install Ubuntu 22.04 sever. Create a bootable Ubuntu USB stick and plug it into the server. Now connect to the webui console by going to the server's IP then clicking Remote Control > iKVM/HTML5 and click the button. You can now reboot the server and you'll see the BIOS popup, where you can hit an F key to choose a boot drive. Do this and select the USB.

\n

The IPMI web console doesn't support pasting text. So getting your ssh pubkey over is a bit of a pain. Here's a solution I've used:

\n
    \n
  1. On your local computer with has your ssh pubkey on it, run cd .ssh && python -m http.server (you are about to serve your private key over http without authentication, please be aware this isn't a great idea).
  2. \n
  3. On the server, via the IPMI web console, login with the user you created when installing Ubuntu, and run wget -qO - "http://192.168.178.21:8000/id_ed25519.pub" > ~/.ssh/authorized_keys && chmod 600 .ssh/authorized_keys.
  4. \n
  5. You should now be able to ssh into your server. Remember to stop the python -m http.server on your local computer now.
  6. \n
\n

Important system tweaks #

\n

There are some tweaks we can do to improve the performance and reliability of our server. Following the tips here (archived page if Medium paywalls that page), first disable the kernel security patches on computing instances. The collateral performance penalty is much more expensive than the imposed risks. Edit /etc/default/grub and add:

\n
GRUB_CMDLINE_LINUX_DEFAULT="pti=off spectre_v2=off l1tf=off nospec_store_bypass_disable no_stf_barrier"\n
\n

It's also critical to disable IOMMU if you plan peer-to-peer GPU communication, e.g., multi-GPU model training in Tensorflow or PyTorch. Also add to /etc/default/grub:

\n
GRUB_CMDLINE_LINUX_DEFAULT="intel_iommu=off rcutree.rcu_idle_gp_delay=1"\n
\n

Check GPU P2P communication #

\n

If you're using a GPU that supports it, P2P communication speeds up things a lot.

\n

Note it's important check PCI Access Control Services (ACS) is disabled.

\n

You can follow these steps to test your system's GPU P2P speed: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html#gpu-to-gpu-communication

\n

NVIDIA drivers and python environment #

\n

We now want to get the NVIDIA drivers, CUDA and our Python envs setup.

\n

I've had success using these steps to install CUDA v11.8: https://gist.github.com/MihailCosmin/affa6b1b71b43787e9228c25fe15aeba\nSome people have mentioned using a higher NVIDIA drivers version than the nvidia-driver-515 in the script. But be beware there's a bug in driver version 545 that prevents 3090 and 4090 cards from using P2P (see this github issue for a discussion on the problem). If you have a driver with this bug, you may find your training run stalls and times out. Version 535 worked well for me.

\n

I like to use Conda with the fastchan channel for my environments. But you may enjoy a different python virtual env tool.

\n

Now you can train some AI #

\n

\"nvidia-smi\"

\n

I'm enjoying using Axolotl for LLM fine tuning. HuggingFace Transformers is also a great place to start.

\n

Datacenter trip #

\n

\"Datacenter\"

\n

Since the GPUs are super noisy and hot, I found a local datacenter that would colocate it for a reasonable cost. Installation was easier than I expected, although we ended up putting it on a lower slot on the rack because it was too heavy to lift half way up without a lift.

\n

This Colocation Survival Guide was super helpful, as it walks you through all the aspects of colocating, from the physical setup to networking.

\n

Other things #

\n

Set a lower max power limit for GPUs #

\n

Some people find that lowering the power limit just a bit will reduce max temp without any real performance sacrifice. I set the max power for my RTX 3090's to 300W (from 305W) by following these steps.

\n

Docker bug workaround #

\n

If you're planning to use Docker with the GPUs, note there's a bug on Ubuntu 22.04 which needs working around.

\n

Going bigger? #

\n

If you're planing to build a cluster, there is an excellent video from the Lambda team: Building a GPU cluster for AI.

\n", + "date_published": "2024-03-14T00:00:00Z" + } + + ] +} diff --git a/feed/feed.xml b/feed/feed.xml new file mode 100644 index 0000000..0b095b7 --- /dev/null +++ b/feed/feed.xml @@ -0,0 +1,354 @@ + + + Damien C. Tanner + Notes from a journey of compounding curiosity. + + + 2025-02-11T00:00:00Z + https://dc.tanner.me/ + + Damien C. Tanner + + + + + LLM tool calling as code blocks + + 2025-02-11T00:00:00Z + https://dc.tanner.me/blog/tools-as-code/ + <p>When building sophisticated agents that have more than a handful of tools to call, I've often found the inbuilt structured output/json tool calling methods provided by LMA APIs come up short.</p> +<p>Intuitively, one of the reasons for this is that when the structured output is enabled, there is no room for chain of thought, text or inbuilt support for comments amongst the JSON output of tool calls.</p> +<p>In addition, when you're trying to compare different LLM APIs, you have to switch between different tool calling schemas.</p> +<p>LLMs are trained on a lot of code, and a tool call is really just a function call. So why not just use code blocks for tool calls?</p> +<p>When you use LLMs to output code with tool calls, you may initially think of running the code in a sandbox. But that comes with infra overhead and security concerns. Instead, what if we just parse tool calls in code blocks with regex, and then validate the function names and params before calling the internal functions?</p> +<p>I've had great results with this approach. It's easy to implement and it works with any LLM (including open source models). The chain of thought and comments next to the tool calls is especially helpful when debugging, as the LLM will explain why it decided to call a particular tool with those params.</p> +<p>You can even do clever stuff like parse the text stream from the LLM as it comes in,and call tools as they are returned, instead of waiting for the LLM to finish.</p> +<p>He's an example of this approach that makes use of zod and Vercel AI SDK:</p> +<pre class="language-typescript" tabindex="0"><code class="language-typescript"><span class="token comment">// Run this example:</span> +<span class="token comment">// npm i zod zod-to-ts relaxed-json ai @ai-sdk/openai</span> +<span class="token comment">// tsx tool-calls-as-ts-example.ts</span> + +<span class="token keyword">import</span> <span class="token punctuation">{</span> z <span class="token punctuation">}</span> <span class="token keyword">from</span> <span class="token string">"zod"</span><span class="token punctuation">;</span> +<span class="token keyword">import</span> <span class="token constant">RJSON</span> <span class="token keyword">from</span> <span class="token string">"relaxed-json"</span><span class="token punctuation">;</span> +<span class="token keyword">import</span> <span class="token punctuation">{</span> printNode<span class="token punctuation">,</span> zodToTs <span class="token punctuation">}</span> <span class="token keyword">from</span> <span class="token string">"zod-to-ts"</span><span class="token punctuation">;</span> + +<span class="token keyword">type</span> <span class="token class-name">ToolsList</span> <span class="token operator">=</span> <span class="token punctuation">{</span> + <span class="token punctuation">[</span>key<span class="token operator">:</span> <span class="token builtin">string</span><span class="token punctuation">]</span><span class="token operator">:</span> <span class="token punctuation">{</span> name<span class="token operator">:</span> <span class="token builtin">string</span><span class="token punctuation">;</span> schema<span class="token operator">:</span> z<span class="token punctuation">.</span>ZodType<span class="token operator">&lt;</span><span class="token builtin">unknown</span><span class="token operator">></span> <span class="token punctuation">}</span><span class="token punctuation">;</span> +<span class="token punctuation">}</span><span class="token punctuation">;</span> + +<span class="token keyword">const</span> <span class="token function-variable function">getToolsAsTypeScriptString</span> <span class="token operator">=</span> <span class="token punctuation">(</span>toolsList<span class="token operator">:</span> ToolsList<span class="token punctuation">)</span> <span class="token operator">=></span> + Object<span class="token punctuation">.</span><span class="token function">entries</span><span class="token punctuation">(</span>toolsList<span class="token punctuation">)</span> + <span class="token punctuation">.</span><span class="token function">map</span><span class="token punctuation">(</span><span class="token punctuation">(</span><span class="token punctuation">[</span>toolName<span class="token punctuation">,</span> <span class="token punctuation">{</span> name<span class="token punctuation">,</span> schema <span class="token punctuation">}</span><span class="token punctuation">]</span><span class="token punctuation">)</span> <span class="token operator">=></span> <span class="token punctuation">{</span> + <span class="token keyword">const</span> <span class="token punctuation">{</span> node <span class="token punctuation">}</span> <span class="token operator">=</span> <span class="token function">zodToTs</span><span class="token punctuation">(</span>schema<span class="token punctuation">,</span> toolName<span class="token punctuation">)</span><span class="token punctuation">;</span> + <span class="token keyword">const</span> nodeString <span class="token operator">=</span> <span class="token function">printNode</span><span class="token punctuation">(</span>node<span class="token punctuation">)</span><span class="token punctuation">;</span> + <span class="token keyword">const</span> tsDefAsString <span class="token operator">=</span> <span class="token template-string"><span class="token template-punctuation string">`</span><span class="token string">/** </span><span class="token interpolation"><span class="token interpolation-punctuation punctuation">${</span>name<span class="token interpolation-punctuation punctuation">}</span></span><span class="token string"> */ \n</span><span class="token interpolation"><span class="token interpolation-punctuation punctuation">${</span>toolName<span class="token interpolation-punctuation punctuation">}</span></span><span class="token string">(</span><span class="token interpolation"><span class="token interpolation-punctuation punctuation">${</span>nodeString<span class="token interpolation-punctuation punctuation">}</span></span><span class="token string">)</span><span class="token template-punctuation string">`</span></span><span class="token punctuation">;</span> + <span class="token keyword">return</span> tsDefAsString<span class="token punctuation">;</span> + <span class="token punctuation">}</span><span class="token punctuation">)</span> + <span class="token punctuation">.</span><span class="token function">join</span><span class="token punctuation">(</span><span class="token string">"\n\n"</span><span class="token punctuation">)</span><span class="token punctuation">;</span> + +<span class="token keyword">const</span> <span class="token function-variable function">parseToolsCalledContent</span> <span class="token operator">=</span> <span class="token punctuation">(</span><span class="token punctuation">{</span> + llmResponseWithToolCallsAsJsCodeblock<span class="token punctuation">,</span> + toolsList<span class="token punctuation">,</span> +<span class="token punctuation">}</span><span class="token operator">:</span> <span class="token punctuation">{</span> + llmResponseWithToolCallsAsJsCodeblock<span class="token operator">:</span> <span class="token builtin">string</span><span class="token punctuation">;</span> + toolsList<span class="token operator">:</span> ToolsList<span class="token punctuation">;</span> +<span class="token punctuation">}</span><span class="token punctuation">)</span> <span class="token operator">=></span> <span class="token punctuation">{</span> + <span class="token keyword">const</span> toolsCallRegex <span class="token operator">=</span> + <span class="token regex"><span class="token regex-delimiter">/</span><span class="token regex-source language-regex">(\w+)\(([^()]*(?:\([^()]*\)[^()]*)*)\)(?:\s*\/\/.*)?</span><span class="token regex-delimiter">/</span><span class="token regex-flags">g</span></span><span class="token punctuation">;</span> + <span class="token keyword">const</span> toolsCalls <span class="token operator">=</span> + llmResponseWithToolCallsAsJsCodeblock<span class="token punctuation">.</span><span class="token function">matchAll</span><span class="token punctuation">(</span>toolsCallRegex<span class="token punctuation">)</span><span class="token punctuation">;</span> + <span class="token keyword">const</span> validatedToolsToCall<span class="token operator">:</span> <span class="token punctuation">{</span> + name<span class="token operator">:</span> <span class="token builtin">string</span><span class="token punctuation">;</span> + args<span class="token operator">:</span> <span class="token builtin">any</span><span class="token punctuation">;</span> + originalArgs<span class="token operator">:</span> <span class="token builtin">string</span><span class="token punctuation">;</span> + <span class="token punctuation">}</span><span class="token punctuation">[</span><span class="token punctuation">]</span> <span class="token operator">=</span> <span class="token punctuation">[</span><span class="token punctuation">]</span><span class="token punctuation">;</span> + <span class="token keyword">for</span> <span class="token punctuation">(</span><span class="token keyword">const</span> match <span class="token keyword">of</span> toolsCalls<span class="token punctuation">)</span> <span class="token punctuation">{</span> + <span class="token comment">// eslint-disable-next-line @typescript-eslint/no-unused-vars</span> + <span class="token keyword">const</span> <span class="token punctuation">[</span>_call<span class="token punctuation">,</span> toolName<span class="token punctuation">,</span> argString<span class="token punctuation">]</span> <span class="token operator">=</span> match<span class="token punctuation">;</span> + <span class="token comment">// console.log(`Found match for tools call: ${toolsName}(${argString})`)</span> + <span class="token keyword">if</span> <span class="token punctuation">(</span>toolName <span class="token operator">&amp;&amp;</span> toolsList<span class="token punctuation">.</span><span class="token function">hasOwnProperty</span><span class="token punctuation">(</span>toolName<span class="token punctuation">)</span><span class="token punctuation">)</span> <span class="token punctuation">{</span> + <span class="token keyword">const</span> tool <span class="token operator">=</span> toolsList<span class="token punctuation">[</span>toolName <span class="token keyword">as</span> <span class="token keyword">keyof</span> <span class="token keyword">typeof</span> toolsList<span class="token punctuation">]</span><span class="token punctuation">;</span> + <span class="token keyword">const</span> argsObj <span class="token operator">=</span> <span class="token constant">RJSON</span><span class="token punctuation">.</span><span class="token function">parse</span><span class="token punctuation">(</span>argString<span class="token punctuation">)</span><span class="token punctuation">;</span> + <span class="token comment">// Validate the arguments using the Zod schema</span> + <span class="token keyword">const</span> validatedArgs <span class="token operator">=</span> tool<span class="token punctuation">.</span>schema<span class="token punctuation">.</span><span class="token function">parse</span><span class="token punctuation">(</span>argsObj<span class="token punctuation">)</span><span class="token punctuation">;</span> + validatedToolsToCall<span class="token punctuation">.</span><span class="token function">push</span><span class="token punctuation">(</span><span class="token punctuation">{</span> + name<span class="token operator">:</span> toolName<span class="token punctuation">,</span> + args<span class="token operator">:</span> validatedArgs<span class="token punctuation">,</span> + originalArgs<span class="token operator">:</span> argString<span class="token punctuation">,</span> + <span class="token punctuation">}</span><span class="token punctuation">)</span><span class="token punctuation">;</span> + <span class="token punctuation">}</span> <span class="token keyword">else</span> <span class="token punctuation">{</span> + <span class="token builtin">console</span><span class="token punctuation">.</span><span class="token function">warn</span><span class="token punctuation">(</span><span class="token template-string"><span class="token template-punctuation string">`</span><span class="token string">Tool </span><span class="token interpolation"><span class="token interpolation-punctuation punctuation">${</span>toolName<span class="token interpolation-punctuation punctuation">}</span></span><span class="token string"> is not found.</span><span class="token template-punctuation string">`</span></span><span class="token punctuation">)</span><span class="token punctuation">;</span> + <span class="token punctuation">}</span> + <span class="token punctuation">}</span> + <span class="token keyword">return</span> validatedToolsToCall<span class="token punctuation">;</span> +<span class="token punctuation">}</span><span class="token punctuation">;</span> + +<span class="token comment">// EXAMPLE</span> +<span class="token keyword">import</span> <span class="token punctuation">{</span> generateText <span class="token punctuation">}</span> <span class="token keyword">from</span> <span class="token string">"ai"</span><span class="token punctuation">;</span> +<span class="token keyword">import</span> <span class="token punctuation">{</span> openai <span class="token punctuation">}</span> <span class="token keyword">from</span> <span class="token string">"@ai-sdk/openai"</span><span class="token punctuation">;</span> +<span class="token keyword">const</span> <span class="token function-variable function">example</span> <span class="token operator">=</span> <span class="token keyword">async</span> <span class="token punctuation">(</span><span class="token punctuation">)</span> <span class="token operator">=></span> <span class="token punctuation">{</span> + <span class="token keyword">const</span> tools <span class="token operator">=</span> <span class="token punctuation">{</span> + getWeather<span class="token operator">:</span> <span class="token punctuation">{</span> + name<span class="token operator">:</span> <span class="token string">"Get weather for location today (default) or N days in the future up to 10 days"</span><span class="token punctuation">,</span> + <span class="token function-variable function">function</span><span class="token operator">:</span> <span class="token punctuation">(</span><span class="token punctuation">{</span> + location<span class="token punctuation">,</span> + daysInFuture<span class="token punctuation">,</span> + <span class="token punctuation">}</span><span class="token operator">:</span> <span class="token punctuation">{</span> + location<span class="token operator">:</span> <span class="token builtin">string</span><span class="token punctuation">;</span> + daysInFuture<span class="token operator">:</span> <span class="token builtin">number</span><span class="token punctuation">;</span> + <span class="token punctuation">}</span><span class="token punctuation">)</span> <span class="token operator">=></span> <span class="token punctuation">{</span> + <span class="token comment">// TODO: Do actualy weather API call</span> + <span class="token keyword">return</span> <span class="token punctuation">{</span> + location<span class="token punctuation">,</span> + daysInFuture<span class="token punctuation">,</span> + weather<span class="token operator">:</span> <span class="token string">"sunny"</span><span class="token punctuation">,</span> + <span class="token punctuation">}</span><span class="token punctuation">;</span> + <span class="token punctuation">}</span><span class="token punctuation">,</span> + schema<span class="token operator">:</span> z<span class="token punctuation">.</span><span class="token function">object</span><span class="token punctuation">(</span><span class="token punctuation">{</span> + location<span class="token operator">:</span> z<span class="token punctuation">.</span><span class="token function">string</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">.</span><span class="token function">describe</span><span class="token punctuation">(</span><span class="token string">"The location to get the weather for."</span><span class="token punctuation">)</span><span class="token punctuation">,</span> + daysInFuture<span class="token operator">:</span> z + <span class="token punctuation">.</span><span class="token function">number</span><span class="token punctuation">(</span><span class="token punctuation">)</span> + <span class="token punctuation">.</span><span class="token function">describe</span><span class="token punctuation">(</span><span class="token string">"The number of days in the future to get the weather for."</span><span class="token punctuation">)</span><span class="token punctuation">,</span> + <span class="token punctuation">}</span><span class="token punctuation">)</span><span class="token punctuation">,</span> + <span class="token punctuation">}</span><span class="token punctuation">,</span> + <span class="token punctuation">}</span><span class="token punctuation">;</span> + <span class="token keyword">const</span> toolsAsTypeScriptString <span class="token operator">=</span> <span class="token function">getToolsAsTypeScriptString</span><span class="token punctuation">(</span>tools<span class="token punctuation">)</span><span class="token punctuation">;</span> + <span class="token keyword">const</span> <span class="token punctuation">{</span> text<span class="token operator">:</span> llmResponseWithToolCallsAsJsCodeblock <span class="token punctuation">}</span> <span class="token operator">=</span> <span class="token keyword">await</span> <span class="token function">generateText</span><span class="token punctuation">(</span><span class="token punctuation">{</span> + model<span class="token operator">:</span> <span class="token function">openai</span><span class="token punctuation">(</span><span class="token string">"gpt-4o"</span><span class="token punctuation">)</span><span class="token punctuation">,</span> + prompt<span class="token operator">:</span> <span class="token template-string"><span class="token template-punctuation string">`</span><span class="token string"> + AVAILABLE_TOOLS: + """ + </span><span class="token interpolation"><span class="token interpolation-punctuation punctuation">${</span>toolsAsTypeScriptString<span class="token interpolation-punctuation punctuation">}</span></span><span class="token string"> + """ + + AVAILABLE_TOOLS must be called in a single javascript codeblock. All function arguments must be on a single line. + + QUESTION: + "What is the weather in San Francisco?" + </span><span class="token template-punctuation string">`</span></span><span class="token punctuation">,</span> + <span class="token punctuation">}</span><span class="token punctuation">)</span><span class="token punctuation">;</span> + <span class="token builtin">console</span><span class="token punctuation">.</span><span class="token function">log</span><span class="token punctuation">(</span><span class="token string">"Tools schema pass to llm:\n"</span><span class="token punctuation">)</span><span class="token punctuation">;</span> + <span class="token builtin">console</span><span class="token punctuation">.</span><span class="token function">log</span><span class="token punctuation">(</span>toolsAsTypeScriptString<span class="token punctuation">)</span><span class="token punctuation">;</span> + <span class="token builtin">console</span><span class="token punctuation">.</span><span class="token function">log</span><span class="token punctuation">(</span><span class="token string">"\nResponse from llm with tool call code block:\n"</span><span class="token punctuation">)</span><span class="token punctuation">;</span> + <span class="token builtin">console</span><span class="token punctuation">.</span><span class="token function">log</span><span class="token punctuation">(</span>llmResponseWithToolCallsAsJsCodeblock<span class="token punctuation">)</span><span class="token punctuation">;</span> + <span class="token keyword">const</span> validatedToolsToCall <span class="token operator">=</span> <span class="token function">parseToolsCalledContent</span><span class="token punctuation">(</span><span class="token punctuation">{</span> + llmResponseWithToolCallsAsJsCodeblock<span class="token punctuation">,</span> + toolsList<span class="token operator">:</span> tools<span class="token punctuation">,</span> + <span class="token punctuation">}</span><span class="token punctuation">)</span><span class="token punctuation">;</span> + <span class="token builtin">console</span><span class="token punctuation">.</span><span class="token function">log</span><span class="token punctuation">(</span><span class="token string">"\nValidated tools to call:\n"</span><span class="token punctuation">)</span><span class="token punctuation">;</span> + <span class="token builtin">console</span><span class="token punctuation">.</span><span class="token function">log</span><span class="token punctuation">(</span>validatedToolsToCall<span class="token punctuation">)</span><span class="token punctuation">;</span> +<span class="token punctuation">}</span><span class="token punctuation">;</span> + +<span class="token function">example</span><span class="token punctuation">(</span><span class="token punctuation">)</span><span class="token punctuation">;</span></code></pre> +<p>Example output:</p> +<pre class="language-js" tabindex="0"><code class="language-js">$ tsx tool<span class="token operator">-</span>calls<span class="token operator">-</span><span class="token keyword">as</span><span class="token operator">-</span>ts<span class="token operator">-</span>example<span class="token punctuation">.</span>ts +Tools schema pass to llm<span class="token operator">:</span> + +<span class="token comment">/** Get weather for location today (default) or N days in the future up to 10 days */</span> +<span class="token function">getWeather</span><span class="token punctuation">(</span><span class="token punctuation">{</span> + <span class="token comment">/** The location to get the weather for. */</span> + <span class="token literal-property property">location</span><span class="token operator">:</span> string<span class="token punctuation">;</span> + <span class="token comment">/** The number of days in the future to get the weather for. */</span> + <span class="token literal-property property">daysInFuture</span><span class="token operator">:</span> number<span class="token punctuation">;</span> +<span class="token punctuation">}</span><span class="token punctuation">)</span> + +Response from llm <span class="token keyword">with</span> tool call code block<span class="token operator">:</span> + +<span class="token template-string"><span class="token template-punctuation string">`</span><span class="token template-punctuation string">`</span></span><span class="token template-string"><span class="token template-punctuation string">`</span><span class="token string">javascript +getWeather({ location: "San Francisco", daysInFuture: 0 }) +</span><span class="token template-punctuation string">`</span></span><span class="token template-string"><span class="token template-punctuation string">`</span><span class="token template-punctuation string">`</span></span> + +Validated tools to call<span class="token operator">:</span> + +<span class="token punctuation">[</span> + <span class="token punctuation">{</span> + <span class="token literal-property property">name</span><span class="token operator">:</span> <span class="token string">'getWeather'</span><span class="token punctuation">,</span> + <span class="token literal-property property">args</span><span class="token operator">:</span> <span class="token punctuation">{</span> <span class="token literal-property property">location</span><span class="token operator">:</span> <span class="token string">'San Francisco'</span><span class="token punctuation">,</span> <span class="token literal-property property">daysInFuture</span><span class="token operator">:</span> <span class="token number">0</span> <span class="token punctuation">}</span><span class="token punctuation">,</span> + <span class="token literal-property property">originalArgs</span><span class="token operator">:</span> <span class="token string">'{ location: "San Francisco", daysInFuture: 0 }'</span> + <span class="token punctuation">}</span> +<span class="token punctuation">]</span></code></pre> + + + + + Using LLM tool calling and long context for better RAG + + 2024-04-25T00:00:00Z + https://dc.tanner.me/blog/your-rag-may-not-need-a-vector-store/ + <p>When building a RAG pipeline you'll probably reach for a vector store to store embeddings of document chunks, which are then retrieved and put into context at query time. This works well if your users are asking single fact queries where the answer can be found in a relevant document chunk. But if your users want to ask more complex questions where the answer requires information spread across the whole document or across multiple documents, retreiveing chunks often leaves out critical information and can lead to inaccurate responses.</p> +<p>Relying on document chunks has been a great solution to add knowledge to LLMs with a limited context window. But context windows have grown massively over the past year, with the leading LLMs supporting context windows reaching 1M tokens. This opens the door to new approaches to RAG which are less constrained by context.</p> +<h2 id="whole-document-querying-rag" tabindex="-1">Whole document querying RAG <a class="header-anchor" href="https://dc.tanner.me/blog/your-rag-may-not-need-a-vector-store/">#</a></h2> +<p>Instead of retrieving document chunks, I've had success retreiving and querying whole documents. Queries like 'summarize xyz document ' or 'compare document abc to xyz' yield a full and complete summary without risk of missing important details.</p> +<p>When does this appraoch work? This approach works best if your documents are all of the same type or can be put into categories, and if the user queries include enough information to locate the specific document(s) the question is for.</p> +<p>For example, if your documents are client contracts, each may have a client name, date and contract type. If a user asks 'Summarize the most recent contract with Acme Inc?' we have enough information to find this document, and then use the whole document as context to fully answer their question.</p> +<p>Querying whole documents like this calls for a different RAG workflow than the common single step chunk-retrieve-query workflow. Retrieving whole documents and putting them straight into the context could fill up even a large context window.</p> +<p>Instead, we can leverage the function/tool calling ability of many LLMs to create sub-queries to query each document, which can be executed in parallel. We can even make use of cheaper and faster LLMs for these sub-queries which have to process the complete documents.</p> +<p>What does this look like in practice?</p> +<h3 id="create-document-query-functions" tabindex="-1">Create document query functions <a class="header-anchor" href="https://dc.tanner.me/blog/your-rag-may-not-need-a-vector-store/">#</a></h3> +<p>In the client contracts example, we would need to be able to locate and query a client contract document. We can create a function which takes several search filters, retrieves the full text of the top matching document, and then calls an LLM (e.g. gpt-3.5-turbo) with the full document text and the query. The fuction should accept the filters required to find the document e.g.: client name, date range, contract type. Plus a query param which is the query to send to the LLM with the full document text.</p> +<p>There's no set way to search for these documents, you could use SQL, Elastic or even embeddings. The key thing is it should be able handle fuzzy search filters for certain params, e.g. for the client name in this case.</p> +<p>Here's an example of this function in Python:</p> +<pre class="language-python" tabindex="0"><code class="language-python"><span class="token keyword">def</span> <span class="token function">query_client_contract</span><span class="token punctuation">(</span>client_name<span class="token punctuation">:</span> <span class="token builtin">str</span><span class="token punctuation">,</span> document_type<span class="token punctuation">:</span> <span class="token builtin">str</span><span class="token punctuation">,</span> from_date<span class="token punctuation">:</span> <span class="token builtin">str</span> <span class="token operator">=</span> <span class="token boolean">None</span><span class="token punctuation">,</span> to_date<span class="token punctuation">:</span> <span class="token builtin">str</span> <span class="token operator">=</span> <span class="token boolean">None</span><span class="token punctuation">,</span> query<span class="token punctuation">:</span> <span class="token builtin">str</span><span class="token punctuation">)</span><span class="token punctuation">:</span> + <span class="token comment"># Search for the document</span> + document <span class="token operator">=</span> search_client_contract<span class="token punctuation">(</span>client_name<span class="token punctuation">,</span> document_type<span class="token punctuation">,</span> from_date<span class="token punctuation">,</span> to_date<span class="token punctuation">)</span> + <span class="token comment"># Call the LLM with the full document text and the query</span> + messages <span class="token operator">=</span> <span class="token punctuation">[</span> + <span class="token punctuation">{</span><span class="token string">"content"</span><span class="token punctuation">:</span> <span class="token string">"Answer the query using the provided text."</span><span class="token punctuation">,</span> <span class="token string">"role"</span><span class="token punctuation">:</span> <span class="token string">"system"</span><span class="token punctuation">}</span><span class="token punctuation">,</span> + <span class="token punctuation">{</span><span class="token string">"content"</span><span class="token punctuation">:</span> document <span class="token operator">+</span> <span class="token string">"\n\nQuery: "</span> <span class="token operator">+</span> query<span class="token punctuation">,</span> <span class="token string">"role"</span><span class="token punctuation">:</span> <span class="token string">"user"</span><span class="token punctuation">}</span><span class="token punctuation">,</span> + <span class="token punctuation">]</span> + response <span class="token operator">=</span> client<span class="token punctuation">.</span>chat<span class="token punctuation">.</span>completions<span class="token punctuation">.</span>create<span class="token punctuation">(</span> + model<span class="token operator">=</span><span class="token string">"gpt-3.5-turbo"</span><span class="token punctuation">,</span> <span class="token comment"># Use a cheaper model for the sub-query which will process the full document</span> + messages<span class="token operator">=</span>messages<span class="token punctuation">,</span> + <span class="token punctuation">)</span> + <span class="token keyword">return</span> response<span class="token punctuation">.</span>choices<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">.</span>message<span class="token punctuation">.</span>content</code></pre> +<h3 id="sub-query-function-calls" tabindex="-1">Sub-query function calls <a class="header-anchor" href="https://dc.tanner.me/blog/your-rag-may-not-need-a-vector-store/">#</a></h3> +<p>Now we have the document query function, we are going to use <a href="https://platform.openai.com/docs/guides/function-calling">OpenAI Function Calling</a> to create sub-queries to this function.</p> +<p>First we use JSON Schema to define the tool for OpenAI function calling:</p> +<pre class="language-python" tabindex="0"><code class="language-python">tools <span class="token operator">=</span> <span class="token punctuation">[</span> + <span class="token punctuation">{</span> + <span class="token string">"type"</span><span class="token punctuation">:</span> <span class="token string">"function"</span><span class="token punctuation">,</span> + <span class="token string">"function"</span><span class="token punctuation">:</span> <span class="token punctuation">{</span> + <span class="token string">"name"</span><span class="token punctuation">:</span> <span class="token string">"query_client_contract"</span><span class="token punctuation">,</span> + <span class="token string">"description"</span><span class="token punctuation">:</span> + <span class="token string">"Send the query to AI to ask the full document text. The AI response will be returned."</span><span class="token punctuation">,</span> + <span class="token string">"parameters"</span><span class="token punctuation">:</span> <span class="token punctuation">{</span> + <span class="token string">"type"</span><span class="token punctuation">:</span> <span class="token string">"object"</span><span class="token punctuation">,</span> + <span class="token string">"properties"</span><span class="token punctuation">:</span> <span class="token punctuation">{</span> + <span class="token string">"client_name"</span><span class="token punctuation">:</span> <span class="token punctuation">{</span> + <span class="token string">"type"</span><span class="token punctuation">:</span> <span class="token string">"string"</span><span class="token punctuation">,</span> + <span class="token string">"description"</span><span class="token punctuation">:</span> <span class="token string">"Name of the client the contract is for."</span><span class="token punctuation">,</span> + <span class="token punctuation">}</span><span class="token punctuation">,</span> + <span class="token string">"document_type"</span><span class="token punctuation">:</span> <span class="token punctuation">{</span> + <span class="token string">"type"</span><span class="token punctuation">:</span> <span class="token string">"string"</span><span class="token punctuation">,</span> + <span class="token string">"enum"</span><span class="token punctuation">:</span> <span class="token punctuation">[</span><span class="token string">"contract"</span><span class="token punctuation">,</span> <span class="token string">"lease"</span><span class="token punctuation">]</span><span class="token punctuation">,</span> + <span class="token string">"description"</span><span class="token punctuation">:</span> <span class="token string">"The type of legal contract."</span><span class="token punctuation">,</span> + <span class="token punctuation">}</span><span class="token punctuation">,</span> + <span class="token string">"from_date"</span><span class="token punctuation">:</span> <span class="token punctuation">{</span> + <span class="token string">"type"</span><span class="token punctuation">:</span> <span class="token string">"string"</span><span class="token punctuation">,</span> + <span class="token string">"format"</span><span class="token punctuation">:</span> <span class="token string">"date-time"</span><span class="token punctuation">,</span> + <span class="token string">"description"</span><span class="token punctuation">:</span> <span class="token string">"Find documents from this date."</span><span class="token punctuation">,</span> + <span class="token punctuation">}</span><span class="token punctuation">,</span> + <span class="token string">"to_date"</span><span class="token punctuation">:</span> <span class="token punctuation">{</span> + <span class="token string">"type"</span><span class="token punctuation">:</span> <span class="token string">"string"</span><span class="token punctuation">,</span> + <span class="token string">"format"</span><span class="token punctuation">:</span> <span class="token string">"date-time"</span><span class="token punctuation">,</span> + <span class="token string">'description'</span><span class="token punctuation">:</span> <span class="token string">"Find documents up to this date."</span><span class="token punctuation">,</span> + <span class="token punctuation">}</span><span class="token punctuation">,</span> + <span class="token punctuation">}</span><span class="token punctuation">,</span> + <span class="token string">"required"</span><span class="token punctuation">:</span> <span class="token punctuation">[</span><span class="token string">"client_name"</span><span class="token punctuation">,</span> <span class="token string">"document_type"</span><span class="token punctuation">]</span><span class="token punctuation">,</span> + <span class="token punctuation">}</span><span class="token punctuation">,</span> + <span class="token punctuation">}</span><span class="token punctuation">,</span> + <span class="token punctuation">}</span> +<span class="token punctuation">]</span></code></pre> +<p>Then we need create a helper function to execute the function when requested by the LLM:</p> +<pre class="language-python" tabindex="0"><code class="language-python"><span class="token keyword">def</span> <span class="token function">execute_function_call</span><span class="token punctuation">(</span>message<span class="token punctuation">)</span><span class="token punctuation">:</span> + <span class="token keyword">if</span> message<span class="token punctuation">.</span>tool_calls<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">.</span>function<span class="token punctuation">.</span>name <span class="token operator">==</span> <span class="token string">"query_client_contract"</span><span class="token punctuation">:</span> + args <span class="token operator">=</span> json<span class="token punctuation">.</span>loads<span class="token punctuation">(</span>message<span class="token punctuation">.</span>tool_calls<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">.</span>function<span class="token punctuation">.</span>arguments<span class="token punctuation">)</span> + results <span class="token operator">=</span> ask_database<span class="token punctuation">(</span>args<span class="token punctuation">[</span><span class="token string">"client_name"</span><span class="token punctuation">]</span><span class="token punctuation">,</span> args<span class="token punctuation">[</span><span class="token string">"document_type"</span><span class="token punctuation">]</span><span class="token punctuation">,</span> args<span class="token punctuation">[</span><span class="token string">"from_date"</span><span class="token punctuation">]</span><span class="token punctuation">,</span> args<span class="token punctuation">[</span><span class="token string">"to_date"</span><span class="token punctuation">]</span><span class="token punctuation">,</span> args<span class="token punctuation">[</span><span class="token string">"query"</span><span class="token punctuation">]</span><span class="token punctuation">)</span> + <span class="token keyword">else</span><span class="token punctuation">:</span> + results <span class="token operator">=</span> <span class="token string-interpolation"><span class="token string">f"Error: function </span><span class="token interpolation"><span class="token punctuation">{</span>message<span class="token punctuation">.</span>tool_calls<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">.</span>function<span class="token punctuation">.</span>name<span class="token punctuation">}</span></span><span class="token string"> does not exist"</span></span> + <span class="token keyword">return</span> results</code></pre> +<p>Now in the main chat function, we take a user's query, and if GPT suggests a function call, we execute it and append the results to the chat messages, and then send the messages back to GPT for the final answer:</p> +<pre class="language-python" tabindex="0"><code class="language-python"><span class="token keyword">def</span> <span class="token function">ask_ai</span><span class="token punctuation">(</span>query<span class="token punctuation">:</span> <span class="token builtin">str</span><span class="token punctuation">)</span><span class="token punctuation">:</span> + messages <span class="token operator">=</span> <span class="token punctuation">[</span> + <span class="token punctuation">{</span><span class="token string">"content"</span><span class="token punctuation">:</span> <span class="token string">"Answer the user query, calling functions if required."</span><span class="token punctuation">,</span> <span class="token string">"role"</span><span class="token punctuation">:</span> <span class="token string">"system"</span><span class="token punctuation">}</span><span class="token punctuation">,</span> + <span class="token punctuation">{</span><span class="token string">"content"</span><span class="token punctuation">:</span> query<span class="token punctuation">,</span> <span class="token string">"role"</span><span class="token punctuation">:</span> <span class="token string">"user"</span><span class="token punctuation">}</span><span class="token punctuation">,</span> + <span class="token punctuation">]</span> + + chat_response <span class="token operator">=</span> client<span class="token punctuation">.</span>chat<span class="token punctuation">.</span>completions<span class="token punctuation">.</span>create<span class="token punctuation">(</span> + model<span class="token operator">=</span><span class="token string">"gpt-4-turbo"</span><span class="token punctuation">,</span> <span class="token comment"># Use a more powerful model for function calling</span> + tools<span class="token operator">=</span>tools<span class="token punctuation">,</span> + tool_choice<span class="token operator">=</span><span class="token string">"auto"</span><span class="token punctuation">,</span> <span class="token comment"># "auto" means the model can pick between generating a message or calling a function</span> + messages<span class="token operator">=</span>messages<span class="token punctuation">,</span> + <span class="token punctuation">)</span> + + assistant_message <span class="token operator">=</span> chat_response<span class="token punctuation">.</span>choices<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">.</span>message + assistant_message<span class="token punctuation">.</span>content <span class="token operator">=</span> <span class="token builtin">str</span><span class="token punctuation">(</span>assistant_message<span class="token punctuation">.</span>tool_calls<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">.</span>function<span class="token punctuation">)</span> + messages<span class="token punctuation">.</span>append<span class="token punctuation">(</span><span class="token punctuation">{</span><span class="token string">"role"</span><span class="token punctuation">:</span> assistant_message<span class="token punctuation">.</span>role<span class="token punctuation">,</span> <span class="token string">"content"</span><span class="token punctuation">:</span> assistant_message<span class="token punctuation">.</span>content<span class="token punctuation">}</span><span class="token punctuation">)</span> + + <span class="token keyword">if</span> assistant_message<span class="token punctuation">.</span>tool_calls<span class="token punctuation">:</span> + results <span class="token operator">=</span> execute_function_call<span class="token punctuation">(</span>assistant_message<span class="token punctuation">)</span> + messages<span class="token punctuation">.</span>append<span class="token punctuation">(</span><span class="token punctuation">{</span><span class="token string">"role"</span><span class="token punctuation">:</span> <span class="token string">"function"</span><span class="token punctuation">,</span> <span class="token string">"tool_call_id"</span><span class="token punctuation">:</span> assistant_message<span class="token punctuation">.</span>tool_calls<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">.</span><span class="token builtin">id</span><span class="token punctuation">,</span> <span class="token string">"name"</span><span class="token punctuation">:</span> assistant_message<span class="token punctuation">.</span>tool_calls<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">.</span>function<span class="token punctuation">.</span>name<span class="token punctuation">,</span> <span class="token string">"content"</span><span class="token punctuation">:</span> results<span class="token punctuation">}</span><span class="token punctuation">)</span> + + second_chat_response <span class="token operator">=</span> client<span class="token punctuation">.</span>chat<span class="token punctuation">.</span>completions<span class="token punctuation">.</span>create<span class="token punctuation">(</span> + model<span class="token operator">=</span><span class="token string">"gpt-4-turbo"</span><span class="token punctuation">,</span> <span class="token comment"># Use a more powerful model for function calling</span> + tools<span class="token operator">=</span>tools<span class="token punctuation">,</span> + tool_choice<span class="token operator">=</span><span class="token string">"auto"</span><span class="token punctuation">,</span> <span class="token comment"># "auto" means the model can pick between generating a message or calling a function</span> + messages<span class="token operator">=</span>messages<span class="token punctuation">,</span> + <span class="token punctuation">)</span> + <span class="token keyword">print</span><span class="token punctuation">(</span>second_chat_response<span class="token punctuation">.</span>choices<span class="token punctuation">[</span><span class="token number">0</span><span class="token punctuation">]</span><span class="token punctuation">.</span>message<span class="token punctuation">.</span>content<span class="token punctuation">)</span></code></pre> +<h2 id="the-benefits-of-this-approach" tabindex="-1">The benefits of this approach <a class="header-anchor" href="https://dc.tanner.me/blog/your-rag-may-not-need-a-vector-store/">#</a></h2> +<p>There are several benefits to this approach. The main benefit, as discussed above, is that we are querying whole documents. For many use cases this is going to provide more complete answers for users. You can also easily extend this approach by adding more functions for different document types and data sources. GPT will call multiple functions which you can execute in parallel, and in the final GPT call we can use gpt-4-turbo to integrate the results and provide the final answer. If you do have a handful of unknown documents, you can still use the chunk-retrieve-query approach for those, and simply add a function to the tool list to query the chunked documents with a typical RAG pipeline.</p> +<p>I'm excited to see how this approach can be used in practice. I think it will be especially useful for complex questions where the answer is spread across multiple documents, or where the user query is for a summary of a document. I'd love to hear how you get on with this approach. Please reach out if you have any other ideas for how to improve this approach, or related new ideas for improving RAG.</p> + + + + + Building an AI SuperServer for LLM training and experiments + + 2024-03-14T00:00:00Z + https://dc.tanner.me/blog/building-an-ai-superserver/ + <p>Impressive new language models like Llama and Mistral have broadened the accessibility of AI training. If you want to fine-tune a model with your own data, it's now relatively easy to do with tools like <a href="https://github.com/OpenAccess-AI-Collective/axolotl">Axolotl</a> and a few dollars spent on a GPU cloud. But if you want to go deeper and train larger models or try new methods, the cloud bill can quickly rack up. Renting 8 A100's on AWS will set you back an astounding $350,000 per year! There are cheaper clouds, but they can still cost tens of thousands a year.</p> +<p>I've always enjoyed building PCs. I remember when I was 16 and my grandma bought me my first PC to assemble myself. So in the name of fun and saving money, I embarked on building an AI server so that I can more affordably do independent AI research.</p> +<h1 id="your-options" tabindex="-1">Your options <a class="header-anchor" href="https://dc.tanner.me/blog/building-an-ai-superserver/">#</a></h1> +<p>Depending on your budget and use case, there are a few routes to take when building an AI server.</p> +<h2 id="open-frame" tabindex="-1">Open frame <a class="header-anchor" href="https://dc.tanner.me/blog/building-an-ai-superserver/">#</a></h2> +<p><picture><source type="image/avif" srcset="https://dc.tanner.me/img/l6X6xCxgFa-2048.avif 2048w"><source type="image/webp" srcset="https://dc.tanner.me/img/l6X6xCxgFa-2048.webp 2048w"><img alt="Miner style" loading="lazy" decoding="async" style="width: 100%; height: auto;" src="https://dc.tanner.me/img/l6X6xCxgFa-2048.jpeg" width="2048" height="1536"></picture></p> +<p>If the server is just for you, and you want to keep it at home or in your basement, the most affordable option is essentially a powerful consumer PC, with an open frame case (originally designed for crypto miners). You'll be able to find a lots of advice on Reddit for this route.</p> +<p>The important things are a motherboard that has lots of 16x PCIe slots, PCIe risers with redrivers, and multiple PSUs (depending the number of GPUs you choose). You'll be able to buy everything second had if you like, including the GPUs. For GPUs you're best going with RTX 3090s or 4090s in this setup, and because there's no case, you won't have issues with space or airflow.</p> +<p>The benefit if this route is cost, but also the ability to start simple with just a single GPU and grow as you desire by adding more.</p> +<h2 id="rack-server" tabindex="-1">Rack server <a class="header-anchor" href="https://dc.tanner.me/blog/building-an-ai-superserver/">#</a></h2> +<p><picture><source type="image/avif" srcset="https://dc.tanner.me/img/U4DUeJHjp3-1280.avif 1280w"><source type="image/webp" srcset="https://dc.tanner.me/img/U4DUeJHjp3-1280.webp 1280w"><img alt="Server style" loading="lazy" decoding="async" style="width: 100%; height: auto;" src="https://dc.tanner.me/img/U4DUeJHjp3-1280.jpeg" width="1280" height="960"></picture></p> +<p>If you're planning to train larger models, have more servers, datacenter GPUs or just don't have anywhere to house a noisy hot server at home, you can go the rack mountable server route. This is the the route I've gone, as our house doesn't have a basement and our internet isn't that fast. My server now lives in a datacenter where it's cooled and well connected.</p> +<p>I found less resources on this route, so the rest of this guide is aimed at helping you build and setup a rack mountable GPU server.</p> +<h1 id="building-the-server" tabindex="-1">Building the server <a class="header-anchor" href="https://dc.tanner.me/blog/building-an-ai-superserver/">#</a></h1> +<p>Supermicro make great server systems and many specifically for AI use cases. For example the <a href="https://www.supermicro.com/en/products/system/4u/4029/sys-4029gp-trt2.cfm">SuperServer 4029GP-TRT2</a> is a mid range 4U dual CPU server with 10 PCIe slots - ideal for filling with GPUs! I found a well priced one from an IT supplier in the UK. The newer model is more expensive, but may be easier to find. Note that the model I used only have PCIe 3.0. If you are using RTX 4090 or a newer datacenter GPU, you will probably want the newer model which supports PCIe 4.0.</p> +<p><picture><source type="image/avif" srcset="https://dc.tanner.me/img/rKkyq27rxN-960.avif 960w"><img alt="SuperServer 4029GP-TRT2" loading="lazy" decoding="async" style="width: 100%; height: auto;" src="https://dc.tanner.me/img/rKkyq27rxN-960.webp" width="960" height="720"></picture></p> +<p>It arrived at my house on a pallet. It was heavier than I expected!</p> +<p><picture><source type="image/avif" srcset="https://dc.tanner.me/img/x5qFkxTOGi-1280.avif 1280w"><source type="image/webp" srcset="https://dc.tanner.me/img/x5qFkxTOGi-1280.webp 1280w"><img alt="The pallet" loading="lazy" decoding="async" style="width: 100%; height: auto;" src="https://dc.tanner.me/img/x5qFkxTOGi-1280.jpeg" width="1280" height="960"></picture></p> +<p>After lugging it up the stairs and reading the manual, I installed 10 RTX 3090s I bought second hand from someone who previously was using them for mining. Note that to fit the maximum number of GPUs in a system you'll need to find blower or turbo style GPUs that are only two slots wide. The vast majority of 3090 and 4090 GPUs are for gaming, and they will take up 3 slots and the power comes out the top and you won't be able to put the case on your server. If you can't find blower consumer GPUs, you're next best bet is the RTX A6000 which is still fairly good value for money, even if it's still 3x more than a 4090.</p> +<p>You'll also need to add the CPUs (two of them), memory and storage. I sourced everything secondhand from eBay. Most things cost no more than a few hundred dollars each. I went with 2x Intel Xeon Platinum 8160, 12x32GB DDR memory and an 8TB SSD.</p> +<p>Once everything was installed, I turned it on for the first time - what I heard could only be described as a mini jet engine. Server fans are noisy.</p> +<p>Next step was to setup the OS and environment.</p> +<h1 id="setting-up-the-os" tabindex="-1">Setting up the OS <a class="header-anchor" href="https://dc.tanner.me/blog/building-an-ai-superserver/">#</a></h1> +<p>Supermicro servers have in inbuilt webui called IPMI for accessing the server console and monitor output. There is a dedicated lan port for PICE on this server. You should also plug in a second lan cable to one of the main lan ports, otherwise your server won't actually have internet access (this confused me initially).</p> +<p>It will find an IP with DHCP, so I just logged into my router to see the IP it was assigned and visited that in my browser. You'll be asked to login, username is 'ADMIN' and the password is printed stickers in several places in your server case.</p> +<p>I decided to install Ubuntu 22.04 sever. Create a bootable Ubuntu USB stick and plug it into the server. Now connect to the webui console by going to the server's IP then clicking Remote Control &gt; iKVM/HTML5 and click the button. You can now reboot the server and you'll see the BIOS popup, where you can hit an F key to choose a boot drive. Do this and select the USB.</p> +<p>The IPMI web console doesn't support pasting text. So getting your ssh pubkey over is a bit of a pain. Here's a solution I've used:</p> +<ol> +<li>On your local computer with has your ssh pubkey on it, run <code>cd .ssh &amp;&amp; python -m http.server</code> (you are about to serve your private key over http without authentication, please be aware this isn't a great idea).</li> +<li>On the server, via the IPMI web console, login with the user you created when installing Ubuntu, and run <code>wget -qO - &quot;http://192.168.178.21:8000/id_ed25519.pub&quot; &gt; ~/.ssh/authorized_keys &amp;&amp; chmod 600 .ssh/authorized_keys</code>.</li> +<li>You should now be able to ssh into your server. Remember to stop the <code>python -m http.server</code> on your local computer now.</li> +</ol> +<h1 id="important-system-tweaks" tabindex="-1">Important system tweaks <a class="header-anchor" href="https://dc.tanner.me/blog/building-an-ai-superserver/">#</a></h1> +<p>There are some tweaks we can do to improve the performance and reliability of our server. Following <a href="https://towardsdatascience.com/deploying-kubeflow-to-a-bare-metal-gpu-cluster-from-scratch-6865ebcde032">the tips here</a> (<a href="https://archive.ph/0Y2DK#selection-611.0-611.103">archived page</a> if Medium paywalls that page), first disable the kernel security patches on computing instances. The collateral performance penalty is much more expensive than the imposed risks. Edit /etc/default/grub and add:</p> +<pre><code>GRUB_CMDLINE_LINUX_DEFAULT=&quot;pti=off spectre_v2=off l1tf=off nospec_store_bypass_disable no_stf_barrier&quot; +</code></pre> +<p>It's also critical to disable IOMMU if you plan peer-to-peer GPU communication, e.g., multi-GPU model training in Tensorflow or PyTorch. Also add to /etc/default/grub:</p> +<pre><code>GRUB_CMDLINE_LINUX_DEFAULT=&quot;intel_iommu=off rcutree.rcu_idle_gp_delay=1&quot; +</code></pre> +<h1 id="check-gpu-p2p-communication" tabindex="-1">Check GPU P2P communication <a class="header-anchor" href="https://dc.tanner.me/blog/building-an-ai-superserver/">#</a></h1> +<p>If you're using a GPU that supports it, P2P communication speeds up things a lot.</p> +<p>Note it's important check <a href="https://docs.nvidia.com/deeplearning/nccl/archives/nccl_284/user-guide/docs/troubleshooting.html#:~:text=PCI%20Access%20Control%20Services%20(ACS)%C2%B6&amp;text=If%20PCI%20switches%20have%20ACS,done%20again%20after%25z">PCI Access Control Services (ACS)</a> is disabled.</p> +<p>You can follow these steps to test your system's GPU P2P speed: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html#gpu-to-gpu-communication</p> +<h1 id="nvidia-drivers-and-python-environment" tabindex="-1">NVIDIA drivers and python environment <a class="header-anchor" href="https://dc.tanner.me/blog/building-an-ai-superserver/">#</a></h1> +<p>We now want to get the NVIDIA drivers, CUDA and our Python envs setup.</p> +<p>I've had success using these steps to install CUDA v11.8: https://gist.github.com/MihailCosmin/affa6b1b71b43787e9228c25fe15aeba +Some people have mentioned using a higher NVIDIA drivers version than the nvidia-driver-515 in the script. But be beware there's a bug in driver version 545 that prevents 3090 and 4090 cards from using P2P (see <a href="https://github.com/NVIDIA/nccl-tests/issues/117">this github issue</a> for a discussion on the problem). If you have a driver with this bug, you may find your training run stalls and times out. Version 535 worked well for me.</p> +<p>I like to use Conda with the <a href="https://www.fast.ai/posts/2021-07-15-fastconda.html">fastchan channel</a> for my environments. But you may enjoy a different python virtual env tool.</p> +<h1 id="now-you-can-train-some-ai" tabindex="-1">Now you can train some AI <a class="header-anchor" href="https://dc.tanner.me/blog/building-an-ai-superserver/">#</a></h1> +<p><picture><source type="image/avif" srcset="https://dc.tanner.me/img/WfDXmhx_Ty-1482.avif 1482w"><source type="image/webp" srcset="https://dc.tanner.me/img/WfDXmhx_Ty-1482.webp 1482w"><img alt="nvidia-smi" loading="lazy" decoding="async" style="width: 100%; height: auto;" src="https://dc.tanner.me/img/WfDXmhx_Ty-1482.jpeg" width="1482" height="1338"></picture></p> +<p>I'm enjoying using <a href="https://github.com/OpenAccess-AI-Collective/axolotl">Axolotl</a> for LLM fine tuning. <a href="https://huggingface.co/docs/transformers/index">HuggingFace Transformers</a> is also a great place to start.</p> +<h1 id="datacenter-trip" tabindex="-1">Datacenter trip <a class="header-anchor" href="https://dc.tanner.me/blog/building-an-ai-superserver/">#</a></h1> +<p><picture><source type="image/avif" srcset="https://dc.tanner.me/img/Fd4oi2rcxB-1922.avif 1922w"><source type="image/webp" srcset="https://dc.tanner.me/img/Fd4oi2rcxB-1922.webp 1922w"><img alt="Datacenter" loading="lazy" decoding="async" style="width: 100%; height: auto;" src="https://dc.tanner.me/img/Fd4oi2rcxB-1922.png" width="1922" height="1294"></picture></p> +<p>Since the GPUs are super noisy and hot, I found a local datacenter that would colocate it for a reasonable cost. Installation was easier than I expected, although we ended up putting it on a lower slot on the rack because it was too heavy to lift half way up without a lift.</p> +<p>This <a href="https://www.datacate.net/wp-content/uploads/2019/04/Colocation-Survival-Guide-6x9-with-bonus-material.pdf">Colocation Survival Guide</a> was super helpful, as it walks you through all the aspects of colocating, from the physical setup to networking.</p> +<h1 id="other-things" tabindex="-1">Other things <a class="header-anchor" href="https://dc.tanner.me/blog/building-an-ai-superserver/">#</a></h1> +<h2 id="set-a-lower-max-power-limit-for-gpus" tabindex="-1">Set a lower max power limit for GPUs <a class="header-anchor" href="https://dc.tanner.me/blog/building-an-ai-superserver/">#</a></h2> +<p>Some people find that lowering the power limit just a bit will reduce max temp without any real performance sacrifice. I set the max power for my RTX 3090's to 300W (from 305W) by <a href="https://www.reddit.com/r/Fedora/comments/11lh9nn/set_nvidia_gpu_power_and_temp_limit_on_boot/">following these steps</a>.</p> +<h2 id="docker-bug-workaround" tabindex="-1">Docker bug workaround <a class="header-anchor" href="https://dc.tanner.me/blog/building-an-ai-superserver/">#</a></h2> +<p>If you're planning to use Docker with the GPUs, note there's <a href="https://github.com/NVIDIA/nvidia-container-toolkit/issues/48">a bug on Ubuntu 22.04 which needs working around</a>.</p> +<h1 id="going-bigger" tabindex="-1">Going bigger? <a class="header-anchor" href="https://dc.tanner.me/blog/building-an-ai-superserver/">#</a></h1> +<p>If you're planing to build a cluster, there is an excellent video from the Lambda team: <a href="https://www.youtube.com/watch?v=rfu5FwncZ6s">Building a GPU cluster for AI</a>.</p> + + + diff --git a/public/img/.gitkeep b/img/.gitkeep similarity index 100% rename from public/img/.gitkeep rename to img/.gitkeep diff --git a/img/Fd4oi2rcxB-1922.avif b/img/Fd4oi2rcxB-1922.avif new file mode 100644 index 0000000..f2bd735 Binary files /dev/null and b/img/Fd4oi2rcxB-1922.avif differ diff --git a/img/Fd4oi2rcxB-1922.png b/img/Fd4oi2rcxB-1922.png new file mode 100644 index 0000000..277fadb Binary files /dev/null and b/img/Fd4oi2rcxB-1922.png differ diff --git a/img/Fd4oi2rcxB-1922.webp b/img/Fd4oi2rcxB-1922.webp new file mode 100644 index 0000000..32587be Binary files /dev/null and b/img/Fd4oi2rcxB-1922.webp differ diff --git a/img/U4DUeJHjp3-1280.avif b/img/U4DUeJHjp3-1280.avif new file mode 100644 index 0000000..7f7cdb3 Binary files /dev/null and b/img/U4DUeJHjp3-1280.avif differ diff --git a/img/U4DUeJHjp3-1280.jpeg b/img/U4DUeJHjp3-1280.jpeg new file mode 100644 index 0000000..c8675fd Binary files /dev/null and b/img/U4DUeJHjp3-1280.jpeg differ diff --git a/img/U4DUeJHjp3-1280.webp b/img/U4DUeJHjp3-1280.webp new file mode 100644 index 0000000..7e042e9 Binary files /dev/null and b/img/U4DUeJHjp3-1280.webp differ diff --git a/img/WfDXmhx_Ty-1482.avif b/img/WfDXmhx_Ty-1482.avif new file mode 100644 index 0000000..a179843 Binary files /dev/null and b/img/WfDXmhx_Ty-1482.avif differ diff --git a/img/WfDXmhx_Ty-1482.jpeg b/img/WfDXmhx_Ty-1482.jpeg new file mode 100644 index 0000000..92c52db Binary files /dev/null and b/img/WfDXmhx_Ty-1482.jpeg differ diff --git a/img/WfDXmhx_Ty-1482.webp b/img/WfDXmhx_Ty-1482.webp new file mode 100644 index 0000000..cf301b8 Binary files /dev/null and b/img/WfDXmhx_Ty-1482.webp differ diff --git a/public/img/damien-square-small.jpeg b/img/damien-square-small.jpeg similarity index 100% rename from public/img/damien-square-small.jpeg rename to img/damien-square-small.jpeg diff --git a/img/l6X6xCxgFa-2048.avif b/img/l6X6xCxgFa-2048.avif new file mode 100644 index 0000000..32c561a Binary files /dev/null and b/img/l6X6xCxgFa-2048.avif differ diff --git a/img/l6X6xCxgFa-2048.jpeg b/img/l6X6xCxgFa-2048.jpeg new file mode 100644 index 0000000..fb08d97 Binary files /dev/null and b/img/l6X6xCxgFa-2048.jpeg differ diff --git a/img/l6X6xCxgFa-2048.webp b/img/l6X6xCxgFa-2048.webp new file mode 100644 index 0000000..5325c55 Binary files /dev/null and b/img/l6X6xCxgFa-2048.webp differ diff --git a/img/rKkyq27rxN-960.avif b/img/rKkyq27rxN-960.avif new file mode 100644 index 0000000..2940ee5 Binary files /dev/null and b/img/rKkyq27rxN-960.avif differ diff --git a/img/rKkyq27rxN-960.webp b/img/rKkyq27rxN-960.webp new file mode 100644 index 0000000..15b6e16 Binary files /dev/null and b/img/rKkyq27rxN-960.webp differ diff --git a/img/x5qFkxTOGi-1280.avif b/img/x5qFkxTOGi-1280.avif new file mode 100644 index 0000000..09981b4 Binary files /dev/null and b/img/x5qFkxTOGi-1280.avif differ diff --git a/img/x5qFkxTOGi-1280.jpeg b/img/x5qFkxTOGi-1280.jpeg new file mode 100644 index 0000000..661c227 Binary files /dev/null and b/img/x5qFkxTOGi-1280.jpeg differ diff --git a/img/x5qFkxTOGi-1280.webp b/img/x5qFkxTOGi-1280.webp new file mode 100644 index 0000000..a2b50be Binary files /dev/null and b/img/x5qFkxTOGi-1280.webp differ diff --git a/index.html b/index.html new file mode 100644 index 0000000..6630f16 --- /dev/null +++ b/index.html @@ -0,0 +1,321 @@ + + + + + + Damien C. Tanner + + + + + + + + Skip to main content + +
+ Damien C. Tanner + +
+ +
+ Me +

Damien C. Tanner #

+

x.com/dctanner

+

I'm the co-founder and CEO of Layercode.

+

Previously I co-founded MediaCore (ed-tech, sold to Workday inc), Pusher (realtime messaging, sold to MessageBird) and thoughtbot (my agency merged with them).

+

I also organise the AI Engineer London Meetup.

+

Companies #

+

Layercode - Voice AI platform for developers.

+

Pusher (Co-Founder. Acquired by MessageBird in 2020) - Realtime messaging platform.

+

MediaCore (Co-Founder. Acquired by Workday in 2015) - Video platform for education.

+

New Bamboo (Co-Founder. Acquired by thoughtbot in 2015) - Digital agency.

+

Panda (Co-Founder. Acquired by Xenon in 2013) - Cloud video processing.

+

Investing #

+

Some select private investments I’ve made:

+

MCJ - early LP in the MCJ climate focused funds.

+

Entocycle - Insect farming for fish & animal feed.

+

Mytos - Lab automation.

+

Strateos - A robotic cloud laboratory for the life sciences.

+ +
+ +
+ + + + diff --git a/netlify.toml b/netlify.toml deleted file mode 100644 index 5a8db0f..0000000 --- a/netlify.toml +++ /dev/null @@ -1,24 +0,0 @@ -[build] - publish = "_site" - command = "npm run build" - -[[plugins]] - - # Opt-in to the Netlify Lighthouse plugin (choose one): - - # 1. Go to your site on https://app.netlify.com and navigate to the Integrations tab, search for the `Lighthouse` plugin - # 2. Or via `npm install -D @netlify/plugin-lighthouse` - - # Read more: https://github.com/netlify/netlify-plugin-lighthouse - - package = "@netlify/plugin-lighthouse" - - # optional, fails build when a category is below a threshold - [plugins.inputs.thresholds] - performance = 1.0 - accessibility = 1.0 - best-practices = 1.0 - seo = 1.0 - - [plugins.inputs] - output_path = "reports/lighthouse/index.html" diff --git a/package.json b/package.json deleted file mode 100644 index 9d8fb4d..0000000 --- a/package.json +++ /dev/null @@ -1,44 +0,0 @@ -{ - "name": "eleventy-base-blog", - "version": "8.0.0", - "description": "A starter repository for a blog web site using the Eleventy site generator.", - "scripts": { - "build": "npx @11ty/eleventy", - "build-ghpages": "npx @11ty/eleventy", - "start": "npx @11ty/eleventy --serve --quiet", - "debug": "DEBUG=Eleventy* npx @11ty/eleventy", - "debugstart": "DEBUG=Eleventy* npx @11ty/eleventy --serve --quiet", - "benchmark": "DEBUG=Eleventy:Benchmark* npx @11ty/eleventy" - }, - "repository": { - "type": "git", - "url": "git://github.com/11ty/eleventy-base-blog.git" - }, - "author": { - "name": "Zach Leatherman", - "email": "zachleatherman@gmail.com", - "url": "https://zachleat.com/" - }, - "license": "MIT", - "engines": { - "node": ">=14" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/11ty" - }, - "bugs": { - "url": "https://github.com/11ty/eleventy-base-blog/issues" - }, - "homepage": "https://github.com/11ty/eleventy-base-blog#readme", - "devDependencies": { - "@11ty/eleventy": "^2.0.1", - "@11ty/eleventy-img": "^3.1.1", - "@11ty/eleventy-navigation": "^0.3.5", - "@11ty/eleventy-plugin-bundle": "^1.0.4", - "@11ty/eleventy-plugin-rss": "^1.2.0", - "@11ty/eleventy-plugin-syntaxhighlight": "^5.0.0", - "luxon": "^3.3.0", - "markdown-it-anchor": "^8.6.7" - } -} diff --git a/sitemap.xml b/sitemap.xml new file mode 100644 index 0000000..d27587e --- /dev/null +++ b/sitemap.xml @@ -0,0 +1,28 @@ + + + + + https://dc.tanner.me/blog/building-an-ai-superserver/ + 2024-03-14 + + + + https://dc.tanner.me/blog/your-rag-may-not-need-a-vector-store/ + 2024-04-25 + + + + https://dc.tanner.me/blog/tools-as-code/ + 2025-02-11 + + + + https://dc.tanner.me/blog/ + 2025-09-30 + + + + https://dc.tanner.me/ + 2025-09-30 + +