Skip to content

Commit

Permalink
Replace Node.js's Buffer with native Typed Arrays on the client-side
Browse files Browse the repository at this point in the history
Vite does not natively ship a Buffer polyfill and most of the
functionality that is required here, can be implemented natively (except
for the byte-wise compare, for that I had to write my own function).
  • Loading branch information
ZauberNerd committed Jul 8, 2023
1 parent 4b91a19 commit ff9f351
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 10 deletions.
1 change: 0 additions & 1 deletion app/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
"@msgpack/msgpack": "^3.0.0-beta2",
"@reduxjs/toolkit": "^1.9.3",
"broadcast-channel": "^4.20.2",
"buffer": "^6.0.3",
"comlink": "^4.4.1",
"events": "^3.3.0",
"idb-keyval": "^6.2.0",
Expand Down
23 changes: 14 additions & 9 deletions app/src/core/tokenizer/bpe.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import { compareUint8Array } from "../utils";

const MAX_NUM_THREADS = 128;

type MergeRange = { start: number, end: number };

const textDecoder = new TextDecoder();
const textEncoder = new TextEncoder();

export class RankMap {
private values = new Map<string, number>();

Expand All @@ -14,23 +19,23 @@ export class RankMap {
}

public set(bytes: Uint8Array, rank: number) {
const key = Buffer.from(bytes).toString();
const key = textDecoder.decode(bytes);
this.values.set(key, rank);
}

public get(bytes: Uint8Array) {
const key = Buffer.from(bytes).toString();
const key = textDecoder.decode(bytes);
return this.values.get(key);
}

public keys() {
return Array.from(this.values.keys()).map(k => Buffer.from(k));
return Array.from(this.values.keys()).map(k => textEncoder.encode(k));
}

public inverted() {
const inverted = new Map<number, Uint8Array>();
for (const [key, value] of Array.from(this.values.entries())) {
inverted.set(value, new Uint8Array(Buffer.from(key)));
inverted.set(value, textEncoder.encode(key));
}
return inverted;
}
Expand Down Expand Up @@ -100,10 +105,10 @@ export class CoreBPE {
const decoder: Map<number, Uint8Array> = encoder.inverted();

const specialTokensDecoder: Map<number, Uint8Array> = new Map(
Array.from(specialTokensEncoder.entries()).map(([k, v]) => [v, new Uint8Array(Buffer.from(k))])
Array.from(specialTokensEncoder.entries()).map(([k, v]) => [v, textEncoder.encode(k)])
);
const sortedTokenBytes: Uint8Array[] = Array.from(encoder.keys());
sortedTokenBytes.sort((a, b) => Buffer.compare(a, b));
sortedTokenBytes.sort((a, b) => compareUint8Array(a, b));

this.encoder = encoder;
this.specialTokensEncoder = specialTokensEncoder;
Expand Down Expand Up @@ -136,7 +141,7 @@ export class CoreBPE {
const ret: number[] = [];
let match: RegExpExecArray | null;
while ((match = regex.exec(text)) !== null) {
const piece = new Uint8Array(Buffer.from(match[0]));
const piece = textEncoder.encode(match[0]);
const token = this.encoder.get(piece);
if (token !== undefined) {
ret.push(token);
Expand Down Expand Up @@ -167,7 +172,7 @@ export class CoreBPE {
const end = nextSpecial === null ? text.length : nextSpecial.index;
let match: RegExpExecArray | null;
while ((match = regex.exec(text.slice(start, end))) !== null) {
const piece = new Uint8Array(Buffer.from(match[0]));
const piece = textEncoder.encode(match[0]);
const token = this.encoder.get(piece);
if (token !== undefined) {
lastPieceTokenLen = 1;
Expand Down Expand Up @@ -208,7 +213,7 @@ export class CoreBPE {
if (token !== undefined) {
return token;
}
const pieceStr = Buffer.from(piece).toString("utf-8");
const pieceStr = textDecoder.decode(piece);
if (this.specialTokensEncoder.has(pieceStr)) {
return this.specialTokensEncoder.get(pieceStr)!;
}
Expand Down
23 changes: 23 additions & 0 deletions app/src/core/utils/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,29 @@ export function cloneArrayBuffer(buffer: ArrayBuffer): ArrayBuffer {
return newBuffer;
}

/**
* Lexicographically compare two `Uint8Array` instances.
*
* @param {Uint8Array} a - The first `Uint8Array` instance to compare.
* @param {Uint8Array} b - The second `Uint8Array` instance to compare.
* @returns {number} The comparison result. -1 if `a` is "less" than `b`, 1 if `a` is "greater" than `b`, or 0 if they are "equal".
*/
export function compareUint8Array(a: Uint8Array, b: Uint8Array): number {
if (a === b) return 0;

const len = Math.min(a.byteLength, b.byteLength);

for (let i = 0; i < len; ++i) {
if (a[i] < b[i]) return -1;
if (a[i] > b[i]) return 1;
}

if (a.byteLength < b.byteLength) return -1;
if (a.byteLength > b.byteLength) return 1;

return 0;
}

/**
* Shares the specified text using the Web Share API if available in the user's browser.
*
Expand Down

0 comments on commit ff9f351

Please sign in to comment.