tc39 · bakkot · Jun 26, 2023 · Jun 26, 2023 · Jun 26, 2023 · Jun 26, 2023
diff --git a/playground/index-raw.html b/playground/index-raw.html
@@ -97,9 +97,10 @@ <h3>Options</h3>
 </code></pre>
 
 <h3>Streaming</h3>
-<p>Two additional methods, <code>toPartialBase64</code> and <code>fromPartialBase64</code>, allow encoding and decoding chunks of base64. This requires managing state, which is handled by returning a <code>{ result, extra }</code> pair. The options bag for these methods takes two additional arguments, one which specifies whether more data is expected and one which specifies any extra values returned by a previous call.</p>
+<p>Two additional methods, <code>toPartialBase64</code> and <code>fromPartialBase64</code>, allow encoding and decoding chunks of base64. This requires managing state, which is handled by returning a <code>{ result, extra }</code> pair; the <code>extra</code> argument must be round-tripped by the user to the next call as part of the options bag.</p>
+<p><code>toPartialBase64</code> requires specifing a <code>more</code> parameter, which should be true for all but the final chunk, in order to generate the final characters and any necessary padding.</p>
 <p>These methods are intended for lower-level use and are less convenient to use.</p>
-<p>Streaming versions of the hex APIs are not included since they are straightforward to do manually.</p>
+<p>Streaming versions of the hex APIs are not included at this time. That may change.</p>
 
 <p>Streaming an ArrayBuffer into chunks of base64 strings:</p>
 <pre class="language-js"><code class="language-js">
@@ -116,26 +117,26 @@ <h3>Streaming</h3>
 }
 ({ result } = extra.toPartialBase64({ more: false }));
 resultChunks.push(result);
+// result chunks are guaranteed to be correct-padded base64 strings
+
 console.log(resultChunks);
 // ['mpmZmZmZ', 'uT+amZmZ', 'mZnJPzMz', 'MzMzM9M/', 'mpmZmZmZ', '', '2T8=']
 </code></pre>
 
 <p>Streaming base64 strings into Uint8Arrays:</p>
 <pre class="language-js"><code class="language-js">
 let chunks = ['mpmZmZmZuT+am', 'ZmZmZnJPzMz', 'MzMz', 'M9M/mpmZmZmZ', '2T8='];
-// individual chunks are not necessarily correctly-padded base64 strings
+// individual chunks are not required to be correctly-padded base64 strings
 
 let output = new Uint8Array(new ArrayBuffer(0, { maxByteLength: 1024 }));
 let result, extra;
 for (let c of chunks) {
-  ({ result, extra } = Uint8Array.fromPartialBase64(c, { more: true, extra }));
+  ({ result, extra, } = Uint8Array.fromPartialBase64(c, { extra }));
   let offset = output.length;
   let newLength = offset + result.length;
   output.buffer.resize(newLength);
   output.set(result, offset);
 }
-// if padding was optional,
-// you'd need to do a final `fromPartialBase64` call here with `more: false`
 
 console.log(new Float64Array(output.buffer));
 // Float64Array([0.1, 0.2, 0.3, 0.4])
@@ -191,6 +192,76 @@ <h3>Streaming</h3>
   .pipeTo(sink);
 </code></pre>
 
+<h3>Decoding base64 strings to an existing Uint8Array</h3>
+<p><code>fromPartialBase64</code> additionally allows you to write data into to an existing buffer. This is handled by an <code>into</code> argument in the options bag, together with <code>inputOffset</code> and <code>outputOffset</code> arguments, as well as <code>read</code> and <written> return values.</p>
+
+<pre class="language-js"><code class="language-js">
+let input = 'SGVsbG8gV29ybGQ=';
+let buffer = new Uint8Array(4);
+let outputOffset = 0;
+let inputOffset = 0;
+let extra, written, read;
+
+while (inputOffset < input.length) {
+  0, { extra, written, read } = Uint8Array.fromPartialBase64(input, {
+    extra,
+    into: buffer,
+    inputOffset,
+    outputOffset,
+  });
+
+  inputOffset += read;
+  outputOffset += written;
+
+  if (outputOffset === buffer.length) {
+    // output is full; consume it
+    console.log([...buffer]);
+    outputOffset = 0;
+  }
+}
+if (outputOffset > 0) {
+  console.log([...buffer].slice(0, outputOffset));
+}
+</code></pre>
+
+<p>This can be combined with streaming as follows:</p>
+<pre class="language-js"><code class="language-js">
+let chunks = ['VGhpcyB', 'pcyBzb2', '1lIGV4YW1w', 'bGUgZGF0YS4='];
+let output = new Uint8Array(4);
+let outputOffset = 0;
+let extra;
+for (let chunk of chunks) {
+  let written, read;
+  let inputOffset = 0;
+
+  while (inputOffset < chunk.length) {
+    0, { extra, written, read } = Uint8Array.fromPartialBase64(chunk, {
+      extra,
+      into: output,
+      inputOffset,
+      outputOffset,
+    });
+
+    inputOffset += read;
+    outputOffset += written;
+
+    if (outputOffset === output.length) {
+      // output is full; consume it
+      console.log([...output]);
+      outputOffset = 0;
+    }
+  }
+}
+if (outputOffset > 0) {
+  console.log([...output].slice(0, outputOffset));
+}
+</code></pre>
+
+<p>This is guaranteed to fill the provided Uint8Array if enough input is available to do so.</p>
+
+<h3>Decoding hex strings to an existing Uint8Array</h3>
+<p>At the moment there is no facility to do so. We will likely add a <code>fromPartialHex</code> method similar to <code>fromPartialHex</code> which provides this ability.
+
 <footer>
   <p>Thanks for reading! If you got this far, you should try out the proposal in your browser's developer tools on this page, and submit feedback on <a href="https://github.com/tc39/proposal-arraybuffer-base64">GitHub</a>.</p>
 </footer>
diff --git a/playground/polyfill-core.mjs b/playground/polyfill-core.mjs
@@ -20,7 +20,7 @@ function assert(condition, message) {
   }
 }
 
-function alphabetFromIdentifier(alphabet) {
+export function alphabetFromIdentifier(alphabet) {
   if (alphabet === 'base64') {
     return base64Characters;
   } else if (alphabet === 'base64url') {
@@ -77,101 +77,136 @@ export function uint8ArrayToBase64(arr, alphabetIdentifier = 'base64', more = fa
   }
 }
 
-export function base64ToUint8Array(str, alphabetIdentifier = 'base64', more = false, origExtra = null) {
-  if (typeof str !== 'string') {
-    throw new TypeError('expected str to be a string');
-  }
-  let alphabet = alphabetFromIdentifier(alphabetIdentifier);
-  more = !!more;
-  if (origExtra != null) {
-    if (typeof origExtra !== 'string') {
-      throw new TypeError('expected extra to be a string');
+// this is extremely inefficient, but easy to reason about
+// actual implementations should use something more efficient except possibly at boundaries
+function decodeOneBase64Character(extraBitCount, extraBits, alphabetMap, char) {
+  let val = alphabetMap.get(char);
+  switch (extraBitCount) {
+    case 0: {
+      // i.e., this is the first of 4 characters
+      return { extraBitCount: 6, extraBits: val, byte: null };
     }
-    str = origExtra + str;
-  }
-  let map = new Map(alphabet.split('').map((c, i) => [c, i]));
-
-  let extra;
-  if (more) {
-    let padding = str.length % 4;
-    if (padding === 0) {
-      extra = '';
-    } else {
-      extra = str.slice(-padding);
-      str = str.slice(0, -padding)
+    case 2: {
+      // i.e., this is the 4th of 4 characters
+      return { extraBitCount: 0, extraBits: 0, byte: (extraBits << 6) | val };
     }
-  } else {
-    // todo opt-in optional padding
-    if (str.length % 4 !== 0) {
-      throw new Error('not correctly padded');
+    case 4: {
+      // i.e., this is the 3rd of 4 characters
+      return { extraBitCount: 2, extraBits: val & 0b11, byte: (extraBits << 4) | ((val & 0b111100) >> 2) };
+    }
+    case 6: {
+      // i.e., this is the 2nd of 4 characters
+      return { extraBitCount: 4, extraBits: val & 0b1111, byte: (extraBits << 2) | ((val & 0b110000) >> 4) };
+    }
+    default: {
+      throw new Error(`unreachable: extraBitCount ${extraBitCount}`);
     }
-    extra = null;
-  }
-  assert(str.length % 4 === 0, 'str.length % 4 === 0');
-  if (str.endsWith('==')) {
-    str = str.slice(0, -2);
-  } else if (str.endsWith('=')) {
-    str = str.slice(0, -1);
   }
+}
 
-  let result = [];
-  let i = 0;
-  for (; i + 3 < str.length; i += 4) {
-    let c1 = str[i];
-    let c2 = str[i + 1];
-    let c3 = str[i + 2];
-    let c4 = str[i + 3];
-    if ([c1, c2, c3, c4].some(c => !map.has(c))) {
-      throw new Error('bad character');
-    }
-    let triplet =
-      (map.get(c1) << 18) +
-      (map.get(c2) << 12) +
-      (map.get(c3) << 6) +
-      map.get(c4);
 
-    result.push(
-      (triplet >> 16) & 255,
-      (triplet >> 8) & 255,
-      triplet & 255
-    );
-  }
-  // TODO if we want to be _really_ pedantic, following the RFC, we should enforce the extra 2-4 bits are 0
-  if (i + 2 === str.length) {
-    // the `==` case
-    let c1 = str[i];
-    let c2 = str[i + 1];
-    if ([c1, c2].some(c => !map.has(c))) {
-      throw new Error('bad character');
+// TODO simplify
+function countFullBytesInBase64StringIncludingExtraBits(str, extraBitCount) {
+  if (str === '=' && extraBitCount === 0) {
+    // special case arising when a `=` char is the second half of a `==` pair
+    return 0;
+  }
+  let paddingCharCount = str.endsWith('==') ? 2 : str.endsWith('=') ? 1 : 0;
+  let fullChunks = Math.floor((str.length - paddingCharCount) / 4);
+  let bytesFromFullChunks = fullChunks * 3;
+  if (paddingCharCount === 2) {
+    let extraCharCount = (str.length - 2) % 4;
+    let isCorrectlyPadded =
+      (extraCharCount === 0 && extraBitCount === 4)
+      || (extraCharCount === 1 && extraBitCount === 6)
+      || (extraCharCount === 2 && extraBitCount === 0)
+      || (extraCharCount === 3 && extraBitCount === 2);
+    if (!isCorrectlyPadded) {
+      throw new Error('string is incorrectly padded');
     }
-    let triplet =
-      (map.get(c1) << 18) +
-      (map.get(c2) << 12);
-    result.push((triplet >> 16) & 255);
-  } else if (i + 3 === str.length) {
-    // the `=` case
-    let c1 = str[i];
-    let c2 = str[i + 1];
-    let c3 = str[i + 2];
-    if ([c1, c2, c3].some(c => !map.has(c))) {
-      throw new Error('bad character');
+    let bytesFromExtraChars =
+      extraCharCount === 0 ? 0
+      : extraCharCount === 1 ? 1
+      : extraCharCount === 2 ? 1
+      : extraCharCount === 3 ? 2
+      : unreachable();
+    return bytesFromFullChunks + bytesFromExtraChars;
+  } else if (paddingCharCount === 1) {
+    let extraCharCount = (str.length - 1) % 4;
+    let isCorrectlyPadded = // the '||' cases arise when the string is cut off halfway through a `==` pair
+      (extraCharCount === 0 && (extraBitCount === 2 || extraBitCount === 4))
+      || (extraCharCount === 1 && (extraBitCount === 4 || extraBitCount === 6))
+      || (extraCharCount === 2 && (extraBitCount === 6 || extraBitCount === 0))
+      || (extraCharCount === 3 && (extraBitCount === 0 || extraBitCount === 2));
+    if (!isCorrectlyPadded) {
+      throw new Error('string is incorrectly padded');
     }
-    let triplet =
-      (map.get(c1) << 18) +
-      (map.get(c2) << 12) +
-      (map.get(c3) << 6);
-    result.push(
-      (triplet >> 16) & 255,
-      (triplet >> 8) & 255,
-    );
+    let bytesFromExtraChars =
+      extraCharCount === 0 ? 0
+      : extraCharCount === 1 ? 1
+      : extraCharCount === 2 ? (extraBitCount === 6 ? 2 : 1)
+      : extraCharCount === 3 ? 2
+      : unreachable();
+    return bytesFromFullChunks + bytesFromExtraChars;
   } else {
-    assert(i === str.length);
+    let extraCharCount = (str.length) % 4;
+    let bytesFromExtraChars =
+      extraCharCount === 0 ? 0 // 0 bits from overflow, plus extra bits
+      : extraCharCount === 1 ? (extraBitCount === 0 ? 0 : 1) // 6 bits from overflow, plus extra bits
+      : extraCharCount === 2 ? (extraBitCount === 4 || extraBitCount === 6 ? 2 : 1) // 12 bits from overflow, plus extra bits
+      : extraCharCount === 3 ? (extraBitCount === 6 ? 3 : 2) // 18 bits from overflow, plus extra bits
+      : unreachable();
+    return bytesFromFullChunks + bytesFromExtraChars;
   }
+}
 
-  return {
-    result: new Uint8Array(result),
-    extra,
-  };
+export function base64ToUint8Array(str, alphabet, into = null, extraBitCount = 0, extraBits = 0, inputOffset = 0, outputOffset = 0) {
+  let alphabetMap = new Map(alphabet.split('').map((c, i) => [c, i]));
+  str = str.slice(inputOffset);
+  let codepoints = [...str]; // NB does not validate characters before inputOffset - should it? probably already been validated, but might be faster to just run on the whole string
+  if (codepoints.some(((c, i) => c === '=' && !(i === codepoints.length - 1 || i === codepoints.length - 2) || c !== '=' && !alphabetMap.has(c)))) {
+    throw new Error('bad character');
+  }
+  let totalBytesForChunk = countFullBytesInBase64StringIncludingExtraBits(str, extraBitCount); // also kinda validates padding, if present
+  let bytesToWrite;
+  let outputIndex;
+  if (into == null) {
+    into = new Uint8Array(totalBytesForChunk);
+    bytesToWrite = totalBytesForChunk;
+  } else {
+    bytesToWrite = Math.min(into.length - outputOffset, totalBytesForChunk);
+    // TODO error if bytesToWrite is ≤ 0, maybe?
+  }
+  let byte;
+  let written = 0;
+  let read = 0;
+  while (written < bytesToWrite) {
+    let char = str[read];
+    if (char === '=') {
+      throw new Error('unreachable');
+    }
+    0, { extraBitCount, extraBits, byte } = decodeOneBase64Character(extraBitCount, extraBits, alphabetMap, char);
+    ++read;
+    if (byte != null) {
+      into[outputOffset + written] = byte;
+      ++written;
+    }
+  }
+  if (read < str.length && str[read] === '=') {
+    read = str.length;
+    // TODO if we want to be really pedantic, check extraBits === 0 here
+    if (extraBitCount === 0 || extraBitCount === 6) {
+      throw new Error('unreachable: malformed padding (checked earlier)');
+    }
+  }
+  if (read < str.length && extraBitCount === 0) {
+    // we can read one more character and store it in extra
+    let char = str[read];
+    0, { extraBitCount, extraBits } = decodeOneBase64Character(extraBitCount, extraBits, alphabetMap, char);
+    ++read;
+  }
+  let extra = extraBitCount === 0 ? void 0 : { count: extraBitCount, bits: extraBits };
+  return { result: into, read, written, extra };
 }
 
 export function uint8ArrayToHex(arr) {