feat(napi/parser): add getUtf16ByteOffset API; update README (#7772)

oxc-project · Dec 10, 2024 · 9157a0e · 9157a0e
1 parent 7dcf6b4
commit 9157a0e
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 14 deletions.
diff --git a/napi/parser/index.d.ts b/napi/parser/index.d.ts
@@ -7,6 +7,8 @@ export declare class MagicString {
   getSourceText(start: number, end: number): string
   /** Get 0-based line and column number from utf8 offset. */
   getLineColumnNumber(offset: number): LineColumn
+  /** Get UTF16 byte offset from UTF8 byte offset. */
+  getUtf16ByteOffset(offset: number): number
   length(): number
   toString(): string
   append(input: string): this

diff --git a/napi/parser/src/magic_string.rs b/napi/parser/src/magic_string.rs
@@ -66,6 +66,14 @@ impl MagicString {
         LineColumn { line, column }
     }
 
+    /// Get UTF16 byte offset from UTF8 byte offset.
+    #[napi]
+    pub fn get_utf16_byte_offset(&mut self, offset: u32) -> u32 {
+        let source_text = self.cell.borrow_owner();
+        // TODO(perf): this is obviously slow ...
+        source_text[..offset as usize].encode_utf16().count() as u32
+    }
+
     #[napi]
     pub fn length(&self) -> u32 {
         self.cell.borrow_dependent().len() as u32

diff --git a/napi/parser/test/magic_string.test.ts b/napi/parser/test/magic_string.test.ts
@@ -4,7 +4,7 @@ import type { StringLiteral, VariableDeclaration } from '../index.js';
 import { parseSync } from '../index.js';
 
 describe('simple', () => {
-  const code = 'const s: String = "测试"';
+  const code = 'const s: String = /* 🤨 */ "测试"';
 
   it('calls magic string APIs', () => {
     // `oxc` holds a magic string instance on the Rust side.
@@ -22,11 +22,14 @@ describe('simple', () => {
     // Access line and column number from utf8 offset.
     expect(ms.getLineColumnNumber(start)).toStrictEqual({
       line: 0,
-      column: 19,
+      column: 28,
     });
 
+    // Get UTF16 offsets.
+    expect(code.substring(ms.getUtf16ByteOffset(start), ms.getUtf16ByteOffset(end))).toEqual('测试');
+
     // Magic string manipulation.
     ms.remove(start, end).append(';');
-    expect(ms.toString()).toEqual('const s: String = "";');
+    expect(ms.toString()).toEqual('const s: String = /* 🤨 */ "";');
   });
 });
diff --git a/npm/oxc-parser/README.md b/npm/oxc-parser/README.md
@@ -1,20 +1,51 @@
-# The JavaScript Oxidation Compiler
+# Oxc Parser
 
-See `index.d.ts` for `parseSync` and `parseAsync` API.
+## Features
+
+- Returns ESM information.
+- Built-in `magic-string` on the Rust side exposed through N-API.
+- "clever" approach to overcome the Rust UTF8 vs JavaScript UTF16 length problem.
+
+## Caveat
+
+The parser alone does not fully check for syntax errors that are associated with semantic data (symbols and scopes).
+The full compiler is needed for such case, as the compiler does an additional semantic pass.
+
+With this caveat, `oxc-parser` is best suited for parser plugins,
+where you need quick access to ESM information, as well as fast `magic-string` operations.
+
+## API
 
 ```javascript
-import assert from 'assert';
-import oxc from 'oxc-parser';
+import oxc from './index.js';
 
-const sourceText = "let foo: Foo = 'foo';";
-// Filename extension is used to determine which dialect to parse source as.
+// The emoji makes the span of `import.meta.url` to be different in UTF8 and UTF16.
+const code = 'const url: String = /* 🤨 */ import.meta.url;';
+
+// File extension is used to determine which dialect to parse source as.
 const filename = 'test.tsx';
 
-test(oxc.parseSync(filename, sourceText, options));
-test(await oxc.parseAsync(filename, sourceText, options));
+const result = oxc.parseSync(filename, code);
+// or `await oxc.parseAsnyc(filename, code)`
+
+// An array of errors, if any.
+console.log(result.errors);
+
+// AST and comments.
+console.log(result.program, result.comments);
+
+// ESM information - imports, exports, `import.meta`s.
+console.log(result.module);
+
+// A `magic-string` instance for accessing and manipulating the source text.
+// All returned spans are in UTF8 offsets, which cannot be used directly on our JavaScript.
+// JavaScript string lengths are in UTF16 offsets.
+const ms = result.magicString;
 
-function test(ret) {
-  assert(ret.program.body.length == 1);
-  assert(ret.errors.length == 0);
+for (const span of result.module.importMetas) {
+  // Extra methods for access the source text through spans with UTF8 offsets.
+  console.log(ms.getSourceText(span.start, span.end)); // prints `import.meta`
+  console.log(ms.getLineColumnNumber(span.start)); // prints `{ line: 0, column: 20 }`
+  console.log(code.substring(ms.getUtf16ByteOffset(span.start)).startsWith('import.meta.url')); // prints `true`
 }
 ```