Skip to content

Commit

Permalink
feat(napi/parser): add getUtf16ByteOffset API; update README (#7772)
Browse files Browse the repository at this point in the history
  • Loading branch information
Boshen committed Dec 10, 2024
1 parent 7dcf6b4 commit 9157a0e
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 14 deletions.
2 changes: 2 additions & 0 deletions napi/parser/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ export declare class MagicString {
getSourceText(start: number, end: number): string
/** Get 0-based line and column number from utf8 offset. */
getLineColumnNumber(offset: number): LineColumn
/** Get UTF16 byte offset from UTF8 byte offset. */
getUtf16ByteOffset(offset: number): number
length(): number
toString(): string
append(input: string): this
Expand Down
8 changes: 8 additions & 0 deletions napi/parser/src/magic_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,14 @@ impl MagicString {
LineColumn { line, column }
}

/// Get UTF16 byte offset from UTF8 byte offset.
#[napi]
pub fn get_utf16_byte_offset(&mut self, offset: u32) -> u32 {
let source_text = self.cell.borrow_owner();
// TODO(perf): this is obviously slow ...
source_text[..offset as usize].encode_utf16().count() as u32
}

#[napi]
pub fn length(&self) -> u32 {
self.cell.borrow_dependent().len() as u32
Expand Down
9 changes: 6 additions & 3 deletions napi/parser/test/magic_string.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import type { StringLiteral, VariableDeclaration } from '../index.js';
import { parseSync } from '../index.js';

describe('simple', () => {
const code = 'const s: String = "测试"';
const code = 'const s: String = /* 🤨 */ "测试"';

it('calls magic string APIs', () => {
// `oxc` holds a magic string instance on the Rust side.
Expand All @@ -22,11 +22,14 @@ describe('simple', () => {
// Access line and column number from utf8 offset.
expect(ms.getLineColumnNumber(start)).toStrictEqual({
line: 0,
column: 19,
column: 28,
});

// Get UTF16 offsets.
expect(code.substring(ms.getUtf16ByteOffset(start), ms.getUtf16ByteOffset(end))).toEqual('测试');

// Magic string manipulation.
ms.remove(start, end).append(';');
expect(ms.toString()).toEqual('const s: String = "";');
expect(ms.toString()).toEqual('const s: String = /* 🤨 */ "";');
});
});
53 changes: 42 additions & 11 deletions npm/oxc-parser/README.md
Original file line number Diff line number Diff line change
@@ -1,20 +1,51 @@
# The JavaScript Oxidation Compiler
# Oxc Parser

See `index.d.ts` for `parseSync` and `parseAsync` API.
## Features

- Returns ESM information.
- Built-in `magic-string` on the Rust side exposed through N-API.
- "clever" approach to overcome the Rust UTF8 vs JavaScript UTF16 length problem.

## Caveat

The parser alone does not fully check for syntax errors that are associated with semantic data (symbols and scopes).
The full compiler is needed for such case, as the compiler does an additional semantic pass.

With this caveat, `oxc-parser` is best suited for parser plugins,
where you need quick access to ESM information, as well as fast `magic-string` operations.

## API

```javascript
import assert from 'assert';
import oxc from 'oxc-parser';
import oxc from './index.js';

const sourceText = "let foo: Foo = 'foo';";
// Filename extension is used to determine which dialect to parse source as.
// The emoji makes the span of `import.meta.url` to be different in UTF8 and UTF16.
const code = 'const url: String = /* 🤨 */ import.meta.url;';

// File extension is used to determine which dialect to parse source as.
const filename = 'test.tsx';

test(oxc.parseSync(filename, sourceText, options));
test(await oxc.parseAsync(filename, sourceText, options));
const result = oxc.parseSync(filename, code);
// or `await oxc.parseAsnyc(filename, code)`

// An array of errors, if any.
console.log(result.errors);

// AST and comments.
console.log(result.program, result.comments);

// ESM information - imports, exports, `import.meta`s.
console.log(result.module);

// A `magic-string` instance for accessing and manipulating the source text.
// All returned spans are in UTF8 offsets, which cannot be used directly on our JavaScript.
// JavaScript string lengths are in UTF16 offsets.
const ms = result.magicString;

function test(ret) {
assert(ret.program.body.length == 1);
assert(ret.errors.length == 0);
for (const span of result.module.importMetas) {
// Extra methods for access the source text through spans with UTF8 offsets.
console.log(ms.getSourceText(span.start, span.end)); // prints `import.meta`
console.log(ms.getLineColumnNumber(span.start)); // prints `{ line: 0, column: 20 }`
console.log(code.substring(ms.getUtf16ByteOffset(span.start)).startsWith('import.meta.url')); // prints `true`
}
```

0 comments on commit 9157a0e

Please sign in to comment.