Skip to content

Commit

Permalink
Merge 407ebc9 into c3ba744
Browse files Browse the repository at this point in the history
  • Loading branch information
jevancc authored Jan 10, 2021
2 parents c3ba744 + 407ebc9 commit cd1b07f
Show file tree
Hide file tree
Showing 14 changed files with 762 additions and 13 deletions.
4 changes: 4 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ See [Debugging](./docs/debugging.md).
If you want to develop on the web assembly side you can run `yarn serve` and then go
to <http://localhost:8080>.

### boa-unicode

Boa uses the library `boa-unicode` to query Unicode character properties and classes in lexer and parser. See [boa_unicode/README.md](./boa_unicode/README.md) for development and more information.

### Setup

#### VSCode Plugins
Expand Down
14 changes: 14 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ members = [
"boa_cli",
"boa_wasm",
"boa_tester",
"boa_unicode",
]

# The release profile, used for `cargo build --release`.
Expand Down
1 change: 1 addition & 0 deletions boa/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ vm = []
console = []

[dependencies]
boa_unicode = { path = "../boa_unicode" }
gc = { version = "0.3.6", features = ["derive"] }
serde = { version = "1.0.118", features = ["derive"] }
serde_json = "1.0.61"
Expand Down
41 changes: 34 additions & 7 deletions boa/src/syntax/lexer/identifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use crate::{
lexer::{Token, TokenKind},
},
};
use boa_unicode::UnicodeProperties;
use core::convert::TryFrom;
use std::io::Read;
use std::str;
Expand Down Expand Up @@ -44,6 +45,38 @@ impl Identifier {
pub(super) fn new(init: char) -> Self {
Self { init }
}

/// Checks if a character is IdentifierStart as per ECMAScript standards.
///
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#sec-names-and-keywords
pub(super) fn is_identifier_start(ch: u32) -> bool {
matches!(ch, 0x0024 /* $ */ | 0x005F /* _ */)
|| if let Ok(ch) = char::try_from(ch) {
ch.is_id_start()
} else {
false
}
}

/// Checks if a character is IdentifierPart as per ECMAScript standards.
///
/// More information:
/// - [ECMAScript reference][spec]
///
/// [spec]: https://tc39.es/ecma262/#sec-names-and-keywords
fn is_identifier_part(ch: u32) -> bool {
matches!(
ch,
0x0024 /* $ */ | 0x005F /* _ */ | 0x200C /* <ZWNJ> */ | 0x200D /* <ZWJ> */
) || if let Ok(ch) = char::try_from(ch) {
ch.is_id_continue()
} else {
false
}
}
}

impl<R> Tokenizer<R> for Identifier {
Expand All @@ -58,13 +91,7 @@ impl<R> Tokenizer<R> for Identifier {
self.init.encode_utf8(&mut init_buf);
buf.extend(init_buf.iter().take(self.init.len_utf8()));

cursor.take_while_char_pred(&mut buf, &|c: u32| {
if let Ok(c) = char::try_from(c) {
c.is_alphabetic() || c.is_digit(10) || c == '_'
} else {
false
}
})?;
cursor.take_while_char_pred(&mut buf, &Self::is_identifier_part)?;

let token_str = unsafe { str::from_utf8_unchecked(buf.as_slice()) };
let tk = match token_str {
Expand Down
12 changes: 6 additions & 6 deletions boa/src/syntax/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,12 +199,6 @@ impl<R> Lexer<R> {
)),
'"' | '\'' => StringLiteral::new(c).lex(&mut self.cursor, start),
'`' => TemplateLiteral.lex(&mut self.cursor, start),
_ if c.is_digit(10) => {
NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start)
}
_ if c.is_alphabetic() || c == '$' || c == '_' => {
Identifier::new(c).lex(&mut self.cursor, start)
}
';' => Ok(Token::new(
Punctuator::Semicolon.into(),
Span::new(start, self.cursor.pos()),
Expand Down Expand Up @@ -252,6 +246,12 @@ impl<R> Lexer<R> {
'=' | '*' | '+' | '-' | '%' | '|' | '&' | '^' | '<' | '>' | '!' | '~' | '?' => {
Operator::new(next_ch as u8).lex(&mut self.cursor, start)
}
_ if c.is_digit(10) => {
NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start)
}
_ if Identifier::is_identifier_start(c as u32) => {
Identifier::new(c).lex(&mut self.cursor, start)
}
_ => {
let details = format!(
"unexpected '{}' at line {}, column {}",
Expand Down
46 changes: 46 additions & 0 deletions boa/src/syntax/lexer/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,52 @@ fn check_multi_line_comment() {
expect_tokens(&mut lexer, &expected);
}

#[test]
fn check_identifier() {
let s = "x x1 _x $x __ $$ Ѐ ЀЀ x\u{200C}\u{200D}";
let mut lexer = Lexer::new(s.as_bytes());

let expected = [
TokenKind::identifier("x"),
TokenKind::identifier("x1"),
TokenKind::identifier("_x"),
TokenKind::identifier("$x"),
TokenKind::identifier("__"),
TokenKind::identifier("$$"),
TokenKind::identifier("Ѐ"),
TokenKind::identifier("ЀЀ"),
TokenKind::identifier("x\u{200C}\u{200D}"),
];

expect_tokens(&mut lexer, &expected);
}

#[test]
fn check_invalid_identifier_start() {
let invalid_identifier_starts = ["\u{200C}", "\u{200D}", "😀"];

for s in invalid_identifier_starts.iter() {
let mut lexer = Lexer::new(s.as_bytes());
lexer
.next()
.expect_err("Invalid identifier start not rejected as expected");
}
}

#[test]
fn check_invalid_identifier_part() {
let invalid_identifier_parts = [" ", "\n", ".", "*", "😀", "\u{007F}"];

for part in invalid_identifier_parts.iter() {
let s = String::from("x") + part;
let mut lexer = Lexer::new(s.as_bytes());
assert_eq!(
lexer.next().unwrap().unwrap().kind(),
&TokenKind::identifier("x")
);
}
}

#[test]
fn check_string() {
let s = "'aaa' \"bbb\"";
Expand Down
14 changes: 14 additions & 0 deletions boa_unicode/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[package]
name = "boa_unicode"
version = "0.10.0"
authors = ["boa-dev"]
description = "Boa is a Javascript lexer, parser and Just-in-Time compiler written in Rust. Currently, it has support for some of the language."
repository = "https://github.com/boa-dev/boa"
keywords = ["javascript", "compiler", "lexer", "parser", "unicode"]
categories = ["parsing"]
license = "Unlicense/MIT"
exclude = ["../.vscode/*", "../Dockerfile", "../Makefile", "../.editorConfig"]
edition = "2018"

[dependencies]
unicode-general-category = "0.3.0"
26 changes: 26 additions & 0 deletions boa_unicode/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# boa-unicode

`boa-unicode` defines the trait to provide methods for querying properties or classes for Unicode identifiers. These properties are used to determine if a code point (char) is valid for being the start/part of an identifier in lexer and parser.

Current version: Unicode 13.0.0

## Development

The Unicode character tables used to query properties are generated by `build_tables.js`. This script depends on [Node.js](https://nodejs.org/en/) and [rustfmt](https://github.com/rust-lang/rustfmt). You can run the script with:

```
$ node build_tables.js
```

or with [npm](https://www.npmjs.com/):

```
$ npm run build-tables
```

The configurations are defined as constants in the script. Please check the comments in `build_tables.js` for more information.

## More Info

- [Unicode® Standard Annex #31 - UNICODE IDENTIFIER AND PATTERN SYNTAX](https://unicode.org/reports/tr31/)
- [Unicode® Standard Annex #44 - UNICODE CHARACTER DATABASE](https://unicode.org/reports/tr44/)
132 changes: 132 additions & 0 deletions boa_unicode/build_tables.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#!/usr/bin/env node
/**
* This file is used to generate the Rust source code with tables for Unicode properties and classes.
*
* This script downloads the content of `PropList.txt` from the remote server, parses the file, extracts
* the target properties, prepares the char tables, and then writes to the output Rust file. It also
* formats the output file with the command `rustfmt`. Please make sure `rustfmt` is available in the environment.
*
* Update and run this script when {@link https://unicode.org/reports/tr44/|Unicode® Standard Annex #44} is updated, and
* always check the latest standard meets the {@link https://tc39.es/ecma262/#sec-names-and-keywords|spec of ECMAScript}.
*
* Run this script with command `node ./build_tables.js` or `npm run build-tables`.
*
* Version: Unicode 13.0.0
*/

const fs = require("fs");
const path = require("path");
const https = require("https");
const child_process = require("child_process");

/**
* The URL to download the content of `PropList.txt` through HTTP Get.
*
* Please make sure the content follows the UCD file format defined in
* {@link http://unicode.org/reports/tr44/#UCD_Files|UAX#44}.
*
* @constant {string}
*/
const PROPLIST_TXT_URL =
"https://www.unicode.org/Public/13.0.0/ucd/PropList.txt";

/**
* The target properties to process given in tuples. The first element is the property to search for.
* The second element is the table variable name in the output Rust file.
*
* @constant {[string, string][]}
*/
const TARGET_PROPERTIES = [
["Pattern_Syntax", "PATTERN_SYNTAX"],
["Other_ID_Continue", "OTHER_ID_CONTINUE"],
["Other_ID_Start", "OTHER_ID_START"],
["Pattern_White_Space", "PATTERN_WHITE_SPACE"],
];

/**
* The path of output Rust file.
*
* @constant {string}
*/
const OUTPUT_FILE = path.join(__dirname, "./src/tables.rs");

/**
* The doc comment to add to the beginning of output Rust file.
*
* @constant {string}
*/
const OUTPUT_FILE_DOC_COMMENT = `
//! This module implements the unicode lookup tables for identifier and pattern syntax.
//! Version: Unicode 13.0.0
//!
//! This file is generated by \`boa_unicode/build_tables.js\`. Please do not modify it directly.
//!
//! More information:
//! - [Unicode® Standard Annex #44][uax44]
//!
//! [uax44]: http://unicode.org/reports/tr44
`.trim();

https
.get(PROPLIST_TXT_URL, (res) => {
let text = "";

res.on("data", (chunk) => {
text += chunk;
});

res.on("end", () => {
buildRustFile(text);
});
})
.on("error", (err) => {
console.log(`Failed to get 'PropList.txt': ${err.message}`);
})
.end();

function buildRustFile(propListText) {
const dataRegex = /(^|\n)(?<codePointStart>[0-9A-F]+)(\.\.(?<codePointEnd>[0-9A-F]+))?\s*;\s*(?<property>[^\s]+)/gi;
const data = [...propListText.matchAll(dataRegex)].map(
(match) => match.groups
);

const rustVariables = TARGET_PROPERTIES.map(
([propertyName, rustTableName]) => {
const codePoints = data
.filter(({ property }) => property === propertyName)
.map(({ codePointStart, codePointEnd }) => [
codePointStart,
codePointEnd ?? codePointStart,
])
.map(([codePointStart, codePointEnd]) => [
parseInt(codePointStart, 16),
parseInt(codePointEnd, 16),
])
.reduce((codePoints, [codePointStart, codePointEnd]) => {
for (let cp = codePointStart; cp <= codePointEnd; cp++) {
codePoints.push(cp);
}
return codePoints;
}, []);

codePoints.sort((a, b) => a - b);
const rustTable = `&[${codePoints
.map((cp) => `'\\u{${cp.toString(16).padStart(4, "0").toUpperCase()}}'`)
.join(",")}]`;
const rustVariable = `pub static ${rustTableName}: &[char] = ${rustTable};`;

console.log(`${propertyName}: ${codePoints.length} code points`);
return rustVariable;
}
);

const rustFile = `${OUTPUT_FILE_DOC_COMMENT}\n\n${rustVariables.join(
"\n\n"
)}`;

console.log("Writing output file...");
fs.writeFileSync(OUTPUT_FILE, rustFile);

console.log("Running rustfmt...");
child_process.execSync(`rustfmt ${OUTPUT_FILE}`);
}
5 changes: 5 additions & 0 deletions boa_unicode/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"scripts": {
"build-tables": "node ./build_tables.js"
}
}
Loading

0 comments on commit cd1b07f

Please sign in to comment.