Merge 407ebc9 into c3ba744

boa-dev · Jan 10, 2021 · cd1b07f · cd1b07f
2 parents c3ba744 + 407ebc9
commit cd1b07f
Show file tree

Hide file tree

Showing 14 changed files with 762 additions and 13 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -45,6 +45,10 @@ See [Debugging](./docs/debugging.md).
 If you want to develop on the web assembly side you can run `yarn serve` and then go
 to <http://localhost:8080>.
 
+### boa-unicode
+
+Boa uses the library `boa-unicode` to query Unicode character properties and classes in lexer and parser. See [boa_unicode/README.md](./boa_unicode/README.md) for development and more information.
+
 ### Setup
 
 #### VSCode Plugins

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -4,6 +4,7 @@ members = [
     "boa_cli",
     "boa_wasm",
     "boa_tester",
+    "boa_unicode",
 ]
 
 # The release profile, used for `cargo build --release`.

diff --git a/boa/Cargo.toml b/boa/Cargo.toml
@@ -21,6 +21,7 @@ vm = []
 console = []
 
 [dependencies]
+boa_unicode = { path = "../boa_unicode" }
 gc = { version = "0.3.6", features = ["derive"] }
 serde = { version = "1.0.118", features = ["derive"] }
 serde_json = "1.0.61"

diff --git a/boa/src/syntax/lexer/identifier.rs b/boa/src/syntax/lexer/identifier.rs
@@ -8,6 +8,7 @@ use crate::{
         lexer::{Token, TokenKind},
     },
 };
+use boa_unicode::UnicodeProperties;
 use core::convert::TryFrom;
 use std::io::Read;
 use std::str;
@@ -44,6 +45,38 @@ impl Identifier {
     pub(super) fn new(init: char) -> Self {
         Self { init }
     }
+
+    /// Checks if a character is IdentifierStart as per ECMAScript standards.
+    ///
+    /// More information:
+    ///  - [ECMAScript reference][spec]
+    ///
+    /// [spec]: https://tc39.es/ecma262/#sec-names-and-keywords
+    pub(super) fn is_identifier_start(ch: u32) -> bool {
+        matches!(ch, 0x0024 /* $ */ | 0x005F /* _ */)
+            || if let Ok(ch) = char::try_from(ch) {
+                ch.is_id_start()
+            } else {
+                false
+            }
+    }
+
+    /// Checks if a character is IdentifierPart as per ECMAScript standards.
+    ///
+    /// More information:
+    ///  - [ECMAScript reference][spec]
+    ///
+    /// [spec]: https://tc39.es/ecma262/#sec-names-and-keywords
+    fn is_identifier_part(ch: u32) -> bool {
+        matches!(
+            ch,
+            0x0024 /* $ */ | 0x005F /* _ */ | 0x200C /* <ZWNJ> */ | 0x200D /* <ZWJ> */
+        ) || if let Ok(ch) = char::try_from(ch) {
+            ch.is_id_continue()
+        } else {
+            false
+        }
+    }
 }
 
 impl<R> Tokenizer<R> for Identifier {
@@ -58,13 +91,7 @@ impl<R> Tokenizer<R> for Identifier {
         self.init.encode_utf8(&mut init_buf);
         buf.extend(init_buf.iter().take(self.init.len_utf8()));
 
-        cursor.take_while_char_pred(&mut buf, &|c: u32| {
-            if let Ok(c) = char::try_from(c) {
-                c.is_alphabetic() || c.is_digit(10) || c == '_'
-            } else {
-                false
-            }
-        })?;
+        cursor.take_while_char_pred(&mut buf, &Self::is_identifier_part)?;
 
         let token_str = unsafe { str::from_utf8_unchecked(buf.as_slice()) };
         let tk = match token_str {

diff --git a/boa/src/syntax/lexer/mod.rs b/boa/src/syntax/lexer/mod.rs
@@ -199,12 +199,6 @@ impl<R> Lexer<R> {
                 )),
                 '"' | '\'' => StringLiteral::new(c).lex(&mut self.cursor, start),
                 '`' => TemplateLiteral.lex(&mut self.cursor, start),
-                _ if c.is_digit(10) => {
-                    NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start)
-                }
-                _ if c.is_alphabetic() || c == '$' || c == '_' => {
-                    Identifier::new(c).lex(&mut self.cursor, start)
-                }
                 ';' => Ok(Token::new(
                     Punctuator::Semicolon.into(),
                     Span::new(start, self.cursor.pos()),
@@ -252,6 +246,12 @@ impl<R> Lexer<R> {
                 '=' | '*' | '+' | '-' | '%' | '|' | '&' | '^' | '<' | '>' | '!' | '~' | '?' => {
                     Operator::new(next_ch as u8).lex(&mut self.cursor, start)
                 }
+                _ if c.is_digit(10) => {
+                    NumberLiteral::new(next_ch as u8).lex(&mut self.cursor, start)
+                }
+                _ if Identifier::is_identifier_start(c as u32) => {
+                    Identifier::new(c).lex(&mut self.cursor, start)
+                }
                 _ => {
                     let details = format!(
                         "unexpected '{}' at line {}, column {}",

diff --git a/boa/src/syntax/lexer/tests.rs b/boa/src/syntax/lexer/tests.rs
@@ -70,6 +70,52 @@ fn check_multi_line_comment() {
     expect_tokens(&mut lexer, &expected);
 }
 
+#[test]
+fn check_identifier() {
+    let s = "x x1 _x $x __ $$ Ѐ ЀЀ x\u{200C}\u{200D}";
+    let mut lexer = Lexer::new(s.as_bytes());
+
+    let expected = [
+        TokenKind::identifier("x"),
+        TokenKind::identifier("x1"),
+        TokenKind::identifier("_x"),
+        TokenKind::identifier("$x"),
+        TokenKind::identifier("__"),
+        TokenKind::identifier("$$"),
+        TokenKind::identifier("Ѐ"),
+        TokenKind::identifier("ЀЀ"),
+        TokenKind::identifier("x\u{200C}\u{200D}"),
+    ];
+
+    expect_tokens(&mut lexer, &expected);
+}
+
+#[test]
+fn check_invalid_identifier_start() {
+    let invalid_identifier_starts = ["\u{200C}", "\u{200D}", "😀"];
+
+    for s in invalid_identifier_starts.iter() {
+        let mut lexer = Lexer::new(s.as_bytes());
+        lexer
+            .next()
+            .expect_err("Invalid identifier start not rejected as expected");
+    }
+}
+
+#[test]
+fn check_invalid_identifier_part() {
+    let invalid_identifier_parts = [" ", "\n", ".", "*", "😀", "\u{007F}"];
+
+    for part in invalid_identifier_parts.iter() {
+        let s = String::from("x") + part;
+        let mut lexer = Lexer::new(s.as_bytes());
+        assert_eq!(
+            lexer.next().unwrap().unwrap().kind(),
+            &TokenKind::identifier("x")
+        );
+    }
+}
+
 #[test]
 fn check_string() {
     let s = "'aaa' \"bbb\"";

diff --git a/boa_unicode/Cargo.toml b/boa_unicode/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "boa_unicode"
+version = "0.10.0"
+authors = ["boa-dev"]
+description = "Boa is a Javascript lexer, parser and Just-in-Time compiler written in Rust. Currently, it has support for some of the language."
+repository = "https://github.com/boa-dev/boa"
+keywords = ["javascript", "compiler", "lexer", "parser", "unicode"]
+categories = ["parsing"]
+license = "Unlicense/MIT"
+exclude = ["../.vscode/*", "../Dockerfile", "../Makefile", "../.editorConfig"]
+edition = "2018"
+
+[dependencies]
+unicode-general-category = "0.3.0"
diff --git a/boa_unicode/README.md b/boa_unicode/README.md
@@ -0,0 +1,26 @@
+# boa-unicode
+
+`boa-unicode` defines the trait to provide methods for querying properties or classes for Unicode identifiers. These properties are used to determine if a code point (char) is valid for being the start/part of an identifier in lexer and parser.
+
+Current version: Unicode 13.0.0
+
+## Development
+
+The Unicode character tables used to query properties are generated by `build_tables.js`. This script depends on [Node.js](https://nodejs.org/en/) and [rustfmt](https://github.com/rust-lang/rustfmt). You can run the script with:
+
+```
+$ node build_tables.js
+```
+
+or with [npm](https://www.npmjs.com/):
+
+```
+$ npm run build-tables
+```
+
+The configurations are defined as constants in the script. Please check the comments in `build_tables.js` for more information.
+
+## More Info
+
+- [Unicode® Standard Annex #31 - UNICODE IDENTIFIER AND PATTERN SYNTAX](https://unicode.org/reports/tr31/)
+- [Unicode® Standard Annex #44 - UNICODE CHARACTER DATABASE](https://unicode.org/reports/tr44/)
diff --git a/boa_unicode/build_tables.js b/boa_unicode/build_tables.js
@@ -0,0 +1,132 @@
+#!/usr/bin/env node
+/**
+ * This file is used to generate the Rust source code with tables for Unicode properties and classes.
+ *
+ * This script downloads the content of `PropList.txt` from the remote server, parses the file, extracts
+ * the target properties, prepares the char tables, and then writes to the output Rust file. It also
+ * formats the output file with the command `rustfmt`. Please make sure `rustfmt` is available in the environment.
+ *
+ * Update and run this script when {@link https://unicode.org/reports/tr44/|Unicode® Standard Annex #44} is updated, and
+ * always check the latest standard meets the {@link https://tc39.es/ecma262/#sec-names-and-keywords|spec of ECMAScript}.
+ *
+ * Run this script with command `node ./build_tables.js` or `npm run build-tables`.
+ *
+ * Version: Unicode 13.0.0
+ */
+
+const fs = require("fs");
+const path = require("path");
+const https = require("https");
+const child_process = require("child_process");
+
+/**
+ * The URL to download the content of `PropList.txt` through HTTP Get.
+ *
+ * Please make sure the content follows the UCD file format defined in
+ * {@link http://unicode.org/reports/tr44/#UCD_Files|UAX#44}.
+ *
+ * @constant {string}
+ */
+const PROPLIST_TXT_URL =
+  "https://www.unicode.org/Public/13.0.0/ucd/PropList.txt";
+
+/**
+ * The target properties to process given in tuples. The first element is the property to search for.
+ * The second element is the table variable name in the output Rust file.
+ *
+ * @constant {[string, string][]}
+ */
+const TARGET_PROPERTIES = [
+  ["Pattern_Syntax", "PATTERN_SYNTAX"],
+  ["Other_ID_Continue", "OTHER_ID_CONTINUE"],
+  ["Other_ID_Start", "OTHER_ID_START"],
+  ["Pattern_White_Space", "PATTERN_WHITE_SPACE"],
+];
+
+/**
+ * The path of output Rust file.
+ *
+ * @constant {string}
+ */
+const OUTPUT_FILE = path.join(__dirname, "./src/tables.rs");
+
+/**
+ * The doc comment to add to the beginning of output Rust file.
+ *
+ * @constant {string}
+ */
+const OUTPUT_FILE_DOC_COMMENT = `
+//! This module implements the unicode lookup tables for identifier and pattern syntax.
+//! Version: Unicode 13.0.0
+//!
+//! This file is generated by \`boa_unicode/build_tables.js\`. Please do not modify it directly.
+//!
+//! More information:
+//!  - [Unicode® Standard Annex #44][uax44]
+//!
+//! [uax44]: http://unicode.org/reports/tr44
+`.trim();
+
+https
+  .get(PROPLIST_TXT_URL, (res) => {
+    let text = "";
+
+    res.on("data", (chunk) => {
+      text += chunk;
+    });
+
+    res.on("end", () => {
+      buildRustFile(text);
+    });
+  })
+  .on("error", (err) => {
+    console.log(`Failed to get 'PropList.txt': ${err.message}`);
+  })
+  .end();
+
+function buildRustFile(propListText) {
+  const dataRegex = /(^|\n)(?<codePointStart>[0-9A-F]+)(\.\.(?<codePointEnd>[0-9A-F]+))?\s*;\s*(?<property>[^\s]+)/gi;
+  const data = [...propListText.matchAll(dataRegex)].map(
+    (match) => match.groups
+  );
+
+  const rustVariables = TARGET_PROPERTIES.map(
+    ([propertyName, rustTableName]) => {
+      const codePoints = data
+        .filter(({ property }) => property === propertyName)
+        .map(({ codePointStart, codePointEnd }) => [
+          codePointStart,
+          codePointEnd ?? codePointStart,
+        ])
+        .map(([codePointStart, codePointEnd]) => [
+          parseInt(codePointStart, 16),
+          parseInt(codePointEnd, 16),
+        ])
+        .reduce((codePoints, [codePointStart, codePointEnd]) => {
+          for (let cp = codePointStart; cp <= codePointEnd; cp++) {
+            codePoints.push(cp);
+          }
+          return codePoints;
+        }, []);
+
+      codePoints.sort((a, b) => a - b);
+      const rustTable = `&[${codePoints
+        .map((cp) => `'\\u{${cp.toString(16).padStart(4, "0").toUpperCase()}}'`)
+        .join(",")}]`;
+      const rustVariable = `pub static ${rustTableName}: &[char] = ${rustTable};`;
+
+      console.log(`${propertyName}: ${codePoints.length} code points`);
+      return rustVariable;
+    }
+  );
+
+  const rustFile = `${OUTPUT_FILE_DOC_COMMENT}\n\n${rustVariables.join(
+    "\n\n"
+  )}`;
+
+  console.log("Writing output file...");
+  fs.writeFileSync(OUTPUT_FILE, rustFile);
+
+  console.log("Running rustfmt...");
+  child_process.execSync(`rustfmt ${OUTPUT_FILE}`);
+}
diff --git a/boa_unicode/package.json b/boa_unicode/package.json
@@ -0,0 +1,5 @@
+{
+  "scripts": {
+    "build-tables": "node ./build_tables.js"
+  }
+}