Allow ZWJ and ZWNJ to continue identifiers as specified

2021-04-24 18:45:40 -07:00 · 2021-04-24 18:45:40 -07:00 · 3ac9719b17
--- a/packages/adl/compiler/charcode.ts
+++ b/packages/adl/compiler/charcode.ts
@ -223,6 +223,13 @@ export function isAsciiIdentifierContinue(ch: number): boolean {
  );
 }

+export function isIdentifierStart(codePoint: number) {
+  return (
+    isAsciiIdentifierStart(codePoint) ||
+    (codePoint > CharCode.MaxAscii && isNonAsciiIdentifierStart(codePoint))
+  );
+}
+
 export function isIdentifierContinue(codePoint: number) {
  return (
    isAsciiIdentifierContinue(codePoint) ||
--- a/packages/adl/compiler/nonascii.ts
+++ b/packages/adl/compiler/nonascii.ts
@ -4,15 +4,19 @@
 //
 // Based on:
 //  - http://www.unicode.org/reports/tr31/
-//  - https://www.ecma-international.org/ecma-262/6.0/#sec-names-and-keywords
+//  - https://www.ecma-international.org/ecma-262/11.0/#sec-names-and-keywords
 //
 // ADL's identifier naming rules are currently the same as JavaScript's.
+//

 /**
 * @internal
 *
 * Map of non-ascii characters that are valid at the start of an identifier.
 * Each pair of numbers represents an inclusive range of code points.
+ *
+ * Corresponds to code points outside the ASCII range with property ID_Start or
+ * Other_ID_Start.
 */
 // prettier-ignore
 export const nonAsciiIdentifierStartMap: readonly number[] = [
@ -641,8 +645,12 @@ export const nonAsciiIdentifierStartMap: readonly number[] = [
 /**
 * @internal
 *
- * Map of non-ascii chacters that are valid after the first character in and identifier.
- * Each pair of numbers represents an inclusive range of code points.
+ * Map of non-ascii chacters that are valid after the first character in and
+ * identifier. Each pair of numbers represents an inclusive range of code
+ * points.
+ *
+ * Corresponds to code points outside the ASCII range with property ID_Continue,
+ * Other_ID_Start, or Other_ID_Continue, plus ZWNJ and ZWJ.
 */
 //prettier-ignore
 export const nonAsciiIdentifierContinueMap: readonly number[] = [
@ -943,6 +951,7 @@ export const nonAsciiIdentifierContinueMap: readonly number[] = [
  0x1fe0, 0x1fec,
  0x1ff2, 0x1ff4,
  0x1ff6, 0x1ffc,
+  0x200c, 0x200d,
  0x203f, 0x2040,
  0x2054, 0x2054,
  0x2071, 0x2071,
--- a/packages/adl/scripts/regen-nonascii.js
+++ b/packages/adl/scripts/regen-nonascii.js
@ -9,8 +9,11 @@ import { fileURLToPath } from "url";
 const MIN_NONASCII_CODEPOINT = 0x80;
 const MAX_UNICODE_CODEPOINT = 0x10ffff;

-const isStartRegex = /[\p{ID_Start}\u{2118}\u{212E}\u{309B}\u{309C}]/u;
-const isContinueRegex = /[\p{ID_Continue}\u{00B7}\u{0387}\u{19DA}\u{1369}\u{136A}\u{136B}\u{136C}\u{136D}\u{136E}\u{136F}\u{1370}\u{1371}]/u;
+// Includes Other_ID_Start
+const isStartRegex = /[\p{ID_Start}]/u;
+
+// Includes Other_ID_Start and Other_ID_Continue
+const isContinueRegex = /[\p{ID_Continue}\u{200c}\u{200d}]/u;

 function isStart(c) {
  return isStartRegex.test(c);
@ -50,15 +53,19 @@ const src = `//
 //
 // Based on:
 //  - http://www.unicode.org/reports/tr31/
-//  - https://www.ecma-international.org/ecma-262/6.0/#sec-names-and-keywords
+//  - https://www.ecma-international.org/ecma-262/11.0/#sec-names-and-keywords
 //
 // ADL's identifier naming rules are currently the same as JavaScript's.
+//

 /**
 * @internal
 *
 * Map of non-ascii characters that are valid at the start of an identifier.
 * Each pair of numbers represents an inclusive range of code points.
+ *
+ * Corresponds to code points outside the ASCII range with property ID_Start or
+ * Other_ID_Start.
 */
 // prettier-ignore
 export const nonAsciiIdentifierStartMap: readonly number[] = [
@ -68,8 +75,12 @@ ${formatPairs(startMap)}
 /**
 * @internal
 *
- * Map of non-ascii chacters that are valid after the first character in and identifier.
- * Each pair of numbers represents an inclusive range of code points.
+ * Map of non-ascii chacters that are valid after the first character in and
+ * identifier. Each pair of numbers represents an inclusive range of code
+ * points.
+ *
+ * Corresponds to code points outside the ASCII range with property ID_Continue,
+ * Other_ID_Start, or Other_ID_Continue, plus ZWNJ and ZWJ.
 */
 //prettier-ignore
 export const nonAsciiIdentifierContinueMap: readonly number[] = [
--- a/packages/adl/test/test-parser.ts
+++ b/packages/adl/test/test-parser.ts
@ -250,7 +250,13 @@ describe("syntax", () => {
  });

  describe("non-ascii identifiers", () => {
-    parseEach(["model Incompréhensible {}", "model 𐌰𐌲 {}", "model Banana𐌰𐌲42Banana {}"]);
+    parseEach([
+      "model Incompréhensible {}",
+      "model 𐌰𐌲 {}",
+      "model Banana𐌰𐌲42Banana {}",
+      "model deaf\u{200c}ly {}", // ZWNJ
+      "model क्‍ष {}", // ZWJ
+    ]);
    parseErrorEach([["model 😢 {}", [/Invalid character/]]]);
  });
 });
--- a/packages/adl/test/test-scanner.ts
+++ b/packages/adl/test/test-scanner.ts
@ -1,6 +1,7 @@
 import assert from "assert";
 import { readFile } from "fs/promises";
 import { URL } from "url";
+import { isIdentifierContinue, isIdentifierStart } from "../compiler/charcode.js";
 import { throwOnError } from "../compiler/diagnostics.js";
 import {
  createScanner,
@ -311,6 +312,54 @@ describe("scanner", () => {
    assert.strictEqual(TokenDisplay[Token.Identifier], "<identifier>");
  });

+  // Search for Other_ID_Start in https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
+  const otherIDStart = [0x1885, 0x1886, 0x2118, 0x212e, 0x309b, 0x309c];
+
+  // Search for Other_ID_Continue in https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
+  const otherIdContinue = [
+    0x00b7,
+    0x0387,
+    0x1369,
+    0x136a,
+    0x136b,
+    0x136c,
+    0x136d,
+    0x136e,
+    0x136f,
+    0x1370,
+    0x1371,
+    0x19da,
+  ];
+
+  it("allows additional identifier start characters", () => {
+    assert(isIdentifierStart("$".codePointAt(0)!), "'$' should be allowed to start identifier.");
+    assert(isIdentifierStart("_".codePointAt(0)!), "'_' should be allowed to start identifier.");
+
+    for (const codePoint of otherIDStart) {
+      assert(
+        isIdentifierStart(codePoint),
+        `U+${codePoint.toString(16)} should be allowed to start identifier.`
+      );
+    }
+  });
+
+  it("allows additional identifier continuation characters", () => {
+    //prettier-ignore
+    assert(isIdentifierContinue("$".codePointAt(0)!), "'$' should be allowed to continue identifier.");
+    //prettier-ignore
+    assert(isIdentifierContinue("_".codePointAt(0)!), "'_' should be allowed to continue identifier.");
+
+    for (const codePoint of [...otherIDStart, ...otherIdContinue]) {
+      assert(
+        isIdentifierContinue(codePoint),
+        `U+${codePoint.toString(16)} should be allowed to continue identifier.`
+      );
+    }
+
+    assert(isIdentifierContinue(0x200c), "U+200C (ZWNJ) should be allowed to continue identifier.");
+    assert(isIdentifierContinue(0x200d), "U+200D (ZWJ) should be allowed to continue identifier.");
+  });
+
  it("scans this file", async () => {
    const text = await readFile(new URL(import.meta.url), "utf-8");
    tokens(text, function () {