Fix bugs with non-ascii identifiers (#474)

2021-04-22 13:13:32 -07:00 · 2021-04-22 13:13:32 -07:00 · 78661fdbe8
--- a/packages/adl/compiler/character-codes.ts
+++ b/packages/adl/compiler/character-codes.ts
--- a/packages/adl/compiler/messages.ts
+++ b/packages/adl/compiler/messages.ts
@ -54,10 +54,10 @@ export const Message = {
      "All lines in triple-quoted string lines must have the same indentation as closing triple quotes",
  } as const,

-  InvalidToken: {
+  InvalidCharacter: {
    code: 1108,
    severity: "error",
-    text: "Invalid token: '{0}'",
+    text: "Invalid character.",
  } as const,
 };

--- a/packages/adl/compiler/parser.ts
+++ b/packages/adl/compiler/parser.ts
@ -1012,7 +1012,7 @@ export function parse(code: string | SourceFile) {
    return false;
  }

-  function parseExpectedOneOf<T extends Token[]>(...options: T): T[number] | undefined {
+  function parseExpectedOneOf<T extends Token[]>(...options: T): T[number] | Token.None {
    for (const tok of options) {
      if (token() === tok) {
        nextToken();
@ -1020,7 +1020,7 @@ export function parse(code: string | SourceFile) {
      }
    }
    errorTokenIsNotOneOf(options);
-    return undefined;
+    return Token.None;
  }

  function errorTokenIsNotOneOf(options: Token[]) {
--- a/packages/adl/compiler/scanner.ts
+++ b/packages/adl/compiler/scanner.ts
@ -1,11 +1,14 @@
 import {
  CharacterCodes,
+  isAsciiIdentifierContinue,
+  isAsciiIdentifierStart,
  isBinaryDigit,
  isDigit,
  isHexDigit,
-  isIdentifierPart,
-  isIdentifierStart,
+  isIdentifierContinue,
  isLineBreak,
+  isNonAsciiIdentifierContinue,
+  isNonAsciiIdentifierStart,
  isWhiteSpaceSingleLine,
 } from "./character-codes.js";
 import { createSourceFile, Message, throwOnError } from "./diagnostics.js";
@ -17,7 +20,7 @@ const mergeConflictMarkerLength = 7;

 export enum Token {
  None = 0,
-  Unknown = 1,
+  Invalid = 1,
  EndOfFile = 2,

  // Trivia
@ -81,7 +84,7 @@ const MaxStatementKeyword = Token.OpKeyword;

 export const TokenDisplay: readonly string[] = [
  "<none>",
-  "<unknown>",
+  "<invalid>",
  "<end of file>",
  "<single-line comment>",
  "<multi-line comment>",
@ -130,6 +133,8 @@ export const Keywords: ReadonlyMap<string, Token> = new Map([
  ["false", Token.FalseKeyword],
 ]);

+export const maxKeywordLength = 9;
+
 export interface Scanner {
  /** The source code being scanned. */
  readonly file: SourceFile;
@ -202,7 +207,7 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
  const file = typeof source === "string" ? createSourceFile(source, "<anonymous file>") : source;
  const input = file.text;
  let position = 0;
-  let token = Token.Unknown;
+  let token = Token.Invalid;
  let tokenPosition = -1;
  let tokenValue: string | undefined = undefined;
  let tokenFlags = TokenFlags.None;
@ -233,6 +238,10 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
    return (token = t);
  }

+  function utf16CodeUnits(codePoint: number) {
+    return codePoint >= 0x10000 ? 2 : 1;
+  }
+
  function getTokenText() {
    return input.substring(tokenPosition, position);
  }
@ -331,7 +340,7 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
            case CharacterCodes.asterisk:
              return scanMultiLineComment();
          }
-          return invalidToken();
+          return scanInvalidCharacter();

        case CharacterCodes._0:
          switch (lookAhead(1)) {
@ -376,16 +385,17 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
          return scanString();

        default:
-          return isIdentifierStart(ch) ? scanIdentifier() : invalidToken();
+          return scanIdentifierOrKeyword();
      }
    }

    return (token = Token.EndOfFile);
  }

-  function invalidToken() {
-    token = next(Token.Unknown);
-    error(Message.InvalidToken, [getTokenText()]);
+  function scanInvalidCharacter() {
+    const codePoint = input.codePointAt(position)!;
+    token = next(Token.Invalid, utf16CodeUnits(codePoint));
+    error(Message.InvalidCharacter);
    return token;
  }

@ -728,8 +738,57 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
    return result;
  }

-  function scanIdentifier() {
-    scanUntil((ch) => !isIdentifierPart(ch));
-    return (token = Keywords.get(getTokenValue()) ?? Token.Identifier);
+  function scanIdentifierOrKeyword() {
+    let ch = input.charCodeAt(position);
+
+    if (!isAsciiIdentifierStart(ch)) {
+      return scanNonAsciiIdentifier();
+    }
+
+    do {
+      position++;
+      if (eof()) {
+        break;
+      }
+      ch = input.charCodeAt(position);
+    } while (isAsciiIdentifierContinue(ch));
+
+    if (!eof() && ch > CharacterCodes.maxAsciiCharacter) {
+      const codePoint = input.codePointAt(position)!;
+      if (isNonAsciiIdentifierContinue(codePoint)) {
+        return scanNonAsciiIdentifierContinue(codePoint);
+      }
+    }
+
+    if (position - tokenPosition <= maxKeywordLength) {
+      const value = getTokenValue();
+      const keyword = Keywords.get(value);
+      if (keyword) {
+        return (token = keyword);
+      }
+    }
+
+    return (token = Token.Identifier);
+  }
+
+  function scanNonAsciiIdentifier() {
+    let codePoint = input.codePointAt(position)!;
+    return isNonAsciiIdentifierStart(codePoint)
+      ? scanNonAsciiIdentifierContinue(codePoint)
+      : scanInvalidCharacter();
+  }
+
+  function scanNonAsciiIdentifierContinue(startCodePoint: number) {
+    let codePoint = startCodePoint;
+
+    do {
+      position += utf16CodeUnits(codePoint);
+      if (eof()) {
+        break;
+      }
+      codePoint = input.codePointAt(position)!;
+    } while (isIdentifierContinue(codePoint));
+
+    return (token = Token.Identifier);
  }
 }
--- a/packages/adl/test/test-parser.ts
+++ b/packages/adl/test/test-parser.ts
@ -216,6 +216,11 @@ describe("syntax", () => {
      ],
    ]);
  });
+
+  describe("non-ascii identifiers", () => {
+    parseEach(["model Incompréhensible {}", "model 𐌰𐌲 {}", "model Banana𐌰𐌲Banana {}"]);
+    parseErrorEach([["model 😢 {}", [/Invalid character/]]]);
+  });
 });

 function parseEach(cases: string[]) {
--- a/packages/adl/test/test-scanner.ts
+++ b/packages/adl/test/test-scanner.ts
@ -8,6 +8,7 @@ import {
  isPunctuation,
  isStatementKeyword,
  Keywords,
+  maxKeywordLength,
  Token,
  TokenDisplay,
 } from "../compiler/scanner.js";
@ -226,7 +227,9 @@ describe("scanner", () => {

    // check that keywords have appropriate display
    const nonStatementKeywords = [Token.ExtendsKeyword, Token.TrueKeyword, Token.FalseKeyword];
+    let maxKeywordLengthFound = -1;
    for (const [name, token] of Keywords.entries()) {
+      maxKeywordLengthFound = Math.max(maxKeywordLengthFound, name.length);
      assert.strictEqual(TokenDisplay[token], `'${name}'`);
      assert(isKeyword(token), `${name} should be classified as a keyword`);
      if (!nonStatementKeywords.includes(token)) {
@ -234,6 +237,8 @@ describe("scanner", () => {
      }
    }

+    assert.strictEqual(maxKeywordLengthFound, maxKeywordLength);
+
    // check single character punctuation
    for (let i = 33; i <= 126; i++) {
      const str = String.fromCharCode(i);
@ -241,7 +246,7 @@ describe("scanner", () => {
      if (
        token !== Token.StringLiteral &&
        token !== Token.Identifier &&
-        token !== Token.Unknown &&
+        token !== Token.Invalid &&
        token !== Token.NumericLiteral
      ) {
        assert.strictEqual(TokenDisplay[token], `'${str}'`);
@ -252,7 +257,7 @@ describe("scanner", () => {
    // check the rest
    assert.strictEqual(TokenDisplay[Token.Elipsis], "'...'");
    assert.strictEqual(TokenDisplay[Token.None], "<none>");
-    assert.strictEqual(TokenDisplay[Token.Unknown], "<unknown>");
+    assert.strictEqual(TokenDisplay[Token.Invalid], "<invalid>");
    assert.strictEqual(TokenDisplay[Token.EndOfFile], "<end of file>");
    assert.strictEqual(TokenDisplay[Token.SingleLineComment], "<single-line comment>");
    assert.strictEqual(TokenDisplay[Token.MultiLineComment], "<multi-line comment>");