Fix scanning edge cases and optimize slightly

* Bug fixes * tokenValue() was wrong if an identifier ever came right after a string literal. * There was an incorrect and confusing end-of-file error if file ends with numeric literal. Generally hardened and cleaned up EOF checking throughout. * Digits weren't allowed in identifier with non-ascii characters. * Perf * Add more ASCII fast paths * Inline scanUntil everywhere, and simplify each use to what it actually needed * Avoid eager substring allocation and map lookup for keywords in more cases
2021-04-24 18:17:28 -07:00 · 2021-04-24 18:17:28 -07:00 · 5fb28d6378
--- a/packages/adl/compiler/charcode.ts
+++ b/packages/adl/compiler/charcode.ts
@ -12,13 +12,13 @@ export const enum CharCode {
  LineSeparator = 0x2028,
  ParagraphSeparator = 0x2029,

-  // ASCII whitespace
+  // ASCII whitespace excluding line breaks
  Space = 0x20,
  FormFeed = 0x0c,
  Tab = 0x09,
  VerticalTab = 0x0b,

-  // Non-ASCII whitespace
+  // Non-ASCII whitespace excluding line breaks
  ByteOrderMark = 0xfeff, // currently allowed anywhere
  NextLine = 0x0085, // not considered a line break, mirroring ECMA-262
  NonBreakingSpace = 0x00a0,
@ -144,15 +144,24 @@ export const enum CharCode {
  Tilde = 0x7e,
 }

-/** Does not include line breaks. For that, see isWhiteSpaceLike. */
-export function isWhiteSpaceSingleLine(ch: number): boolean {
-  // Note: nextLine is in the Zs space, and should be considered to be a whitespace.
-  // It is explicitly not a line-break as it isn't in the exact set specified by EcmaScript.
+export function isAsciiLineBreak(ch: number) {
+  return ch === CharCode.LineFeed || ch == CharCode.CarriageReturn;
+}
+
+export function isAsciiWhiteSpaceSingleLine(ch: number): boolean {
  return (
    ch === CharCode.Space ||
    ch === CharCode.Tab ||
    ch === CharCode.VerticalTab ||
-    ch === CharCode.FormFeed ||
+    ch === CharCode.FormFeed
+  );
+}
+
+export function isNonAsciiWhiteSpaceSingleLine(ch: number) {
+  // Note: nextLine is in the Zs space, and should be considered to be a
+  // whitespace. It is explicitly not a line-break as it isn't in the exact set
+  // inherited by ADL from JavaScript.
+  return (
    ch === CharCode.NonBreakingSpace ||
    ch === CharCode.NextLine ||
    ch === CharCode.Ogham ||
@ -164,17 +173,23 @@ export function isWhiteSpaceSingleLine(ch: number): boolean {
  );
 }

-export function isLineBreak(ch: number): boolean {
-  // Other new line or line
-  // breaking characters are treated as white space but not as line terminators.
+export function isNonAsciiLineBreak(ch: number) {
+  // Other new line or line breaking characters are treated as white space but
+  // not as line terminators.
+  return ch === CharCode.ParagraphSeparator || ch === CharCode.LineSeparator;
+}
+
+export function isWhiteSpaceSingleLine(ch: number) {
  return (
-    ch === CharCode.LineFeed ||
-    ch === CharCode.CarriageReturn ||
-    ch === CharCode.LineSeparator ||
-    ch === CharCode.ParagraphSeparator
+    isAsciiWhiteSpaceSingleLine(ch) ||
+    (ch > CharCode.MaxAscii && isNonAsciiWhiteSpaceSingleLine(ch))
  );
 }

+export function isLineBreak(ch: number): boolean {
+  return isAsciiLineBreak(ch) || (ch > CharCode.MaxAscii && isNonAsciiLineBreak(ch));
+}
+
 export function isDigit(ch: number): boolean {
  return ch >= CharCode._0 && ch <= CharCode._9;
 }
@ -210,7 +225,7 @@ export function isAsciiIdentifierContinue(ch: number): boolean {

 export function isIdentifierContinue(codePoint: number) {
  return (
-    isAsciiIdentifierStart(codePoint) ||
+    isAsciiIdentifierContinue(codePoint) ||
    (codePoint > CharCode.MaxAscii && isNonAsciiIdentifierContinue(codePoint))
  );
 }
--- a/packages/adl/compiler/diagnostics.ts
+++ b/packages/adl/compiler/diagnostics.ts
@ -1,5 +1,5 @@
 import { AssertionError } from "assert";
-import { CharCode } from "./charcode.js";
+import { CharCode, isNonAsciiLineBreak } from "./charcode.js";
 import { Message } from "./messages.js";
 import { Diagnostic, Node, SourceFile, SourceLocation, Sym, SyntaxKind, Type } from "./types.js";

@ -113,7 +113,7 @@ export function createSourceFile(text: string, path: string): SourceFile {
  };

  function getLineStarts() {
-    return (lineStarts = lineStarts ?? scanLineStarts());
+    return (lineStarts = lineStarts ?? scanLineStarts(text));
  }

  function getLineAndCharacterOfPosition(position: number) {
@ -136,57 +136,6 @@ export function createSourceFile(text: string, path: string): SourceFile {
      character: position - starts[line],
    };
  }
-
-  function scanLineStarts() {
-    const starts = [];
-    let start = 0;
-    let pos = 0;
-
-    while (pos < text.length) {
-      const ch = text.charCodeAt(pos);
-      pos++;
-      switch (ch) {
-        case CharCode.CarriageReturn:
-          if (text.charCodeAt(pos) === CharCode.LineFeed) {
-            pos++;
-          }
-        // fallthrough
-        case CharCode.LineFeed:
-        case CharCode.LineSeparator:
-        case CharCode.ParagraphSeparator:
-          starts.push(start);
-          start = pos;
-          break;
-      }
-    }
-
-    starts.push(start);
-    return starts;
-  }
-
-  /**
-   * Search sorted array of numbers for the given value. If found, return index
-   * in array where value was found. If not found, return a negative number that
-   * is the bitwise complement of the index where value would need to be inserted
-   * to keep the array sorted.
-   */
-  function binarySearch(array: readonly number[], value: number) {
-    let low = 0;
-    let high = array.length - 1;
-    while (low <= high) {
-      const middle = low + ((high - low) >> 1);
-      const v = array[middle];
-      if (v < value) {
-        low = middle + 1;
-      } else if (v > value) {
-        high = middle - 1;
-      } else {
-        return middle;
-      }
-    }
-
-    return ~low;
-  }
 }

 export function getSourceLocation(target: DiagnosticTarget): SourceLocation {
@ -328,3 +277,58 @@ function format(text: string, args?: (string | number)[]): [string, Error?] {
 function isNotUndefined<T>(value: T | undefined): value is T {
  return value !== undefined;
 }
+
+function scanLineStarts(text: string): number[] {
+  const starts = [];
+  let start = 0;
+  let pos = 0;
+
+  while (pos < text.length) {
+    const ch = text.charCodeAt(pos);
+    pos++;
+    switch (ch) {
+      case CharCode.CarriageReturn:
+        if (text.charCodeAt(pos) === CharCode.LineFeed) {
+          pos++;
+        }
+      // fallthrough
+      case CharCode.LineFeed:
+        starts.push(start);
+        start = pos;
+        break;
+      default:
+        if (ch > CharCode.MaxAscii && isNonAsciiLineBreak(ch)) {
+          starts.push(start);
+          start = pos;
+          break;
+        }
+    }
+  }
+
+  starts.push(start);
+  return starts;
+}
+
+/**
+ * Search sorted array of numbers for the given value. If found, return index
+ * in array where value was found. If not found, return a negative number that
+ * is the bitwise complement of the index where value would need to be inserted
+ * to keep the array sorted.
+ */
+function binarySearch(array: readonly number[], value: number) {
+  let low = 0;
+  let high = array.length - 1;
+  while (low <= high) {
+    const middle = low + ((high - low) >> 1);
+    const v = array[middle];
+    if (v < value) {
+      low = middle + 1;
+    } else if (v > value) {
+      high = middle - 1;
+    } else {
+      return middle;
+    }
+  }
+
+  return ~low;
+}
--- a/packages/adl/compiler/messages.ts
+++ b/packages/adl/compiler/messages.ts
@ -8,50 +8,50 @@ export const Message = {
  DigitExpected: {
    code: 1100,
    severity: "error",
-    text: "Digit expected (0-9)",
+    text: "Digit expected.",
  } as const,

  HexDigitExpected: {
    code: 1101,
    severity: "error",
-    text: "Hex Digit expected (0-F)",
+    text: "Hexadecimal digit expected.",
  } as const,

  BinaryDigitExpected: {
    code: 1102,
    severity: "error",
-    text: "Binary Digit expected (0,1)",
+    text: "Binary digit expected.",
  } as const,

-  UnexpectedEndOfFile: {
+  Unterminated: {
    code: 1103,
    severity: "error",
-    text: "Unexpected end of file while searching for '{0}'",
+    text: "Unterminated {0}.",
  } as const,

  InvalidEscapeSequence: {
    code: 1104,
    severity: "error",
-    text: "Invalid escape sequence",
+    text: "Invalid escape sequence.",
  } as const,

  NoNewLineAtStartOfTripleQuotedString: {
    code: 1105,
    severity: "error",
-    text: "String content in triple quotes must begin on a new line",
+    text: "String content in triple quotes must begin on a new line.",
  } as const,

  NoNewLineAtEndOfTripleQuotedString: {
    code: 1106,
    severity: "error",
-    text: "Closing triple quotes must begin on a new line",
+    text: "Closing triple quotes must begin on a new line.",
  } as const,

  InconsistentTripleQuoteIndentation: {
    code: 1107,
    severity: "error",
    text:
-      "All lines in triple-quoted string lines must have the same indentation as closing triple quotes",
+      "All lines in triple-quoted string lines must have the same indentation as closing triple quotes.",
  } as const,

  InvalidCharacter: {
--- a/packages/adl/compiler/parser.ts
+++ b/packages/adl/compiler/parser.ts
@ -280,7 +280,7 @@ export function parse(code: string | SourceFile) {
          stmts.push(parseUsingStatement());
          break;
        case Token.EndOfFile:
-          error("End of file reached without '}'.");
+          parseExpected(Token.CloseBrace);
          return stmts;
        case Token.Semicolon:
          reportInvalidDecorators(decorators, "empty statement");
@ -990,9 +990,8 @@ export function parse(code: string | SourceFile) {
    if (realPositionOfLastError === realPos) {
      return;
    }
-
    realPositionOfLastError = realPos;
-    parseErrorInNextFinishedNode = true;
+
    reportDiagnostic(message, location);
  }

@ -1001,6 +1000,9 @@ export function parse(code: string | SourceFile) {
    target: DiagnosticTarget,
    args?: (string | number)[]
  ) {
+    if (typeof message === "string" || message.severity === "error") {
+      parseErrorInNextFinishedNode = true;
+    }
    const diagnostic = createDiagnostic(message, target, args);
    parseDiagnostics.push(diagnostic);
  }
--- a/packages/adl/compiler/scanner.ts
+++ b/packages/adl/compiler/scanner.ts
@ -9,6 +9,8 @@ import {
  isLineBreak,
  isNonAsciiIdentifierContinue,
  isNonAsciiIdentifierStart,
+  isNonAsciiLineBreak,
+  isNonAsciiWhiteSpaceSingleLine,
  isWhiteSpaceSingleLine,
 } from "./charcode.js";
 import { createSourceFile, Message, throwOnError } from "./diagnostics.js";
@ -82,6 +84,7 @@ const MaxPunctuation = Token.At;
 const MinStatementKeyword = Token.ImportKeyword;
 const MaxStatementKeyword = Token.OpKeyword;

+/** @internal */
 export const TokenDisplay: readonly string[] = [
  "<none>",
  "<invalid>",
@ -122,6 +125,7 @@ export const TokenDisplay: readonly string[] = [
  "'false'",
 ];

+/** @internal */
 export const Keywords: ReadonlyMap<string, Token> = new Map([
  ["import", Token.ImportKeyword],
  ["model", Token.ModelKeyword],
@ -133,7 +137,13 @@ export const Keywords: ReadonlyMap<string, Token> = new Map([
  ["false", Token.FalseKeyword],
 ]);

-export const maxKeywordLength = 9;
+/** @internal */
+export const enum KeywordLimit {
+  MinLength = 2,
+  MaxLength = 9,
+  MinStartChar = CharCode.e,
+  MaxStartChar = CharCode.u,
+}

 export interface Scanner {
  /** The source code being scanned. */
@ -264,32 +274,12 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
          }
        // fallthrough
        case CharCode.LineFeed:
-        case CharCode.LineSeparator:
-        case CharCode.ParagraphSeparator:
          return next(Token.NewLine);

+        case CharCode.Space:
        case CharCode.Tab:
        case CharCode.VerticalTab:
        case CharCode.FormFeed:
-        case CharCode.Space:
-        case CharCode.NonBreakingSpace:
-        case CharCode.Ogham:
-        case CharCode.EnQuad:
-        case CharCode.EmQuad:
-        case CharCode.EnSpace:
-        case CharCode.EmSpace:
-        case CharCode.ThreePerEmSpace:
-        case CharCode.FourPerEmSpace:
-        case CharCode.SixPerEmSpace:
-        case CharCode.FigureSpace:
-        case CharCode.PunctuationSpace:
-        case CharCode.ThinSpace:
-        case CharCode.HairSpace:
-        case CharCode.ZeroWidthSpace:
-        case CharCode.NarrowNoBreakSpace:
-        case CharCode.MathematicalSpace:
-        case CharCode.IdeographicSpace:
-        case CharCode.ByteOrderMark:
          return scanWhitespace();

        case CharCode.OpenParen:
@ -382,16 +372,45 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
            : next(Token.Bar);

        case CharCode.DoubleQuote:
-          return scanString();
+          return lookAhead(1) === CharCode.DoubleQuote && lookAhead(2) === CharCode.DoubleQuote
+            ? scanTripleQuotedString()
+            : scanString();

        default:
-          return scanIdentifierOrKeyword();
+          if (isAsciiIdentifierStart(ch)) {
+            return scanIdentifierOrKeyword();
+          }
+
+          if (ch <= CharCode.MaxAscii) {
+            return scanInvalidCharacter();
+          }
+
+          return scanNonAsciiToken();
      }
    }

    return (token = Token.EndOfFile);
  }

+  function scanNonAsciiToken() {
+    const ch = input.charCodeAt(position);
+
+    if (isNonAsciiLineBreak(ch)) {
+      return next(Token.NewLine);
+    }
+
+    if (isNonAsciiWhiteSpaceSingleLine(ch)) {
+      return scanWhitespace();
+    }
+
+    const codePoint = input.codePointAt(position)!;
+    if (isNonAsciiIdentifierStart(codePoint)) {
+      return scanNonAsciiIdentifierContinue(codePoint);
+    }
+
+    return scanInvalidCharacter();
+  }
+
  function scanInvalidCharacter() {
    const codePoint = input.codePointAt(position)!;
    token = next(Token.Invalid, utf16CodeUnits(codePoint));
@ -423,152 +442,184 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
  function scanWhitespace(): Token {
    do {
      position++;
-    } while (isWhiteSpaceSingleLine(input.charCodeAt(position)));
+    } while (!eof() && isWhiteSpaceSingleLine(input.charCodeAt(position)));

    return (token = Token.Whitespace);
  }

-  function scanDigits() {
-    while (isDigit(input.charCodeAt(position))) {
-      position++;
-    }
-  }
-
  function scanNumber() {
-    scanDigits();
-
-    let ch = input.charCodeAt(position);
-
-    if (ch === CharCode.Dot) {
-      position++;
-      scanDigits();
-    }
-
-    ch = input.charCodeAt(position);
-    if (ch === CharCode.e) {
-      position++;
-      ch = input.charCodeAt(position);
-      if (ch === CharCode.Plus || ch == CharCode.Minus) {
-        position++;
-        ch = input.charCodeAt(position);
-      }
-
-      if (isDigit(ch)) {
-        position++;
-        scanDigits();
-      } else {
-        error(Message.DigitExpected);
+    scanKnownDigits();
+    if (!eof()) {
+      switch (input.charCodeAt(position)) {
+        case CharCode.Dot:
+          scanFractionAndExponent();
+          break;
+        case CharCode.e:
+          scanExponent();
+          break;
      }
    }
-
    return (token = Token.NumericLiteral);
  }

-  function scanHexNumber() {
-    if (!isHexDigit(lookAhead(2))) {
-      error(Message.HexDigitExpected);
-      return next(Token.NumericLiteral, 2);
-    }
+  function scanKnownDigits() {
+    do {
+      position++;
+    } while (!eof() && isDigit(input.charCodeAt(position)));
+  }
+
+  function scanOptionalDigits() {
+    if (!eof() && isDigit(input.charCodeAt(position))) {
+      scanKnownDigits();
+    }
+  }
+
+  function scanRequiredDigits() {
+    if (eof() || !isDigit(input.charCodeAt(position))) {
+      error(Message.DigitExpected);
+      return;
+    }
+    scanKnownDigits();
+  }
+
+  function scanFractionAndExponent() {
+    position++; // consume '.'
+    scanOptionalDigits();
+    if (!eof() && input.charCodeAt(position) === CharCode.e) {
+      scanExponent();
+    }
+  }
+
+  function scanExponent() {
+    position++; // consume 'e'
+    if (eof()) {
+      error(Message.DigitExpected);
+      return;
+    }
+    const ch = input.charCodeAt(position);
+    if (ch === CharCode.Plus || ch === CharCode.Minus) {
+      position++;
+    }
+    scanRequiredDigits();
+  }
+
+  function scanHexNumber() {
+    position += 2; // consume '0x'
+
+    if (eof() || !isHexDigit(input.charCodeAt(position))) {
+      error(Message.HexDigitExpected);
+      return (token = Token.NumericLiteral);
+    }
+    do {
+      position++;
+    } while (!eof() && isHexDigit(input.charCodeAt(position)));

-    position += 2;
-    scanUntil((ch) => !isHexDigit(ch), "Hex Digit");
    return (token = Token.NumericLiteral);
  }

  function scanBinaryNumber() {
-    if (!isBinaryDigit(lookAhead(2))) {
-      error(Message.BinaryDigitExpected);
-      return next(Token.NumericLiteral, 2);
-    }
+    position += 2; // consume '0b'
+
+    if (eof() || !isBinaryDigit(input.charCodeAt(position))) {
+      error(Message.BinaryDigitExpected);
+      return (token = Token.NumericLiteral);
+    }
+    do {
+      position++;
+    } while (!eof() && isBinaryDigit(input.charCodeAt(position)));

-    position += 2;
-    scanUntil((ch) => !isBinaryDigit(ch), "Binary Digit");
    return (token = Token.NumericLiteral);
  }

-  function scanUntil(
-    predicate: (char: number) => boolean,
-    expectedClose?: string,
-    consumeClose?: number
-  ) {
-    let ch: number;
+  function scanSingleLineComment() {
+    position += 2; // consume '//'

-    do {
-      position++;
-
-      if (eof()) {
-        if (expectedClose) {
-          error(Message.UnexpectedEndOfFile, [expectedClose]);
-        }
+    while (!eof()) {
+      if (isLineBreak(input.charCodeAt(position))) {
        break;
      }
-
-      ch = input.charCodeAt(position);
-    } while (!predicate(ch));
-
-    if (consumeClose) {
-      position += consumeClose;
+      position++;
    }
-  }

-  function scanSingleLineComment() {
-    scanUntil(isLineBreak);
    return (token = Token.SingleLineComment);
  }

  function scanMultiLineComment() {
-    scanUntil((ch) => ch === CharCode.Asterisk && lookAhead(1) === CharCode.Slash, "*/", 2);
+    position += 2; // consume '/*'
+
+    while (!eof()) {
+      if (input.charCodeAt(position) === CharCode.Asterisk && lookAhead(1) === CharCode.Slash) {
+        position += 2;
+        return (token = Token.MultiLineComment);
+      }
+      position++;
+    }
+
+    error(Message.Unterminated, ["comment"]);
    return (token = Token.MultiLineComment);
  }

  function scanString() {
-    let quoteLength = 1;
-    let closing = '"';
-    let isEscaping = false;
+    position++; // consume '"'

-    const tripleQuoted =
-      lookAhead(1) === CharCode.DoubleQuote && lookAhead(2) === CharCode.DoubleQuote;
-
-    if (tripleQuoted) {
-      tokenFlags |= TokenFlags.TripleQuoted;
-      quoteLength = 3;
-      position += 2;
-      closing = '"""';
+    loop: while (!eof()) {
+      const ch = input.charCodeAt(position);
+      switch (ch) {
+        case CharCode.CarriageReturn:
+          if (lookAhead(1) === CharCode.LineFeed) {
+            tokenFlags |= TokenFlags.HasCrlf;
+            position++;
+          }
+          break;
+        case CharCode.Backslash:
+          tokenFlags |= TokenFlags.Escaped;
+          position++;
+          if (eof()) {
+            break loop;
+          }
+          break;
+        case CharCode.DoubleQuote:
+          position++;
+          return (token = Token.StringLiteral);
+      }
+      position++;
    }

-    scanUntil(
-      (ch) => {
-        if (isEscaping) {
-          isEscaping = false;
-          return false;
-        }
+    error(Message.Unterminated, ["string literal"]);
+    return (token = Token.StringLiteral);
+  }

-        switch (ch) {
-          case CharCode.CarriageReturn:
-            if (lookAhead(1) === CharCode.LineFeed) {
-              tokenFlags |= TokenFlags.HasCrlf;
-            }
-            return false;
+  function scanTripleQuotedString() {
+    tokenFlags |= TokenFlags.TripleQuoted;
+    position += 3; // consume '"""'

-          case CharCode.Backslash:
-            isEscaping = true;
-            tokenFlags |= TokenFlags.Escaped;
-            return false;
-
-          case CharCode.DoubleQuote:
-            if (tripleQuoted) {
-              return lookAhead(1) === CharCode.DoubleQuote && lookAhead(2) === CharCode.DoubleQuote;
-            }
-            return true;
-
-          default:
-            return false;
-        }
-      },
-      closing,
-      quoteLength
-    );
+    loop: while (!eof()) {
+      const ch = input.charCodeAt(position);
+      switch (ch) {
+        case CharCode.CarriageReturn:
+          if (lookAhead(1) === CharCode.LineFeed) {
+            tokenFlags |= TokenFlags.HasCrlf;
+            position++;
+          }
+          break;
+        case CharCode.Backslash:
+          tokenFlags |= TokenFlags.Escaped;
+          position++;
+          if (eof()) {
+            break loop;
+          }
+          break;
+        case CharCode.DoubleQuote:
+          if (lookAhead(1) === CharCode.DoubleQuote && lookAhead(2) === CharCode.DoubleQuote) {
+            position += 3;
+            return (token = Token.StringLiteral);
+          }
+          break;
+      }
+      position++;
+    }

+    error(Message.Unterminated, ["string literal"]);
    return (token = Token.StringLiteral);
  }

@ -576,11 +627,10 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
    if (tokenValue !== undefined) {
      return tokenValue;
    }
+    return (tokenValue = token === Token.StringLiteral ? getStringTokenValue() : getTokenText());
+  }

-    if (token !== Token.StringLiteral) {
-      return (tokenValue = getTokenText());
-    }
-
+  function getStringTokenValue() {
    // strip quotes
    const quoteLength = tokenFlags & TokenFlags.TripleQuoted ? 3 : 1;
    let value = input.substring(tokenPosition + quoteLength, position - quoteLength);
@ -729,30 +779,28 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
  }

  function scanIdentifierOrKeyword() {
-    let ch = input.charCodeAt(position);
-
-    if (!isAsciiIdentifierStart(ch)) {
-      return scanNonAsciiIdentifier();
-    }
-
+    const startChar = input.charCodeAt(position);
+    let ch = startChar;
    do {
      position++;
-      if (eof()) {
-        break;
-      }
-      ch = input.charCodeAt(position);
-    } while (isAsciiIdentifierContinue(ch));
+    } while (!eof() && isAsciiIdentifierContinue((ch = input.charCodeAt(position))));

-    if (!eof() && ch > CharCode.MaxAscii) {
+    if (ch > CharCode.MaxAscii) {
      const codePoint = input.codePointAt(position)!;
      if (isNonAsciiIdentifierContinue(codePoint)) {
        return scanNonAsciiIdentifierContinue(codePoint);
      }
    }

-    if (position - tokenPosition <= maxKeywordLength) {
-      const value = getTokenValue();
-      const keyword = Keywords.get(value);
+    const length = position - tokenPosition;
+    if (
+      length >= KeywordLimit.MinLength &&
+      length <= KeywordLimit.MaxLength &&
+      startChar >= KeywordLimit.MinStartChar &&
+      startChar <= KeywordLimit.MaxStartChar
+    ) {
+      tokenValue = getTokenText();
+      const keyword = Keywords.get(tokenValue);
      if (keyword) {
        return (token = keyword);
      }
@ -761,23 +809,11 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
    return (token = Token.Identifier);
  }

-  function scanNonAsciiIdentifier() {
-    let codePoint = input.codePointAt(position)!;
-    return isNonAsciiIdentifierStart(codePoint)
-      ? scanNonAsciiIdentifierContinue(codePoint)
-      : scanInvalidCharacter();
-  }
-
  function scanNonAsciiIdentifierContinue(startCodePoint: number) {
    let codePoint = startCodePoint;
-
    do {
      position += utf16CodeUnits(codePoint);
-      if (eof()) {
-        break;
-      }
-      codePoint = input.codePointAt(position)!;
-    } while (isIdentifierContinue(codePoint));
+    } while (!eof() && isIdentifierContinue((codePoint = input.codePointAt(position)!)));

    return (token = Token.Identifier);
  }
--- a/packages/adl/test/test-parser.ts
+++ b/packages/adl/test/test-parser.ts
@ -219,8 +219,38 @@ describe("syntax", () => {
    ]);
  });

+  describe("unterminated tokens", () => {
+    parseErrorEach([
+      ['model X = "banana', [/Unterminated string literal/]],
+      ['model X = "banana\\', [/Unterminated string literal/]],
+      ['model X = """\nbanana', [/Unterminated string literal/]],
+      ['model X = """\nbanana\\', [/Unterminated string literal/]],
+      ["/* Yada yada yada", [/Unterminated comment/]],
+      ["123.0e", [/Digit expected/]],
+      ["123.e", [/Digit expected/]],
+      ["123e", [/Digit expected/]],
+      ["0b", [/Binary digit expected/]],
+      ["0x", [/Hexadecimal digit expected/]],
+    ]);
+  });
+
+  describe("terminated tokens at EOF with missing semicolon", () => {
+    parseErrorEach([
+      ["model X = 0x10101", [/';' expected/]],
+      ["model X = 0xBEEF", [/';' expected/]],
+      ["model X = 123", [/';' expected/]],
+      ["model X = 123.", [/';' expected/]],
+      ["model X = 123e45", [/';' expected/]],
+      ["model X = 123.45", [/';' expected/]],
+      ["model X = 123.45e2", [/';' expected/]],
+      ["model X = Banana", [/';' expected/]],
+      ['model X = "Banana"', [/';' expected/]],
+      ['model X = """\nBanana\n"""', [/';' expected/]],
+    ]);
+  });
+
  describe("non-ascii identifiers", () => {
-    parseEach(["model Incompréhensible {}", "model 𐌰𐌲 {}", "model Banana𐌰𐌲Banana {}"]);
+    parseEach(["model Incompréhensible {}", "model 𐌰𐌲 {}", "model Banana𐌰𐌲42Banana {}"]);
    parseErrorEach([["model 😢 {}", [/Invalid character/]]]);
  });
 });
--- a/packages/adl/test/test-scanner.ts
+++ b/packages/adl/test/test-scanner.ts
@ -7,14 +7,22 @@ import {
  isKeyword,
  isPunctuation,
  isStatementKeyword,
+  KeywordLimit,
  Keywords,
-  maxKeywordLength,
  Token,
  TokenDisplay,
 } from "../compiler/scanner.js";
-import { LineAndCharacter } from "../compiler/types.js";

-type TokenEntry = [Token, string?, number?, LineAndCharacter?];
+type TokenEntry = [
+  Token,
+  string?,
+  {
+    pos?: number;
+    line?: number;
+    character?: number;
+    value?: string;
+  }?
+];

 function tokens(text: string, onError = throwOnError): TokenEntry[] {
  const scanner = createScanner(text, onError);
@ -25,8 +33,11 @@ function tokens(text: string, onError = throwOnError): TokenEntry[] {
    result.push([
      scanner.token,
      scanner.getTokenText(),
-      scanner.tokenPosition,
-      scanner.file.getLineAndCharacterOfPosition(scanner.tokenPosition),
+      {
+        pos: scanner.tokenPosition,
+        value: scanner.getTokenValue(),
+        ...scanner.file.getLineAndCharacterOfPosition(scanner.tokenPosition),
+      },
    ]);
  } while (!scanner.eof());

@ -38,26 +49,43 @@ function tokens(text: string, onError = throwOnError): TokenEntry[] {
 }

 function verify(tokens: TokenEntry[], expecting: TokenEntry[]) {
-  for (const [
-    index,
-    [expectedToken, expectedText, expectedPosition, expectedLineAndCharacter],
-  ] of expecting.entries()) {
-    const [token, text, position, lineAndCharacter] = tokens[index];
+  for (const [index, [expectedToken, expectedText, expectedAdditional]] of expecting.entries()) {
+    const [token, text, additional] = tokens[index];
    assert.strictEqual(Token[token], Token[expectedToken], `Token ${index} must match`);

    if (expectedText) {
      assert.strictEqual(text, expectedText, `Token ${index} test must match`);
    }

-    if (expectedPosition) {
-      assert.strictEqual(position, expectedPosition, `Token ${index} position must match`);
+    if (expectedAdditional?.pos) {
+      assert.strictEqual(
+        additional!.pos,
+        expectedAdditional.pos,
+        `Token ${index} position must match`
+      );
    }

-    if (expectedLineAndCharacter) {
-      assert.deepStrictEqual(
-        lineAndCharacter,
-        expectedLineAndCharacter,
-        `Token ${index} line and character must match`
+    if (expectedAdditional?.line) {
+      assert.strictEqual(
+        additional!.line,
+        expectedAdditional.line,
+        `Token ${index} line must match`
+      );
+    }
+
+    if (expectedAdditional?.character) {
+      assert.strictEqual(
+        additional!.character,
+        expectedAdditional?.character,
+        `Token ${index} character must match`
+      );
+    }
+
+    if (expectedAdditional?.value) {
+      assert.strictEqual(
+        additional!.value,
+        expectedAdditional.value,
+        `Token ${index} value must match`
      );
    }
  }
@ -66,16 +94,16 @@ function verify(tokens: TokenEntry[], expecting: TokenEntry[]) {
 describe("scanner", () => {
  /** verifies that we can scan tokens and get back some output. */
  it("smoketest", () => {
-    const all = tokens("\tthis is  a test");
+    const all = tokens('\tthis is "a" test');
    verify(all, [
      [Token.Whitespace],
-      [Token.Identifier, "this"],
+      [Token.Identifier, "this", { value: "this" }],
      [Token.Whitespace],
-      [Token.Identifier, "is"],
+      [Token.Identifier, "is", { value: "is" }],
      [Token.Whitespace],
-      [Token.Identifier, "a"],
+      [Token.StringLiteral, '"a"', { value: "a" }],
      [Token.Whitespace],
-      [Token.Identifier, "test"],
+      [Token.Identifier, "test", { value: "test" }],
    ]);
  });

@ -130,7 +158,7 @@ describe("scanner", () => {
  });

  it("scans numeric literals", () => {
-    const all = tokens("42 0xBEEF 0b1010 1.5e4 314.0e-2 1e+1000");
+    const all = tokens("42 0xBEEF 0b1010 1.5e4 314.0e-2 1e+1000 3. 2.e3");
    verify(all, [
      [Token.NumericLiteral, "42"],
      [Token.Whitespace],
@ -143,6 +171,11 @@ describe("scanner", () => {
      [Token.NumericLiteral, "314.0e-2"],
      [Token.Whitespace],
      [Token.NumericLiteral, "1e+1000"],
+      [Token.Whitespace],
+      // https://github.com/Azure/adl/issues/488 - we may want to disallow these
+      [Token.NumericLiteral, "3."],
+      [Token.Whitespace],
+      [Token.NumericLiteral, "2.e3"],
    ]);
  });

@ -184,34 +217,34 @@ describe("scanner", () => {
  it("provides token position", () => {
    const all = tokens("a x\raa x\r\naaa x\naaaa x\u{2028}aaaaa x\u{2029}aaaaaa x");
    verify(all, [
-      [Token.Identifier, "a", 0, { line: 0, character: 0 }],
-      [Token.Whitespace, " ", 1, { line: 0, character: 1 }],
-      [Token.Identifier, "x", 2, { line: 0, character: 2 }],
-      [Token.NewLine, "\r", 3, { line: 0, character: 3 }],
+      [Token.Identifier, "a", { pos: 0, line: 0, character: 0 }],
+      [Token.Whitespace, " ", { pos: 1, line: 0, character: 1 }],
+      [Token.Identifier, "x", { pos: 2, line: 0, character: 2 }],
+      [Token.NewLine, "\r", { pos: 3, line: 0, character: 3 }],

-      [Token.Identifier, "aa", 4, { line: 1, character: 0 }],
-      [Token.Whitespace, " ", 6, { line: 1, character: 2 }],
-      [Token.Identifier, "x", 7, { line: 1, character: 3 }],
-      [Token.NewLine, "\r\n", 8, { line: 1, character: 4 }],
+      [Token.Identifier, "aa", { pos: 4, line: 1, character: 0 }],
+      [Token.Whitespace, " ", { pos: 6, line: 1, character: 2 }],
+      [Token.Identifier, "x", { pos: 7, line: 1, character: 3 }],
+      [Token.NewLine, "\r\n", { pos: 8, line: 1, character: 4 }],

-      [Token.Identifier, "aaa", 10, { line: 2, character: 0 }],
-      [Token.Whitespace, " ", 13, { line: 2, character: 3 }],
-      [Token.Identifier, "x", 14, { line: 2, character: 4 }],
-      [Token.NewLine, "\n", 15, { line: 2, character: 5 }],
+      [Token.Identifier, "aaa", { pos: 10, line: 2, character: 0 }],
+      [Token.Whitespace, " ", { pos: 13, line: 2, character: 3 }],
+      [Token.Identifier, "x", { pos: 14, line: 2, character: 4 }],
+      [Token.NewLine, "\n", { pos: 15, line: 2, character: 5 }],

-      [Token.Identifier, "aaaa", 16, { line: 3, character: 0 }],
-      [Token.Whitespace, " ", 20, { line: 3, character: 4 }],
-      [Token.Identifier, "x", 21, { line: 3, character: 5 }],
-      [Token.NewLine, "\u{2028}", 22, { line: 3, character: 6 }],
+      [Token.Identifier, "aaaa", { pos: 16, line: 3, character: 0 }],
+      [Token.Whitespace, " ", { pos: 20, line: 3, character: 4 }],
+      [Token.Identifier, "x", { pos: 21, line: 3, character: 5 }],
+      [Token.NewLine, "\u{2028}", { pos: 22, line: 3, character: 6 }],

-      [Token.Identifier, "aaaaa", 23, { line: 4, character: 0 }],
-      [Token.Whitespace, " ", 28, { line: 4, character: 5 }],
-      [Token.Identifier, "x", 29, { line: 4, character: 6 }],
-      [Token.NewLine, "\u{2029}", 30, { line: 4, character: 7 }],
+      [Token.Identifier, "aaaaa", { pos: 23, line: 4, character: 0 }],
+      [Token.Whitespace, " ", { pos: 28, line: 4, character: 5 }],
+      [Token.Identifier, "x", { pos: 29, line: 4, character: 6 }],
+      [Token.NewLine, "\u{2029}", { pos: 30, line: 4, character: 7 }],

-      [Token.Identifier, "aaaaaa", 31, { line: 5, character: 0 }],
-      [Token.Whitespace, " ", 37, { line: 5, character: 6 }],
-      [Token.Identifier, "x", 38, { line: 5, character: 7 }],
+      [Token.Identifier, "aaaaaa", { pos: 31, line: 5, character: 0 }],
+      [Token.Whitespace, " ", { pos: 37, line: 5, character: 6 }],
+      [Token.Identifier, "x", { pos: 38, line: 5, character: 7 }],
    ]);
  });

@ -225,11 +258,19 @@ describe("scanner", () => {
      `Token enum has ${tokenCount} elements but TokenDisplay array has ${tokenDisplayCount}.`
    );

-    // check that keywords have appropriate display
+    // check that keywords have appropriate display and limits
    const nonStatementKeywords = [Token.ExtendsKeyword, Token.TrueKeyword, Token.FalseKeyword];
-    let maxKeywordLengthFound = -1;
+    let minKeywordLengthFound = Number.MAX_SAFE_INTEGER;
+    let maxKeywordLengthFound = Number.MIN_SAFE_INTEGER;
+    let minKeywordStartCharFound = Number.MAX_SAFE_INTEGER;
+    let maxKeywordStartCharFound = Number.MIN_SAFE_INTEGER;
+
    for (const [name, token] of Keywords.entries()) {
+      minKeywordLengthFound = Math.min(minKeywordLengthFound, name.length);
      maxKeywordLengthFound = Math.max(maxKeywordLengthFound, name.length);
+      minKeywordStartCharFound = Math.min(minKeywordStartCharFound, name.charCodeAt(0));
+      maxKeywordStartCharFound = Math.max(maxKeywordStartCharFound, name.charCodeAt(0));
+
      assert.strictEqual(TokenDisplay[token], `'${name}'`);
      assert(isKeyword(token), `${name} should be classified as a keyword`);
      if (!nonStatementKeywords.includes(token)) {
@ -237,7 +278,10 @@ describe("scanner", () => {
      }
    }

-    assert.strictEqual(maxKeywordLengthFound, maxKeywordLength);
+    assert.strictEqual(minKeywordLengthFound, KeywordLimit.MinLength);
+    assert.strictEqual(maxKeywordLengthFound, KeywordLimit.MaxLength);
+    assert.strictEqual(minKeywordStartCharFound, KeywordLimit.MinStartChar);
+    assert.strictEqual(maxKeywordStartCharFound, KeywordLimit.MaxStartChar);

    // check single character punctuation
    for (let i = 33; i <= 126; i++) {