Fix bugs with non-ascii identifiers (#474)

This commit is contained in:
Nick Guerrera 2021-04-22 13:13:32 -07:00 коммит произвёл GitHub
Родитель ea281c601c
Коммит 78661fdbe8
6 изменённых файлов: 113 добавлений и 40 удалений

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -54,10 +54,10 @@ export const Message = {
"All lines in triple-quoted string lines must have the same indentation as closing triple quotes",
} as const,
InvalidToken: {
InvalidCharacter: {
code: 1108,
severity: "error",
text: "Invalid token: '{0}'",
text: "Invalid character.",
} as const,
};

Просмотреть файл

@ -1012,7 +1012,7 @@ export function parse(code: string | SourceFile) {
return false;
}
function parseExpectedOneOf<T extends Token[]>(...options: T): T[number] | undefined {
function parseExpectedOneOf<T extends Token[]>(...options: T): T[number] | Token.None {
for (const tok of options) {
if (token() === tok) {
nextToken();
@ -1020,7 +1020,7 @@ export function parse(code: string | SourceFile) {
}
}
errorTokenIsNotOneOf(options);
return undefined;
return Token.None;
}
function errorTokenIsNotOneOf(options: Token[]) {

Просмотреть файл

@ -1,11 +1,14 @@
import {
CharacterCodes,
isAsciiIdentifierContinue,
isAsciiIdentifierStart,
isBinaryDigit,
isDigit,
isHexDigit,
isIdentifierPart,
isIdentifierStart,
isIdentifierContinue,
isLineBreak,
isNonAsciiIdentifierContinue,
isNonAsciiIdentifierStart,
isWhiteSpaceSingleLine,
} from "./character-codes.js";
import { createSourceFile, Message, throwOnError } from "./diagnostics.js";
@ -17,7 +20,7 @@ const mergeConflictMarkerLength = 7;
export enum Token {
None = 0,
Unknown = 1,
Invalid = 1,
EndOfFile = 2,
// Trivia
@ -81,7 +84,7 @@ const MaxStatementKeyword = Token.OpKeyword;
export const TokenDisplay: readonly string[] = [
"<none>",
"<unknown>",
"<invalid>",
"<end of file>",
"<single-line comment>",
"<multi-line comment>",
@ -130,6 +133,8 @@ export const Keywords: ReadonlyMap<string, Token> = new Map([
["false", Token.FalseKeyword],
]);
export const maxKeywordLength = 9;
export interface Scanner {
/** The source code being scanned. */
readonly file: SourceFile;
@ -202,7 +207,7 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
const file = typeof source === "string" ? createSourceFile(source, "<anonymous file>") : source;
const input = file.text;
let position = 0;
let token = Token.Unknown;
let token = Token.Invalid;
let tokenPosition = -1;
let tokenValue: string | undefined = undefined;
let tokenFlags = TokenFlags.None;
@ -233,6 +238,10 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
return (token = t);
}
function utf16CodeUnits(codePoint: number) {
return codePoint >= 0x10000 ? 2 : 1;
}
function getTokenText() {
return input.substring(tokenPosition, position);
}
@ -331,7 +340,7 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
case CharacterCodes.asterisk:
return scanMultiLineComment();
}
return invalidToken();
return scanInvalidCharacter();
case CharacterCodes._0:
switch (lookAhead(1)) {
@ -376,16 +385,17 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
return scanString();
default:
return isIdentifierStart(ch) ? scanIdentifier() : invalidToken();
return scanIdentifierOrKeyword();
}
}
return (token = Token.EndOfFile);
}
function invalidToken() {
token = next(Token.Unknown);
error(Message.InvalidToken, [getTokenText()]);
function scanInvalidCharacter() {
const codePoint = input.codePointAt(position)!;
token = next(Token.Invalid, utf16CodeUnits(codePoint));
error(Message.InvalidCharacter);
return token;
}
@ -728,8 +738,57 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
return result;
}
function scanIdentifier() {
scanUntil((ch) => !isIdentifierPart(ch));
return (token = Keywords.get(getTokenValue()) ?? Token.Identifier);
function scanIdentifierOrKeyword() {
let ch = input.charCodeAt(position);
if (!isAsciiIdentifierStart(ch)) {
return scanNonAsciiIdentifier();
}
do {
position++;
if (eof()) {
break;
}
ch = input.charCodeAt(position);
} while (isAsciiIdentifierContinue(ch));
if (!eof() && ch > CharacterCodes.maxAsciiCharacter) {
const codePoint = input.codePointAt(position)!;
if (isNonAsciiIdentifierContinue(codePoint)) {
return scanNonAsciiIdentifierContinue(codePoint);
}
}
if (position - tokenPosition <= maxKeywordLength) {
const value = getTokenValue();
const keyword = Keywords.get(value);
if (keyword) {
return (token = keyword);
}
}
return (token = Token.Identifier);
}
function scanNonAsciiIdentifier() {
let codePoint = input.codePointAt(position)!;
return isNonAsciiIdentifierStart(codePoint)
? scanNonAsciiIdentifierContinue(codePoint)
: scanInvalidCharacter();
}
function scanNonAsciiIdentifierContinue(startCodePoint: number) {
let codePoint = startCodePoint;
do {
position += utf16CodeUnits(codePoint);
if (eof()) {
break;
}
codePoint = input.codePointAt(position)!;
} while (isIdentifierContinue(codePoint));
return (token = Token.Identifier);
}
}

Просмотреть файл

@ -216,6 +216,11 @@ describe("syntax", () => {
],
]);
});
describe("non-ascii identifiers", () => {
parseEach(["model Incompréhensible {}", "model 𐌰𐌲 {}", "model Banana𐌰𐌲Banana {}"]);
parseErrorEach([["model 😢 {}", [/Invalid character/]]]);
});
});
function parseEach(cases: string[]) {

Просмотреть файл

@ -8,6 +8,7 @@ import {
isPunctuation,
isStatementKeyword,
Keywords,
maxKeywordLength,
Token,
TokenDisplay,
} from "../compiler/scanner.js";
@ -226,7 +227,9 @@ describe("scanner", () => {
// check that keywords have appropriate display
const nonStatementKeywords = [Token.ExtendsKeyword, Token.TrueKeyword, Token.FalseKeyword];
let maxKeywordLengthFound = -1;
for (const [name, token] of Keywords.entries()) {
maxKeywordLengthFound = Math.max(maxKeywordLengthFound, name.length);
assert.strictEqual(TokenDisplay[token], `'${name}'`);
assert(isKeyword(token), `${name} should be classified as a keyword`);
if (!nonStatementKeywords.includes(token)) {
@ -234,6 +237,8 @@ describe("scanner", () => {
}
}
assert.strictEqual(maxKeywordLengthFound, maxKeywordLength);
// check single character punctuation
for (let i = 33; i <= 126; i++) {
const str = String.fromCharCode(i);
@ -241,7 +246,7 @@ describe("scanner", () => {
if (
token !== Token.StringLiteral &&
token !== Token.Identifier &&
token !== Token.Unknown &&
token !== Token.Invalid &&
token !== Token.NumericLiteral
) {
assert.strictEqual(TokenDisplay[token], `'${str}'`);
@ -252,7 +257,7 @@ describe("scanner", () => {
// check the rest
assert.strictEqual(TokenDisplay[Token.Elipsis], "'...'");
assert.strictEqual(TokenDisplay[Token.None], "<none>");
assert.strictEqual(TokenDisplay[Token.Unknown], "<unknown>");
assert.strictEqual(TokenDisplay[Token.Invalid], "<invalid>");
assert.strictEqual(TokenDisplay[Token.EndOfFile], "<end of file>");
assert.strictEqual(TokenDisplay[Token.SingleLineComment], "<single-line comment>");
assert.strictEqual(TokenDisplay[Token.MultiLineComment], "<multi-line comment>");