Fix bugs with non-ascii identifiers (#474)
This commit is contained in:
Родитель
ea281c601c
Коммит
78661fdbe8
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -54,10 +54,10 @@ export const Message = {
|
|||
"All lines in triple-quoted string lines must have the same indentation as closing triple quotes",
|
||||
} as const,
|
||||
|
||||
InvalidToken: {
|
||||
InvalidCharacter: {
|
||||
code: 1108,
|
||||
severity: "error",
|
||||
text: "Invalid token: '{0}'",
|
||||
text: "Invalid character.",
|
||||
} as const,
|
||||
};
|
||||
|
||||
|
|
|
@ -1012,7 +1012,7 @@ export function parse(code: string | SourceFile) {
|
|||
return false;
|
||||
}
|
||||
|
||||
function parseExpectedOneOf<T extends Token[]>(...options: T): T[number] | undefined {
|
||||
function parseExpectedOneOf<T extends Token[]>(...options: T): T[number] | Token.None {
|
||||
for (const tok of options) {
|
||||
if (token() === tok) {
|
||||
nextToken();
|
||||
|
@ -1020,7 +1020,7 @@ export function parse(code: string | SourceFile) {
|
|||
}
|
||||
}
|
||||
errorTokenIsNotOneOf(options);
|
||||
return undefined;
|
||||
return Token.None;
|
||||
}
|
||||
|
||||
function errorTokenIsNotOneOf(options: Token[]) {
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
import {
|
||||
CharacterCodes,
|
||||
isAsciiIdentifierContinue,
|
||||
isAsciiIdentifierStart,
|
||||
isBinaryDigit,
|
||||
isDigit,
|
||||
isHexDigit,
|
||||
isIdentifierPart,
|
||||
isIdentifierStart,
|
||||
isIdentifierContinue,
|
||||
isLineBreak,
|
||||
isNonAsciiIdentifierContinue,
|
||||
isNonAsciiIdentifierStart,
|
||||
isWhiteSpaceSingleLine,
|
||||
} from "./character-codes.js";
|
||||
import { createSourceFile, Message, throwOnError } from "./diagnostics.js";
|
||||
|
@ -17,7 +20,7 @@ const mergeConflictMarkerLength = 7;
|
|||
|
||||
export enum Token {
|
||||
None = 0,
|
||||
Unknown = 1,
|
||||
Invalid = 1,
|
||||
EndOfFile = 2,
|
||||
|
||||
// Trivia
|
||||
|
@ -81,7 +84,7 @@ const MaxStatementKeyword = Token.OpKeyword;
|
|||
|
||||
export const TokenDisplay: readonly string[] = [
|
||||
"<none>",
|
||||
"<unknown>",
|
||||
"<invalid>",
|
||||
"<end of file>",
|
||||
"<single-line comment>",
|
||||
"<multi-line comment>",
|
||||
|
@ -130,6 +133,8 @@ export const Keywords: ReadonlyMap<string, Token> = new Map([
|
|||
["false", Token.FalseKeyword],
|
||||
]);
|
||||
|
||||
export const maxKeywordLength = 9;
|
||||
|
||||
export interface Scanner {
|
||||
/** The source code being scanned. */
|
||||
readonly file: SourceFile;
|
||||
|
@ -202,7 +207,7 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
|
|||
const file = typeof source === "string" ? createSourceFile(source, "<anonymous file>") : source;
|
||||
const input = file.text;
|
||||
let position = 0;
|
||||
let token = Token.Unknown;
|
||||
let token = Token.Invalid;
|
||||
let tokenPosition = -1;
|
||||
let tokenValue: string | undefined = undefined;
|
||||
let tokenFlags = TokenFlags.None;
|
||||
|
@ -233,6 +238,10 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
|
|||
return (token = t);
|
||||
}
|
||||
|
||||
function utf16CodeUnits(codePoint: number) {
|
||||
return codePoint >= 0x10000 ? 2 : 1;
|
||||
}
|
||||
|
||||
function getTokenText() {
|
||||
return input.substring(tokenPosition, position);
|
||||
}
|
||||
|
@ -331,7 +340,7 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
|
|||
case CharacterCodes.asterisk:
|
||||
return scanMultiLineComment();
|
||||
}
|
||||
return invalidToken();
|
||||
return scanInvalidCharacter();
|
||||
|
||||
case CharacterCodes._0:
|
||||
switch (lookAhead(1)) {
|
||||
|
@ -376,16 +385,17 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
|
|||
return scanString();
|
||||
|
||||
default:
|
||||
return isIdentifierStart(ch) ? scanIdentifier() : invalidToken();
|
||||
return scanIdentifierOrKeyword();
|
||||
}
|
||||
}
|
||||
|
||||
return (token = Token.EndOfFile);
|
||||
}
|
||||
|
||||
function invalidToken() {
|
||||
token = next(Token.Unknown);
|
||||
error(Message.InvalidToken, [getTokenText()]);
|
||||
function scanInvalidCharacter() {
|
||||
const codePoint = input.codePointAt(position)!;
|
||||
token = next(Token.Invalid, utf16CodeUnits(codePoint));
|
||||
error(Message.InvalidCharacter);
|
||||
return token;
|
||||
}
|
||||
|
||||
|
@ -728,8 +738,57 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
|
|||
return result;
|
||||
}
|
||||
|
||||
function scanIdentifier() {
|
||||
scanUntil((ch) => !isIdentifierPart(ch));
|
||||
return (token = Keywords.get(getTokenValue()) ?? Token.Identifier);
|
||||
function scanIdentifierOrKeyword() {
|
||||
let ch = input.charCodeAt(position);
|
||||
|
||||
if (!isAsciiIdentifierStart(ch)) {
|
||||
return scanNonAsciiIdentifier();
|
||||
}
|
||||
|
||||
do {
|
||||
position++;
|
||||
if (eof()) {
|
||||
break;
|
||||
}
|
||||
ch = input.charCodeAt(position);
|
||||
} while (isAsciiIdentifierContinue(ch));
|
||||
|
||||
if (!eof() && ch > CharacterCodes.maxAsciiCharacter) {
|
||||
const codePoint = input.codePointAt(position)!;
|
||||
if (isNonAsciiIdentifierContinue(codePoint)) {
|
||||
return scanNonAsciiIdentifierContinue(codePoint);
|
||||
}
|
||||
}
|
||||
|
||||
if (position - tokenPosition <= maxKeywordLength) {
|
||||
const value = getTokenValue();
|
||||
const keyword = Keywords.get(value);
|
||||
if (keyword) {
|
||||
return (token = keyword);
|
||||
}
|
||||
}
|
||||
|
||||
return (token = Token.Identifier);
|
||||
}
|
||||
|
||||
function scanNonAsciiIdentifier() {
|
||||
let codePoint = input.codePointAt(position)!;
|
||||
return isNonAsciiIdentifierStart(codePoint)
|
||||
? scanNonAsciiIdentifierContinue(codePoint)
|
||||
: scanInvalidCharacter();
|
||||
}
|
||||
|
||||
function scanNonAsciiIdentifierContinue(startCodePoint: number) {
|
||||
let codePoint = startCodePoint;
|
||||
|
||||
do {
|
||||
position += utf16CodeUnits(codePoint);
|
||||
if (eof()) {
|
||||
break;
|
||||
}
|
||||
codePoint = input.codePointAt(position)!;
|
||||
} while (isIdentifierContinue(codePoint));
|
||||
|
||||
return (token = Token.Identifier);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -216,6 +216,11 @@ describe("syntax", () => {
|
|||
],
|
||||
]);
|
||||
});
|
||||
|
||||
describe("non-ascii identifiers", () => {
|
||||
parseEach(["model Incompréhensible {}", "model 𐌰𐌲 {}", "model Banana𐌰𐌲Banana {}"]);
|
||||
parseErrorEach([["model 😢 {}", [/Invalid character/]]]);
|
||||
});
|
||||
});
|
||||
|
||||
function parseEach(cases: string[]) {
|
||||
|
|
|
@ -8,6 +8,7 @@ import {
|
|||
isPunctuation,
|
||||
isStatementKeyword,
|
||||
Keywords,
|
||||
maxKeywordLength,
|
||||
Token,
|
||||
TokenDisplay,
|
||||
} from "../compiler/scanner.js";
|
||||
|
@ -226,7 +227,9 @@ describe("scanner", () => {
|
|||
|
||||
// check that keywords have appropriate display
|
||||
const nonStatementKeywords = [Token.ExtendsKeyword, Token.TrueKeyword, Token.FalseKeyword];
|
||||
let maxKeywordLengthFound = -1;
|
||||
for (const [name, token] of Keywords.entries()) {
|
||||
maxKeywordLengthFound = Math.max(maxKeywordLengthFound, name.length);
|
||||
assert.strictEqual(TokenDisplay[token], `'${name}'`);
|
||||
assert(isKeyword(token), `${name} should be classified as a keyword`);
|
||||
if (!nonStatementKeywords.includes(token)) {
|
||||
|
@ -234,6 +237,8 @@ describe("scanner", () => {
|
|||
}
|
||||
}
|
||||
|
||||
assert.strictEqual(maxKeywordLengthFound, maxKeywordLength);
|
||||
|
||||
// check single character punctuation
|
||||
for (let i = 33; i <= 126; i++) {
|
||||
const str = String.fromCharCode(i);
|
||||
|
@ -241,7 +246,7 @@ describe("scanner", () => {
|
|||
if (
|
||||
token !== Token.StringLiteral &&
|
||||
token !== Token.Identifier &&
|
||||
token !== Token.Unknown &&
|
||||
token !== Token.Invalid &&
|
||||
token !== Token.NumericLiteral
|
||||
) {
|
||||
assert.strictEqual(TokenDisplay[token], `'${str}'`);
|
||||
|
@ -252,7 +257,7 @@ describe("scanner", () => {
|
|||
// check the rest
|
||||
assert.strictEqual(TokenDisplay[Token.Elipsis], "'...'");
|
||||
assert.strictEqual(TokenDisplay[Token.None], "<none>");
|
||||
assert.strictEqual(TokenDisplay[Token.Unknown], "<unknown>");
|
||||
assert.strictEqual(TokenDisplay[Token.Invalid], "<invalid>");
|
||||
assert.strictEqual(TokenDisplay[Token.EndOfFile], "<end of file>");
|
||||
assert.strictEqual(TokenDisplay[Token.SingleLineComment], "<single-line comment>");
|
||||
assert.strictEqual(TokenDisplay[Token.MultiLineComment], "<multi-line comment>");
|
||||
|
|
Загрузка…
Ссылка в новой задаче