Continuing to implement

2000-01-25 22:57:32 +00:00 · 2000-01-25 22:57:32 +00:00 · 0f9dbf7003
--- a/js/js2/parser.cpp
+++ b/js/js2/parser.cpp
@ -18,6 +18,8 @@
 // Rights Reserved.

 #include "parser.h"
+#include "world.h"
+
 namespace JS = JavaScript;


@ -28,75 +30,226 @@ namespace JS = JavaScript;

 // Create a Reader reading characters from begin up to but not including end.
 JS::Reader::Reader(const char16 *begin, const char16 *end):
-	begin(begin), p(begin), end(end), nGetsPastEnd(0)
+	begin(begin), p(begin), end(end), lineStart(begin), nGetsPastEnd(0)
 {
 	ASSERT(begin <= end);
+  #ifdef DEBUG
+	recordString = 0;
+  #endif
 }


-// Unread the last character.
-void JS::Reader::unget()
+// Unread the last n characters.  unget cannot be called to back up past the position
+// of the last call to beginLine().
+void JS::Reader::unget(uint32 n)
 {
-	if (nGetsPastEnd)
-		--nGetsPastEnd;
-	else {
-		ASSERT(p != begin);
-		--p;
+	if (nGetsPastEnd) {
+		if (nGetsPastEnd >= n) {
+			nGetsPastEnd -= n;
+			return;
 		}
+		n -= nGetsPastEnd;
+		nGetsPastEnd = 0;
+	}
+	ASSERT(p >= begin + n);
+	p -= n;
 }


-// Set s to the characters read in after the mark but before the current position
-// and then delete the Reader mark.
-void JS::Reader::unmark(String &s)
+// Return the characters read in from position begin inclusive to position end
+// exclusive relative to the current line.  begin <= end <= charPos() is required.
+JS::String JS::Reader::extract(uint32 begin, uint32 end) const
 {
-	ASSERT(markPos);
-	s.assign(markPos, p);
-	markPos = 0;
+	ASSERT(begin <= end && end + nGetsPastEnd <= charPos());
+	return String(lineStart + begin, lineStart + end);
+}
+
+
+// Begin accumulating characters into the recordString.  Each character passed
+// to recordChar() is added to the end of the recordString.  Recording ends when
+// endRecord() or beginLine() is called.
+// Recording is significantly optimized when the characters passed to readChar()
+// are the same characters as read by get().  In this case the record String does
+// not get allocated until endRecord() is called or a discrepancy appears between
+// get() and recordChar().
+void JS::Reader::beginRecording(String &recordString)
+{
+	Reader::recordString = &recordString;
+	recordBase = p;
+	recordPos = p;
+}
+
+
+// Append ch to the recordString.
+void JS::Reader::recordChar(char16 ch)
+{
+	ASSERT(recordString);
+	if (recordPos) {
+		if (recordPos != end && *recordPos == ch) {
+			recordPos++;
+			return;
+		} else {
+			recordString->assign(recordBase, recordPos);
+			recordPos = 0;
+		}
+	}
+	*recordString += ch;
+}
+
+
+// Finish recording characters into the recordString that was last passed to beginRecording().
+// Return that recordString.
+JS::String &JS::Reader::endRecording()
+{
+	String *rs = recordString;
+	ASSERT(rs);
+	if (recordPos)
+		rs->assign(recordBase, recordPos);
+	recordString = 0;
+	return *rs;
 }


 // Refill the source buffer after running off the end.  Get and return
 // the next character.
-// The default implementation just returns ueof.
-JS::wint_t JS::Reader::underflow()
+// The default implementation just returns char16eof.
+JS::char16orEOF JS::Reader::underflow()
 {
 	++nGetsPastEnd;
-	return ueof;
+	return char16eof;
 }


 // Perform a peek when begin == end.
-JS::wint_t JS::Reader::peekUnderflow()
+JS::char16orEOF JS::Reader::peekUnderflow()
 {
-	wint_t ch = underflow();
+	char16orEOF ch = underflow();
 	unget();
 	return ch;
 }


-// Create a StringReader reading characters from a copy of the given String.
-JS::StringReader::StringReader(const String &s):
-	str(s)
+// Create a StringReader reading characters from s.
+// source describes the origin of string s and may be used for error messages.
+JS::StringReader::StringReader(const String &s, const String &source):
+	str(s), source(source)
 {
 	const char16 *begin = str.data();
 	setBuffer(begin, begin, begin + str.size());
 }


+JS::String JS::StringReader::sourceFile() const
+{
+	return source;
+}
+
+
 //
 // Lexer
 //


-// Create a new Lexer using the provided Reader.
-JS::Lexer::Lexer(Reader &reader): reader(reader)
+void JS::Token::setChars(const String &s)
+{
+	chars = static_cast<auto_ptr<String> >(new String(s));
+}
+
+
+struct KeywordInit {
+	const char *name;					// Null-terminated ASCII name of keyword
+	JS::Token::Kind tokenKind;			// Keyword's number
+};
+
+static KeywordInit keywordInits[] = {
+  // Reserved words
+	{"abstract", JS::Token::Abstract},
+	{"abstract", JS::Token::Abstract},
+	{"break", JS::Token::Break},
+	{"case", JS::Token::Case},
+	{"catch", JS::Token::Catch},
+	{"class", JS::Token::Class},
+	{"const", JS::Token::Const},
+	{"continue", JS::Token::Continue},
+	{"debugger", JS::Token::Debugger},
+	{"default", JS::Token::Default},
+	{"delete", JS::Token::Delete},
+	{"do", JS::Token::Do},
+	{"else", JS::Token::Else},
+	{"enum", JS::Token::Enum},
+	{"eval", JS::Token::Eval},
+	{"export", JS::Token::Export},
+	{"extends", JS::Token::Extends},
+	{"false", JS::Token::False},
+	{"final", JS::Token::Final},
+	{"finally", JS::Token::Finally},
+	{"for", JS::Token::For},
+	{"function", JS::Token::Function},
+	{"goto", JS::Token::Goto},
+	{"if", JS::Token::If},
+	{"implements", JS::Token::Implements},
+	{"import", JS::Token::Import},
+	{"in", JS::Token::In},
+	{"instanceof", JS::Token::Instanceof},
+	{"native", JS::Token::Native},
+	{"new", JS::Token::New},
+	{"null", JS::Token::Null},
+	{"package", JS::Token::Package},
+	{"private", JS::Token::Private},
+	{"protected", JS::Token::Protected},
+	{"public", JS::Token::Public},
+	{"return", JS::Token::Return},
+	{"static", JS::Token::Static},
+	{"super", JS::Token::Super},
+	{"switch", JS::Token::Switch},
+	{"synchronized", JS::Token::Synchronized},
+	{"this", JS::Token::This},
+	{"throw", JS::Token::Throw},
+	{"throws", JS::Token::Throws},
+	{"transient", JS::Token::Transient},
+	{"true", JS::Token::True},
+	{"try", JS::Token::Try},
+	{"typeof", JS::Token::Typeof},
+	{"var", JS::Token::Var},
+	{"volatile", JS::Token::Volatile},
+	{"while", JS::Token::While},
+	{"with", JS::Token::With},
+  // Non-reserved words
+	{"box", JS::Token::Box},
+	{"constructor", JS::Token::Constructor},
+	{"field", JS::Token::Field},
+	{"get", JS::Token::Get},
+	{"language", JS::Token::Language},
+	{"local", JS::Token::Local},
+	{"method", JS::Token::Method},
+	{"override", JS::Token::Override},
+	{"set", JS::Token::Set},
+	{"version", JS::Token::Version}
+};
+
+
+// Initialize the keywords in the given world.
+void JS::initKeywords(World &world)
+{
+	KeywordInit *ki = keywordInits;
+	KeywordInit *kiEnd = keywordInits + sizeof(keywordInits)/sizeof(KeywordInit);
+	for (; ki != kiEnd; ++ki)
+		world.identifiers[widenCString(ki->name)].tokenKind = ki->tokenKind;
+}
+
+
+
+// Create a new Lexer using the provided Reader and interning identifiers, keywords, and regular
+// expressions in the designated world.
+JS::Lexer::Lexer(Reader &reader, World &world): reader(reader), world(world)
 {
 	nextToken = tokens;
 	nTokensFwd = 0;
  #ifdef DEBUG
 	nTokensBack = 0;
  #endif
+	lineNum = 1;
+	lexingUnit = false;
 }


@ -156,9 +309,536 @@ void JS::Lexer::unget()
 }


+// Report a syntax error at the backUp-th last character read by the Reader.
+// In other words, if backUp is 0, the error is at the next character to be read by the Reader;
+// if backUp is 1, the error is at the last character read by the Reader, and so forth.
+void JS::Lexer::syntaxError(const char *message, uint backUp)
+{
+	reader.unget(backUp);
+	uint32 charPos = reader.charPos();
+	char16orEOF ch;
+	do {
+		ch = reader.get();
+	} while (ch != char16eof && !isLineBreak(char16orEOFToChar16(ch)));
+	reader.unget();
+	Exception e(Exception::SyntaxError, widenCString(message), reader.sourceFile(), lineNum, charPos,
+				reader.extract(0, reader.charPos()));
+	throw e;
+}
+
+
+// Get the next character from the reader, skipping any Unicode format-control (Cf) characters.
+inline JS::char16orEOF JS::Lexer::getChar()
+{
+	char16orEOF ch = reader.get();
+	if (static_cast<uint32>(ch) >= firstFormatChar)
+		ch = internalGetChar(ch);
+	return ch;
+}
+
+// Helper for getChar()
+JS::char16orEOF JS::Lexer::internalGetChar(char16orEOF ch)
+{
+	while (isFormat(char16orEOFToChar16(ch)))
+		ch = reader.get();
+	return ch;
+}
+
+
+// Peek the next character from the reader, skipping any Unicode format-control (Cf) characters,
+// which are read and discarded.
+inline JS::char16orEOF JS::Lexer::peekChar()
+{
+	char16orEOF ch = reader.peek();
+	if (static_cast<uint32>(ch) >= firstFormatChar)
+		ch = internalPeekChar(ch);
+	return ch;
+}
+
+// Helper for peekChar()
+JS::char16orEOF JS::Lexer::internalPeekChar(char16orEOF ch)
+{
+	while (isFormat(char16orEOFToChar16(ch))) {
+		reader.get();
+		ch = reader.peek();
+	}
+	return ch;
+}
+
+
+// Peek the next character from the reader, skipping any Unicode format-control (Cf) characters,
+// which are read and discarded.  If the peeked character matches ch, read that character and return true;
+// otherwise return false.
+bool JS::Lexer::testChar(char16 ch)
+{
+	char16orEOF ch2 = peekChar();
+	if (ch == ch2) {
+		reader.get();
+		return true;
+	}
+	return false;
+}
+
+
+// A backslash has been read.  Read the rest of the escape code.
+// Return the interpreted escaped character.  Throw an exception if the escape is not valid.
+// If unicodeOnly is true, allow only \uxxxx escapes.
+char16 JS::Lexer::lexEscape(bool unicodeOnly)
+{
+	char16orEOF ch = getChar();
+	int nDigits;
+
+	if (!unicodeOnly || ch == 'u')
+		switch (ch) {
+		  case '0':
+			// Make sure that the next character isn't a digit.
+			ch = peekChar();
+			if (!isASCIIDecimalDigit(char16orEOFToChar16(ch)))
+				return 0x00;
+			getChar();	// Point to the next character in the error message
+		  case 'b':
+			return 0x08;
+		  case 'f':
+			return 0x0C;
+		  case 'n':
+			return 0x0A;
+		  case 'r':
+			return 0x0D;
+		  case 't':
+			return 0x09;
+		  case 'v':
+			return 0x0B;
+		  case 'x':
+			nDigits = 2;
+			goto lexHex;
+		  case 'u':
+			nDigits = 4;
+		  lexHex:
+			{
+				uint32 n = 0;
+				while (nDigits--) {
+					ch = getChar();
+					uint digit;
+					if (!isASCIIHexDigit(char16orEOFToChar16(ch), digit))
+						goto error;
+					n = (n << 4) | digit;
+				}
+				return char16(n);
+			}
+		default:
+			if (ch != char16eof) {
+				CharInfo chi(char16orEOFToChar16(ch));
+				if (!isAlphanumeric(chi) && !isLineBreak(chi))
+					return char16orEOFToChar16(ch);
+			}
+		}
+  error:
+	syntaxError("Bad escape code");
+	return 0;
+}
+
+
+// Read an identifier into s.  Return true if an escape code has been encountered.
+// If allowLeadingDigit is true, allow the first character of s to be a digit, just like any
+// continuing identifier character.
+bool JS::Lexer::lexIdentifier(String &s, bool allowLeadingDigit)
+{
+	reader.beginRecording(s);
+	bool hasEscape = false;
+
+	while (true) {
+		char16orEOF ch = getChar();
+		char16orEOF ch2 = ch;
+		if (ch == '\\') {
+			ch2 = lexEscape(true);
+			hasEscape = true;
+		}
+		CharInfo chi2(char16orEOFToChar16(ch2));
+		
+		if (!(allowLeadingDigit ? isIdContinuing(chi2) : isIdLeading(chi2))) {
+			if (ch == '\\')
+				syntaxError("Identifier escape expands into non-identifier character");
+			else
+				reader.unget();
+			break;
+		}
+		reader.recordChar(char16orEOFToChar16(ch2));
+		allowLeadingDigit = true;
+	}
+	reader.endRecording();
+	return hasEscape;
+}
+
+
+// Read a numeric literal into nextToken->chars and nextToken->value.
+// Return true if the numeric literal is followed by a unit, but don't read the unit yet.
+bool JS::Lexer::lexNumeral()
+{
+	int radix = 10;
+	int hasDecimalPoint = 0;
+	String s;
+	uint digit;
+
+	reader.beginRecording(s);
+	char16orEOF ch = getChar();
+	if (ch == '0') {
+		reader.recordChar('0');
+		ch = getChar();
+		if (ch&~0x20 == 'X') {
+			uint32 pos = reader.charPos();
+			char16orEOF ch2 = getChar();
+			if (isASCIIHexDigit(char16orEOFToChar16(ch2), digit)) {
+				reader.recordChar(char16orEOFToChar16(ch));
+				do {
+					reader.recordChar(char16orEOFToChar16(ch2));
+					ch2 = getChar();
+				} while (isASCIIHexDigit(char16orEOFToChar16(ch2), digit));
+				ch = ch2;
+			} else
+				reader.backUpTo(pos);
+			goto done;
+		} else if (isASCIIDecimalDigit(char16orEOFToChar16(ch))) {
+			syntaxError("Numeric constant syntax error");
+		}
+	}
+	while (isASCIIDecimalDigit(char16orEOFToChar16(ch)) || ch == '.' && !hasDecimalPoint++) {
+		reader.recordChar(char16orEOFToChar16(ch));
+		ch = getChar();
+	}
+	if (ch&~0x20 == 'E') {
+		uint32 pos = reader.charPos();
+		char16orEOF ch2 = getChar();
+		char16 sign = 0;
+		if (ch2 == '+' || ch2 == '-') {
+			sign = char16orEOFToChar16(ch2);
+			ch2 = getChar();
+		}
+		if (isASCIIDecimalDigit(char16orEOFToChar16(ch2))) {
+			reader.recordChar(char16orEOFToChar16(ch));
+			if (sign)
+				reader.recordChar(sign);
+			do {
+				reader.recordChar(char16orEOFToChar16(ch2));
+				ch2 = getChar();
+			} while (isASCIIDecimalDigit(char16orEOFToChar16(ch2)));
+			ch = ch2;
+		} else
+			reader.backUpTo(pos);
+	}
+	
+  done:
+	// At this point the reader is just past the character ch, which is the first non-formatting character
+	// that is not part of the number.
+	reader.endRecording();
+	nextToken->setChars(s);
+	reader.unget();
+	ASSERT(ch == reader.peek());
+	return isIdContinuing(char16orEOFToChar16(ch)) || ch == '\\';
+}
+
+
+// Read a string literal into a String and return that String.
+// The opening quote has already been read into separator.
+JS::String JS::Lexer::lexString(char16 separator)
+{
+	String s;
+	char16orEOF ch;
+
+	reader.beginRecording(s);
+	while ((ch = reader.get()) != separator) {
+    	CharInfo chi(char16orEOFToChar16(ch));
+    	if (!isFormat(chi)) {
+			if (ch == '\\')
+				ch = lexEscape(false);
+			else if (ch == char16eof || isLineBreak(chi))
+				syntaxError("Unterminated string literal");
+			reader.recordChar(char16orEOFToChar16(ch));
+		}
+	}
+	reader.endRecording();
+	return s;
+}
+
+
+// Read a regular expression literal.  Store the regular expression in nextToken->identifier
+// and the flags in nextToken->flags.
+// The opening slash has already been read.
+void JS::Lexer::lexRegExp()
+{
+	String s;
+	char16orEOF prevCh = 0;
+
+	reader.beginRecording(s);
+	while (true) {
+		char16orEOF ch = getChar();
+    	CharInfo chi(char16orEOFToChar16(ch));
+		if (ch == char16eof || isLineBreak(chi))
+			syntaxError("Unterminated regular expression literal");
+		if (prevCh == '\\') {
+			reader.recordChar(char16orEOFToChar16(ch));
+			prevCh = 0;	// Ignore slashes and backslashes immediately after a \
+		} else if (ch != '/') {
+			reader.recordChar(char16orEOFToChar16(ch));
+			prevCh = ch;
+		} else
+			break;
+	}
+	reader.endRecording();
+	nextToken->identifier = &world.identifiers[s];
+	
+	String flags;
+	lexIdentifier(flags, true);
+	nextToken->setChars(flags);
+}
+
+
 // Read a token from the Reader and store it at *nextToken.
 // If the Reader reached the end of file, store a Token whose Kind is End.
 void JS::Lexer::lexToken(bool preferRegExp)
 {
-}
+	Token &t = *nextToken;
+	t.lineBreak = false;
+	t.identifier = 0;
+	t.chars.reset();
+	t.value = 0;
+	Token::Kind kind;

+  next:
+	char16orEOF ch = reader.get();
+	char16orEOF ch2;
+	CharInfo chi(char16orEOFToChar16(ch));
+
+	switch (cGroup(chi)) {
+      case CharInfo::FormatGroup:
+      case CharInfo::WhiteGroup:
+    	goto next;
+
+      case CharInfo::IdGroup:
+    	t.charPos = reader.charPos() - 1;
+      readIdentifier:
+    	{
+	    	reader.unget();
+	    	String s;
+    		bool hasEscape = lexIdentifier(s, false);
+	    	t.identifier = &world.identifiers[s];
+	    	kind = hasEscape ? Token::Id : t.identifier->tokenKind;
+    	}
+    	break;
+
+      case CharInfo::NonIdGroup:
+      case CharInfo::IdContinueGroup:
+    	t.charPos = reader.charPos() - 1;
+    	switch (ch) {
+		  case '(':
+			kind = Token::OpenParenthesis;	// (
+			break;
+		  case ')':
+			kind = Token::CloseParenthesis;	// )
+			break;
+		  case '[':
+			kind = Token::OpenBracket;		// [
+			break;
+		  case ']':
+			kind = Token::CloseBracket;		// ]
+			break;
+		  case '{':
+			kind = Token::OpenBrace;		// {
+			break;
+		  case '}':
+			kind = Token::CloseBrace;		// }
+			break;
+		  case ',':
+			kind = Token::Comma;			// ,
+			break;
+		  case ';':
+			kind = Token::Semicolon;		// ;
+			break;
+		  case '.':
+			kind = Token::Dot;				// .
+			ch2 = getChar();
+			if (isASCIIDecimalDigit(char16orEOFToChar16(ch2))) {
+				reader.backUpTo(t.charPos);
+				goto number;				// decimal point
+			} else if (ch2 == '.') {
+				kind = Token::DoubleDot;	// ..
+				if (testChar('.'))
+					kind = Token::TripleDot; // ...
+			} else
+				reader.unget();
+			break;
+		  case ':':
+			kind = Token::Colon;			// :
+			if (testChar(':'))
+				kind = Token::DoubleColon;	// ::
+			break;
+		  case '#':
+			kind = Token::Pound;			// #
+			break;
+		  case '@':
+			kind = Token::At;				// @
+			break;
+		  case '?':
+			kind = Token::Question;			// ?
+			break;
+
+		  case '~':
+			kind = Token::Complement;		// ~
+			break;
+		  case '!':
+			kind = Token::Not;				// !
+			if (testChar('=')) {
+				kind = Token::NotEqual;		// !=
+				if (testChar('='))
+					kind = Token::NotIdentical; // !==
+			}
+			break;
+
+		  case '*':
+			kind = Token::Times;			// * *=
+		  tryAssignment:
+			if (testChar('='))
+				kind = Token::Kind(kind + Token::TimesEquals - Token::Times);
+			break;
+
+		  case '/':
+			kind = Token::Divide;			// /
+			ch = getChar();
+			if (ch == '/') {				// // comment
+				do {
+					ch = reader.get();
+					if (ch == char16eof)
+						goto endOfInput;
+				} while (!isLineBreak(char16orEOFToChar16(ch)));
+				goto endOfLine;
+			} else if (ch == '*') {			// /* comment */
+				ch = 0;
+				do {
+					ch2 = ch;
+					ch = getChar();
+					if (isLineBreak(char16orEOFToChar16(ch))) {
+						reader.beginLine();
+						++lineNum;
+						t.lineBreak = true;
+					}
+					if (ch == char16eof)
+						syntaxError("Unterminated /* comment");
+				} while (ch != '/' || ch2 != '*');
+				goto next;
+			} else {
+				reader.unget();
+				if (preferRegExp) {			// Regular expression
+					kind = Token::RegExp;
+					lexRegExp();
+				} else
+					 goto tryAssignment;	// /=
+			}
+			break;
+
+		  case '%':
+			kind = Token::Modulo;			// %
+			goto tryAssignment;				// %=
+
+		  case '+':
+			kind = Token::Plus;				// +
+			if (testChar('+'))
+				kind = Token::Increment;	// ++
+			else
+				goto tryAssignment;			// +=
+			break;
+
+		  case '-':
+			kind = Token::Minus;			// -
+			ch = getChar();
+			if (ch == '-')
+				kind = Token::Decrement;	// --
+			else if (ch == '>')
+				kind = Token::Arrow;		// ->
+			else {
+				reader.unget();
+				goto tryAssignment;			// -=
+			}
+			break;
+	
+		  case '&':
+			kind = Token::And;				// & && &= &&=
+		  logical:
+			if (testChar(char16orEOFToChar16(ch)))
+				kind = Token::Kind(kind - Token::And + Token::LogicalAnd);
+			goto tryAssignment;
+		  case '^':
+			kind = Token::Xor;				// ^ ^^ ^= ^^=
+			goto logical;
+		  case '|':
+			kind = Token::Or;				// | || |= ||=
+			goto logical;
+
+		  case '=':
+			kind = Token::Assignment;		// =
+			if (testChar('=')) {
+				kind = Token::Equal;		// ==
+				if (testChar('='))
+					kind = Token::Identical; // ===
+			}
+			break;
+
+		  case '<':
+			kind = Token::LessThan;			// <
+			if (testChar('<')) {
+				kind = Token::LeftShift;	// <<
+				goto tryAssignment;			// <<=
+			}
+		  comparison:
+			if (testChar('='))				// <= >=
+				kind = Token::Kind(kind + Token::LessThanOrEqual - Token::LessThan);
+			break;
+		  case '>':
+			kind = Token::GreaterThan;		// >
+			if (testChar('>')) {
+				kind = Token::RightShift;	// >>
+				if (testChar('>'))
+					kind = Token::LogicalRightShift; // >>>
+				goto tryAssignment;			// >>= >>>=
+			}
+			goto comparison;
+
+		  case '\\':
+			goto readIdentifier;			// An identifier that starts with an escape
+
+		  case '\'':
+		  case '"':
+			kind = Token::Str;				// 'string' "string"
+			t.setChars(lexString(char16orEOFToChar16(ch)));
+			break;
+
+		  case '0':
+		  case '1':
+		  case '2':
+		  case '3':
+		  case '4':
+		  case '5':
+		  case '6':
+		  case '7':
+		  case '8':
+		  case '9':
+			reader.unget();					// Number
+		  number:
+			kind = Token::Num;
+			lexNumeral();
+			break;
+
+		  case char16eof:
+		  endOfInput:
+			kind = Token::End;
+    	}
+    	break;
+
+      case CharInfo::LineBreakGroup:
+      endOfLine:
+		reader.beginLine();
+		++lineNum;
+		t.lineBreak = true;
+		goto next;
+	}
+	t.kind = kind;
+	t.lineNum = lineNum;
+}
--- a/js/js2/parser.h
+++ b/js/js2/parser.h
@ -21,22 +21,28 @@
 #define parser_h

 #include "utilities.h"
-#include "world.h"

 namespace JavaScript {

+	class StringAtom;
+	class World;
+
 //
 // Reader
 //

 	// A Reader reads Unicode characters from some source -- either a file or a string.
-	// get() returns all of the characters followed by a ueof.
+	// get() returns all of the characters followed by a char16eof.
 	class Reader {
 		const char16 *begin;			// Beginning of current buffer
 		const char16 *p;				// Position in current buffer
 		const char16 *end;				// End of current buffer
-		const char16 *markPos;			// Pointer to mark in current buffer or null if no mark
-		uint32 nGetsPastEnd;			// Number of times ueof has been returned
+		const char16 *lineStart;		// Pointer to start of current line
+		uint32 nGetsPastEnd;			// Number of times char16eof has been returned
+
+		String *recordString;			// String, if any, into which recordChar() records characters
+		const char16 *recordBase;		// Position of last beginRecording() call
+		const char16 *recordPos;		// Position of last recordChar() call; nil if a discrepancy occurred
 		
 	  protected:
 		Reader(): nGetsPastEnd(0) {}
@ -46,54 +52,70 @@ namespace JavaScript {
 	    Reader(const Reader&);			// No copy constructor
 	    void operator=(const Reader&);	// No assignment operator
 	  public:
-	#ifdef DEBUG
-		~Reader() {ASSERT(!markPos);}
-	#endif

-		wint_t get();
-		wint_t peek();
-		void unget();
+		char16orEOF get();
+		char16orEOF peek();
+		void unget(uint32 n = 1);
 		
-		void mark();
-		void unmark();
-		void unmark(String &s);
-		bool marked() const {return markPos;}
+		void beginLine();
+		uint32 charPos() const;
+		void backUpTo(uint32 pos);
+
+		String extract(uint32 begin, uint32 end) const;
+		void beginRecording(String &recordString);
+		void recordChar(char16 ch);
+		String &endRecording();
+		
+		virtual String sourceFile() const = 0; // A description of the source code that caused the error

 	  protected:
 		void setBuffer(const char16 *begin, const char16 *p, const char16 *end);
-		virtual wint_t underflow();
-		wint_t peekUnderflow();
+		virtual char16orEOF underflow();
+		char16orEOF peekUnderflow();
 	};


-	// Get and return the next character or ueof if at end of input.
-	inline wint_t Reader::get()
+	// Get and return the next character or char16eof if at end of input.
+	inline char16orEOF Reader::get()
 	{
 		if (p != end)
 			return *p++;
 		return underflow();
 	}

-	// Return the next character without consuming it.  Return ueof if at end of input.
-	inline wint_t Reader::peek()
+	// Return the next character without consuming it.  Return char16eof if at end of input.
+	inline char16orEOF Reader::peek()
 	{
 		if (p != end)
 			return *p;
 		return peekUnderflow();
 	}

-	// Mark the current position in the Reader.
-	inline void Reader::mark()
+
+	// Set the beginning of the current line.  unget cannot be subsequently called past this point.
+	inline void Reader::beginLine()
 	{
-		ASSERT(!markPos);
-		markPos = p;
+		lineStart = p;
+	  #ifdef DEBUG
+		recordString = 0;
+	  #endif
 	}

-	// Delete the Reader mark.
-	inline void Reader::unmark()
+	// Return the character offset relative to the current line.  This cannot be called
+	// if the current position is past the end of the input.
+	inline uint32 Reader::charPos() const
 	{
-		ASSERT(markPos);
-		markPos = 0;
+		ASSERT(!nGetsPastEnd);
+		return static_cast<uint32>(p - lineStart);
+	}
+
+
+	// Back up to the given character offset relative to the current line.
+	inline void Reader::backUpTo(uint32 pos)
+	{
+		ASSERT(pos <= charPos());
+		p = lineStart + pos;
+		nGetsPastEnd = 0;
 	}


@ -103,14 +125,21 @@ namespace JavaScript {
 		Reader::begin = begin;
 		Reader::p = p;
 		Reader::end = end;
+		lineStart = begin;
+	  #ifdef DEBUG
+		recordString = 0;
+	  #endif
 	}


 	// A Reader that reads from a String.
 	class StringReader: public Reader {
 		const String str;
+		const String source;
+
 	  public:
-		StringReader(const String &s);
+		StringReader(const String &s, const String &source);
+		String sourceFile() const;
 	};


@ -122,7 +151,6 @@ namespace JavaScript {
 	  public:
 		enum Kind {
 			End,						// End of token stream
-			Error,						// Lexer error

 			Id,							// Non-keyword identifier (may be same as a keyword if it contains an escape code)
 			Num,						// Numeral
@ -165,12 +193,12 @@ namespace JavaScript {
 			LogicalAnd,					// &&
 			LogicalXor,					// ^^
 			LogicalOr,					// ||
-			And,						// &
+			And,						// &	// These must be at constant offsets from LogicalAnd ... LogicalOr
 			Xor,						// ^
 			Or,							// |

 			Assignment,					// =
-			TimesEquals,				// *=
+			TimesEquals,				// *=	// These must be at constant offsets from Times ... Or
 			DivideEquals,				// /=
 			ModuloEquals,				// %=
 			PlusEquals,					// +=
@ -189,7 +217,7 @@ namespace JavaScript {
 			NotEqual,					// !=
 			LessThan,					// <
 			LessThanOrEqual,			// <=
-			GreaterThan,				// >
+			GreaterThan,				// >	// >, >= must be at constant offsets from <, <=
 			GreaterThanOrEqual,			// >=
 			Identical,					// ===
 			NotIdentical,				// !==
@ -268,13 +296,19 @@ namespace JavaScript {
 		StringAtom *identifier;			// The token's characters (identifiers, keywords, and regular expressions only)
 		auto_ptr<String> chars;			// The token's characters (strings, numbers, and regular expression flags only)
 		float64 value;					// The token's value (numbers only)
+		
+		void setChars(const String &s);
 	};


+	void initKeywords(World &world);
+	
+
 	class Lexer {
 		static const int tokenBufferSize = 3;	// Token lookahead buffer size
 	  public:
 		Reader &reader;
+		World &world;
 	  private:
 		Token tokens[tokenBufferSize];	// Circular buffer of recently read or lookahead tokens
 		Token *nextToken;				// Address of next Token in the circular buffer to be returned by get()
@ -283,16 +317,31 @@ namespace JavaScript {
 		int nTokensBack;				// Number of Tokens on which unget() can be called; these Tokens are beind nextToken
 		bool savedPreferRegExp[tokenBufferSize]; // Circular buffer of saved values of preferRegExp to get() calls
 	  #endif
+		uint32 lineNum;					// Current line number
+		bool lexingUnit;				// True if lexing a unit identifier immediately following a number

 	  public:
-		Lexer(Reader &reader);
+		Lexer(Reader &reader, World &world);
 		
 		Token &get(bool preferRegExp);
 		const Token &peek(bool preferRegExp);
 		void unget();

 	  private:
+		void syntaxError(const char *message, uint backUp = 1);
+		char16orEOF getChar();
+		char16orEOF internalGetChar(char16orEOF ch);
+		char16orEOF peekChar();
+		char16orEOF internalPeekChar(char16orEOF ch);
+		bool testChar(char16 ch);
+
+		char16 lexEscape(bool unicodeOnly);
+		bool lexIdentifier(String &s, bool allowLeadingDigit);
+		bool lexNumeral();
+		String lexString(char16 separator);
+		void lexRegExp();
 		void lexToken(bool preferRegExp);
+	  public:
 	};
 }
 #endif
--- a/js2/src/parser.cpp
+++ b/js2/src/parser.cpp
@ -18,6 +18,8 @@
 // Rights Reserved.

 #include "parser.h"
+#include "world.h"
+
 namespace JS = JavaScript;


@ -28,75 +30,226 @@ namespace JS = JavaScript;

 // Create a Reader reading characters from begin up to but not including end.
 JS::Reader::Reader(const char16 *begin, const char16 *end):
-	begin(begin), p(begin), end(end), nGetsPastEnd(0)
+	begin(begin), p(begin), end(end), lineStart(begin), nGetsPastEnd(0)
 {
 	ASSERT(begin <= end);
+  #ifdef DEBUG
+	recordString = 0;
+  #endif
 }


-// Unread the last character.
-void JS::Reader::unget()
+// Unread the last n characters.  unget cannot be called to back up past the position
+// of the last call to beginLine().
+void JS::Reader::unget(uint32 n)
 {
-	if (nGetsPastEnd)
-		--nGetsPastEnd;
-	else {
-		ASSERT(p != begin);
-		--p;
+	if (nGetsPastEnd) {
+		if (nGetsPastEnd >= n) {
+			nGetsPastEnd -= n;
+			return;
 		}
+		n -= nGetsPastEnd;
+		nGetsPastEnd = 0;
+	}
+	ASSERT(p >= begin + n);
+	p -= n;
 }


-// Set s to the characters read in after the mark but before the current position
-// and then delete the Reader mark.
-void JS::Reader::unmark(String &s)
+// Return the characters read in from position begin inclusive to position end
+// exclusive relative to the current line.  begin <= end <= charPos() is required.
+JS::String JS::Reader::extract(uint32 begin, uint32 end) const
 {
-	ASSERT(markPos);
-	s.assign(markPos, p);
-	markPos = 0;
+	ASSERT(begin <= end && end + nGetsPastEnd <= charPos());
+	return String(lineStart + begin, lineStart + end);
+}
+
+
+// Begin accumulating characters into the recordString.  Each character passed
+// to recordChar() is added to the end of the recordString.  Recording ends when
+// endRecord() or beginLine() is called.
+// Recording is significantly optimized when the characters passed to readChar()
+// are the same characters as read by get().  In this case the record String does
+// not get allocated until endRecord() is called or a discrepancy appears between
+// get() and recordChar().
+void JS::Reader::beginRecording(String &recordString)
+{
+	Reader::recordString = &recordString;
+	recordBase = p;
+	recordPos = p;
+}
+
+
+// Append ch to the recordString.
+void JS::Reader::recordChar(char16 ch)
+{
+	ASSERT(recordString);
+	if (recordPos) {
+		if (recordPos != end && *recordPos == ch) {
+			recordPos++;
+			return;
+		} else {
+			recordString->assign(recordBase, recordPos);
+			recordPos = 0;
+		}
+	}
+	*recordString += ch;
+}
+
+
+// Finish recording characters into the recordString that was last passed to beginRecording().
+// Return that recordString.
+JS::String &JS::Reader::endRecording()
+{
+	String *rs = recordString;
+	ASSERT(rs);
+	if (recordPos)
+		rs->assign(recordBase, recordPos);
+	recordString = 0;
+	return *rs;
 }


 // Refill the source buffer after running off the end.  Get and return
 // the next character.
-// The default implementation just returns ueof.
-JS::wint_t JS::Reader::underflow()
+// The default implementation just returns char16eof.
+JS::char16orEOF JS::Reader::underflow()
 {
 	++nGetsPastEnd;
-	return ueof;
+	return char16eof;
 }


 // Perform a peek when begin == end.
-JS::wint_t JS::Reader::peekUnderflow()
+JS::char16orEOF JS::Reader::peekUnderflow()
 {
-	wint_t ch = underflow();
+	char16orEOF ch = underflow();
 	unget();
 	return ch;
 }


-// Create a StringReader reading characters from a copy of the given String.
-JS::StringReader::StringReader(const String &s):
-	str(s)
+// Create a StringReader reading characters from s.
+// source describes the origin of string s and may be used for error messages.
+JS::StringReader::StringReader(const String &s, const String &source):
+	str(s), source(source)
 {
 	const char16 *begin = str.data();
 	setBuffer(begin, begin, begin + str.size());
 }


+JS::String JS::StringReader::sourceFile() const
+{
+	return source;
+}
+
+
 //
 // Lexer
 //


-// Create a new Lexer using the provided Reader.
-JS::Lexer::Lexer(Reader &reader): reader(reader)
+void JS::Token::setChars(const String &s)
+{
+	chars = static_cast<auto_ptr<String> >(new String(s));
+}
+
+
+struct KeywordInit {
+	const char *name;					// Null-terminated ASCII name of keyword
+	JS::Token::Kind tokenKind;			// Keyword's number
+};
+
+static KeywordInit keywordInits[] = {
+  // Reserved words
+	{"abstract", JS::Token::Abstract},
+	{"abstract", JS::Token::Abstract},
+	{"break", JS::Token::Break},
+	{"case", JS::Token::Case},
+	{"catch", JS::Token::Catch},
+	{"class", JS::Token::Class},
+	{"const", JS::Token::Const},
+	{"continue", JS::Token::Continue},
+	{"debugger", JS::Token::Debugger},
+	{"default", JS::Token::Default},
+	{"delete", JS::Token::Delete},
+	{"do", JS::Token::Do},
+	{"else", JS::Token::Else},
+	{"enum", JS::Token::Enum},
+	{"eval", JS::Token::Eval},
+	{"export", JS::Token::Export},
+	{"extends", JS::Token::Extends},
+	{"false", JS::Token::False},
+	{"final", JS::Token::Final},
+	{"finally", JS::Token::Finally},
+	{"for", JS::Token::For},
+	{"function", JS::Token::Function},
+	{"goto", JS::Token::Goto},
+	{"if", JS::Token::If},
+	{"implements", JS::Token::Implements},
+	{"import", JS::Token::Import},
+	{"in", JS::Token::In},
+	{"instanceof", JS::Token::Instanceof},
+	{"native", JS::Token::Native},
+	{"new", JS::Token::New},
+	{"null", JS::Token::Null},
+	{"package", JS::Token::Package},
+	{"private", JS::Token::Private},
+	{"protected", JS::Token::Protected},
+	{"public", JS::Token::Public},
+	{"return", JS::Token::Return},
+	{"static", JS::Token::Static},
+	{"super", JS::Token::Super},
+	{"switch", JS::Token::Switch},
+	{"synchronized", JS::Token::Synchronized},
+	{"this", JS::Token::This},
+	{"throw", JS::Token::Throw},
+	{"throws", JS::Token::Throws},
+	{"transient", JS::Token::Transient},
+	{"true", JS::Token::True},
+	{"try", JS::Token::Try},
+	{"typeof", JS::Token::Typeof},
+	{"var", JS::Token::Var},
+	{"volatile", JS::Token::Volatile},
+	{"while", JS::Token::While},
+	{"with", JS::Token::With},
+  // Non-reserved words
+	{"box", JS::Token::Box},
+	{"constructor", JS::Token::Constructor},
+	{"field", JS::Token::Field},
+	{"get", JS::Token::Get},
+	{"language", JS::Token::Language},
+	{"local", JS::Token::Local},
+	{"method", JS::Token::Method},
+	{"override", JS::Token::Override},
+	{"set", JS::Token::Set},
+	{"version", JS::Token::Version}
+};
+
+
+// Initialize the keywords in the given world.
+void JS::initKeywords(World &world)
+{
+	KeywordInit *ki = keywordInits;
+	KeywordInit *kiEnd = keywordInits + sizeof(keywordInits)/sizeof(KeywordInit);
+	for (; ki != kiEnd; ++ki)
+		world.identifiers[widenCString(ki->name)].tokenKind = ki->tokenKind;
+}
+
+
+
+// Create a new Lexer using the provided Reader and interning identifiers, keywords, and regular
+// expressions in the designated world.
+JS::Lexer::Lexer(Reader &reader, World &world): reader(reader), world(world)
 {
 	nextToken = tokens;
 	nTokensFwd = 0;
  #ifdef DEBUG
 	nTokensBack = 0;
  #endif
+	lineNum = 1;
+	lexingUnit = false;
 }


@ -156,9 +309,536 @@ void JS::Lexer::unget()
 }


+// Report a syntax error at the backUp-th last character read by the Reader.
+// In other words, if backUp is 0, the error is at the next character to be read by the Reader;
+// if backUp is 1, the error is at the last character read by the Reader, and so forth.
+void JS::Lexer::syntaxError(const char *message, uint backUp)
+{
+	reader.unget(backUp);
+	uint32 charPos = reader.charPos();
+	char16orEOF ch;
+	do {
+		ch = reader.get();
+	} while (ch != char16eof && !isLineBreak(char16orEOFToChar16(ch)));
+	reader.unget();
+	Exception e(Exception::SyntaxError, widenCString(message), reader.sourceFile(), lineNum, charPos,
+				reader.extract(0, reader.charPos()));
+	throw e;
+}
+
+
+// Get the next character from the reader, skipping any Unicode format-control (Cf) characters.
+inline JS::char16orEOF JS::Lexer::getChar()
+{
+	char16orEOF ch = reader.get();
+	if (static_cast<uint32>(ch) >= firstFormatChar)
+		ch = internalGetChar(ch);
+	return ch;
+}
+
+// Helper for getChar()
+JS::char16orEOF JS::Lexer::internalGetChar(char16orEOF ch)
+{
+	while (isFormat(char16orEOFToChar16(ch)))
+		ch = reader.get();
+	return ch;
+}
+
+
+// Peek the next character from the reader, skipping any Unicode format-control (Cf) characters,
+// which are read and discarded.
+inline JS::char16orEOF JS::Lexer::peekChar()
+{
+	char16orEOF ch = reader.peek();
+	if (static_cast<uint32>(ch) >= firstFormatChar)
+		ch = internalPeekChar(ch);
+	return ch;
+}
+
+// Helper for peekChar()
+JS::char16orEOF JS::Lexer::internalPeekChar(char16orEOF ch)
+{
+	while (isFormat(char16orEOFToChar16(ch))) {
+		reader.get();
+		ch = reader.peek();
+	}
+	return ch;
+}
+
+
+// Peek the next character from the reader, skipping any Unicode format-control (Cf) characters,
+// which are read and discarded.  If the peeked character matches ch, read that character and return true;
+// otherwise return false.
+bool JS::Lexer::testChar(char16 ch)
+{
+	char16orEOF ch2 = peekChar();
+	if (ch == ch2) {
+		reader.get();
+		return true;
+	}
+	return false;
+}
+
+
+// A backslash has been read.  Read the rest of the escape code.
+// Return the interpreted escaped character.  Throw an exception if the escape is not valid.
+// If unicodeOnly is true, allow only \uxxxx escapes.
+char16 JS::Lexer::lexEscape(bool unicodeOnly)
+{
+	char16orEOF ch = getChar();
+	int nDigits;
+
+	if (!unicodeOnly || ch == 'u')
+		switch (ch) {
+		  case '0':
+			// Make sure that the next character isn't a digit.
+			ch = peekChar();
+			if (!isASCIIDecimalDigit(char16orEOFToChar16(ch)))
+				return 0x00;
+			getChar();	// Point to the next character in the error message
+		  case 'b':
+			return 0x08;
+		  case 'f':
+			return 0x0C;
+		  case 'n':
+			return 0x0A;
+		  case 'r':
+			return 0x0D;
+		  case 't':
+			return 0x09;
+		  case 'v':
+			return 0x0B;
+		  case 'x':
+			nDigits = 2;
+			goto lexHex;
+		  case 'u':
+			nDigits = 4;
+		  lexHex:
+			{
+				uint32 n = 0;
+				while (nDigits--) {
+					ch = getChar();
+					uint digit;
+					if (!isASCIIHexDigit(char16orEOFToChar16(ch), digit))
+						goto error;
+					n = (n << 4) | digit;
+				}
+				return char16(n);
+			}
+		default:
+			if (ch != char16eof) {
+				CharInfo chi(char16orEOFToChar16(ch));
+				if (!isAlphanumeric(chi) && !isLineBreak(chi))
+					return char16orEOFToChar16(ch);
+			}
+		}
+  error:
+	syntaxError("Bad escape code");
+	return 0;
+}
+
+
+// Read an identifier into s.  Return true if an escape code has been encountered.
+// If allowLeadingDigit is true, allow the first character of s to be a digit, just like any
+// continuing identifier character.
+bool JS::Lexer::lexIdentifier(String &s, bool allowLeadingDigit)
+{
+	reader.beginRecording(s);
+	bool hasEscape = false;
+
+	while (true) {
+		char16orEOF ch = getChar();
+		char16orEOF ch2 = ch;
+		if (ch == '\\') {
+			ch2 = lexEscape(true);
+			hasEscape = true;
+		}
+		CharInfo chi2(char16orEOFToChar16(ch2));
+		
+		if (!(allowLeadingDigit ? isIdContinuing(chi2) : isIdLeading(chi2))) {
+			if (ch == '\\')
+				syntaxError("Identifier escape expands into non-identifier character");
+			else
+				reader.unget();
+			break;
+		}
+		reader.recordChar(char16orEOFToChar16(ch2));
+		allowLeadingDigit = true;
+	}
+	reader.endRecording();
+	return hasEscape;
+}
+
+
+// Read a numeric literal into nextToken->chars and nextToken->value.
+// Return true if the numeric literal is followed by a unit, but don't read the unit yet.
+bool JS::Lexer::lexNumeral()
+{
+	int radix = 10;
+	int hasDecimalPoint = 0;
+	String s;
+	uint digit;
+
+	reader.beginRecording(s);
+	char16orEOF ch = getChar();
+	if (ch == '0') {
+		reader.recordChar('0');
+		ch = getChar();
+		if (ch&~0x20 == 'X') {
+			uint32 pos = reader.charPos();
+			char16orEOF ch2 = getChar();
+			if (isASCIIHexDigit(char16orEOFToChar16(ch2), digit)) {
+				reader.recordChar(char16orEOFToChar16(ch));
+				do {
+					reader.recordChar(char16orEOFToChar16(ch2));
+					ch2 = getChar();
+				} while (isASCIIHexDigit(char16orEOFToChar16(ch2), digit));
+				ch = ch2;
+			} else
+				reader.backUpTo(pos);
+			goto done;
+		} else if (isASCIIDecimalDigit(char16orEOFToChar16(ch))) {
+			syntaxError("Numeric constant syntax error");
+		}
+	}
+	while (isASCIIDecimalDigit(char16orEOFToChar16(ch)) || ch == '.' && !hasDecimalPoint++) {
+		reader.recordChar(char16orEOFToChar16(ch));
+		ch = getChar();
+	}
+	if (ch&~0x20 == 'E') {
+		uint32 pos = reader.charPos();
+		char16orEOF ch2 = getChar();
+		char16 sign = 0;
+		if (ch2 == '+' || ch2 == '-') {
+			sign = char16orEOFToChar16(ch2);
+			ch2 = getChar();
+		}
+		if (isASCIIDecimalDigit(char16orEOFToChar16(ch2))) {
+			reader.recordChar(char16orEOFToChar16(ch));
+			if (sign)
+				reader.recordChar(sign);
+			do {
+				reader.recordChar(char16orEOFToChar16(ch2));
+				ch2 = getChar();
+			} while (isASCIIDecimalDigit(char16orEOFToChar16(ch2)));
+			ch = ch2;
+		} else
+			reader.backUpTo(pos);
+	}
+	
+  done:
+	// At this point the reader is just past the character ch, which is the first non-formatting character
+	// that is not part of the number.
+	reader.endRecording();
+	nextToken->setChars(s);
+	reader.unget();
+	ASSERT(ch == reader.peek());
+	return isIdContinuing(char16orEOFToChar16(ch)) || ch == '\\';
+}
+
+
+// Read a string literal into a String and return that String.
+// The opening quote has already been read into separator.
+JS::String JS::Lexer::lexString(char16 separator)
+{
+	String s;
+	char16orEOF ch;
+
+	reader.beginRecording(s);
+	while ((ch = reader.get()) != separator) {
+    	CharInfo chi(char16orEOFToChar16(ch));
+    	if (!isFormat(chi)) {
+			if (ch == '\\')
+				ch = lexEscape(false);
+			else if (ch == char16eof || isLineBreak(chi))
+				syntaxError("Unterminated string literal");
+			reader.recordChar(char16orEOFToChar16(ch));
+		}
+	}
+	reader.endRecording();
+	return s;
+}
+
+
+// Read a regular expression literal.  Store the regular expression in nextToken->identifier
+// and the flags in nextToken->flags.
+// The opening slash has already been read.
+void JS::Lexer::lexRegExp()
+{
+	String s;
+	char16orEOF prevCh = 0;
+
+	reader.beginRecording(s);
+	while (true) {
+		char16orEOF ch = getChar();
+    	CharInfo chi(char16orEOFToChar16(ch));
+		if (ch == char16eof || isLineBreak(chi))
+			syntaxError("Unterminated regular expression literal");
+		if (prevCh == '\\') {
+			reader.recordChar(char16orEOFToChar16(ch));
+			prevCh = 0;	// Ignore slashes and backslashes immediately after a \
+		} else if (ch != '/') {
+			reader.recordChar(char16orEOFToChar16(ch));
+			prevCh = ch;
+		} else
+			break;
+	}
+	reader.endRecording();
+	nextToken->identifier = &world.identifiers[s];
+	
+	String flags;
+	lexIdentifier(flags, true);
+	nextToken->setChars(flags);
+}
+
+
 // Read a token from the Reader and store it at *nextToken.
 // If the Reader reached the end of file, store a Token whose Kind is End.
 void JS::Lexer::lexToken(bool preferRegExp)
 {
-}
+	Token &t = *nextToken;
+	t.lineBreak = false;
+	t.identifier = 0;
+	t.chars.reset();
+	t.value = 0;
+	Token::Kind kind;

+  next:
+	char16orEOF ch = reader.get();
+	char16orEOF ch2;
+	CharInfo chi(char16orEOFToChar16(ch));
+
+	switch (cGroup(chi)) {
+      case CharInfo::FormatGroup:
+      case CharInfo::WhiteGroup:
+    	goto next;
+
+      case CharInfo::IdGroup:
+    	t.charPos = reader.charPos() - 1;
+      readIdentifier:
+    	{
+	    	reader.unget();
+	    	String s;
+    		bool hasEscape = lexIdentifier(s, false);
+	    	t.identifier = &world.identifiers[s];
+	    	kind = hasEscape ? Token::Id : t.identifier->tokenKind;
+    	}
+    	break;
+
+      case CharInfo::NonIdGroup:
+      case CharInfo::IdContinueGroup:
+    	t.charPos = reader.charPos() - 1;
+    	switch (ch) {
+		  case '(':
+			kind = Token::OpenParenthesis;	// (
+			break;
+		  case ')':
+			kind = Token::CloseParenthesis;	// )
+			break;
+		  case '[':
+			kind = Token::OpenBracket;		// [
+			break;
+		  case ']':
+			kind = Token::CloseBracket;		// ]
+			break;
+		  case '{':
+			kind = Token::OpenBrace;		// {
+			break;
+		  case '}':
+			kind = Token::CloseBrace;		// }
+			break;
+		  case ',':
+			kind = Token::Comma;			// ,
+			break;
+		  case ';':
+			kind = Token::Semicolon;		// ;
+			break;
+		  case '.':
+			kind = Token::Dot;				// .
+			ch2 = getChar();
+			if (isASCIIDecimalDigit(char16orEOFToChar16(ch2))) {
+				reader.backUpTo(t.charPos);
+				goto number;				// decimal point
+			} else if (ch2 == '.') {
+				kind = Token::DoubleDot;	// ..
+				if (testChar('.'))
+					kind = Token::TripleDot; // ...
+			} else
+				reader.unget();
+			break;
+		  case ':':
+			kind = Token::Colon;			// :
+			if (testChar(':'))
+				kind = Token::DoubleColon;	// ::
+			break;
+		  case '#':
+			kind = Token::Pound;			// #
+			break;
+		  case '@':
+			kind = Token::At;				// @
+			break;
+		  case '?':
+			kind = Token::Question;			// ?
+			break;
+
+		  case '~':
+			kind = Token::Complement;		// ~
+			break;
+		  case '!':
+			kind = Token::Not;				// !
+			if (testChar('=')) {
+				kind = Token::NotEqual;		// !=
+				if (testChar('='))
+					kind = Token::NotIdentical; // !==
+			}
+			break;
+
+		  case '*':
+			kind = Token::Times;			// * *=
+		  tryAssignment:
+			if (testChar('='))
+				kind = Token::Kind(kind + Token::TimesEquals - Token::Times);
+			break;
+
+		  case '/':
+			kind = Token::Divide;			// /
+			ch = getChar();
+			if (ch == '/') {				// // comment
+				do {
+					ch = reader.get();
+					if (ch == char16eof)
+						goto endOfInput;
+				} while (!isLineBreak(char16orEOFToChar16(ch)));
+				goto endOfLine;
+			} else if (ch == '*') {			// /* comment */
+				ch = 0;
+				do {
+					ch2 = ch;
+					ch = getChar();
+					if (isLineBreak(char16orEOFToChar16(ch))) {
+						reader.beginLine();
+						++lineNum;
+						t.lineBreak = true;
+					}
+					if (ch == char16eof)
+						syntaxError("Unterminated /* comment");
+				} while (ch != '/' || ch2 != '*');
+				goto next;
+			} else {
+				reader.unget();
+				if (preferRegExp) {			// Regular expression
+					kind = Token::RegExp;
+					lexRegExp();
+				} else
+					 goto tryAssignment;	// /=
+			}
+			break;
+
+		  case '%':
+			kind = Token::Modulo;			// %
+			goto tryAssignment;				// %=
+
+		  case '+':
+			kind = Token::Plus;				// +
+			if (testChar('+'))
+				kind = Token::Increment;	// ++
+			else
+				goto tryAssignment;			// +=
+			break;
+
+		  case '-':
+			kind = Token::Minus;			// -
+			ch = getChar();
+			if (ch == '-')
+				kind = Token::Decrement;	// --
+			else if (ch == '>')
+				kind = Token::Arrow;		// ->
+			else {
+				reader.unget();
+				goto tryAssignment;			// -=
+			}
+			break;
+	
+		  case '&':
+			kind = Token::And;				// & && &= &&=
+		  logical:
+			if (testChar(char16orEOFToChar16(ch)))
+				kind = Token::Kind(kind - Token::And + Token::LogicalAnd);
+			goto tryAssignment;
+		  case '^':
+			kind = Token::Xor;				// ^ ^^ ^= ^^=
+			goto logical;
+		  case '|':
+			kind = Token::Or;				// | || |= ||=
+			goto logical;
+
+		  case '=':
+			kind = Token::Assignment;		// =
+			if (testChar('=')) {
+				kind = Token::Equal;		// ==
+				if (testChar('='))
+					kind = Token::Identical; // ===
+			}
+			break;
+
+		  case '<':
+			kind = Token::LessThan;			// <
+			if (testChar('<')) {
+				kind = Token::LeftShift;	// <<
+				goto tryAssignment;			// <<=
+			}
+		  comparison:
+			if (testChar('='))				// <= >=
+				kind = Token::Kind(kind + Token::LessThanOrEqual - Token::LessThan);
+			break;
+		  case '>':
+			kind = Token::GreaterThan;		// >
+			if (testChar('>')) {
+				kind = Token::RightShift;	// >>
+				if (testChar('>'))
+					kind = Token::LogicalRightShift; // >>>
+				goto tryAssignment;			// >>= >>>=
+			}
+			goto comparison;
+
+		  case '\\':
+			goto readIdentifier;			// An identifier that starts with an escape
+
+		  case '\'':
+		  case '"':
+			kind = Token::Str;				// 'string' "string"
+			t.setChars(lexString(char16orEOFToChar16(ch)));
+			break;
+
+		  case '0':
+		  case '1':
+		  case '2':
+		  case '3':
+		  case '4':
+		  case '5':
+		  case '6':
+		  case '7':
+		  case '8':
+		  case '9':
+			reader.unget();					// Number
+		  number:
+			kind = Token::Num;
+			lexNumeral();
+			break;
+
+		  case char16eof:
+		  endOfInput:
+			kind = Token::End;
+    	}
+    	break;
+
+      case CharInfo::LineBreakGroup:
+      endOfLine:
+		reader.beginLine();
+		++lineNum;
+		t.lineBreak = true;
+		goto next;
+	}
+	t.kind = kind;
+	t.lineNum = lineNum;
+}
--- a/js2/src/parser.h
+++ b/js2/src/parser.h
@ -21,22 +21,28 @@
 #define parser_h

 #include "utilities.h"
-#include "world.h"

 namespace JavaScript {

+	class StringAtom;
+	class World;
+
 //
 // Reader
 //

 	// A Reader reads Unicode characters from some source -- either a file or a string.
-	// get() returns all of the characters followed by a ueof.
+	// get() returns all of the characters followed by a char16eof.
 	class Reader {
 		const char16 *begin;			// Beginning of current buffer
 		const char16 *p;				// Position in current buffer
 		const char16 *end;				// End of current buffer
-		const char16 *markPos;			// Pointer to mark in current buffer or null if no mark
-		uint32 nGetsPastEnd;			// Number of times ueof has been returned
+		const char16 *lineStart;		// Pointer to start of current line
+		uint32 nGetsPastEnd;			// Number of times char16eof has been returned
+
+		String *recordString;			// String, if any, into which recordChar() records characters
+		const char16 *recordBase;		// Position of last beginRecording() call
+		const char16 *recordPos;		// Position of last recordChar() call; nil if a discrepancy occurred
 		
 	  protected:
 		Reader(): nGetsPastEnd(0) {}
@ -46,54 +52,70 @@ namespace JavaScript {
 	    Reader(const Reader&);			// No copy constructor
 	    void operator=(const Reader&);	// No assignment operator
 	  public:
-	#ifdef DEBUG
-		~Reader() {ASSERT(!markPos);}
-	#endif

-		wint_t get();
-		wint_t peek();
-		void unget();
+		char16orEOF get();
+		char16orEOF peek();
+		void unget(uint32 n = 1);
 		
-		void mark();
-		void unmark();
-		void unmark(String &s);
-		bool marked() const {return markPos;}
+		void beginLine();
+		uint32 charPos() const;
+		void backUpTo(uint32 pos);
+
+		String extract(uint32 begin, uint32 end) const;
+		void beginRecording(String &recordString);
+		void recordChar(char16 ch);
+		String &endRecording();
+		
+		virtual String sourceFile() const = 0; // A description of the source code that caused the error

 	  protected:
 		void setBuffer(const char16 *begin, const char16 *p, const char16 *end);
-		virtual wint_t underflow();
-		wint_t peekUnderflow();
+		virtual char16orEOF underflow();
+		char16orEOF peekUnderflow();
 	};


-	// Get and return the next character or ueof if at end of input.
-	inline wint_t Reader::get()
+	// Get and return the next character or char16eof if at end of input.
+	inline char16orEOF Reader::get()
 	{
 		if (p != end)
 			return *p++;
 		return underflow();
 	}

-	// Return the next character without consuming it.  Return ueof if at end of input.
-	inline wint_t Reader::peek()
+	// Return the next character without consuming it.  Return char16eof if at end of input.
+	inline char16orEOF Reader::peek()
 	{
 		if (p != end)
 			return *p;
 		return peekUnderflow();
 	}

-	// Mark the current position in the Reader.
-	inline void Reader::mark()
+
+	// Set the beginning of the current line.  unget cannot be subsequently called past this point.
+	inline void Reader::beginLine()
 	{
-		ASSERT(!markPos);
-		markPos = p;
+		lineStart = p;
+	  #ifdef DEBUG
+		recordString = 0;
+	  #endif
 	}

-	// Delete the Reader mark.
-	inline void Reader::unmark()
+	// Return the character offset relative to the current line.  This cannot be called
+	// if the current position is past the end of the input.
+	inline uint32 Reader::charPos() const
 	{
-		ASSERT(markPos);
-		markPos = 0;
+		ASSERT(!nGetsPastEnd);
+		return static_cast<uint32>(p - lineStart);
+	}
+
+
+	// Back up to the given character offset relative to the current line.
+	inline void Reader::backUpTo(uint32 pos)
+	{
+		ASSERT(pos <= charPos());
+		p = lineStart + pos;
+		nGetsPastEnd = 0;
 	}


@ -103,14 +125,21 @@ namespace JavaScript {
 		Reader::begin = begin;
 		Reader::p = p;
 		Reader::end = end;
+		lineStart = begin;
+	  #ifdef DEBUG
+		recordString = 0;
+	  #endif
 	}


 	// A Reader that reads from a String.
 	class StringReader: public Reader {
 		const String str;
+		const String source;
+
 	  public:
-		StringReader(const String &s);
+		StringReader(const String &s, const String &source);
+		String sourceFile() const;
 	};


@ -122,7 +151,6 @@ namespace JavaScript {
 	  public:
 		enum Kind {
 			End,						// End of token stream
-			Error,						// Lexer error

 			Id,							// Non-keyword identifier (may be same as a keyword if it contains an escape code)
 			Num,						// Numeral
@ -165,12 +193,12 @@ namespace JavaScript {
 			LogicalAnd,					// &&
 			LogicalXor,					// ^^
 			LogicalOr,					// ||
-			And,						// &
+			And,						// &	// These must be at constant offsets from LogicalAnd ... LogicalOr
 			Xor,						// ^
 			Or,							// |

 			Assignment,					// =
-			TimesEquals,				// *=
+			TimesEquals,				// *=	// These must be at constant offsets from Times ... Or
 			DivideEquals,				// /=
 			ModuloEquals,				// %=
 			PlusEquals,					// +=
@ -189,7 +217,7 @@ namespace JavaScript {
 			NotEqual,					// !=
 			LessThan,					// <
 			LessThanOrEqual,			// <=
-			GreaterThan,				// >
+			GreaterThan,				// >	// >, >= must be at constant offsets from <, <=
 			GreaterThanOrEqual,			// >=
 			Identical,					// ===
 			NotIdentical,				// !==
@ -268,13 +296,19 @@ namespace JavaScript {
 		StringAtom *identifier;			// The token's characters (identifiers, keywords, and regular expressions only)
 		auto_ptr<String> chars;			// The token's characters (strings, numbers, and regular expression flags only)
 		float64 value;					// The token's value (numbers only)
+		
+		void setChars(const String &s);
 	};


+	void initKeywords(World &world);
+	
+
 	class Lexer {
 		static const int tokenBufferSize = 3;	// Token lookahead buffer size
 	  public:
 		Reader &reader;
+		World &world;
 	  private:
 		Token tokens[tokenBufferSize];	// Circular buffer of recently read or lookahead tokens
 		Token *nextToken;				// Address of next Token in the circular buffer to be returned by get()
@ -283,16 +317,31 @@ namespace JavaScript {
 		int nTokensBack;				// Number of Tokens on which unget() can be called; these Tokens are beind nextToken
 		bool savedPreferRegExp[tokenBufferSize]; // Circular buffer of saved values of preferRegExp to get() calls
 	  #endif
+		uint32 lineNum;					// Current line number
+		bool lexingUnit;				// True if lexing a unit identifier immediately following a number

 	  public:
-		Lexer(Reader &reader);
+		Lexer(Reader &reader, World &world);
 		
 		Token &get(bool preferRegExp);
 		const Token &peek(bool preferRegExp);
 		void unget();

 	  private:
+		void syntaxError(const char *message, uint backUp = 1);
+		char16orEOF getChar();
+		char16orEOF internalGetChar(char16orEOF ch);
+		char16orEOF peekChar();
+		char16orEOF internalPeekChar(char16orEOF ch);
+		bool testChar(char16 ch);
+
+		char16 lexEscape(bool unicodeOnly);
+		bool lexIdentifier(String &s, bool allowLeadingDigit);
+		bool lexNumeral();
+		String lexString(char16 separator);
+		void lexRegExp();
 		void lexToken(bool preferRegExp);
+	  public:
 	};
 }
 #endif