зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1875096 - Part 1: Only apply StepBackToLeadSurrogate for atom matches. r=iain
It's no longer necessary to adjust `lastIndex` before calling into irregexp, because irregexp has been fixed a while ago to correctly handle split surrogate pairs, cf. `v8::internal::RegExpCompiler::OptionallyStepBackToLeadSurrogate`. That means we only need to adjust `lastIndex` when performing atom matches, which allows to remove `StepBackToLeadSurrogate` from code-gen and move the corresponding C++ code from "builtin/RegExp.cpp" to "vm/RegExpObject.cpp". Differential Revision: https://phabricator.services.mozilla.com/D198821
This commit is contained in:
Родитель
16a6dcdaf3
Коммит
f33228184b
|
@ -19,7 +19,6 @@
|
|||
#include "js/PropertySpec.h"
|
||||
#include "js/RegExpFlags.h" // JS::RegExpFlag, JS::RegExpFlags
|
||||
#include "util/StringBuffer.h"
|
||||
#include "util/Unicode.h"
|
||||
#include "vm/Interpreter.h"
|
||||
#include "vm/JSContext.h"
|
||||
#include "vm/RegExpObject.h"
|
||||
|
@ -1064,28 +1063,6 @@ const JSPropertySpec js::regexp_static_props[] = {
|
|||
JS_SELF_HOSTED_SYM_GET(species, "$RegExpSpecies", 0),
|
||||
JS_PS_END};
|
||||
|
||||
template <typename CharT>
|
||||
static bool IsTrailSurrogateWithLeadSurrogateImpl(Handle<JSLinearString*> input,
|
||||
size_t index) {
|
||||
JS::AutoCheckCannotGC nogc;
|
||||
MOZ_ASSERT(index > 0 && index < input->length());
|
||||
const CharT* inputChars = input->chars<CharT>(nogc);
|
||||
|
||||
return unicode::IsTrailSurrogate(inputChars[index]) &&
|
||||
unicode::IsLeadSurrogate(inputChars[index - 1]);
|
||||
}
|
||||
|
||||
static bool IsTrailSurrogateWithLeadSurrogate(Handle<JSLinearString*> input,
|
||||
int32_t index) {
|
||||
if (index <= 0 || size_t(index) >= input->length()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return input->hasLatin1Chars()
|
||||
? IsTrailSurrogateWithLeadSurrogateImpl<Latin1Char>(input, index)
|
||||
: IsTrailSurrogateWithLeadSurrogateImpl<char16_t>(input, index);
|
||||
}
|
||||
|
||||
/*
|
||||
* ES 2017 draft rev 6a13789aa9e7c6de4e96b7d3e24d9e6eba6584ad 21.2.5.2.2
|
||||
* steps 3, 9-14, except 12.a.i, 12.c.i.1.
|
||||
|
@ -1122,36 +1099,7 @@ static RegExpRunStatus ExecuteRegExp(JSContext* cx, HandleObject regexp,
|
|||
|
||||
/* Steps 4-8 performed by the caller. */
|
||||
|
||||
/* Step 10. */
|
||||
if (reobj->unicode() || reobj->unicodeSets()) {
|
||||
/*
|
||||
* ES 2017 draft rev 6a13789aa9e7c6de4e96b7d3e24d9e6eba6584ad
|
||||
* 21.2.2.2 step 2.
|
||||
* Let listIndex be the index into Input of the character that was
|
||||
* obtained from element index of str.
|
||||
*
|
||||
* In the spec, pattern match is performed with decoded Unicode code
|
||||
* points, but our implementation performs it with UTF-16 encoded
|
||||
* string. In step 2, we should decrement lastIndex (index) if it
|
||||
* points the trail surrogate that has corresponding lead surrogate.
|
||||
*
|
||||
* var r = /\uD83D\uDC38/ug;
|
||||
* r.lastIndex = 1;
|
||||
* var str = "\uD83D\uDC38";
|
||||
* var result = r.exec(str); // pattern match starts from index 0
|
||||
* print(result.index); // prints 0
|
||||
*
|
||||
* Note: this doesn't match the current spec text and result in
|
||||
* different values for `result.index` under certain conditions.
|
||||
* However, the spec will change to match our implementation's
|
||||
* behavior. See https://github.com/tc39/ecma262/issues/128.
|
||||
*/
|
||||
if (IsTrailSurrogateWithLeadSurrogate(input, lastIndex)) {
|
||||
lastIndex--;
|
||||
}
|
||||
}
|
||||
|
||||
/* Steps 3, 11-14, except 12.a.i, 12.c.i.1. */
|
||||
/* Steps 3, 10-14, except 12.a.i, 12.c.i.1. */
|
||||
RegExpRunStatus status =
|
||||
ExecuteRegExpImpl(cx, res, &re, input, lastIndex, matches);
|
||||
if (status == RegExpRunStatus_Error) {
|
||||
|
|
|
@ -1814,70 +1814,6 @@ static Address RegExpPairCountAddress(MacroAssembler& masm,
|
|||
MatchPairs::offsetOfPairCount());
|
||||
}
|
||||
|
||||
// When the unicode flag is set, if lastIndex points to a trail
|
||||
// surrogate, we should step back to the corresponding lead surrogate.
|
||||
// See ExecuteRegExp in builtin/RegExp.cpp for more detail.
|
||||
static void StepBackToLeadSurrogate(MacroAssembler& masm, Register regexpShared,
|
||||
Register input, Register lastIndex,
|
||||
Register temp1, Register temp2) {
|
||||
Label done;
|
||||
|
||||
// If neither unicode flag is set, there is nothing to do.
|
||||
masm.branchTest32(
|
||||
Assembler::Zero, Address(regexpShared, RegExpShared::offsetOfFlags()),
|
||||
Imm32(int32_t(JS::RegExpFlag::Unicode | JS::RegExpFlag::UnicodeSets)),
|
||||
&done);
|
||||
|
||||
// If the input is latin1, there can't be any surrogates.
|
||||
masm.branchLatin1String(input, &done);
|
||||
|
||||
// Check if |lastIndex > 0 && lastIndex < input->length()|.
|
||||
// lastIndex should already have no sign here.
|
||||
masm.branchTest32(Assembler::Zero, lastIndex, lastIndex, &done);
|
||||
masm.loadStringLength(input, temp1);
|
||||
masm.branch32(Assembler::AboveOrEqual, lastIndex, temp1, &done);
|
||||
|
||||
// For TrailSurrogateMin ≤ x ≤ TrailSurrogateMax and
|
||||
// LeadSurrogateMin ≤ x ≤ LeadSurrogateMax, the following
|
||||
// equations hold.
|
||||
//
|
||||
// SurrogateMin ≤ x ≤ SurrogateMax
|
||||
// <> SurrogateMin ≤ x ≤ SurrogateMin + 2^10 - 1
|
||||
// <> ((x - SurrogateMin) >>> 10) = 0 where >>> is an unsigned-shift
|
||||
// See Hacker's Delight, section 4-1 for details.
|
||||
//
|
||||
// ((x - SurrogateMin) >>> 10) = 0
|
||||
// <> floor((x - SurrogateMin) / 1024) = 0
|
||||
// <> floor((x / 1024) - (SurrogateMin / 1024)) = 0
|
||||
// <> floor(x / 1024) = SurrogateMin / 1024
|
||||
// <> floor(x / 1024) * 1024 = SurrogateMin
|
||||
// <> (x >>> 10) << 10 = SurrogateMin
|
||||
// <> x & ~(2^10 - 1) = SurrogateMin
|
||||
|
||||
constexpr char16_t SurrogateMask = 0xFC00;
|
||||
|
||||
Register charsReg = temp1;
|
||||
masm.loadStringChars(input, charsReg, CharEncoding::TwoByte);
|
||||
|
||||
// Check if input[lastIndex] is trail surrogate.
|
||||
masm.loadChar(charsReg, lastIndex, temp2, CharEncoding::TwoByte);
|
||||
masm.and32(Imm32(SurrogateMask), temp2);
|
||||
masm.branch32(Assembler::NotEqual, temp2, Imm32(unicode::TrailSurrogateMin),
|
||||
&done);
|
||||
|
||||
// Check if input[lastIndex-1] is lead surrogate.
|
||||
masm.loadChar(charsReg, lastIndex, temp2, CharEncoding::TwoByte,
|
||||
-int32_t(sizeof(char16_t)));
|
||||
masm.and32(Imm32(SurrogateMask), temp2);
|
||||
masm.branch32(Assembler::NotEqual, temp2, Imm32(unicode::LeadSurrogateMin),
|
||||
&done);
|
||||
|
||||
// Move lastIndex back to lead surrogate.
|
||||
masm.sub32(Imm32(1), lastIndex);
|
||||
|
||||
masm.bind(&done);
|
||||
}
|
||||
|
||||
static void UpdateRegExpStatics(MacroAssembler& masm, Register regexp,
|
||||
Register input, Register lastIndex,
|
||||
Register staticsReg, Register temp1,
|
||||
|
@ -2067,9 +2003,6 @@ static bool PrepareAndExecuteRegExp(MacroAssembler& masm, Register regexp,
|
|||
masm.branchTestUndefined(Assembler::Equal, sharedSlot, failure);
|
||||
masm.unboxNonDouble(sharedSlot, regexpReg, JSVAL_TYPE_PRIVATE_GCTHING);
|
||||
|
||||
// Update lastIndex if necessary.
|
||||
StepBackToLeadSurrogate(masm, regexpReg, input, lastIndex, temp2, temp3);
|
||||
|
||||
// Handle Atom matches
|
||||
Label notAtom, checkSuccess;
|
||||
masm.branchPtr(Assembler::Equal,
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#include "js/RegExp.h"
|
||||
#include "js/RegExpFlags.h" // JS::RegExpFlags
|
||||
#include "util/StringBuffer.h"
|
||||
#include "util/Unicode.h"
|
||||
#include "vm/MatchPairs.h"
|
||||
#include "vm/PlainObject.h"
|
||||
#include "vm/RegExpStatics.h"
|
||||
|
@ -782,12 +783,57 @@ bool RegExpShared::markedForTierUp() const {
|
|||
return ticks_ == 0;
|
||||
}
|
||||
|
||||
// When either unicode flag is set and if |index| points to a trail surrogate,
|
||||
// step back to the corresponding lead surrogate.
|
||||
static size_t StepBackToLeadSurrogate(const JSLinearString* input,
|
||||
size_t index) {
|
||||
// |index| must be a position within a two-byte string, otherwise it can't
|
||||
// point to the trail surrogate of a surrogate pair.
|
||||
if (index == 0 || index >= input->length() || input->hasLatin1Chars()) {
|
||||
return index;
|
||||
}
|
||||
|
||||
/*
|
||||
* ES 2017 draft rev 6a13789aa9e7c6de4e96b7d3e24d9e6eba6584ad
|
||||
* 21.2.2.2 step 2.
|
||||
* Let listIndex be the index into Input of the character that was obtained
|
||||
* from element index of str.
|
||||
*
|
||||
* In the spec, pattern match is performed with decoded Unicode code points,
|
||||
* but our implementation performs it with UTF-16 encoded strings. In step 2,
|
||||
* we should decrement lastIndex (index) if it points to a trail surrogate
|
||||
* that has a corresponding lead surrogate.
|
||||
*
|
||||
* var r = /\uD83D\uDC38/ug;
|
||||
* r.lastIndex = 1;
|
||||
* var str = "\uD83D\uDC38";
|
||||
* var result = r.exec(str); // pattern match starts from index 0
|
||||
* print(result.index); // prints 0
|
||||
*
|
||||
* Note: This doesn't match the current spec text and result in different
|
||||
* values for `result.index` under certain conditions. However, the spec will
|
||||
* change to match our implementation's behavior.
|
||||
* See https://github.com/tc39/ecma262/issues/128.
|
||||
*/
|
||||
JS::AutoCheckCannotGC nogc;
|
||||
const auto* chars = input->twoByteChars(nogc);
|
||||
if (unicode::IsTrailSurrogate(chars[index]) &&
|
||||
unicode::IsLeadSurrogate(chars[index - 1])) {
|
||||
index--;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
static RegExpRunStatus ExecuteAtomImpl(RegExpShared* re, JSLinearString* input,
|
||||
size_t start, MatchPairs* matches) {
|
||||
MOZ_ASSERT(re->pairCount() == 1);
|
||||
size_t length = input->length();
|
||||
size_t searchLength = re->patternAtom()->length();
|
||||
|
||||
if (re->unicode() || re->unicodeSets()) {
|
||||
start = StepBackToLeadSurrogate(input, start);
|
||||
}
|
||||
|
||||
if (re->sticky()) {
|
||||
// First part checks size_t overflow.
|
||||
if (searchLength + start < searchLength || searchLength + start > length) {
|
||||
|
|
Загрузка…
Ссылка в новой задаче