Bug 1875096 - Part 1: Only apply StepBackToLeadSurrogate for atom matches. r=iain

It's no longer necessary to adjust `lastIndex` before calling into irregexp,
because irregexp has been fixed a while ago to correctly handle split surrogate
pairs, cf. `v8::internal::RegExpCompiler::OptionallyStepBackToLeadSurrogate`.

That means we only need to adjust `lastIndex` when performing atom matches,
which allows to remove `StepBackToLeadSurrogate` from code-gen and move the
corresponding C++ code from "builtin/RegExp.cpp" to "vm/RegExpObject.cpp".

Differential Revision: https://phabricator.services.mozilla.com/D198821
This commit is contained in:
André Bargull 2024-01-18 10:26:02 +00:00
Родитель 16a6dcdaf3
Коммит f33228184b
3 изменённых файлов: 47 добавлений и 120 удалений

Просмотреть файл

@ -19,7 +19,6 @@
#include "js/PropertySpec.h"
#include "js/RegExpFlags.h" // JS::RegExpFlag, JS::RegExpFlags
#include "util/StringBuffer.h"
#include "util/Unicode.h"
#include "vm/Interpreter.h"
#include "vm/JSContext.h"
#include "vm/RegExpObject.h"
@ -1064,28 +1063,6 @@ const JSPropertySpec js::regexp_static_props[] = {
JS_SELF_HOSTED_SYM_GET(species, "$RegExpSpecies", 0),
JS_PS_END};
template <typename CharT>
static bool IsTrailSurrogateWithLeadSurrogateImpl(Handle<JSLinearString*> input,
size_t index) {
JS::AutoCheckCannotGC nogc;
MOZ_ASSERT(index > 0 && index < input->length());
const CharT* inputChars = input->chars<CharT>(nogc);
return unicode::IsTrailSurrogate(inputChars[index]) &&
unicode::IsLeadSurrogate(inputChars[index - 1]);
}
static bool IsTrailSurrogateWithLeadSurrogate(Handle<JSLinearString*> input,
int32_t index) {
if (index <= 0 || size_t(index) >= input->length()) {
return false;
}
return input->hasLatin1Chars()
? IsTrailSurrogateWithLeadSurrogateImpl<Latin1Char>(input, index)
: IsTrailSurrogateWithLeadSurrogateImpl<char16_t>(input, index);
}
/*
* ES 2017 draft rev 6a13789aa9e7c6de4e96b7d3e24d9e6eba6584ad 21.2.5.2.2
* steps 3, 9-14, except 12.a.i, 12.c.i.1.
@ -1122,36 +1099,7 @@ static RegExpRunStatus ExecuteRegExp(JSContext* cx, HandleObject regexp,
/* Steps 4-8 performed by the caller. */
/* Step 10. */
if (reobj->unicode() || reobj->unicodeSets()) {
/*
* ES 2017 draft rev 6a13789aa9e7c6de4e96b7d3e24d9e6eba6584ad
* 21.2.2.2 step 2.
* Let listIndex be the index into Input of the character that was
* obtained from element index of str.
*
* In the spec, pattern match is performed with decoded Unicode code
* points, but our implementation performs it with UTF-16 encoded
* string. In step 2, we should decrement lastIndex (index) if it
* points the trail surrogate that has corresponding lead surrogate.
*
* var r = /\uD83D\uDC38/ug;
* r.lastIndex = 1;
* var str = "\uD83D\uDC38";
* var result = r.exec(str); // pattern match starts from index 0
* print(result.index); // prints 0
*
* Note: this doesn't match the current spec text and result in
* different values for `result.index` under certain conditions.
* However, the spec will change to match our implementation's
* behavior. See https://github.com/tc39/ecma262/issues/128.
*/
if (IsTrailSurrogateWithLeadSurrogate(input, lastIndex)) {
lastIndex--;
}
}
/* Steps 3, 11-14, except 12.a.i, 12.c.i.1. */
/* Steps 3, 10-14, except 12.a.i, 12.c.i.1. */
RegExpRunStatus status =
ExecuteRegExpImpl(cx, res, &re, input, lastIndex, matches);
if (status == RegExpRunStatus_Error) {

Просмотреть файл

@ -1814,70 +1814,6 @@ static Address RegExpPairCountAddress(MacroAssembler& masm,
MatchPairs::offsetOfPairCount());
}
// When the unicode flag is set, if lastIndex points to a trail
// surrogate, we should step back to the corresponding lead surrogate.
// See ExecuteRegExp in builtin/RegExp.cpp for more detail.
static void StepBackToLeadSurrogate(MacroAssembler& masm, Register regexpShared,
Register input, Register lastIndex,
Register temp1, Register temp2) {
Label done;
// If neither unicode flag is set, there is nothing to do.
masm.branchTest32(
Assembler::Zero, Address(regexpShared, RegExpShared::offsetOfFlags()),
Imm32(int32_t(JS::RegExpFlag::Unicode | JS::RegExpFlag::UnicodeSets)),
&done);
// If the input is latin1, there can't be any surrogates.
masm.branchLatin1String(input, &done);
// Check if |lastIndex > 0 && lastIndex < input->length()|.
// lastIndex should already have no sign here.
masm.branchTest32(Assembler::Zero, lastIndex, lastIndex, &done);
masm.loadStringLength(input, temp1);
masm.branch32(Assembler::AboveOrEqual, lastIndex, temp1, &done);
// For TrailSurrogateMin ≤ x ≤ TrailSurrogateMax and
// LeadSurrogateMin ≤ x ≤ LeadSurrogateMax, the following
// equations hold.
//
// SurrogateMin ≤ x ≤ SurrogateMax
// <> SurrogateMin ≤ x ≤ SurrogateMin + 2^10 - 1
// <> ((x - SurrogateMin) >>> 10) = 0 where >>> is an unsigned-shift
// See Hacker's Delight, section 4-1 for details.
//
// ((x - SurrogateMin) >>> 10) = 0
// <> floor((x - SurrogateMin) / 1024) = 0
// <> floor((x / 1024) - (SurrogateMin / 1024)) = 0
// <> floor(x / 1024) = SurrogateMin / 1024
// <> floor(x / 1024) * 1024 = SurrogateMin
// <> (x >>> 10) << 10 = SurrogateMin
// <> x & ~(2^10 - 1) = SurrogateMin
constexpr char16_t SurrogateMask = 0xFC00;
Register charsReg = temp1;
masm.loadStringChars(input, charsReg, CharEncoding::TwoByte);
// Check if input[lastIndex] is trail surrogate.
masm.loadChar(charsReg, lastIndex, temp2, CharEncoding::TwoByte);
masm.and32(Imm32(SurrogateMask), temp2);
masm.branch32(Assembler::NotEqual, temp2, Imm32(unicode::TrailSurrogateMin),
&done);
// Check if input[lastIndex-1] is lead surrogate.
masm.loadChar(charsReg, lastIndex, temp2, CharEncoding::TwoByte,
-int32_t(sizeof(char16_t)));
masm.and32(Imm32(SurrogateMask), temp2);
masm.branch32(Assembler::NotEqual, temp2, Imm32(unicode::LeadSurrogateMin),
&done);
// Move lastIndex back to lead surrogate.
masm.sub32(Imm32(1), lastIndex);
masm.bind(&done);
}
static void UpdateRegExpStatics(MacroAssembler& masm, Register regexp,
Register input, Register lastIndex,
Register staticsReg, Register temp1,
@ -2067,9 +2003,6 @@ static bool PrepareAndExecuteRegExp(MacroAssembler& masm, Register regexp,
masm.branchTestUndefined(Assembler::Equal, sharedSlot, failure);
masm.unboxNonDouble(sharedSlot, regexpReg, JSVAL_TYPE_PRIVATE_GCTHING);
// Update lastIndex if necessary.
StepBackToLeadSurrogate(masm, regexpReg, input, lastIndex, temp2, temp3);
// Handle Atom matches
Label notAtom, checkSuccess;
masm.branchPtr(Assembler::Equal,

Просмотреть файл

@ -23,6 +23,7 @@
#include "js/RegExp.h"
#include "js/RegExpFlags.h" // JS::RegExpFlags
#include "util/StringBuffer.h"
#include "util/Unicode.h"
#include "vm/MatchPairs.h"
#include "vm/PlainObject.h"
#include "vm/RegExpStatics.h"
@ -782,12 +783,57 @@ bool RegExpShared::markedForTierUp() const {
return ticks_ == 0;
}
// When either unicode flag is set and if |index| points to a trail surrogate,
// step back to the corresponding lead surrogate.
static size_t StepBackToLeadSurrogate(const JSLinearString* input,
size_t index) {
// |index| must be a position within a two-byte string, otherwise it can't
// point to the trail surrogate of a surrogate pair.
if (index == 0 || index >= input->length() || input->hasLatin1Chars()) {
return index;
}
/*
* ES 2017 draft rev 6a13789aa9e7c6de4e96b7d3e24d9e6eba6584ad
* 21.2.2.2 step 2.
* Let listIndex be the index into Input of the character that was obtained
* from element index of str.
*
* In the spec, pattern match is performed with decoded Unicode code points,
* but our implementation performs it with UTF-16 encoded strings. In step 2,
* we should decrement lastIndex (index) if it points to a trail surrogate
* that has a corresponding lead surrogate.
*
* var r = /\uD83D\uDC38/ug;
* r.lastIndex = 1;
* var str = "\uD83D\uDC38";
* var result = r.exec(str); // pattern match starts from index 0
* print(result.index); // prints 0
*
* Note: This doesn't match the current spec text and result in different
* values for `result.index` under certain conditions. However, the spec will
* change to match our implementation's behavior.
* See https://github.com/tc39/ecma262/issues/128.
*/
JS::AutoCheckCannotGC nogc;
const auto* chars = input->twoByteChars(nogc);
if (unicode::IsTrailSurrogate(chars[index]) &&
unicode::IsLeadSurrogate(chars[index - 1])) {
index--;
}
return index;
}
static RegExpRunStatus ExecuteAtomImpl(RegExpShared* re, JSLinearString* input,
size_t start, MatchPairs* matches) {
MOZ_ASSERT(re->pairCount() == 1);
size_t length = input->length();
size_t searchLength = re->patternAtom()->length();
if (re->unicode() || re->unicodeSets()) {
start = StepBackToLeadSurrogate(input, start);
}
if (re->sticky()) {
// First part checks size_t overflow.
if (searchLength + start < searchLength || searchLength + start > length) {