Bug 1628835 - Part 3: Add interpreter support

The irregexp compiler takes the AST produced by the parser, compiles it down to
a more efficient internal representation, then uses a 'macroassembler' to
generate code.

The generated code can either be bytecode (which is then fed into the
interpreter) or jitcode (which can be executed directly). This patch
gets the infrastructure set up and handles the former case.

CompilePattern is based heavily on V8's `RegExpImpl::compile`

When we import a future change to irregexp, we'll be able to pass
`matches->pairsRaw()` directly into `MatchForCallFromRuntime`, and
the interpreter will fill it in for us.
In the old engine, we could handle interrupts in the middle of the interpreter.
If we hit an urgent interrupt in compiled code, we would generate bytecode
and fall back to the interpreter (This is what all the `ForceByteCode` machinery
in RegExpObject.cpp is about. It was added in bug 1077514.)
That won't work in the new regexp version. V8 _does_ allow interrupts during
regexp execution, but only by jumping through some scary hoops to "manually
relocate unhandlified references" afterwards... that just sounds like a really
bad idea.
Instead, we just retry the regexp up to 4 times before failing to handle
urgent interruptions, which seems a reasonable amount.
new-regexp
Moonchild 12 months ago committed by roytam1
parent 81273b24db
commit f99aefaf66
  1. 204
      js/src/new-regexp/RegExpAPI.cpp
  2. 5
      js/src/new-regexp/RegExpAPI.h
  3. 39
      js/src/new-regexp/regexp-shim.h
  4. 59
      js/src/vm/RegExpObject.cpp
  5. 11
      js/src/vm/RegExpObject.h

@ -12,7 +12,9 @@
#include "mozilla/ArrayUtils.h"
#include "mozilla/Casting.h"
#include "new-regexp/regexp-bytecode-generator.h"
#include "new-regexp/regexp-compiler.h"
#include "new-regexp/regexp-interpreter.h"
#include "new-regexp/regexp-macro-assembler-arch.h"
#include "new-regexp/regexp-parser.h"
#include "new-regexp/regexp-shim.h"
@ -21,6 +23,7 @@
#include "vm/ErrorReporting.h"
#include "vm/RegExpObject.h"
#include "vm/StringBuffer.h"
#include "vm/MatchPairs.h"
namespace js {
namespace irregexp {
@ -30,12 +33,21 @@ using namespace mozilla;
using frontend::TokenStream;
using v8::internal::FlatStringReader;
using v8::internal::HandleScope;
using v8::internal::IrregexpInterpreter;
using v8::internal::NativeRegExpMacroAssembler;
using v8::internal::RegExpBytecodeGenerator;
using v8::internal::RegExpCompileData;
using v8::internal::RegExpCompiler;
using v8::internal::RegExpError;
using v8::internal::RegExpMacroAssembler;
using v8::internal::RegExpNode;
using v8::internal::RegExpParser;
using v8::internal::Zone;
using V8HandleString = v8::internal::Handle<v8::internal::String>;
using V8HandleRegExp = v8::internal::Handle<v8::internal::JSRegExp>;
using namespace v8::internal::regexp_compiler_constants;
static uint32_t ErrorNumber(RegExpError err) {
@ -216,7 +228,7 @@ static bool CheckPatternSyntaxImpl(JSContext* cx, FlatStringReader* pattern,
LifoAllocScope allocScope(&cx->tempLifoAlloc());
Zone zone(allocScope.alloc());
v8::internal::HandleScope handleScope(cx->isolate);
HandleScope handleScope(cx->isolate);
v8::internal::JSRegExp::Flags passflags = static_cast<uint8_t>(flags.value());
return RegExpParser::ParseRegExp(cx->isolate, &zone, pattern, passflags, result);
}
@ -284,6 +296,76 @@ static bool UseBoyerMoore(HandleAtom pattern, JS::AutoCheckCannotGC& nogc) {
return HasFewDifferentCharacters(pattern->twoByteChars(nogc), length);
}
// Sample character frequency information for use in Boyer-Moore.
static void SampleCharacters(HandleLinearString input,
RegExpCompiler& compiler) {
static const int kSampleSize = 128;
int chars_sampled = 0;
FlatStringReader sample_subject(input);
int length = sample_subject.length();
int half_way = (length - kSampleSize) / 2;
for (int i = std::max(0, half_way);
i < length && chars_sampled < kSampleSize;
i++, chars_sampled++) {
compiler.frequency_collator()->CountCharacter(sample_subject.Get(i));
}
}
static RegExpNode* WrapBody(RegExpShared* re,
RegExpCompiler& compiler, RegExpCompileData& data,
Zone* zone, bool isLatin1) {
using v8::internal::ChoiceNode;
using v8::internal::EndNode;
using v8::internal::GuardedAlternative;
using v8::internal::RegExpCapture;
using v8::internal::RegExpCharacterClass;
using v8::internal::RegExpQuantifier;
using v8::internal::RegExpTree;
using v8::internal::TextNode;
RegExpNode* captured_body =
RegExpCapture::ToNode(data.tree, 0, &compiler, compiler.accept());
RegExpNode* node = captured_body;
if (!data.tree->IsAnchoredAtStart() && !re->sticky()) {
// Add a .*? at the beginning, outside the body capture, unless
// this expression is anchored at the beginning or sticky.
v8::internal::JSRegExp::Flags default_flags;
RegExpNode* loop_node = RegExpQuantifier::ToNode(
0, RegExpTree::kInfinity, false,
new (zone) RegExpCharacterClass('*', default_flags), &compiler,
captured_body, data.contains_anchor);
if (data.contains_anchor) {
// Unroll loop once, to take care of the case that might start
// at the start of input.
ChoiceNode* first_step_node = new (zone) ChoiceNode(2, zone);
first_step_node->AddAlternative(GuardedAlternative(captured_body));
first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
new (zone) RegExpCharacterClass('*', default_flags), false,
loop_node)));
node = first_step_node;
} else {
node = loop_node;
}
}
if (isLatin1) {
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
// Do it again to propagate the new nodes to places where they were not
// put because they had not been calculated yet.
if (node != nullptr) {
node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
}
} else if (re->unicode() && (re->global() || re->sticky())) {
v8::internal::JSRegExp::Flags passflags = static_cast<uint8_t>(re->getFlags().value());
node = RegExpCompiler::OptionallyStepBackToLeadSurrogate(&compiler, node, passflags);
}
if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
return node;
}
bool CompilePattern(JSContext* cx, RegExpShared* re,
HandleLinearString input) {
RootedAtom pattern(cx, re->getSource());
@ -328,8 +410,126 @@ bool CompilePattern(JSContext* cx, RegExpShared* re,
return true;
}
}
// Add one to account for the whole-match capture
re->useRegExpMatch(data.capture_count + 1);
}
MOZ_CRASH("TODO");
MOZ_ASSERT(re->kind() == RegExpShared::Kind::RegExp);
HandleScope handleScope(cx->isolate);
RegExpCompiler compiler(cx->isolate, &zone, data.capture_count,
input->hasLatin1Chars());
bool isLatin1 = input->hasLatin1Chars();
SampleCharacters(input, compiler);
data.node = WrapBody(re, compiler, data, &zone, isLatin1);
data.error = AnalyzeRegExp(cx->isolate, isLatin1, data.node);
if (data.error != RegExpError::kNone) {
MOZ_ASSERT(data.error == RegExpError::kAnalysisStackOverflow);
JS_ReportErrorASCII(cx, "Stack overflow");
return false;
}
// Note: This code looks weird because in a future patch we will add
// support for native compilation, which will initialize `masm` with
// a different subclass of RegExpMacroAssembler.
UniquePtr<RegExpMacroAssembler> masm;
masm = MakeUnique<RegExpBytecodeGenerator>(cx->isolate, &zone);
if (!masm) {
ReportOutOfMemory(cx);
return false;
}
bool largePattern =
pattern->length() > v8::internal::RegExp::kRegExpTooLargeToOptimize;
masm->set_slow_safe(largePattern);
if (compiler.optimize()) {
compiler.set_optimize(!largePattern);
}
// When matching a regexp with known maximum length that is anchored
// at the end, we may be able to skip the beginning of long input
// strings. This decision is made here because it depends on
// information in the AST that isn't replicated in the Node
// structure used inside the compiler.
bool is_start_anchored = data.tree->IsAnchoredAtStart();
bool is_end_anchored = data.tree->IsAnchoredAtEnd();
int max_length = data.tree->max_match();
static const int kMaxBacksearchLimit = 1024;
if (is_end_anchored && !is_start_anchored && !re->sticky() &&
max_length < kMaxBacksearchLimit) {
masm->SetCurrentPositionFromEnd(max_length);
}
if (re->global()) {
RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
if (data.tree->min_match() > 0) {
mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
} else if (re->unicode()) {
mode = RegExpMacroAssembler::GLOBAL_UNICODE;
}
masm->set_global_mode(mode);
}
// Compile the regexp
V8HandleString wrappedPattern(v8::internal::String(pattern), cx->isolate);
RegExpCompiler::CompilationResult result = compiler.Assemble(
cx->isolate, masm.get(), data.node, data.capture_count, wrappedPattern);
if (!result.Succeeded()) {
MOZ_ASSERT(result.error == RegExpError::kTooLarge);
JS_ReportErrorASCII(cx, "regexp too big");
return false;
}
re->updateMaxRegisters(result.num_registers);
ByteArray bytecode =
v8::internal::ByteArray::cast(result.code).takeOwnership(cx->isolate);
uint32_t length = bytecode->length;
re->setByteCode(bytecode.release(), isLatin1);
//js::AddCellMemory(re, length, MemoryUse::RegExpSharedBytecode); /* malloc tracking */
return true;
}
RegExpRunStatus Interpret(JSContext* cx, RegExpShared* re,
HandleLinearString input, size_t startIndex,
VectorMatchPairs* matches) {
HandleScope handleScope(cx->isolate);
V8HandleRegExp wrappedRegExp(v8::internal::JSRegExp(re), cx->isolate);
V8HandleString wrappedInput(v8::internal::String(input), cx->isolate);
uint32_t numRegisters = re->getMaxRegisters();
// Allocate memory for registers. They will be initialized by the
// interpreter. (See IrregexpInterpreter::MatchInternal.)
Vector<int32_t, 8, SystemAllocPolicy> registers;
if (!registers.growByUninitialized(numRegisters)) {
ReportOutOfMemory(cx);
return RegExpRunStatus_Error;
}
RegExpRunStatus status =
(RegExpRunStatus) IrregexpInterpreter::MatchForCallFromRuntime(
cx->isolate, wrappedRegExp, wrappedInput, registers.begin(),
numRegisters, startIndex);
// Copy results out of registers
if (status == RegExpRunStatus_Success) {
uint32_t length = re->pairCount() * 2;
MOZ_ASSERT(length <= registers.length());
for (uint32_t i = 0; i < length; i++) {
matches->pairsRaw()[i] = registers[i];
}
}
return status;
}
RegExpRunStatus Execute(JSContext* cx, RegExpShared* re,
HandleLinearString input, size_t startIndex,
VectorMatchPairs* matches) {
return Interpret(cx, re, input, startIndex, matches);
}
} // namespace irregexp

@ -14,6 +14,7 @@
#include "frontend/TokenStream.h"
#include "jscntxt.h"
#include "vm/MatchPairs.h"
#include "vm/RegExpObject.h"
namespace js {
@ -30,6 +31,10 @@ bool CheckPatternSyntax(JSContext* cx, frontend::TokenStream& ts,
bool CompilePattern(JSContext* cx, RegExpShared* re,
HandleLinearString input);
RegExpRunStatus Execute(JSContext* cx, RegExpShared* re,
HandleLinearString input, size_t start,
VectorMatchPairs* matches);
} // namespace irregexp
} // namespace js

@ -504,12 +504,14 @@ class Object {
constexpr Object(JS::Value value) : value_(value) {}
operator JS::Value() const { return value_; }
// Used in regexp-macro-assembler.cc and regexp-interpreter.cc to
// check the return value of isolate->stack_guard()->HandleInterrupts()
// In V8, this will be either an exception object or undefined.
// In SM, we store the exception in the context, so we can use our normal
// idiom: return false iff we are throwing an exception.
inline bool IsException(Isolate*) const { return !value_.toBoolean(); }
// Used in regexp-interpreter.cc to check the return value of
// isolate->stack_guard()->HandleInterrupts(). We want to handle
// interrupts in the caller, so we always return false from
// HandleInterrupts and true here.
inline bool IsException(Isolate*) const {
MOZ_ASSERT(!value_.toBoolean());
return true;
}
protected:
JS::Value value_;
@ -742,15 +744,17 @@ class DisallowHeapAllocation {
const JS::AutoCheckCannotGC no_gc_;
};
// This is used inside DisallowHeapAllocation regions to enable
// allocation just before throwing an exception, to allocate the
// exception object. Specifically, it only ever guards:
// - isolate->stack_guard()->HandleInterrupts()
// - isolate->StackOverflow()
// Those cases don't allocate in SpiderMonkey, so this can be a no-op.
// V8 uses this inside DisallowHeapAllocation regions to turn
// allocation back on before throwing a stack overflow exception or
// handling interrupts. AutoSuppressGC is sufficient for the former
// case, but not for the latter: handling interrupts can execute
// arbitrary script code, and V8 jumps through some scary hoops to
// "manually relocate unhandlified references" afterwards. To keep
// things sane, we don't try to handle interrupts while regex code is
// still on the stack. Instead, we return EXCEPTION and handle
// interrupts in the caller. (See RegExpShared::execute.)
class AllowHeapAllocation {
public:
// Empty constructor to avoid unused_variable warnings
AllowHeapAllocation() {}
};
@ -1035,9 +1039,12 @@ public:
//********** Stack guard code **********//
inline StackGuard* stack_guard() { return this; }
Object HandleInterrupts() {
return Object(JS::BooleanValue(cx()->handleInterrupt(cx())));
}
// This is called from inside no-GC code. V8 runs the interrupt
// inside the no-GC code and then "manually relocates unhandlified
// references" afterwards. We just return false and let the caller
// handle interrupts.
Object HandleInterrupts() { return Object(JS::BooleanValue(false)); }
JSContext* cx() const { return cx_; }

@ -25,6 +25,7 @@
#endif
#ifdef JS_NEW_REGEXP
#include "new-regexp/regexp-stack.h"
#include "new-regexp/RegExpAPI.h"
#endif
@ -1112,7 +1113,57 @@ RegExpShared::execute(JSContext* cx, HandleLinearString input, size_t start,
return RegExpShared::executeAtom(cx, input, start, matches);
}
MOZ_CRASH("TODO");
// Reset the Irregexp backtrack stack if it grows during execution.
irregexp::RegExpStackScope stackScope(cx->isolate);
/*
* Ensure sufficient memory for output vector.
* No need to initialize it. The RegExp engine fills them in on a match.
*/
if (!matches->allocOrExpandArray(pairCount())) {
ReportOutOfMemory(cx);
return RegExpRunStatus_Error;
}
uint32_t interruptRetries = 0;
const uint32_t maxInterruptRetries = 4;
VectorMatchPairs* vmatches = static_cast<VectorMatchPairs*>(matches);
do {
RegExpRunStatus result = irregexp::Execute(cx, this, input, start, vmatches);
if (result == RegExpRunStatus_Error) {
/* Execute can return RegExpRunStatus_Error:
*
* 1. If the native stack overflowed
* 2. If the backtrack stack overflowed
* 3. If an interrupt was requested during execution.
*
* In the first two cases, we want to throw an error. In the
* third case, we want to handle the interrupt and try again.
* We cap the number of times we will retry.
*/
JSRuntime* rt = cx->runtime();
if (rt->hasPendingInterrupt()) {
if (!CheckForInterrupt(cx)) {
return RegExpRunStatus_Error;
}
if (interruptRetries++ < maxInterruptRetries) {
continue;
}
}
// If we have run out of retries, this regexp takes too long to execute.
ReportOverRecursed(cx);
return RegExpRunStatus_Error;
}
MOZ_ASSERT(result == RegExpRunStatus_Success ||
result == RegExpRunStatus_Success_NotFound);
return result;
} while (true);
MOZ_CRASH("Unreachable");
}
void RegExpShared::useAtomMatch(HandleAtom pattern) {
@ -1122,6 +1173,12 @@ void RegExpShared::useAtomMatch(HandleAtom pattern) {
pairCount_ = 1;
}
void RegExpShared::useRegExpMatch(size_t pairCount) {
MOZ_ASSERT(kind() == RegExpShared::Kind::Unparsed);
kind_ = RegExpShared::Kind::RegExp;
pairCount_ = pairCount;
}
#else
bool

@ -124,6 +124,7 @@ class RegExpShared
#ifdef JS_NEW_REGEXP
RegExpShared::Kind kind_ = Kind::Unparsed;
GCPtrAtom patternAtom_;
uint32_t maxRegisters_ = 0;
#else
bool canStringMatch = false;
#endif
@ -188,13 +189,21 @@ class RegExpShared
// Use simple string matching for this regexp.
void useAtomMatch(HandleAtom pattern);
// Use the regular expression engine for this regexp.
void useRegExpMatch(size_t parenCount);
void setByteCode(ByteCode* code, bool latin1) {
compilation(latin1).byteCode = code;
}
ByteCode* getByteCode(bool latin1) const {
return compilation(latin1).byteCode;
}
#endif
uint32_t getMaxRegisters() const { return maxRegisters_; }
void updateMaxRegisters(uint32_t numRegisters) {
maxRegisters_ = std::max(maxRegisters_, numRegisters);
}
#endif // JS_NEW_REGEXP
JSAtom* getSource() const { return source; }
#ifdef JS_NEW_REGEXP

Loading…
Cancel
Save