Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 43 additions & 91 deletions stl/inc/regex
Original file line number Diff line number Diff line change
Expand Up @@ -1681,6 +1681,7 @@ enum class _Rx_unwind_ops {
_Disjunction_eval_alt_always,
_Do_nothing,
_Loop_simple_nongreedy,
_Loop_simple_greedy,
};

template <class _BidIt>
Expand Down Expand Up @@ -1815,7 +1816,6 @@ private:
void _Decrease_stack_usage_count();
void _Increase_complexity_count();

bool _Do_rep0(_Node_rep*);
bool _Do_rep(_Node_rep*, bool, int);
void _Prepare_rep(_Node_rep*);
bool _Find_first_inner_capture_group(_Node_base*, _Loop_vals_v2_t*);
Expand Down Expand Up @@ -3413,72 +3413,6 @@ void _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Increase_complexity_coun
}
}

template <class _BidIt, class _Elem, class _RxTraits, class _It, class _Alloc>
bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node) {
// apply repetition to loop with no nested if/do
int _Ix = _Node->_Min;
const size_t _Frame_idx = _Loop_vals[_Node->_Loop_number]._Loop_frame_idx;
_Loop_vals[_Node->_Loop_number]._Loop_idx = _Ix + 2;

_Tgt_state_t<_It> _Final;
bool _Matched0 = false;
_It _Saved_pos = _Tgt_state._Cur;
bool _Done = false;

if (_Match_pat(_Node->_End_rep->_Next)) {
// record an acceptable match and continue
_Final = _Tgt_state;
_Matched0 = true;
}

if (_Ix == 0 && _Node->_Max != 0) {
_Tgt_state._Cur = _Saved_pos;
_Tgt_state._Grp_valid = _Frames[_Frame_idx]._Match_state._Grp_valid;

if (!_Match_pat(_Node->_Next)) { // rep match failed, we are done
_Done = true;
} else if (_Saved_pos == _Tgt_state._Cur) { // match empty, try no more repetitions
_Done = true;
// we only potentially accept/try tail for POSIX
if ((_Sflags & regex_constants::_Any_posix) && _Match_pat(_Node->_End_rep->_Next)) {
return true; // go with current match
}
} else {
_Saved_pos = _Tgt_state._Cur;
if (_Match_pat(_Node->_End_rep->_Next)) {
// record match and continue
_Final = _Tgt_state;
_Matched0 = true;
}
}
_Ix = 1;
}

if (!_Done) {
while (_Node->_Max == -1 || _Ix++ < _Node->_Max) { // try another rep/tail match
_Tgt_state._Cur = _Saved_pos;
_Tgt_state._Grp_valid = _Frames[_Frame_idx]._Match_state._Grp_valid;
if (!_Match_pat(_Node->_Next) || _Tgt_state._Cur == _Saved_pos) {
break; // rep match failed, quit loop
}

// since loop is branchless, empty rep match is not possible at this point
_Saved_pos = _Tgt_state._Cur;
if (_Match_pat(_Node->_End_rep->_Next)) {
// record match and continue
_Final = _Tgt_state;
_Matched0 = true;
}
}
}

if (_Matched0) { // record final match
_Tgt_state = _Final;
}

return _Matched0;
}

template <class _BidIt, class _Elem, class _RxTraits, class _It, class _Alloc>
bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep(_Node_rep* _Node, bool _Greedy, int _Init_idx) {
// apply repetition
Expand Down Expand Up @@ -4117,32 +4051,31 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
break;

case _N_rep:
{
{ // handle start of loop
auto _Node = static_cast<_Node_rep*>(_Nx);
_Prepare_rep(_Node);
bool _Greedy = (_Node->_Flags & _Fl_greedy) != 0;

if (_Node->_Simple_loop == 1) {
auto& _Sav = _Loop_vals[_Node->_Loop_number];
_Sav._Loop_frame_idx = _Push_frame(_Rx_unwind_ops::_Do_nothing);
if (_Node->_Min > 0) { // try to match a rep
_Increase_complexity_count();
_Increase_complexity_count();
if (_Node->_Min > 0 || (_Greedy && !_Longest && _Node->_Max != 0)) { // try a rep first
_Sav._Loop_idx = 1;
// _Next is already assigned correctly for matching a rep
} else if (!_Greedy || _Longest) { // non-greedy matching
_Increase_complexity_count();

// try tail first
// set up stack unwinding for greedy matching if no rep is allowed
if (_Node->_Min == 0) {
_Push_frame(_Rx_unwind_ops::_Loop_simple_greedy, _Node);
}
} else { // try tail first
_Sav._Loop_idx = 0;
_Next = _Node->_End_rep->_Next;

// set up stack unwinding for non-greedy matching if at least one rep is allowed
if (_Node->_Max != 0) {
_Push_frame(_Rx_unwind_ops::_Loop_simple_nongreedy, _Node);
}
} else {
_Failed = !_Do_rep0(_Node);
_Next = nullptr;
}
} else {
_Failed = !_Do_rep(_Node, _Greedy, 0);
Expand All @@ -4153,7 +4086,7 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
break;

case _N_end_rep:
{
{ // handle end of loop
_Node_rep* _Nr = static_cast<_Node_end_rep*>(_Nx)->_Begin_rep;
auto& _Sav = _Loop_vals[_Nr->_Loop_number];
bool _Greedy = (_Nr->_Flags & _Fl_greedy) != 0;
Expand All @@ -4163,31 +4096,36 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
== _Frames[_Sav._Loop_frame_idx]._Match_state._Cur) { // initial match empty
// loop is branchless, so it will only ever match empty strings
// -> we only try tail for POSIX or if minimum number of reps is non-zero
if ((_Sflags & regex_constants::_Any_posix) || _Nr->_Min > 0) {
_Increase_complexity_count();
// _Next is already assigned correctly for matching tail
} else {
// _Next is already assigned correctly for matching tail

if (!(_Sflags & regex_constants::_Any_posix) && _Nr->_Min == 0) {
_Failed = true;
}
} else if (_Sav._Loop_idx < _Nr->_Min) { // at least one more rep to reach minimum
_Increase_complexity_count();

_Next = _Nr->_Next;
// GH-5365: We have to reset the capture groups from the second iteration on.
_Tgt_state._Grp_valid = _Frames[_Sav._Loop_frame_idx]._Match_state._Grp_valid;
++_Sav._Loop_idx;
} else if (_Longest || !_Greedy) {
_Increase_complexity_count();
} else if (_Greedy && !_Longest && _Sav._Loop_idx != _Nr->_Max) { // one more rep to try next
// set up stack unwinding for greedy matching
_Push_frame(_Rx_unwind_ops::_Loop_simple_greedy, _Nr);

_Next = _Nr->_Next;
// GH-5365: We have to reset the capture groups from the second iteration on.
_Tgt_state._Grp_valid = _Frames[_Sav._Loop_frame_idx]._Match_state._Grp_valid;
if (_Sav._Loop_idx < INT_MAX) { // avoid overflowing _Loop_idx
++_Sav._Loop_idx;
}
} else { // non-greedy matching or greedy matching with maximum reached
// set up stack unwinding for non-greedy matching if one more rep is allowed
if (_Sav._Loop_idx != _Nr->_Max) {
_Push_frame(_Rx_unwind_ops::_Loop_simple_nongreedy, _Nr);
}
// _Next is already assigned correctly for matching tail
} else if (_Sav._Loop_idx == _Nr->_Min) { // greedy and minimum number of reps reached
_Failed = !_Do_rep0(_Nr);
_Next = nullptr;
} else { // internal _Match_pat(_Node->_Next) call in _Do_rep0()
_Next = nullptr;
}

if (!_Failed) {
_Increase_complexity_count();
}
} else {
_Failed = !_Do_rep(_Nr, _Greedy, _Sav._Loop_idx);
Expand Down Expand Up @@ -4297,6 +4235,20 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N
}
break;

case _Rx_unwind_ops::_Loop_simple_greedy:
// try tail if matching one more rep failed
if (_Failed) {
auto _Node = static_cast<_Node_rep*>(_Frame._Node);
auto& _Sav = _Loop_vals[_Node->_Loop_number];

_Increase_complexity_count();
_Nx = _Node->_End_rep->_Next;
_Tgt_state._Cur = _Frame._Match_state._Cur;
_Tgt_state._Grp_valid = _Frames[_Sav._Loop_frame_idx]._Match_state._Grp_valid;
_Failed = false;
}
break;

default:
#if _ITERATOR_DEBUG_LEVEL != 0
_STL_REPORT_ERROR("internal stack of regex matcher corrupted");
Expand Down Expand Up @@ -5299,7 +5251,7 @@ void _Parser2<_FwdIt, _Elem, _RxTraits>::_Calculate_loop_simplicity(
break;
case _N_rep:
// _Node_rep inside another _Node_rep makes both not simple if _Outer_rep can be repeated more than once
// because _Matcher3::_Do_rep0() does not reset capture group boundaries when control is returned to it.
// because the matcher does not reset capture group boundaries when handling simple loops.
// If _Outer_rep can repeat at most once, we have to analyze the structure of the inner loop.
if (_Outer_rep) {
_Outer_rep->_Simple_loop = 0;
Expand Down
81 changes: 81 additions & 0 deletions tests/std/tests/VSO_0000000_regex_use/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2133,6 +2133,7 @@ void test_gh_5774() {
// GH-5774: Process non-greedy and longest-mode simple loops non-recursively.
// This extends our test coverage on non-greedy simple loops with bounded number of repetitions.
g_regexTester.should_not_match("", "a+?");
g_regexTester.should_match("b", "a{0}?b");
g_regexTester.should_not_match("ab", "a{0}?b");
g_regexTester.should_match("ab", "a{0,1}?b");
g_regexTester.should_not_match("aab", "a{0,1}?b");
Expand All @@ -2143,6 +2144,85 @@ void test_gh_5774() {
g_regexTester.should_match("aaab", "a{1,3}?b");
}

void test_gh_5790() {
// GH-5790: Process greedy simple loops non-recursively.
// This extends our test coverage on (mainly greedy) simple loops.
g_regexTester.should_not_match("", "a+");
g_regexTester.should_match("b", "a{0}b");
g_regexTester.should_not_match("ab", "a{0}b");
g_regexTester.should_match("ab", "a{0,1}b");
g_regexTester.should_not_match("aab", "a{0,1}b");
g_regexTester.should_match("aab", "a{0,2}b");
g_regexTester.should_match("aab", "a{1,2}b");
g_regexTester.should_not_match("aab", "a{1}b");
g_regexTester.should_not_match("aaab", "a{1,2}b");
g_regexTester.should_match("aaab", "a{1,3}b");

// Check that greedy and non-greedy search find the appropriate match.
// For the following regexes, greedy and leftmost-longest search yield the same matches.
for (syntax_option_type options : {ECMAScript, extended}) {
{
test_regex greedy_a_star(&g_regexTester, "a*", options);
greedy_a_star.should_search_match("aaaaaaaaaa", "aaaaaaaaaa");
}

{
test_regex bounded_greedy_a_rep(&g_regexTester, "a{5}", options);
bounded_greedy_a_rep.should_search_match("aaaaaaaaaa", "aaaaa");
}

{
test_regex upper_bounded_greedy_a_rep(&g_regexTester, "a{0,5}", options);
upper_bounded_greedy_a_rep.should_search_match("aaaaaaaaaa", "aaaaa");
}

{
test_regex lower_bounded_greedy_a_rep(&g_regexTester, "a{4,1000}", options);
lower_bounded_greedy_a_rep.should_search_match("aaaaaaaaaa", "aaaaaaaaaa");
}

{
test_regex lower_and_upper_bounded_greedy_a_rep(&g_regexTester, "a{2,5}", options);
lower_and_upper_bounded_greedy_a_rep.should_search_match("aaaaaaaaaa", "aaaaa");
}

{
test_regex too_large_min_greedy_a_rep(&g_regexTester, "a{11,1000}", options);
too_large_min_greedy_a_rep.should_search_fail("aaaaaaaaaa");
}
}

{
test_regex nongreedy_a_star(&g_regexTester, "a*?");
nongreedy_a_star.should_search_match("aaaaaaaaaa", "");
}

{
test_regex bounded_nongreedy_a_rep(&g_regexTester, "a{5}?");
bounded_nongreedy_a_rep.should_search_match("aaaaaaaaaa", "aaaaa");
}

{
test_regex upper_bounded_nongreedy_a_rep(&g_regexTester, "a{0,5}?");
upper_bounded_nongreedy_a_rep.should_search_match("aaaaaaaaaa", "");
}

{
test_regex lower_bounded_nongreedy_a_rep(&g_regexTester, "a{4,1000}?");
lower_bounded_nongreedy_a_rep.should_search_match("aaaaaaaaaa", "aaaa");
}

{
test_regex lower_and_upper_bounded_nongreedy_a_rep(&g_regexTester, "a{2,5}?");
lower_and_upper_bounded_nongreedy_a_rep.should_search_match("aaaaaaaaaa", "aa");
}

{
test_regex too_large_min_nongreedy_a_rep(&g_regexTester, "a{11,1000}?");
too_large_min_nongreedy_a_rep.should_search_fail("aaaaaaaaaa");
}
}

int main() {
test_dev10_449367_case_insensitivity_should_work();
test_dev11_462743_regex_collate_should_not_disable_regex_icase();
Expand Down Expand Up @@ -2195,6 +2275,7 @@ int main() {
test_gh_5576();
test_gh_5672();
test_gh_5774();
test_gh_5790();

return g_regexTester.result();
}