Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

<regex>: Fix depth-first and leftmost-longest matching rules #5218

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 26 additions & 7 deletions stl/inc/regex
Original file line number Diff line number Diff line change
Expand Up @@ -1638,11 +1638,12 @@ public:

if (_Matches) { // copy results to _Matches
_Matches->_Resize(_Get_ncap());
const auto& _Result = _Longest ? _Res : _Tgt_state;
for (unsigned int _Idx = 0; _Idx < _Get_ncap(); ++_Idx) { // copy submatch _Idx
if (_Res._Grp_valid[_Idx]) { // copy successful match
if (_Result._Grp_valid[_Idx]) { // copy successful match
_Matches->_At(_Idx).matched = true;
_Matches->_At(_Idx).first = _Res._Grps[_Idx]._Begin;
_Matches->_At(_Idx).second = _Res._Grps[_Idx]._End;
_Matches->_At(_Idx).first = _Result._Grps[_Idx]._Begin;
_Matches->_At(_Idx).second = _Result._Grps[_Idx]._End;
} else { // copy failed match
_Matches->_At(_Idx).matched = false;
_Matches->_At(_Idx).first = _End;
Expand Down Expand Up @@ -3283,6 +3284,17 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_rep(_Node_rep* _Node, bool _Gr
_Psav->_Loop_iter = _STD addressof(_Cur_iter);
_Matched0 = _Match_pat(_Node->_Next);
}
} else if (_Longest) { // longest, try any number of repetitions

// match with no further repetition
_Matched0 = _Match_pat(_Node->_End_rep->_Next);
// match with at least one more repetition if last repetition made progress
if (_Progress) {
_Tgt_state = _St;
_Psav->_Loop_idx = _Init_idx + 1;
_Psav->_Loop_iter = _STD addressof(_Cur_iter);
_Matched0 |= _Match_pat(_Node->_Next);
}
} else if (!_Greedy) { // not greedy, favor minimum number of reps
_Matched0 = _Match_pat(_Node->_End_rep->_Next);
if (!_Matched0 && _Progress) { // tail failed, try another rep
Expand Down Expand Up @@ -3443,16 +3455,23 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_class(_Node_base* _Nx) { // ap
}

template <class _BidIt, class _Elem, class _RxTraits, class _It>
bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { // check for better match under UNIX rules
bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Better_match() { // check for better match under leftmost-longest rule
for (unsigned int _Ix = 0; _Ix < _Get_ncap(); ++_Ix) { // check each capture group
// any match (even an empty one) is better than no match at all
if (_Res._Grp_valid[_Ix] != _Tgt_state._Grp_valid[_Ix]) {
return _Tgt_state._Grp_valid[_Ix];
}
if (_Res._Grp_valid[_Ix] && _Tgt_state._Grp_valid[_Ix]) {
// if both groups are matched, prefer the leftmost one
if (_Res._Grps[_Ix]._Begin != _Tgt_state._Grps[_Ix]._Begin) {
return _STD distance(_Begin, _Res._Grps[_Ix]._Begin)
< _STD distance(_Begin, _Tgt_state._Grps[_Ix]._Begin);
> _STD distance(_Begin, _Tgt_state._Grps[_Ix]._Begin);
}

// if both groups start at the same position, prefer the longer one
if (_Res._Grps[_Ix]._End != _Tgt_state._Grps[_Ix]._End) {
return _STD distance(_Begin, _Res._Grps[_Ix]._End) < _STD distance(_Begin, _Tgt_state._Grps[_Ix]._End);
return _STD distance(_Res._Grps[_Ix]._Begin, _Res._Grps[_Ix]._End)
< _STD distance(_Tgt_state._Grps[_Ix]._Begin, _Tgt_state._Grps[_Ix]._End);
}
}
}
Expand Down Expand Up @@ -3671,7 +3690,7 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Match_pat(_Node_base* _Nx) { // c
&& _Begin == _Tgt_state._Cur)
|| (_Full && _Tgt_state._Cur != _End)) {
_Failed = true;
} else if (!_Matched || _Better_match()) { // record successful match
} else if (_Longest && (!_Matched || _Better_match())) { // record successful match
_Res = _Tgt_state;
_Matched = true;
}
Expand Down
88 changes: 88 additions & 0 deletions tests/std/include/test_regex_support.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#pragma once
#include <cstdio>
#include <initializer_list>
#include <regex>
#include <string>

Expand Down Expand Up @@ -241,6 +242,93 @@ class test_regex {
fixture->fail_regex();
}
}

void should_search_match_capture_groups(const std::string& subject, const std::string& expected,
const std::regex_constants::match_flag_type match_flags,
std::initializer_list<std::pair<std::ptrdiff_t, std::ptrdiff_t>> capture_groups) const {
std::smatch mr;
try {
const bool search_result = std::regex_search(subject, mr, r, match_flags);
if (!search_result || mr[0] != expected) {
printf(R"(Expected regex_search("%s", regex("%s", 0x%X), 0x%X) to find "%s", )", subject.c_str(),
pattern.c_str(), static_cast<unsigned int>(syntax), static_cast<unsigned int>(match_flags),
expected.c_str());
if (search_result) {
printf(R"(but it matched "%s")"
"\n",
mr.str().c_str());
} else {
puts("but it failed to match");
}

fixture->fail_regex();
} else if (capture_groups.size() + 1 != mr.size()) {
printf(R"(Expected regex_search("%s", regex("%s", 0x%X), 0x%X) to match %zu capture groups in "%s", )",
subject.c_str(), pattern.c_str(), static_cast<unsigned int>(syntax),
static_cast<unsigned int>(match_flags), capture_groups.size() + 1, expected.c_str());
printf(R"(but it matched %zu groups)"
"\n",
mr.size());
fixture->fail_regex();
} else {
bool submatches_success = true;
for (std::size_t i = 1U; i < mr.size(); ++i) {
const auto& expected_capture = *(capture_groups.begin() + (i - 1));
const auto& actual_capture = mr[i];
if (expected_capture.first == -1) {
if (actual_capture.matched) {
submatches_success = false;
break;
}
} else if (!actual_capture.matched || actual_capture.first != (mr[0].first + expected_capture.first)
|| actual_capture.second != (mr[0].first + expected_capture.second)) {
submatches_success = false;
break;
}
}
if (!submatches_success) {
printf(R"(Expected regex_search("%s", regex("%s", 0x%X), 0x%X) to find capture groups {)",
subject.c_str(), pattern.c_str(), static_cast<unsigned int>(syntax),
static_cast<unsigned int>(match_flags));

bool initial = true;
for (const auto& expected_capture : capture_groups) {
std::string capture = "(unmatched)";
if (expected_capture.first != -1) {
capture.assign(mr[0].first + expected_capture.first, mr[0].first + expected_capture.second);
}
printf(R"(%s"%s" [%td %td])", initial ? "" : ", ", capture.c_str(), expected_capture.first,
expected_capture.second);
initial = false;
}
printf(R"(} in "%s", but found {)", expected.c_str());

initial = true;
for (std::size_t i = 1U; i < mr.size(); ++i) {
const auto& actual_capture = mr[i];
std::string capture = "(unmatched)";
std::ptrdiff_t first = -1;
std::ptrdiff_t last = -1;
if (actual_capture.matched) {
capture = actual_capture.str();
first = actual_capture.first - mr[0].first;
last = actual_capture.second - mr[0].first;
}
printf(R"(%s"%s" [%td %td])", initial ? "" : ", ", capture.c_str(), first, last);
initial = false;
}
printf("}\n");
fixture->fail_regex();
}
}
} catch (const std::regex_error& e) {
printf(R"(Failed to regex_search("%s", regex("%s", 0x%X), 0x%X): regex_error: "%s")"
"\n",
subject.c_str(), pattern.c_str(), static_cast<unsigned int>(syntax),
static_cast<unsigned int>(match_flags), e.what());
fixture->fail_regex();
}
}
};

class test_wregex {
Expand Down
84 changes: 84 additions & 0 deletions tests/std/tests/VSO_0000000_regex_use/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,89 @@ void test_construction_from_nullptr_and_zero() {
}
}

void test_gh_731() {
// GH-731 <regex>: Incorrect behavior for capture groups
// GH-996: regex_search behaves incorrectly when the regex contains R"(\[)"

// Several bugs were fixed in ECMAScript (depth-first) and POSIX (leftmost-longest) matching rules.
{
const test_regex ecma_regex(&g_regexTester, R"((A+)\s*(B+)?\s*B*)", ECMAScript);
ecma_regex.should_search_match_capture_groups("AAA BBB", "AAA BBB", match_default, {{0, 3}, {4, 7}});
}
for (syntax_option_type option : {extended, egrep, awk}) {
const test_regex posix_regex(&g_regexTester, R"((A+)[[:space:]]*(B+)?[[:space:]]*B*)", option);
posix_regex.should_search_match_capture_groups("AAA BBB", "AAA BBB", match_default, {{0, 3}, {4, 7}});
}

{
const test_regex ecma_regex(&g_regexTester, ".*(cat|concatenate)", ECMAScript);
ecma_regex.should_search_match_capture_groups("WXconcatenateYZ", "WXconcat", match_default, {{5, 8}});
}
for (syntax_option_type option : {extended, egrep, awk}) {
const test_regex posix_regex(&g_regexTester, ".*(cat|concatenate)", option);
posix_regex.should_search_match_capture_groups("WXconcatenateYZ", "WXconcatenate", match_default, {{2, 13}});
}

{
const test_regex ecma_regex(&g_regexTester, "(aa|aabaac|ba|b|c)*", ECMAScript);
ecma_regex.should_search_match_capture_groups("aabaac", "aaba", match_default, {{2, 4}});
}
for (syntax_option_type option : {extended, egrep, awk}) {
const test_regex posix_regex(&g_regexTester, "(aa|aabaac|ba|b|c)*", option);
posix_regex.should_search_match_capture_groups("aabaac", "aabaac", match_default, {{0, 6}});
}

{
const test_regex ecma_regex(&g_regexTester, ".*(a|bacc|baccc)", ECMAScript);
ecma_regex.should_search_match_capture_groups("ddbacccd", "ddba", match_default, {{3, 4}});
}
{
const test_regex ecma_regex(&g_regexTester, ".*?(a|bacc|baccc)", ECMAScript);
ecma_regex.should_search_match_capture_groups("ddbacccd", "ddbacc", match_default, {{2, 6}});
}
for (syntax_option_type option : {extended, egrep, awk}) {
const test_regex ecma_regex(&g_regexTester, ".*(a|bacc|baccc)", option);
ecma_regex.should_search_match_capture_groups("ddbacccd", "ddbaccc", match_default, {{2, 7}});
}

for (syntax_option_type option : {extended, egrep, awk}) {
const test_regex posix_regex(&g_regexTester, "(aa|aabaac|ba|b|c)*", option);
posix_regex.should_search_match_capture_groups("aabaac", "aabaac", match_default, {{0, 6}});
}

{
const test_regex ecma_regex(&g_regexTester, "^[[:blank:]]*#([^\\n]*\\\\[[:space:]]+)*[^\\n]*", ECMAScript);
ecma_regex.should_search_match_capture_groups("#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);",
"#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);", match_default, {{30, 42}});
}
{
const test_regex awk_regex(&g_regexTester, "^[[:blank:]]*#([^\\n]*\\\\[[:space:]]+)*[^\\n]*", awk);
awk_regex.should_search_match_capture_groups("#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);",
"#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);", match_default, {{28, 42}});
}
{
const test_regex extended_regex(&g_regexTester, "^[[:blank:]]*#([^\n]*\\\\[[:space:]]+)*[^\n]*", extended);
extended_regex.should_search_match_capture_groups("#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);",
"#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);", match_default, {{28, 42}});
}

{
const test_regex ecma_regex(&g_regexTester, "(ab*)*(ce|bbceef)", ECMAScript);
ecma_regex.should_search_match_capture_groups("aababbbceef", "aababbbce", match_default, {{3, 7}, {7, 9}});
}
for (syntax_option_type option : {extended, egrep, awk}) {
const test_regex posix_regex(&g_regexTester, "(ab*)*(ce|bbceef)", option);
posix_regex.should_search_match_capture_groups("aababbbceef", "aababbbceef", match_default, {{3, 5}, {5, 11}});
}

{
// GH-996 test case
const test_regex ecma_regex(&g_regexTester, R"( *((<<)|(\[)|(.+)))");
ecma_regex.should_search_match_capture_groups(
" [<</Category/Export>>]>>", " [", match_default, {{1, 2}, {-1, -1}, {1, 2}, {-1, -1}});
}
}

void test_gh_993() {
// GH-993 regex::icase is not handled correctly for some input.
{
Expand Down Expand Up @@ -775,6 +858,7 @@ int main() {
test_VSO_225160_match_eol_flag();
test_VSO_226914_word_boundaries();
test_construction_from_nullptr_and_zero();
test_gh_731();
test_gh_993();
test_gh_4995();
test_gh_5058();
Expand Down
6 changes: 4 additions & 2 deletions tests/tr1/tests/regex2/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -659,7 +659,7 @@ static const regex_test tests[] = {
{__LINE__, T("a[a-z]\\{2,4\\}"), T("abcdefghi"), "1 0 5", BASIC | GREP},
{__LINE__, T("a[a-z]{2,4}?"), T("abcdefghi"), "1 0 3", ECMA},
{__LINE__, T("(aa|aabaac|ba|b|c)*"), T("aabaac"), "2 0 4 2 4", ECMA},
{__LINE__, T("(aa|aabaac|ba|b|c)*"), T("aabaac"), "2 0 6 5 6", EEA},
{__LINE__, T("(aa|aabaac|ba|b|c)*"), T("aabaac"), "2 0 6 0 6", EEA},
{__LINE__, T("(z)((a+)?(b+)?(c))*"), T("zaacbbbcac"), "6 0 10 0 1 8 10 8 9 -1 -1 9 10", ECMA},
{__LINE__, T("(a*)b\\1+"), T("baaaac"), "2 0 1 0 0", ECMA},
{__LINE__, T("(?=(a+))"), T("baaabac"), "2 1 1 1 4", ECMA},
Expand Down Expand Up @@ -774,7 +774,9 @@ static const regex_test tests[] = {
{__LINE__, T("^[[:blank:]]*#([^\\n]*\\\\[[:space:]]+)*[^\\n]*"), T("#define some_symbol(x) #x"), "2 0 25 -1 -1",
ECMA | AWK},
{__LINE__, T("^[[:blank:]]*#([^\\n]*\\\\[[:space:]]+)*[^\\n]*"),
T("#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);"), "2 0 53 30 42", ECMA | AWK},
T("#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);"), "2 0 53 30 42", ECMA},
{__LINE__, T("^[[:blank:]]*#([^\\n]*\\\\[[:space:]]+)*[^\\n]*"),
T("#define some_symbol(x) \\ \r\n cat();\\\r\n printf(#x);"), "2 0 53 28 42", AWK},
};

static STD string check_matches(
Expand Down