Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

<regex>: Revise caret parsing in basic and grep mode #5165

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix miscompilation of double carets and treat carets as anchors at th…
…e beginning of alternatives

bonus: eliminates order-dependency between lexer tokenization and NFA additions
  • Loading branch information
muellerj2 committed Dec 15, 2024
commit edada370fb3653ad2d31ab0263b2af35254170a2
33 changes: 13 additions & 20 deletions stl/inc/regex
Original file line number Diff line number Diff line change
@@ -1485,7 +1485,6 @@ public:
using _Difft = typename iterator_traits<_FwdIt>::difference_type;

_Builder(const _RxTraits& _Tr, regex_constants::syntax_option_type);
bool _Beg_expr() const;
void _Setlong();
// _Discard_pattern is an ABI zombie name
void _Tidy() noexcept;
@@ -1521,7 +1520,6 @@ private:
static void _Insert_node(_Node_base*, _Node_base*);
_Node_base* _New_node(_Node_type _Kind);
void _Add_str_node();
bool _Beg_expr(_Node_base*) const;
void _Add_char_to_bitmap(_Elem _Ch);
void _Add_char_to_array(_Elem _Ch);
void _Add_elts(_Node_class<_Elem, _RxTraits>*, _Regex_traits_base::char_class_type, bool);
@@ -2755,17 +2753,6 @@ _Node_base* _Builder<_FwdIt, _Elem, _RxTraits>::_Getmark() const {
return _Current;
}

template <class _FwdIt, class _Elem, class _RxTraits>
bool _Builder<_FwdIt, _Elem, _RxTraits>::_Beg_expr(_Node_base* _Nx) const {
// test for beginning of expression or subexpression
return _Nx->_Kind == _N_begin || _Nx->_Kind == _N_group || _Nx->_Kind == _N_capture;
}

template <class _FwdIt, class _Elem, class _RxTraits>
bool _Builder<_FwdIt, _Elem, _RxTraits>::_Beg_expr() const { // test for beginning of expression or subexpression
return _Beg_expr(_Current) || (_Current->_Kind == _N_bol && _Beg_expr(_Current->_Prev));
}

template <class _FwdIt, class _Elem, class _RxTraits>
_Node_base* _Builder<_FwdIt, _Elem, _RxTraits>::_Link_node(_Node_base* _Nx) { // insert _Nx at current location
_Nx->_Prev = _Current;
@@ -3867,10 +3854,10 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Trans() { // map character to meta-char
break;

case _Meta_star:
if ((_L_flags & _L_star_beg) && _Nfa._Beg_expr()) {
_Mchar = _Meta_chr;
}

// A star can always act as a quantifier outside bracket expressions,
// but _L_star_beg (used by basic/grep) allows its use as an ordinary character
// at the beginning of a (sub-)expression (potentially after an optional caret anchor).
// We'll handle that when we are parsing alternatives in disjunctions.
break;

case _Meta_caret:
@@ -4434,15 +4421,21 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_Alternative() { // check for valid alte
_Next();
_Quant = _Wrapped_disjunction();
_Expect(_Meta_rpar, regex_constants::error_paren);
} else if (_Mchar == _Meta_caret && (!(_L_flags & _L_anch_rstr) || _Nfa._Beg_expr())) { // add bol node
} else if (_Mchar == _Meta_caret && (!(_L_flags & _L_anch_rstr) || !_Found)) { // add bol node
_Nfa._Add_bol();
_Next();
_Quant = false;
if ((_L_flags & _L_star_beg) && _Mchar == _Meta_star) {
_Nfa._Add_char(_Char);
_Next();
} else {
_Quant = false;
}
} else if (_Mchar == _Meta_dlr) { // add eol node
_Nfa._Add_eol();
_Next();
_Quant = false;
} else if (_Mchar == _Meta_star || _Mchar == _Meta_plus || _Mchar == _Meta_query || _Mchar == _Meta_lbr) {
} else if ((_Mchar == _Meta_star && (!(_L_flags & _L_star_beg) || !_Found)) || _Mchar == _Meta_plus
|| _Mchar == _Meta_query || _Mchar == _Meta_lbr) {
_Error(regex_constants::error_badrepeat);
} else if (_Mchar == _Meta_rbr && !(_L_flags & _L_paren_bal)) {
_Error(regex_constants::error_brace);