From f197536ee9cee9eb2f6a6f94bdeaa47a991de7d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Wed, 15 Jan 2025 19:19:15 +0100 Subject: [PATCH 1/4] ``: Implement collating ranges --- stl/inc/regex | 85 +++-- .../GH_005204_regex_collating_ranges/env.lst | 4 + .../GH_005204_regex_collating_ranges/test.cpp | 333 ++++++++++++++++++ 3 files changed, 392 insertions(+), 30 deletions(-) create mode 100644 tests/std/tests/GH_005204_regex_collating_ranges/env.lst create mode 100644 tests/std/tests/GH_005204_regex_collating_ranges/test.cpp diff --git a/stl/inc/regex b/stl/inc/regex index e280502115..43e7e7f7f5 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -1720,7 +1720,7 @@ public: private: // lexing - void _Error(regex_constants::error_type); + [[noreturn]] void _Error(regex_constants::error_type); bool _Is_esc() const; void _Trans(); @@ -2917,7 +2917,8 @@ void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_range2(const _Elem _Arg0, const _E _Node->_Small->_Mark(_Ex0); } - if (_Ex1 >= _Ex0) { + + if (_Flags & regex_constants::collate || _Ex1 >= _Ex0) { if (_Ex1 - _Ex0 < _Get_tmax()) { for (; _Ex0 <= _Ex1; ++_Ex0) { _Add_char_to_array(static_cast<_Elem>(_Ex0)); @@ -3355,6 +3356,20 @@ bool _Lookup_range(unsigned int _Ch, const _Buf<_Elem>* _Bufptr) { // check whet return false; } +template +bool _Lookup_collating_range(_Elem _Ch, const _Buf<_Elem>* _Bufptr, const _RxTraits& _Traits) { + typename _RxTraits::string_type _Str = _Traits.transform(_STD addressof(_Ch), _STD addressof(_Ch) + 1); + for (unsigned int _Ix = 0; _Ix < _Bufptr->_Size(); _Ix += 2) { // check current position + const _Elem _Left = _Bufptr->_At(_Ix); + const _Elem _Right = _Bufptr->_At(_Ix + 1); + if (_Traits.transform(_STD addressof(_Left), _STD addressof(_Left) + 1) <= _Str + && _Str <= _Traits.transform(_STD addressof(_Right), _STD addressof(_Right) + 1)) { + return true; + } + } + return false; +} + template bool _Lookup_equiv(typename _RxTraits::_Uelem _Ch, const _Sequence<_Elem>* _Eq, const _RxTraits& _Traits) { // check whether _Ch is in _Eq @@ -3398,35 +3413,36 @@ _BidIt _Lookup_coll(_BidIt _First, _BidIt _Last, const _Sequence<_Elem>* _Eq) { template bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_class(_Node_base* _Nx) { // apply bracket expression bool _Found; - auto _Ch = static_cast(*_Tgt_state._Cur); + _Elem _Ch = *_Tgt_state._Cur; if (_Sflags & regex_constants::icase) { - _Ch = static_cast(_Traits.translate_nocase(static_cast<_Elem>(_Ch))); + _Ch = _Traits.translate_nocase(_Ch); + } else if (_Sflags & regex_constants::collate) { + _Ch = _Traits.translate(_Ch); } + auto _UCh = static_cast(_Ch); _It _Res0 = _Tgt_state._Cur; ++_Res0; _It _Resx; _Node_class<_Elem, _RxTraits>* _Node = static_cast<_Node_class<_Elem, _RxTraits>*>(_Nx); if (_Node->_Coll - && (_Resx = _Lookup_coll(_Tgt_state._Cur, _End, _Node->_Coll)) + && (_Resx = _STD _Lookup_coll(_Tgt_state._Cur, _End, _Node->_Coll)) != _Tgt_state._Cur) { // check for collation element _Res0 = _Resx; _Found = true; } else if (_Node->_Ranges - && (_Lookup_range(static_cast( - _Sflags & regex_constants::collate ? _Traits.translate(static_cast<_Elem>(_Ch)) - : static_cast<_Elem>(_Ch)), - _Node->_Ranges))) { + && (_Sflags & regex_constants::collate ? _STD _Lookup_collating_range(_Ch, _Node->_Ranges, _Traits) + : _STD _Lookup_range(_UCh, _Node->_Ranges))) { _Found = true; - } else if (_Ch < _Bmp_max) { - _Found = _Node->_Small && _Node->_Small->_Find(_Ch); + } else if (_UCh < _Bmp_max) { + _Found = _Node->_Small && _Node->_Small->_Find(_UCh); } else if (_Node->_Large && _STD find(_Node->_Large->_Str(), _Node->_Large->_Str() + _Node->_Large->_Size(), _Ch) != _Node->_Large->_Str() + _Node->_Large->_Size()) { _Found = true; - } else if (_Node->_Classes != 0 && _Traits.isctype(static_cast<_Elem>(_Ch), _Node->_Classes)) { + } else if (_Node->_Classes != typename _RxTraits::char_class_type{} && _Traits.isctype(_Ch, _Node->_Classes)) { _Found = true; - } else if (_Node->_Equiv && _Lookup_equiv(_Ch, _Node->_Equiv, _Traits)) { + } else if (_Node->_Equiv && _STD _Lookup_equiv(_UCh, _Node->_Equiv, _Traits)) { _Found = true; } else { _Found = false; @@ -3746,34 +3762,36 @@ _BidIt _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Skip(_BidIt _First_arg, _BidIt case _N_class: { // check for string match for (; _First_arg != _Last; ++_First_arg) { // look for starting match - using _Uelem = typename _RxTraits::_Uelem; bool _Found; - auto _Ch = static_cast<_Uelem>(*_First_arg); + _Elem _Ch = *_First_arg; + if (_Sflags & regex_constants::icase) { + _Ch = _Traits.translate_nocase(_Ch); + } else if (_Sflags & regex_constants::collate) { + _Ch = _Traits.translate(_Ch); + } + auto _UCh = static_cast(_Ch); + _Node_class<_Elem, _RxTraits>* _Node = static_cast<_Node_class<_Elem, _RxTraits>*>(_Nx); _It _Next = _First_arg; ++_Next; - if (_Sflags & regex_constants::icase) { - _Ch = static_cast<_Uelem>(_Traits.translate_nocase(static_cast<_Elem>(_Ch))); - } - - if (_Node->_Coll && _Lookup_coll(_First_arg, _Next, _Node->_Coll) != _First_arg) { + if (_Node->_Coll && _STD _Lookup_coll(_First_arg, _Next, _Node->_Coll) != _First_arg) { _Found = true; } else if (_Node->_Ranges - && (_Lookup_range(static_cast<_Uelem>(_Sflags & regex_constants::collate - ? _Traits.translate(static_cast<_Elem>(_Ch)) - : static_cast<_Elem>(_Ch)), - _Node->_Ranges))) { + && (_Sflags & regex_constants::collate + ? _STD _Lookup_collating_range(_Ch, _Node->_Ranges, _Traits) + : _STD _Lookup_range(_UCh, _Node->_Ranges))) { _Found = true; - } else if (_Ch < _Bmp_max) { - _Found = _Node->_Small && _Node->_Small->_Find(_Ch); + } else if (_UCh < _Bmp_max) { + _Found = _Node->_Small && _Node->_Small->_Find(_UCh); } else if (_Node->_Large && _STD find(_Node->_Large->_Str(), _Node->_Large->_Str() + _Node->_Large->_Size(), _Ch) != _Node->_Large->_Str() + _Node->_Large->_Size()) { _Found = true; - } else if (_Node->_Classes && _Traits.isctype(static_cast<_Elem>(_Ch), _Node->_Classes)) { + } else if (_Node->_Classes != typename _RxTraits::char_class_type{} + && _Traits.isctype(_Ch, _Node->_Classes)) { _Found = true; - } else if (_Node->_Equiv && _Lookup_equiv(_Ch, _Node->_Equiv, _Traits)) { + } else if (_Node->_Equiv && _STD _Lookup_equiv(_UCh, _Node->_Equiv, _Traits)) { _Found = true; } else { _Found = false; @@ -3842,7 +3860,7 @@ _BidIt _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Skip(_BidIt _First_arg, _BidIt } template -void _Parser<_FwdIt, _Elem, _RxTraits>::_Error(regex_constants::error_type _Code) { // handle error +[[noreturn]] void _Parser<_FwdIt, _Elem, _RxTraits>::_Error(regex_constants::error_type _Code) { // handle error _Xregex_error(_Code); } @@ -4156,7 +4174,14 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_ClassRanges() { // check for valid clas _Chr2 = _Traits.translate(_Chr2); } - if (static_cast(_Chr2) < static_cast(_Chr1)) { + if (_Flags & regex_constants::collate) { + const _Elem* const _Chr1_ptr = _STD addressof(_Chr1); + const _Elem* const _Chr2_ptr = _STD addressof(_Chr2); + if (_Traits.transform(_Chr2_ptr, _Chr2_ptr + 1) < _Traits.transform(_Chr1_ptr, _Chr1_ptr + 1)) { + _Error(regex_constants::error_range); + } + } else if (static_cast(_Chr2) + < static_cast(_Chr1)) { _Error(regex_constants::error_range); } diff --git a/tests/std/tests/GH_005204_regex_collating_ranges/env.lst b/tests/std/tests/GH_005204_regex_collating_ranges/env.lst new file mode 100644 index 0000000000..19f025bd0e --- /dev/null +++ b/tests/std/tests/GH_005204_regex_collating_ranges/env.lst @@ -0,0 +1,4 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +RUNALL_INCLUDE ..\usual_matrix.lst diff --git a/tests/std/tests/GH_005204_regex_collating_ranges/test.cpp b/tests/std/tests/GH_005204_regex_collating_ranges/test.cpp new file mode 100644 index 0000000000..153d989629 --- /dev/null +++ b/tests/std/tests/GH_005204_regex_collating_ranges/test.cpp @@ -0,0 +1,333 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include +#include +#include +#include + +#include + +// skip collation tests when linking to the DLL in case of +// * undefined _NATIVE_WCHAR_T_DEFINED due to GH-5236 +// * _ITERATOR_DEBUG_LEVEL mismatch between code and linked DLL +#ifdef _DLL +#ifndef _NATIVE_WCHAR_T_DEFINED // TRANSITION, GH-212 or GH-5236 +#define SKIP_COLLATE_TESTS +#elif (_ITERATOR_DEBUG_LEVEL != 0) != defined(_DEBUG) +#define SKIP_COLLATE_TESTS +#endif // !defined(_NATIVE_WCHAR_T_DEFINED) || ((_ITERATOR_DEBUG_LEVEL > 0) != defined(_DEBUG)) +#endif // defined(_DLL) + + +using namespace std; +using namespace std::regex_constants; + +class test_wregex_locale { + regex_fixture* const fixture; + const wstring pattern; + const syntax_option_type syntax; + const string locname; + wregex r; + +public: + test_wregex_locale( + regex_fixture* fixture, const wstring& pattern, const string& locname, syntax_option_type syntax = ECMAScript) + : fixture(fixture), pattern(pattern), syntax(syntax), locname(locname), r() { + r.imbue(locale(locname)); + r.assign(pattern, syntax); + } + + test_wregex_locale(const test_wregex_locale&) = delete; + test_wregex_locale& operator=(const test_wregex_locale&) = delete; + + void should_search_match( + const wstring& subject, const wstring& expected, const match_flag_type match_flags = match_default) const { + wsmatch mr; + try { + const bool search_result = regex_search(subject, mr, r, match_flags); + if (!search_result || mr[0] != expected) { + wprintf(LR"(Expected regex_search("%s", regex("%s", 0x%X), 0x%X) to find "%s" for locale "%hs", )", + subject.c_str(), pattern.c_str(), static_cast(syntax), + static_cast(match_flags), expected.c_str(), locname.c_str()); + if (search_result) { + wprintf(LR"(but it matched "%s")" + "\n", + mr.str().c_str()); + } else { + puts("but it failed to match"); + } + + fixture->fail_regex(); + } + } catch (const regex_error& e) { + wprintf(LR"(Failed to regex_search("%s", regex("%s", 0x%X), 0x%X))", subject.c_str(), pattern.c_str(), + static_cast(syntax), static_cast(match_flags)); + printf(" for locale \"%s\": regex_error: \"%s\"\n", locname.c_str(), e.what()); + fixture->fail_regex(); + } + } + + void should_search_fail(const wstring& subject, const match_flag_type match_flags = match_default) const { + wsmatch mr; + try { + if (regex_search(subject, mr, r, match_flags)) { + wprintf( + LR"(Expected regex_search("%s", regex("%s", 0x%X), 0x%X) to not match for locale "%hs", but it found "%s")" + "\n", + subject.c_str(), pattern.c_str(), static_cast(syntax), + static_cast(match_flags), locname.c_str(), mr.str().c_str()); + fixture->fail_regex(); + } + } catch (const regex_error& e) { + wprintf(LR"(Failed to regex_search("%s", regex("%s", 0x%X), 0x%X))", subject.c_str(), pattern.c_str(), + static_cast(syntax), static_cast(match_flags)); + printf(" for locale \"%s\": regex_error: \"%s\"\n", locname.c_str(), e.what()); + fixture->fail_regex(); + } + } +}; + +regex_fixture g_regexTester; + +void regex_with_locale_should_throw(const wstring& pattern, const string& locname, error_type expected, + syntax_option_type syntax = regex_constants::collate) { + wregex r; + r.imbue(locale(locname)); + try { + r.assign(pattern, regex_constants::collate); + wprintf(LR"(regex r("%s", 0x%X) succeeded for locale "%hs" (which is bad).)" + L"\n", + pattern.c_str(), static_cast(syntax), locname.c_str()); + g_regexTester.fail_regex(); + } catch (const regex_error& e) { + if (e.code() != expected) { + wprintf(LR"(regex r("%s", 0x%X) with locale "%hs" threw 0x%X; expected 0x%X)" + L"\n", + pattern.c_str(), static_cast(syntax), locname.c_str(), + static_cast(e.code()), static_cast(expected)); + } + } +} + +void test_collating_ranges_german() { + + // special characters in German (umlauts and sharp s) + const wchar_t* special_characters[] = { + L"\u00E4", // U+00E4 LATIN SMALL LETTER A WITH DIARESIS + L"\u00C4", // U+00C4 LATIN CAPITAL LETTER A WITH DIARESIS + L"\u00DF", // U+00DF LATIN SMALL LETTER SHARP S + L"\u1E9E", // U+1E9E LATIN CAPITAL LETTER SHARP S + L"\u00F6", // U+00F6 LATIN SMALL LETTER U WITH DIARESIS + L"\u00D6", // U+00D6 LATIN CAPITAL LETTER U WITH DIARESIS + L"\u00FC", // U+00FC LATIN SMALL LETTER O WITH DIARESIS + L"\u00DC" // U+00DC LATIN CAPITAL LETTER O WITH DIARESIS + }; + + // sanity checks: collation not enabled, with or without imbued locale + { + // [a-z], [A-Z] and [A-z] should not match special German characters + for (const wstring& pattern : {L"[a-z]", L"[A-Z]", L"[A-z]"}) { + { + test_wregex nocollate_nolocale(&g_regexTester, pattern); + for (const wchar_t* s : special_characters) { + nocollate_nolocale.should_search_fail(s); + } + } + + { + test_wregex_locale nocollate_locale(&g_regexTester, pattern, "de-DE"); + for (const wchar_t* s : special_characters) { + nocollate_locale.should_search_fail(s); + } + } + } + } + +#ifndef SKIP_COLLATE_TESTS + // de-DE collation order sorts as follows: + // a, A, + // U+00E4 LATIN SMALL LETTER A WITH DIARESIS, + // U+00C4 LATIN CAPITAL LETTER A WITH DIARESIS, + // b, B, ..., o, O, + // U+00F6 LATIN SMALL LETTER O WITH DIARESIS, + // U+00D6 LATIN CAPITAL LETTER O WITH DIARESIS, + // p, P, ..., s, S, + // U+00DF LATIN SMALL LETTER SHARP S + // U+1E9E LATIN CAPITAL LETTER SHARP S, + // t, T, u, U, + // U+00FC LATIN SMALL LETTER U WITH DIARESIS, + // U+00DC LATIN CAPITAL LETTER U WITH DIARESIS, + // v, V, ..., z, Z + + const wchar_t* special_characters_without_ae[] = { + L"\u00DF", // U+00DF LATIN SMALL LETTER SHARP S + L"\u1E9E", // U+1E9E LATIN CAPITAL LETTER SHARP S + L"\u00F6", // U+00F6 LATIN SMALL LETTER U WITH DIARESIS + L"\u00D6", // U+00D6 LATIN CAPITAL LETTER U WITH DIARESIS + L"\u00FC", // U+00FC LATIN SMALL LETTER O WITH DIARESIS + L"\u00DC" // U+00DC LATIN CAPITAL LETTER O WITH DIARESIS + }; + + { + test_wregex_locale collate_a_to_a_regex(&g_regexTester, L"[a-a]", "de-DE", regex_constants::collate); + collate_a_to_a_regex.should_search_match(L"a", L"a"); + collate_a_to_a_regex.should_search_fail(L"A"); + collate_a_to_a_regex.should_search_fail(L"\u00E4"); // U+00E4 LATIN SMALL LETTER A WITH DIARESIS + collate_a_to_a_regex.should_search_fail(L"\u00C4"); // U+00C4 LATIN CAPITAL LETTER A WITH DIARESIS + collate_a_to_a_regex.should_search_fail(L"b"); + for (const wchar_t* s : special_characters_without_ae) { + collate_a_to_a_regex.should_search_fail(s); + } + } + + { + test_wregex_locale collate_a_to_A_regex(&g_regexTester, L"[a-A]", "de-DE", regex_constants::collate); + collate_a_to_A_regex.should_search_match(L"a", L"a"); + collate_a_to_A_regex.should_search_match(L"A", L"A"); + collate_a_to_A_regex.should_search_fail(L"\u00E4"); // U+00E4 LATIN SMALL LETTER A WITH DIARESIS + collate_a_to_A_regex.should_search_fail(L"\u00C4"); // U+00C4 LATIN CAPITAL LETTER A WITH DIARESIS + collate_a_to_A_regex.should_search_fail(L"b"); + for (const wchar_t* s : special_characters_without_ae) { + collate_a_to_A_regex.should_search_fail(s); + } + } + + { + test_wregex_locale collate_a_to_ae_regex(&g_regexTester, + L"[a-\u00E4]", // U+00E4 LATIN SMALL LETTER A WITH DIARESIS + "de-DE", regex_constants::collate); + collate_a_to_ae_regex.should_search_match(L"a", L"a"); + collate_a_to_ae_regex.should_search_match(L"A", L"A"); + collate_a_to_ae_regex.should_search_match(L"\u00E4", L"\u00E4"); // U+00E4 LATIN SMALL LETTER A WITH DIARESIS + collate_a_to_ae_regex.should_search_fail(L"\u00C4"); // U+00C4 LATIN CAPITAL LETTER A WITH DIARESIS + collate_a_to_ae_regex.should_search_fail(L"b"); + for (const wchar_t* s : special_characters_without_ae) { + collate_a_to_ae_regex.should_search_fail(s); + } + } + + { + test_wregex_locale collate_a_to_Ae_regex(&g_regexTester, + L"[a-\u00C4]", // U+00C4 LATIN CAPITAL LETTER A WITH DIARESIS + "de-DE", regex_constants::collate); + collate_a_to_Ae_regex.should_search_match(L"a", L"a"); + collate_a_to_Ae_regex.should_search_match(L"A", L"A"); + collate_a_to_Ae_regex.should_search_match(L"\u00E4", L"\u00E4"); // U+00E4 LATIN SMALL LETTER A WITH DIARESIS + collate_a_to_Ae_regex.should_search_match(L"\u00C4", L"\u00C4"); // U+00C4 LATIN CAPITAL LETTER A WITH DIARESIS + collate_a_to_Ae_regex.should_search_fail(L"b"); + for (const wchar_t* s : special_characters_without_ae) { + collate_a_to_Ae_regex.should_search_fail(s); + } + } + + { + test_wregex_locale collate_a_to_b_regex(&g_regexTester, L"[a-b]", "de-DE", regex_constants::collate); + collate_a_to_b_regex.should_search_match(L"a", L"a"); + collate_a_to_b_regex.should_search_match(L"A", L"A"); + collate_a_to_b_regex.should_search_match(L"\u00E4", L"\u00E4"); // U+00E4 LATIN SMALL LETTER A WITH DIARESIS + collate_a_to_b_regex.should_search_match(L"\u00C4", L"\u00C4"); // U+00C4 LATIN CAPITAL LETTER A WITH DIARESIS + collate_a_to_b_regex.should_search_match(L"b", L"b"); + for (const wchar_t* s : special_characters_without_ae) { + collate_a_to_b_regex.should_search_fail(s); + } + } + + const wchar_t* special_characters_without_sharp_s[] = { + L"\u00E4", // U+00E4 LATIN SMALL LETTER A WITH DIARESIS + L"\u00C4", // U+00C4 LATIN CAPITAL LETTER A WITH DIARESIS + L"\u00F6", // U+00F6 LATIN SMALL LETTER U WITH DIARESIS + L"\u00D6", // U+00D6 LATIN CAPITAL LETTER U WITH DIARESIS + L"\u00FC", // U+00FC LATIN SMALL LETTER O WITH DIARESIS + L"\u00DC" // U+00DC LATIN CAPITAL LETTER O WITH DIARESIS + }; + + { + test_wregex_locale collate_s_to_S_regex(&g_regexTester, L"[s-S]", "de-DE", regex_constants::collate); + collate_s_to_S_regex.should_search_fail(L"r"); + collate_s_to_S_regex.should_search_match(L"s", L"s"); + collate_s_to_S_regex.should_search_match(L"S", L"S"); + collate_s_to_S_regex.should_search_fail(L"\u00DF"); // U+00DF LATIN SMALL LETTER SHARP S + collate_s_to_S_regex.should_search_fail(L"\u1E9E"); // U+1E9E LATIN CAPITAL LETTER SHARP S + collate_s_to_S_regex.should_search_fail(L"t"); + collate_s_to_S_regex.should_search_fail(L"u"); + for (const wchar_t* s : special_characters_without_sharp_s) { + collate_s_to_S_regex.should_search_fail(s); + } + } + + { + test_wregex_locale collate_s_to_sharp_s_regex(&g_regexTester, + L"[s-\u00DF]", // U+00DF LATIN SMALL LETTER SHARP S + "de-DE", regex_constants::collate); + collate_s_to_sharp_s_regex.should_search_fail(L"r"); + collate_s_to_sharp_s_regex.should_search_match(L"s", L"s"); + collate_s_to_sharp_s_regex.should_search_match(L"S", L"S"); + collate_s_to_sharp_s_regex.should_search_match(L"\u00DF", L"\u00DF"); // U+00DF LATIN SMALL LETTER SHARP S + collate_s_to_sharp_s_regex.should_search_fail(L"\u1E9E"); // U+1E9E LATIN CAPITAL LETTER SHARP S + collate_s_to_sharp_s_regex.should_search_fail(L"t"); + collate_s_to_sharp_s_regex.should_search_fail(L"u"); + for (const wchar_t* s : special_characters_without_sharp_s) { + collate_s_to_sharp_s_regex.should_search_fail(s); + } + } + + { + test_wregex_locale collate_s_to_Sharp_S_regex(&g_regexTester, L"[s-\u1E9E]", // LATIN CAPITAL LETTER SHARP S + "de-DE", regex_constants::collate); + collate_s_to_Sharp_S_regex.should_search_fail(L"r"); + collate_s_to_Sharp_S_regex.should_search_match(L"s", L"s"); + collate_s_to_Sharp_S_regex.should_search_match(L"S", L"S"); + collate_s_to_Sharp_S_regex.should_search_match(L"\u00DF", L"\u00DF"); // U+00DF LATIN SMALL LETTER SHARP S + collate_s_to_Sharp_S_regex.should_search_match(L"\u1E9E", L"\u1E9E"); // U+1E9E LATIN CAPITAL LETTER SHARP S + collate_s_to_Sharp_S_regex.should_search_fail(L"t"); + collate_s_to_Sharp_S_regex.should_search_fail(L"u"); + for (const wchar_t* s : special_characters_without_sharp_s) { + collate_s_to_Sharp_S_regex.should_search_fail(s); + } + } + + { + test_wregex_locale collate_s_to_t_regex(&g_regexTester, L"[s-t]", "de-DE", regex_constants::collate); + collate_s_to_t_regex.should_search_fail(L"r"); + collate_s_to_t_regex.should_search_match(L"s", L"s"); + collate_s_to_t_regex.should_search_match(L"S", L"S"); + collate_s_to_t_regex.should_search_match(L"\u00DF", L"\u00DF"); // U+00DF LATIN SMALL LETTER SHARP S + collate_s_to_t_regex.should_search_match(L"\u1E9E", L"\u1E9E"); // U+1E9E LATIN CAPITAL LETTER SHARP S + collate_s_to_t_regex.should_search_match(L"t", L"t"); + collate_s_to_t_regex.should_search_fail(L"u"); + for (const wchar_t* s : special_characters_without_sharp_s) { + collate_s_to_t_regex.should_search_fail(s); + } + } + + { + test_wregex_locale collate_A_to_z_regex(&g_regexTester, L"[A-z]", "de-DE", regex_constants::collate); + collate_A_to_z_regex.should_search_fail(L"a"); + for (wchar_t ascii_upper = L'A'; ascii_upper < 'Z'; ++ascii_upper) { + collate_A_to_z_regex.should_search_match(wstring(1, ascii_upper), wstring(1, ascii_upper)); + } + for (wchar_t ascii_lower = L'b'; ascii_lower <= 'z'; ++ascii_lower) { + collate_A_to_z_regex.should_search_match(wstring(1, ascii_lower), wstring(1, ascii_lower)); + } + for (const wchar_t* s : special_characters) { + collate_A_to_z_regex.should_search_match(s, s); + } + collate_A_to_z_regex.should_search_fail(L"Z"); + } + + regex_with_locale_should_throw(L"[A-a]", "de-DE", error_range); + regex_with_locale_should_throw(L"[\u00DF-S]", // U+00DF LATIN SMALL LETTER SHARP S + "de-DE", error_range); + regex_with_locale_should_throw( + L"[\u1E9E-\u00DF]", // U+1E9E LATIN CAPITAL LETTER SHARP S, U+00DF LATIN SMALL LETTER SHARP S + "de-DE", error_range); +#endif // !defined(SKIP_COLLATE_TESTS) +} + +int main() { + test_collating_ranges_german(); + + return g_regexTester.result(); +} From ac09f8ef298570eb526815976107903bc778385a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Wed, 15 Jan 2025 19:44:55 +0100 Subject: [PATCH 2/4] fix overlong line --- tests/std/tests/GH_005204_regex_collating_ranges/test.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/std/tests/GH_005204_regex_collating_ranges/test.cpp b/tests/std/tests/GH_005204_regex_collating_ranges/test.cpp index 153d989629..52272f1742 100644 --- a/tests/std/tests/GH_005204_regex_collating_ranges/test.cpp +++ b/tests/std/tests/GH_005204_regex_collating_ranges/test.cpp @@ -73,9 +73,9 @@ class test_wregex_locale { wsmatch mr; try { if (regex_search(subject, mr, r, match_flags)) { - wprintf( - LR"(Expected regex_search("%s", regex("%s", 0x%X), 0x%X) to not match for locale "%hs", but it found "%s")" - "\n", + wprintf(LR"(Expected regex_search("%s", regex("%s", 0x%X), 0x%X) to not match )" + LR"(for locale "%hs", but it found "%s")" + "\n", subject.c_str(), pattern.c_str(), static_cast(syntax), static_cast(match_flags), locname.c_str(), mr.str().c_str()); fixture->fail_regex(); From e2bba57b3df14b9c10868011f12a3485d8dca785 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Wed, 15 Jan 2025 20:04:31 +0100 Subject: [PATCH 3/4] add some const --- stl/inc/regex | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 43e7e7f7f5..5287dd9010 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -3357,8 +3357,8 @@ bool _Lookup_range(unsigned int _Ch, const _Buf<_Elem>* _Bufptr) { // check whet } template -bool _Lookup_collating_range(_Elem _Ch, const _Buf<_Elem>* _Bufptr, const _RxTraits& _Traits) { - typename _RxTraits::string_type _Str = _Traits.transform(_STD addressof(_Ch), _STD addressof(_Ch) + 1); +bool _Lookup_collating_range(const _Elem _Ch, const _Buf<_Elem>* const _Bufptr, const _RxTraits& _Traits) { + const typename _RxTraits::string_type _Str = _Traits.transform(_STD addressof(_Ch), _STD addressof(_Ch) + 1); for (unsigned int _Ix = 0; _Ix < _Bufptr->_Size(); _Ix += 2) { // check current position const _Elem _Left = _Bufptr->_At(_Ix); const _Elem _Right = _Bufptr->_At(_Ix + 1); @@ -3419,7 +3419,7 @@ bool _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Do_class(_Node_base* _Nx) { // ap } else if (_Sflags & regex_constants::collate) { _Ch = _Traits.translate(_Ch); } - auto _UCh = static_cast(_Ch); + const auto _UCh = static_cast(_Ch); _It _Res0 = _Tgt_state._Cur; ++_Res0; @@ -3769,7 +3769,7 @@ _BidIt _Matcher<_BidIt, _Elem, _RxTraits, _It>::_Skip(_BidIt _First_arg, _BidIt } else if (_Sflags & regex_constants::collate) { _Ch = _Traits.translate(_Ch); } - auto _UCh = static_cast(_Ch); + const auto _UCh = static_cast(_Ch); _Node_class<_Elem, _RxTraits>* _Node = static_cast<_Node_class<_Elem, _RxTraits>*>(_Nx); _It _Next = _First_arg; From c3f4d6a95067e63f34ca194d05791da45ac5b813 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Wed, 15 Jan 2025 20:05:12 +0100 Subject: [PATCH 4/4] update tests/std/test.lst --- tests/std/test.lst | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/std/test.lst b/tests/std/test.lst index 7093936102..bd3ab3e1e1 100644 --- a/tests/std/test.lst +++ b/tests/std/test.lst @@ -253,6 +253,7 @@ tests\GH_004657_expected_constraints_permissive tests\GH_004845_logical_operator_traits_with_non_bool_constant tests\GH_004929_internal_tag_constructors tests\GH_004930_char_traits_user_specialization +tests\GH_005204_regex_collating_ranges tests\LWG2381_num_get_floating_point tests\LWG2597_complex_branch_cut tests\LWG3018_shared_ptr_function