libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2013-2014 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex.tcc 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 // See below __regex_algo_impl to get what this is talking about. The default 00032 // value 1 indicated a conservative optimization without giving up worst case 00033 // performance. 00034 #ifndef _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 00035 #define _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 1 00036 #endif 00037 00038 namespace std _GLIBCXX_VISIBILITY(default) 00039 { 00040 namespace __detail 00041 { 00042 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00043 00044 // Result of merging regex_match and regex_search. 00045 // 00046 // __policy now can be _S_auto (auto dispatch) and _S_alternate (use 00047 // the other one if possible, for test purpose). 00048 // 00049 // That __match_mode is true means regex_match, else regex_search. 00050 template<typename _BiIter, typename _Alloc, 00051 typename _CharT, typename _TraitsT, 00052 _RegexExecutorPolicy __policy, 00053 bool __match_mode> 00054 bool 00055 __regex_algo_impl(_BiIter __s, 00056 _BiIter __e, 00057 match_results<_BiIter, _Alloc>& __m, 00058 const basic_regex<_CharT, _TraitsT>& __re, 00059 regex_constants::match_flag_type __flags) 00060 { 00061 if (__re._M_automaton == nullptr) 00062 return false; 00063 00064 typename match_results<_BiIter, _Alloc>::_Base_type& __res = __m; 00065 __m._M_begin = __s; 00066 __res.resize(__re._M_automaton->_M_sub_count() + 2); 00067 for (auto& __it : __res) 00068 __it.matched = false; 00069 00070 // This function decide which executor to use under given circumstances. 00071 // The _S_auto policy now is the following: if a NFA has no 00072 // back-references and has more than _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT 00073 // quantifiers (*, +, ?), the BFS executor will be used, other wise 00074 // DFS executor. This is because DFS executor has a exponential upper 00075 // bound, but better best-case performace. Meanwhile, BFS executor can 00076 // effectively prevent from exponential-long time matching (which must 00077 // contains many quantifiers), but it's slower in average. 00078 // 00079 // For simple regex, BFS executor could be 2 or more times slower than 00080 // DFS executor. 00081 // 00082 // Of course, BFS executor cannot handle back-references. 00083 bool __ret; 00084 if (!__re._M_automaton->_M_has_backref 00085 && (__policy == _RegexExecutorPolicy::_S_alternate 00086 || __re._M_automaton->_M_quant_count 00087 > _GLIBCXX_REGEX_DFS_QUANTIFIERS_LIMIT)) 00088 { 00089 _Executor<_BiIter, _Alloc, _TraitsT, false> 00090 __executor(__s, __e, __m, __re, __flags); 00091 if (__match_mode) 00092 __ret = __executor._M_match(); 00093 else 00094 __ret = __executor._M_search(); 00095 } 00096 else 00097 { 00098 _Executor<_BiIter, _Alloc, _TraitsT, true> 00099 __executor(__s, __e, __m, __re, __flags); 00100 if (__match_mode) 00101 __ret = __executor._M_match(); 00102 else 00103 __ret = __executor._M_search(); 00104 } 00105 if (__ret) 00106 { 00107 for (auto __it : __res) 00108 if (!__it.matched) 00109 __it.first = __it.second = __e; 00110 auto& __pre = __res[__res.size()-2]; 00111 auto& __suf = __res[__res.size()-1]; 00112 if (__match_mode) 00113 { 00114 __pre.matched = false; 00115 __pre.first = __s; 00116 __pre.second = __s; 00117 __suf.matched = false; 00118 __suf.first = __e; 00119 __suf.second = __e; 00120 } 00121 else 00122 { 00123 __pre.first = __s; 00124 __pre.second = __res[0].first; 00125 __pre.matched = (__pre.first != __pre.second); 00126 __suf.first = __res[0].second; 00127 __suf.second = __e; 00128 __suf.matched = (__suf.first != __suf.second); 00129 } 00130 } 00131 return __ret; 00132 } 00133 00134 _GLIBCXX_END_NAMESPACE_VERSION 00135 } 00136 00137 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00138 00139 template<typename _Ch_type> 00140 template<typename _Fwd_iter> 00141 typename regex_traits<_Ch_type>::string_type 00142 regex_traits<_Ch_type>:: 00143 lookup_collatename(_Fwd_iter __first, _Fwd_iter __last) const 00144 { 00145 typedef std::ctype<char_type> __ctype_type; 00146 const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale)); 00147 00148 static const char* __collatenames[] = 00149 { 00150 "NUL", 00151 "SOH", 00152 "STX", 00153 "ETX", 00154 "EOT", 00155 "ENQ", 00156 "ACK", 00157 "alert", 00158 "backspace", 00159 "tab", 00160 "newline", 00161 "vertical-tab", 00162 "form-feed", 00163 "carriage-return", 00164 "SO", 00165 "SI", 00166 "DLE", 00167 "DC1", 00168 "DC2", 00169 "DC3", 00170 "DC4", 00171 "NAK", 00172 "SYN", 00173 "ETB", 00174 "CAN", 00175 "EM", 00176 "SUB", 00177 "ESC", 00178 "IS4", 00179 "IS3", 00180 "IS2", 00181 "IS1", 00182 "space", 00183 "exclamation-mark", 00184 "quotation-mark", 00185 "number-sign", 00186 "dollar-sign", 00187 "percent-sign", 00188 "ampersand", 00189 "apostrophe", 00190 "left-parenthesis", 00191 "right-parenthesis", 00192 "asterisk", 00193 "plus-sign", 00194 "comma", 00195 "hyphen", 00196 "period", 00197 "slash", 00198 "zero", 00199 "one", 00200 "two", 00201 "three", 00202 "four", 00203 "five", 00204 "six", 00205 "seven", 00206 "eight", 00207 "nine", 00208 "colon", 00209 "semicolon", 00210 "less-than-sign", 00211 "equals-sign", 00212 "greater-than-sign", 00213 "question-mark", 00214 "commercial-at", 00215 "A", 00216 "B", 00217 "C", 00218 "D", 00219 "E", 00220 "F", 00221 "G", 00222 "H", 00223 "I", 00224 "J", 00225 "K", 00226 "L", 00227 "M", 00228 "N", 00229 "O", 00230 "P", 00231 "Q", 00232 "R", 00233 "S", 00234 "T", 00235 "U", 00236 "V", 00237 "W", 00238 "X", 00239 "Y", 00240 "Z", 00241 "left-square-bracket", 00242 "backslash", 00243 "right-square-bracket", 00244 "circumflex", 00245 "underscore", 00246 "grave-accent", 00247 "a", 00248 "b", 00249 "c", 00250 "d", 00251 "e", 00252 "f", 00253 "g", 00254 "h", 00255 "i", 00256 "j", 00257 "k", 00258 "l", 00259 "m", 00260 "n", 00261 "o", 00262 "p", 00263 "q", 00264 "r", 00265 "s", 00266 "t", 00267 "u", 00268 "v", 00269 "w", 00270 "x", 00271 "y", 00272 "z", 00273 "left-curly-bracket", 00274 "vertical-line", 00275 "right-curly-bracket", 00276 "tilde", 00277 "DEL", 00278 }; 00279 00280 string __s; 00281 for (; __first != __last; ++__first) 00282 __s += __fctyp.narrow(*__first, 0); 00283 00284 for (const auto& __it : __collatenames) 00285 if (__s == __it) 00286 return string_type(1, __fctyp.widen( 00287 static_cast<char>(&__it - __collatenames))); 00288 00289 // TODO Add digraph support: 00290 // http://boost.sourceforge.net/libs/regex/doc/collating_names.html 00291 00292 return string_type(); 00293 } 00294 00295 template<typename _Ch_type> 00296 template<typename _Fwd_iter> 00297 typename regex_traits<_Ch_type>::char_class_type 00298 regex_traits<_Ch_type>:: 00299 lookup_classname(_Fwd_iter __first, _Fwd_iter __last, bool __icase) const 00300 { 00301 typedef std::ctype<char_type> __ctype_type; 00302 const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale)); 00303 00304 // Mappings from class name to class mask. 00305 static const pair<const char*, char_class_type> __classnames[] = 00306 { 00307 {"d", ctype_base::digit}, 00308 {"w", {ctype_base::alnum, _RegexMask::_S_under}}, 00309 {"s", ctype_base::space}, 00310 {"alnum", ctype_base::alnum}, 00311 {"alpha", ctype_base::alpha}, 00312 {"blank", {0, _RegexMask::_S_blank}}, 00313 {"cntrl", ctype_base::cntrl}, 00314 {"digit", ctype_base::digit}, 00315 {"graph", ctype_base::graph}, 00316 {"lower", ctype_base::lower}, 00317 {"print", ctype_base::print}, 00318 {"punct", ctype_base::punct}, 00319 {"space", ctype_base::space}, 00320 {"upper", ctype_base::upper}, 00321 {"xdigit", ctype_base::xdigit}, 00322 }; 00323 00324 string __s; 00325 for (; __first != __last; ++__first) 00326 __s += __fctyp.narrow(__fctyp.tolower(*__first), 0); 00327 00328 for (const auto& __it : __classnames) 00329 if (__s == __it.first) 00330 { 00331 if (__icase 00332 && ((__it.second 00333 & (ctype_base::lower | ctype_base::upper)) != 0)) 00334 return ctype_base::alpha; 00335 return __it.second; 00336 } 00337 return 0; 00338 } 00339 00340 template<typename _Ch_type> 00341 bool 00342 regex_traits<_Ch_type>:: 00343 isctype(_Ch_type __c, char_class_type __f) const 00344 { 00345 typedef std::ctype<char_type> __ctype_type; 00346 const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale)); 00347 00348 return __fctyp.is(__f._M_base, __c) 00349 // [[:w:]] 00350 || ((__f._M_extended & _RegexMask::_S_under) 00351 && __c == __fctyp.widen('_')) 00352 // [[:blank:]] 00353 || ((__f._M_extended & _RegexMask::_S_blank) 00354 && (__c == __fctyp.widen(' ') 00355 || __c == __fctyp.widen('\t'))); 00356 } 00357 00358 template<typename _Ch_type> 00359 int 00360 regex_traits<_Ch_type>:: 00361 value(_Ch_type __ch, int __radix) const 00362 { 00363 std::basic_istringstream<char_type> __is(string_type(1, __ch)); 00364 long __v; 00365 if (__radix == 8) 00366 __is >> std::oct; 00367 else if (__radix == 16) 00368 __is >> std::hex; 00369 __is >> __v; 00370 return __is.fail() ? -1 : __v; 00371 } 00372 00373 template<typename _Bi_iter, typename _Alloc> 00374 template<typename _Out_iter> 00375 _Out_iter match_results<_Bi_iter, _Alloc>:: 00376 format(_Out_iter __out, 00377 const match_results<_Bi_iter, _Alloc>::char_type* __fmt_first, 00378 const match_results<_Bi_iter, _Alloc>::char_type* __fmt_last, 00379 match_flag_type __flags) const 00380 { 00381 _GLIBCXX_DEBUG_ASSERT( ready() ); 00382 regex_traits<char_type> __traits; 00383 typedef std::ctype<char_type> __ctype_type; 00384 const __ctype_type& 00385 __fctyp(use_facet<__ctype_type>(__traits.getloc())); 00386 00387 auto __output = [&](size_t __idx) 00388 { 00389 auto& __sub = _Base_type::operator[](__idx); 00390 if (__sub.matched) 00391 __out = std::copy(__sub.first, __sub.second, __out); 00392 }; 00393 00394 if (__flags & regex_constants::format_sed) 00395 { 00396 for (; __fmt_first != __fmt_last;) 00397 if (*__fmt_first == '&') 00398 { 00399 __output(0); 00400 ++__fmt_first; 00401 } 00402 else if (*__fmt_first == '\\') 00403 { 00404 if (++__fmt_first != __fmt_last 00405 && __fctyp.is(__ctype_type::digit, *__fmt_first)) 00406 __output(__traits.value(*__fmt_first++, 10)); 00407 else 00408 *__out++ = '\\'; 00409 } 00410 else 00411 *__out++ = *__fmt_first++; 00412 } 00413 else 00414 { 00415 while (1) 00416 { 00417 auto __next = std::find(__fmt_first, __fmt_last, '$'); 00418 if (__next == __fmt_last) 00419 break; 00420 00421 __out = std::copy(__fmt_first, __next, __out); 00422 00423 auto __eat = [&](char __ch) -> bool 00424 { 00425 if (*__next == __ch) 00426 { 00427 ++__next; 00428 return true; 00429 } 00430 return false; 00431 }; 00432 00433 if (++__next == __fmt_last) 00434 *__out++ = '$'; 00435 else if (__eat('$')) 00436 *__out++ = '$'; 00437 else if (__eat('&')) 00438 __output(0); 00439 else if (__eat('`')) 00440 __output(_Base_type::size()-2); 00441 else if (__eat('\'')) 00442 __output(_Base_type::size()-1); 00443 else if (__fctyp.is(__ctype_type::digit, *__next)) 00444 { 00445 long __num = __traits.value(*__next, 10); 00446 if (++__next != __fmt_last 00447 && __fctyp.is(__ctype_type::digit, *__next)) 00448 { 00449 __num *= 10; 00450 __num += __traits.value(*__next++, 10); 00451 } 00452 if (0 <= __num && __num < this->size()) 00453 __output(__num); 00454 } 00455 else 00456 *__out++ = '$'; 00457 __fmt_first = __next; 00458 } 00459 __out = std::copy(__fmt_first, __fmt_last, __out); 00460 } 00461 return __out; 00462 } 00463 00464 template<typename _Out_iter, typename _Bi_iter, 00465 typename _Rx_traits, typename _Ch_type> 00466 _Out_iter 00467 regex_replace(_Out_iter __out, _Bi_iter __first, _Bi_iter __last, 00468 const basic_regex<_Ch_type, _Rx_traits>& __e, 00469 const _Ch_type* __fmt, 00470 regex_constants::match_flag_type __flags) 00471 { 00472 typedef regex_iterator<_Bi_iter, _Ch_type, _Rx_traits> _IterT; 00473 _IterT __i(__first, __last, __e, __flags); 00474 _IterT __end; 00475 if (__i == __end) 00476 { 00477 if (!(__flags & regex_constants::format_no_copy)) 00478 __out = std::copy(__first, __last, __out); 00479 } 00480 else 00481 { 00482 sub_match<_Bi_iter> __last; 00483 auto __len = char_traits<_Ch_type>::length(__fmt); 00484 for (; __i != __end; ++__i) 00485 { 00486 if (!(__flags & regex_constants::format_no_copy)) 00487 __out = std::copy(__i->prefix().first, __i->prefix().second, 00488 __out); 00489 __out = __i->format(__out, __fmt, __fmt + __len, __flags); 00490 __last = __i->suffix(); 00491 if (__flags & regex_constants::format_first_only) 00492 break; 00493 } 00494 if (!(__flags & regex_constants::format_no_copy)) 00495 __out = std::copy(__last.first, __last.second, __out); 00496 } 00497 return __out; 00498 } 00499 00500 template<typename _Bi_iter, 00501 typename _Ch_type, 00502 typename _Rx_traits> 00503 bool 00504 regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>:: 00505 operator==(const regex_iterator& __rhs) const 00506 { 00507 return (_M_match.empty() && __rhs._M_match.empty()) 00508 || (_M_begin == __rhs._M_begin 00509 && _M_end == __rhs._M_end 00510 && _M_pregex == __rhs._M_pregex 00511 && _M_flags == __rhs._M_flags 00512 && _M_match[0] == __rhs._M_match[0]); 00513 } 00514 00515 template<typename _Bi_iter, 00516 typename _Ch_type, 00517 typename _Rx_traits> 00518 regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>& 00519 regex_iterator<_Bi_iter, _Ch_type, _Rx_traits>:: 00520 operator++() 00521 { 00522 // In all cases in which the call to regex_search returns true, 00523 // match.prefix().first shall be equal to the previous value of 00524 // match[0].second, and for each index i in the half-open range 00525 // [0, match.size()) for which match[i].matched is true, 00526 // match[i].position() shall return distance(begin, match[i].first). 00527 // [28.12.1.4.5] 00528 if (_M_match[0].matched) 00529 { 00530 auto __start = _M_match[0].second; 00531 auto __prefix_first = _M_match[0].second; 00532 if (_M_match[0].first == _M_match[0].second) 00533 { 00534 if (__start == _M_end) 00535 { 00536 _M_match = value_type(); 00537 return *this; 00538 } 00539 else 00540 { 00541 if (regex_search(__start, _M_end, _M_match, *_M_pregex, 00542 _M_flags 00543 | regex_constants::match_not_null 00544 | regex_constants::match_continuous)) 00545 { 00546 _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched); 00547 auto& __prefix = _M_match.at(_M_match.size()); 00548 __prefix.first = __prefix_first; 00549 __prefix.matched = __prefix.first != __prefix.second; 00550 // [28.12.1.4.5] 00551 _M_match._M_begin = _M_begin; 00552 return *this; 00553 } 00554 else 00555 ++__start; 00556 } 00557 } 00558 _M_flags |= regex_constants::match_prev_avail; 00559 if (regex_search(__start, _M_end, _M_match, *_M_pregex, _M_flags)) 00560 { 00561 _GLIBCXX_DEBUG_ASSERT(_M_match[0].matched); 00562 auto& __prefix = _M_match.at(_M_match.size()); 00563 __prefix.first = __prefix_first; 00564 __prefix.matched = __prefix.first != __prefix.second; 00565 // [28.12.1.4.5] 00566 _M_match._M_begin = _M_begin; 00567 } 00568 else 00569 _M_match = value_type(); 00570 } 00571 return *this; 00572 } 00573 00574 template<typename _Bi_iter, 00575 typename _Ch_type, 00576 typename _Rx_traits> 00577 regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>& 00578 regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>:: 00579 operator=(const regex_token_iterator& __rhs) 00580 { 00581 _M_position = __rhs._M_position; 00582 _M_subs = __rhs._M_subs; 00583 _M_n = __rhs._M_n; 00584 _M_suffix = __rhs._M_suffix; 00585 _M_has_m1 = __rhs._M_has_m1; 00586 _M_normalize_result(); 00587 return *this; 00588 } 00589 00590 template<typename _Bi_iter, 00591 typename _Ch_type, 00592 typename _Rx_traits> 00593 bool 00594 regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>:: 00595 operator==(const regex_token_iterator& __rhs) const 00596 { 00597 if (_M_end_of_seq() && __rhs._M_end_of_seq()) 00598 return true; 00599 if (_M_suffix.matched && __rhs._M_suffix.matched 00600 && _M_suffix == __rhs._M_suffix) 00601 return true; 00602 if (_M_end_of_seq() || _M_suffix.matched 00603 || __rhs._M_end_of_seq() || __rhs._M_suffix.matched) 00604 return false; 00605 return _M_position == __rhs._M_position 00606 && _M_n == __rhs._M_n 00607 && _M_subs == __rhs._M_subs; 00608 } 00609 00610 template<typename _Bi_iter, 00611 typename _Ch_type, 00612 typename _Rx_traits> 00613 regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>& 00614 regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>:: 00615 operator++() 00616 { 00617 _Position __prev = _M_position; 00618 if (_M_suffix.matched) 00619 *this = regex_token_iterator(); 00620 else if (_M_n + 1 < _M_subs.size()) 00621 { 00622 _M_n++; 00623 _M_result = &_M_current_match(); 00624 } 00625 else 00626 { 00627 _M_n = 0; 00628 ++_M_position; 00629 if (_M_position != _Position()) 00630 _M_result = &_M_current_match(); 00631 else if (_M_has_m1 && __prev->suffix().length() != 0) 00632 { 00633 _M_suffix.matched = true; 00634 _M_suffix.first = __prev->suffix().first; 00635 _M_suffix.second = __prev->suffix().second; 00636 _M_result = &_M_suffix; 00637 } 00638 else 00639 *this = regex_token_iterator(); 00640 } 00641 return *this; 00642 } 00643 00644 template<typename _Bi_iter, 00645 typename _Ch_type, 00646 typename _Rx_traits> 00647 void 00648 regex_token_iterator<_Bi_iter, _Ch_type, _Rx_traits>:: 00649 _M_init(_Bi_iter __a, _Bi_iter __b) 00650 { 00651 _M_has_m1 = false; 00652 for (auto __it : _M_subs) 00653 if (__it == -1) 00654 { 00655 _M_has_m1 = true; 00656 break; 00657 } 00658 if (_M_position != _Position()) 00659 _M_result = &_M_current_match(); 00660 else if (_M_has_m1) 00661 { 00662 _M_suffix.matched = true; 00663 _M_suffix.first = __a; 00664 _M_suffix.second = __b; 00665 _M_result = &_M_suffix; 00666 } 00667 else 00668 _M_result = nullptr; 00669 } 00670 00671 _GLIBCXX_END_NAMESPACE_VERSION 00672 } // namespace 00673