libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2013-2016 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex_compiler.tcc 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 // FIXME make comments doxygen format. 00032 00033 // This compiler refers to "Regular Expression Matching Can Be Simple And Fast" 00034 // (http://swtch.com/~rsc/regexp/regexp1.html"), 00035 // but doesn't strictly follow it. 00036 // 00037 // When compiling, states are *chained* instead of tree- or graph-constructed. 00038 // It's more like structured programs: there's if statement and loop statement. 00039 // 00040 // For alternative structure (say "a|b"), aka "if statement", two branches 00041 // should be constructed. However, these two shall merge to an "end_tag" at 00042 // the end of this operator: 00043 // 00044 // branch1 00045 // / \ 00046 // => begin_tag end_tag => 00047 // \ / 00048 // branch2 00049 // 00050 // This is the difference between this implementation and that in Russ's 00051 // article. 00052 // 00053 // That's why we introduced dummy node here ------ "end_tag" is a dummy node. 00054 // All dummy node will be eliminated at the end of compiling process. 00055 00056 namespace std _GLIBCXX_VISIBILITY(default) 00057 { 00058 namespace __detail 00059 { 00060 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00061 00062 template<typename _TraitsT> 00063 _Compiler<_TraitsT>:: 00064 _Compiler(_IterT __b, _IterT __e, 00065 const typename _TraitsT::locale_type& __loc, _FlagT __flags) 00066 : _M_flags((__flags 00067 & (regex_constants::ECMAScript 00068 | regex_constants::basic 00069 | regex_constants::extended 00070 | regex_constants::grep 00071 | regex_constants::egrep 00072 | regex_constants::awk)) 00073 ? __flags 00074 : __flags | regex_constants::ECMAScript), 00075 _M_scanner(__b, __e, _M_flags, __loc), 00076 _M_nfa(make_shared<_RegexT>(__loc, _M_flags)), 00077 _M_traits(_M_nfa->_M_traits), 00078 _M_ctype(std::use_facet<_CtypeT>(__loc)) 00079 { 00080 _StateSeqT __r(*_M_nfa, _M_nfa->_M_start()); 00081 __r._M_append(_M_nfa->_M_insert_subexpr_begin()); 00082 this->_M_disjunction(); 00083 if (!_M_match_token(_ScannerT::_S_token_eof)) 00084 __throw_regex_error(regex_constants::error_paren); 00085 __r._M_append(_M_pop()); 00086 __glibcxx_assert(_M_stack.empty()); 00087 __r._M_append(_M_nfa->_M_insert_subexpr_end()); 00088 __r._M_append(_M_nfa->_M_insert_accept()); 00089 _M_nfa->_M_eliminate_dummy(); 00090 } 00091 00092 template<typename _TraitsT> 00093 void 00094 _Compiler<_TraitsT>:: 00095 _M_disjunction() 00096 { 00097 this->_M_alternative(); 00098 while (_M_match_token(_ScannerT::_S_token_or)) 00099 { 00100 _StateSeqT __alt1 = _M_pop(); 00101 this->_M_alternative(); 00102 _StateSeqT __alt2 = _M_pop(); 00103 auto __end = _M_nfa->_M_insert_dummy(); 00104 __alt1._M_append(__end); 00105 __alt2._M_append(__end); 00106 // __alt2 is state._M_next, __alt1 is state._M_alt. The executor 00107 // executes _M_alt before _M_next, as well as executing left 00108 // alternative before right one. 00109 _M_stack.push(_StateSeqT(*_M_nfa, 00110 _M_nfa->_M_insert_alt( 00111 __alt2._M_start, __alt1._M_start, false), 00112 __end)); 00113 } 00114 } 00115 00116 template<typename _TraitsT> 00117 void 00118 _Compiler<_TraitsT>:: 00119 _M_alternative() 00120 { 00121 if (this->_M_term()) 00122 { 00123 _StateSeqT __re = _M_pop(); 00124 this->_M_alternative(); 00125 __re._M_append(_M_pop()); 00126 _M_stack.push(__re); 00127 } 00128 else 00129 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_dummy())); 00130 } 00131 00132 template<typename _TraitsT> 00133 bool 00134 _Compiler<_TraitsT>:: 00135 _M_term() 00136 { 00137 if (this->_M_assertion()) 00138 return true; 00139 if (this->_M_atom()) 00140 { 00141 while (this->_M_quantifier()); 00142 return true; 00143 } 00144 return false; 00145 } 00146 00147 template<typename _TraitsT> 00148 bool 00149 _Compiler<_TraitsT>:: 00150 _M_assertion() 00151 { 00152 if (_M_match_token(_ScannerT::_S_token_line_begin)) 00153 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_begin())); 00154 else if (_M_match_token(_ScannerT::_S_token_line_end)) 00155 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa->_M_insert_line_end())); 00156 else if (_M_match_token(_ScannerT::_S_token_word_bound)) 00157 // _M_value[0] == 'n' means it's negative, say "not word boundary". 00158 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa-> 00159 _M_insert_word_bound(_M_value[0] == 'n'))); 00160 else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin)) 00161 { 00162 auto __neg = _M_value[0] == 'n'; 00163 this->_M_disjunction(); 00164 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 00165 __throw_regex_error(regex_constants::error_paren, 00166 "Parenthesis is not closed."); 00167 auto __tmp = _M_pop(); 00168 __tmp._M_append(_M_nfa->_M_insert_accept()); 00169 _M_stack.push( 00170 _StateSeqT( 00171 *_M_nfa, 00172 _M_nfa->_M_insert_lookahead(__tmp._M_start, __neg))); 00173 } 00174 else 00175 return false; 00176 return true; 00177 } 00178 00179 template<typename _TraitsT> 00180 bool 00181 _Compiler<_TraitsT>:: 00182 _M_quantifier() 00183 { 00184 bool __neg = (_M_flags & regex_constants::ECMAScript); 00185 auto __init = [this, &__neg]() 00186 { 00187 if (_M_stack.empty()) 00188 __throw_regex_error(regex_constants::error_badrepeat, 00189 "Nothing to repeat before a quantifier."); 00190 __neg = __neg && _M_match_token(_ScannerT::_S_token_opt); 00191 }; 00192 if (_M_match_token(_ScannerT::_S_token_closure0)) 00193 { 00194 __init(); 00195 auto __e = _M_pop(); 00196 _StateSeqT __r(*_M_nfa, 00197 _M_nfa->_M_insert_repeat(_S_invalid_state_id, 00198 __e._M_start, __neg)); 00199 __e._M_append(__r); 00200 _M_stack.push(__r); 00201 } 00202 else if (_M_match_token(_ScannerT::_S_token_closure1)) 00203 { 00204 __init(); 00205 auto __e = _M_pop(); 00206 __e._M_append(_M_nfa->_M_insert_repeat(_S_invalid_state_id, 00207 __e._M_start, __neg)); 00208 _M_stack.push(__e); 00209 } 00210 else if (_M_match_token(_ScannerT::_S_token_opt)) 00211 { 00212 __init(); 00213 auto __e = _M_pop(); 00214 auto __end = _M_nfa->_M_insert_dummy(); 00215 _StateSeqT __r(*_M_nfa, 00216 _M_nfa->_M_insert_repeat(_S_invalid_state_id, 00217 __e._M_start, __neg)); 00218 __e._M_append(__end); 00219 __r._M_append(__end); 00220 _M_stack.push(__r); 00221 } 00222 else if (_M_match_token(_ScannerT::_S_token_interval_begin)) 00223 { 00224 if (_M_stack.empty()) 00225 __throw_regex_error(regex_constants::error_badrepeat, 00226 "Nothing to repeat before a quantifier."); 00227 if (!_M_match_token(_ScannerT::_S_token_dup_count)) 00228 __throw_regex_error(regex_constants::error_badbrace, 00229 "Unexpected token in brace expression."); 00230 _StateSeqT __r(_M_pop()); 00231 _StateSeqT __e(*_M_nfa, _M_nfa->_M_insert_dummy()); 00232 long __min_rep = _M_cur_int_value(10); 00233 bool __infi = false; 00234 long __n; 00235 00236 // {3 00237 if (_M_match_token(_ScannerT::_S_token_comma)) 00238 if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7} 00239 __n = _M_cur_int_value(10) - __min_rep; 00240 else 00241 __infi = true; 00242 else 00243 __n = 0; 00244 if (!_M_match_token(_ScannerT::_S_token_interval_end)) 00245 __throw_regex_error(regex_constants::error_brace, 00246 "Unexpected end of brace expression."); 00247 00248 __neg = __neg && _M_match_token(_ScannerT::_S_token_opt); 00249 00250 for (long __i = 0; __i < __min_rep; ++__i) 00251 __e._M_append(__r._M_clone()); 00252 00253 if (__infi) 00254 { 00255 auto __tmp = __r._M_clone(); 00256 _StateSeqT __s(*_M_nfa, 00257 _M_nfa->_M_insert_repeat(_S_invalid_state_id, 00258 __tmp._M_start, __neg)); 00259 __tmp._M_append(__s); 00260 __e._M_append(__s); 00261 } 00262 else 00263 { 00264 if (__n < 0) 00265 __throw_regex_error(regex_constants::error_badbrace, 00266 "Invalid range in brace expression."); 00267 auto __end = _M_nfa->_M_insert_dummy(); 00268 // _M_alt is the "match more" branch, and _M_next is the 00269 // "match less" one. Switch _M_alt and _M_next of all created 00270 // nodes. This is a hack but IMO works well. 00271 std::stack<_StateIdT> __stack; 00272 for (long __i = 0; __i < __n; ++__i) 00273 { 00274 auto __tmp = __r._M_clone(); 00275 auto __alt = _M_nfa->_M_insert_repeat(__tmp._M_start, 00276 __end, __neg); 00277 __stack.push(__alt); 00278 __e._M_append(_StateSeqT(*_M_nfa, __alt, __tmp._M_end)); 00279 } 00280 __e._M_append(__end); 00281 while (!__stack.empty()) 00282 { 00283 auto& __tmp = (*_M_nfa)[__stack.top()]; 00284 __stack.pop(); 00285 std::swap(__tmp._M_next, __tmp._M_alt); 00286 } 00287 } 00288 _M_stack.push(__e); 00289 } 00290 else 00291 return false; 00292 return true; 00293 } 00294 00295 #define __INSERT_REGEX_MATCHER(__func, args...)\ 00296 do\ 00297 if (!(_M_flags & regex_constants::icase))\ 00298 if (!(_M_flags & regex_constants::collate))\ 00299 __func<false, false>(args);\ 00300 else\ 00301 __func<false, true>(args);\ 00302 else\ 00303 if (!(_M_flags & regex_constants::collate))\ 00304 __func<true, false>(args);\ 00305 else\ 00306 __func<true, true>(args);\ 00307 while (false) 00308 00309 template<typename _TraitsT> 00310 bool 00311 _Compiler<_TraitsT>:: 00312 _M_atom() 00313 { 00314 if (_M_match_token(_ScannerT::_S_token_anychar)) 00315 { 00316 if (!(_M_flags & regex_constants::ECMAScript)) 00317 __INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix); 00318 else 00319 __INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma); 00320 } 00321 else if (_M_try_char()) 00322 __INSERT_REGEX_MATCHER(_M_insert_char_matcher); 00323 else if (_M_match_token(_ScannerT::_S_token_backref)) 00324 _M_stack.push(_StateSeqT(*_M_nfa, _M_nfa-> 00325 _M_insert_backref(_M_cur_int_value(10)))); 00326 else if (_M_match_token(_ScannerT::_S_token_quoted_class)) 00327 __INSERT_REGEX_MATCHER(_M_insert_character_class_matcher); 00328 else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin)) 00329 { 00330 _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_dummy()); 00331 this->_M_disjunction(); 00332 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 00333 __throw_regex_error(regex_constants::error_paren, 00334 "Parenthesis is not closed."); 00335 __r._M_append(_M_pop()); 00336 _M_stack.push(__r); 00337 } 00338 else if (_M_match_token(_ScannerT::_S_token_subexpr_begin)) 00339 { 00340 _StateSeqT __r(*_M_nfa, _M_nfa->_M_insert_subexpr_begin()); 00341 this->_M_disjunction(); 00342 if (!_M_match_token(_ScannerT::_S_token_subexpr_end)) 00343 __throw_regex_error(regex_constants::error_paren, 00344 "Parenthesis is not closed."); 00345 __r._M_append(_M_pop()); 00346 __r._M_append(_M_nfa->_M_insert_subexpr_end()); 00347 _M_stack.push(__r); 00348 } 00349 else if (!_M_bracket_expression()) 00350 return false; 00351 return true; 00352 } 00353 00354 template<typename _TraitsT> 00355 bool 00356 _Compiler<_TraitsT>:: 00357 _M_bracket_expression() 00358 { 00359 bool __neg = 00360 _M_match_token(_ScannerT::_S_token_bracket_neg_begin); 00361 if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin))) 00362 return false; 00363 __INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg); 00364 return true; 00365 } 00366 #undef __INSERT_REGEX_MATCHER 00367 00368 template<typename _TraitsT> 00369 template<bool __icase, bool __collate> 00370 void 00371 _Compiler<_TraitsT>:: 00372 _M_insert_any_matcher_ecma() 00373 { 00374 _M_stack.push(_StateSeqT(*_M_nfa, 00375 _M_nfa->_M_insert_matcher 00376 (_AnyMatcher<_TraitsT, true, __icase, __collate> 00377 (_M_traits)))); 00378 } 00379 00380 template<typename _TraitsT> 00381 template<bool __icase, bool __collate> 00382 void 00383 _Compiler<_TraitsT>:: 00384 _M_insert_any_matcher_posix() 00385 { 00386 _M_stack.push(_StateSeqT(*_M_nfa, 00387 _M_nfa->_M_insert_matcher 00388 (_AnyMatcher<_TraitsT, false, __icase, __collate> 00389 (_M_traits)))); 00390 } 00391 00392 template<typename _TraitsT> 00393 template<bool __icase, bool __collate> 00394 void 00395 _Compiler<_TraitsT>:: 00396 _M_insert_char_matcher() 00397 { 00398 _M_stack.push(_StateSeqT(*_M_nfa, 00399 _M_nfa->_M_insert_matcher 00400 (_CharMatcher<_TraitsT, __icase, __collate> 00401 (_M_value[0], _M_traits)))); 00402 } 00403 00404 template<typename _TraitsT> 00405 template<bool __icase, bool __collate> 00406 void 00407 _Compiler<_TraitsT>:: 00408 _M_insert_character_class_matcher() 00409 { 00410 __glibcxx_assert(_M_value.size() == 1); 00411 _BracketMatcher<_TraitsT, __icase, __collate> __matcher 00412 (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits); 00413 __matcher._M_add_character_class(_M_value, false); 00414 __matcher._M_ready(); 00415 _M_stack.push(_StateSeqT(*_M_nfa, 00416 _M_nfa->_M_insert_matcher(std::move(__matcher)))); 00417 } 00418 00419 template<typename _TraitsT> 00420 template<bool __icase, bool __collate> 00421 void 00422 _Compiler<_TraitsT>:: 00423 _M_insert_bracket_matcher(bool __neg) 00424 { 00425 _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits); 00426 pair<bool, _CharT> __last_char; // Optional<_CharT> 00427 __last_char.first = false; 00428 if (!(_M_flags & regex_constants::ECMAScript)) 00429 if (_M_try_char()) 00430 { 00431 __matcher._M_add_char(_M_value[0]); 00432 __last_char.first = true; 00433 __last_char.second = _M_value[0]; 00434 } 00435 while (_M_expression_term(__last_char, __matcher)); 00436 __matcher._M_ready(); 00437 _M_stack.push(_StateSeqT( 00438 *_M_nfa, 00439 _M_nfa->_M_insert_matcher(std::move(__matcher)))); 00440 } 00441 00442 template<typename _TraitsT> 00443 template<bool __icase, bool __collate> 00444 bool 00445 _Compiler<_TraitsT>:: 00446 _M_expression_term(pair<bool, _CharT>& __last_char, 00447 _BracketMatcher<_TraitsT, __icase, __collate>& __matcher) 00448 { 00449 if (_M_match_token(_ScannerT::_S_token_bracket_end)) 00450 return false; 00451 00452 if (_M_match_token(_ScannerT::_S_token_collsymbol)) 00453 { 00454 auto __symbol = __matcher._M_add_collate_element(_M_value); 00455 if (__symbol.size() == 1) 00456 { 00457 __last_char.first = true; 00458 __last_char.second = __symbol[0]; 00459 } 00460 } 00461 else if (_M_match_token(_ScannerT::_S_token_equiv_class_name)) 00462 __matcher._M_add_equivalence_class(_M_value); 00463 else if (_M_match_token(_ScannerT::_S_token_char_class_name)) 00464 __matcher._M_add_character_class(_M_value, false); 00465 // POSIX doesn't allow '-' as a start-range char (say [a-z--0]), 00466 // except when the '-' is the first or last character in the bracket 00467 // expression ([--0]). ECMAScript treats all '-' after a range as a 00468 // normal character. Also see above, where _M_expression_term gets called. 00469 // 00470 // As a result, POSIX rejects [-----], but ECMAScript doesn't. 00471 // Boost (1.57.0) always uses POSIX style even in its ECMAScript syntax. 00472 // Clang (3.5) always uses ECMAScript style even in its POSIX syntax. 00473 // 00474 // It turns out that no one reads BNFs ;) 00475 else if (_M_try_char()) 00476 { 00477 if (!__last_char.first) 00478 { 00479 __matcher._M_add_char(_M_value[0]); 00480 if (_M_value[0] == '-' 00481 && !(_M_flags & regex_constants::ECMAScript)) 00482 { 00483 if (_M_match_token(_ScannerT::_S_token_bracket_end)) 00484 return false; 00485 __throw_regex_error( 00486 regex_constants::error_range, 00487 "Unexpected dash in bracket expression. For POSIX syntax, " 00488 "a dash is not treated literally only when it is at " 00489 "beginning or end."); 00490 } 00491 __last_char.first = true; 00492 __last_char.second = _M_value[0]; 00493 } 00494 else 00495 { 00496 if (_M_value[0] == '-') 00497 { 00498 if (_M_try_char()) 00499 { 00500 __matcher._M_make_range(__last_char.second , _M_value[0]); 00501 __last_char.first = false; 00502 } 00503 else 00504 { 00505 if (_M_scanner._M_get_token() 00506 != _ScannerT::_S_token_bracket_end) 00507 __throw_regex_error( 00508 regex_constants::error_range, 00509 "Unexpected end of bracket expression."); 00510 __matcher._M_add_char(_M_value[0]); 00511 } 00512 } 00513 else 00514 { 00515 __matcher._M_add_char(_M_value[0]); 00516 __last_char.second = _M_value[0]; 00517 } 00518 } 00519 } 00520 else if (_M_match_token(_ScannerT::_S_token_quoted_class)) 00521 __matcher._M_add_character_class(_M_value, 00522 _M_ctype.is(_CtypeT::upper, 00523 _M_value[0])); 00524 else 00525 __throw_regex_error(regex_constants::error_brack, 00526 "Unexpected character in bracket expression."); 00527 00528 return true; 00529 } 00530 00531 template<typename _TraitsT> 00532 bool 00533 _Compiler<_TraitsT>:: 00534 _M_try_char() 00535 { 00536 bool __is_char = false; 00537 if (_M_match_token(_ScannerT::_S_token_oct_num)) 00538 { 00539 __is_char = true; 00540 _M_value.assign(1, _M_cur_int_value(8)); 00541 } 00542 else if (_M_match_token(_ScannerT::_S_token_hex_num)) 00543 { 00544 __is_char = true; 00545 _M_value.assign(1, _M_cur_int_value(16)); 00546 } 00547 else if (_M_match_token(_ScannerT::_S_token_ord_char)) 00548 __is_char = true; 00549 return __is_char; 00550 } 00551 00552 template<typename _TraitsT> 00553 bool 00554 _Compiler<_TraitsT>:: 00555 _M_match_token(_TokenT token) 00556 { 00557 if (token == _M_scanner._M_get_token()) 00558 { 00559 _M_value = _M_scanner._M_get_value(); 00560 _M_scanner._M_advance(); 00561 return true; 00562 } 00563 return false; 00564 } 00565 00566 template<typename _TraitsT> 00567 int 00568 _Compiler<_TraitsT>:: 00569 _M_cur_int_value(int __radix) 00570 { 00571 long __v = 0; 00572 for (typename _StringT::size_type __i = 0; 00573 __i < _M_value.length(); ++__i) 00574 __v =__v * __radix + _M_traits.value(_M_value[__i], __radix); 00575 return __v; 00576 } 00577 00578 template<typename _TraitsT, bool __icase, bool __collate> 00579 bool 00580 _BracketMatcher<_TraitsT, __icase, __collate>:: 00581 _M_apply(_CharT __ch, false_type) const 00582 { 00583 bool __ret = std::binary_search(_M_char_set.begin(), _M_char_set.end(), 00584 _M_translator._M_translate(__ch)); 00585 if (!__ret) 00586 { 00587 auto __s = _M_translator._M_transform(__ch); 00588 for (auto& __it : _M_range_set) 00589 if (__it.first <= __s && __s <= __it.second) 00590 { 00591 __ret = true; 00592 break; 00593 } 00594 if (_M_traits.isctype(__ch, _M_class_set)) 00595 __ret = true; 00596 else if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(), 00597 _M_traits.transform_primary(&__ch, &__ch+1)) 00598 != _M_equiv_set.end()) 00599 __ret = true; 00600 else 00601 { 00602 for (auto& __it : _M_neg_class_set) 00603 if (!_M_traits.isctype(__ch, __it)) 00604 { 00605 __ret = true; 00606 break; 00607 } 00608 } 00609 } 00610 if (_M_is_non_matching) 00611 return !__ret; 00612 else 00613 return __ret; 00614 } 00615 00616 _GLIBCXX_END_NAMESPACE_VERSION 00617 } // namespace __detail 00618 } // namespace