libstdc++
regex_compiler.tcc
Go to the documentation of this file.
00001 // class template regex -*- C++ -*-
00002 
00003 // Copyright (C) 2013-2014 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 /**
00026  *  @file bits/regex_compiler.tcc
00027  *  This is an internal header file, included by other library headers.
00028  *  Do not attempt to use it directly. @headername{regex}
00029  */
00030 
00031 // FIXME make comments doxygen format.
00032 
00033 // This compiler refers to "Regular Expression Matching Can Be Simple And Fast"
00034 // (http://swtch.com/~rsc/regexp/regexp1.html"),
00035 // but doesn't strictly follow it.
00036 //
00037 // When compiling, states are *chained* instead of tree- or graph-constructed.
00038 // It's more like structured programs: there's if statement and loop statement.
00039 //
00040 // For alternative structure(say "a|b"), aka "if statement", two branchs should
00041 // be constructed. However, these two shall merge to an "end_tag" at the end of
00042 // this operator:
00043 //
00044 //                branch1
00045 //              /        \
00046 // => begin_tag            end_tag =>
00047 //              \        /
00048 //                branch2
00049 //
00050 // This is the difference between this implementation and that in Russ's
00051 // article.
00052 //
00053 // That's why we introduced dummy node here ------ "end_tag" is a dummy node.
00054 // All dummy node will be eliminated at the end of compiling process.
00055 
00056 namespace std _GLIBCXX_VISIBILITY(default)
00057 {
00058 namespace __detail
00059 {
00060 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00061 
00062   template<typename _TraitsT>
00063     _Compiler<_TraitsT>::
00064     _Compiler(_IterT __b, _IterT __e,
00065           const _TraitsT& __traits, _FlagT __flags)
00066     : _M_flags((__flags
00067         & (regex_constants::ECMAScript
00068            | regex_constants::basic
00069            | regex_constants::extended
00070            | regex_constants::grep
00071            | regex_constants::egrep
00072            | regex_constants::awk))
00073            ? __flags
00074            : __flags | regex_constants::ECMAScript),
00075     _M_traits(__traits),
00076     _M_ctype(std::use_facet<_CtypeT>(_M_traits.getloc())),
00077     _M_scanner(__b, __e, _M_flags, _M_traits.getloc()),
00078     _M_nfa(_M_flags)
00079     {
00080       _StateSeqT __r(_M_nfa, _M_nfa._M_start());
00081       __r._M_append(_M_nfa._M_insert_subexpr_begin());
00082       this->_M_disjunction();
00083       if (!_M_match_token(_ScannerT::_S_token_eof))
00084     __throw_regex_error(regex_constants::error_paren);
00085       __r._M_append(_M_pop());
00086       _GLIBCXX_DEBUG_ASSERT(_M_stack.empty());
00087       __r._M_append(_M_nfa._M_insert_subexpr_end());
00088       __r._M_append(_M_nfa._M_insert_accept());
00089       _M_nfa._M_eliminate_dummy();
00090     }
00091 
00092   template<typename _TraitsT>
00093     void
00094     _Compiler<_TraitsT>::
00095     _M_disjunction()
00096     {
00097       this->_M_alternative();
00098       while (_M_match_token(_ScannerT::_S_token_or))
00099     {
00100       _StateSeqT __alt1 = _M_pop();
00101       this->_M_alternative();
00102       _StateSeqT __alt2 = _M_pop();
00103       auto __end = _M_nfa._M_insert_dummy();
00104       __alt1._M_append(__end);
00105       __alt2._M_append(__end);
00106       _M_stack.push(_StateSeqT(_M_nfa,
00107                    _M_nfa._M_insert_alt(__alt1._M_start,
00108                             __alt2._M_start, false),
00109                    __end));
00110     }
00111     }
00112 
00113   template<typename _TraitsT>
00114     void
00115     _Compiler<_TraitsT>::
00116     _M_alternative()
00117     {
00118       if (this->_M_term())
00119     {
00120       _StateSeqT __re = _M_pop();
00121       this->_M_alternative();
00122       __re._M_append(_M_pop());
00123       _M_stack.push(__re);
00124     }
00125       else
00126     _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_dummy()));
00127     }
00128 
00129   template<typename _TraitsT>
00130     bool
00131     _Compiler<_TraitsT>::
00132     _M_term()
00133     {
00134       if (this->_M_assertion())
00135     return true;
00136       if (this->_M_atom())
00137     {
00138       while (this->_M_quantifier());
00139       return true;
00140     }
00141       return false;
00142     }
00143 
00144   template<typename _TraitsT>
00145     bool
00146     _Compiler<_TraitsT>::
00147     _M_assertion()
00148     {
00149       if (_M_match_token(_ScannerT::_S_token_line_begin))
00150     _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_line_begin()));
00151       else if (_M_match_token(_ScannerT::_S_token_line_end))
00152     _M_stack.push(_StateSeqT(_M_nfa, _M_nfa._M_insert_line_end()));
00153       else if (_M_match_token(_ScannerT::_S_token_word_bound))
00154     // _M_value[0] == 'n' means it's negtive, say "not word boundary".
00155     _M_stack.push(_StateSeqT(_M_nfa, _M_nfa.
00156           _M_insert_word_bound(_M_value[0] == 'n')));
00157       else if (_M_match_token(_ScannerT::_S_token_subexpr_lookahead_begin))
00158     {
00159       auto __neg = _M_value[0] == 'n';
00160       this->_M_disjunction();
00161       if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
00162         __throw_regex_error(regex_constants::error_paren);
00163       auto __tmp = _M_pop();
00164       __tmp._M_append(_M_nfa._M_insert_accept());
00165       _M_stack.push(
00166           _StateSeqT(
00167         _M_nfa,
00168         _M_nfa._M_insert_lookahead(__tmp._M_start, __neg)));
00169     }
00170       else
00171     return false;
00172       return true;
00173     }
00174 
00175   template<typename _TraitsT>
00176     bool
00177     _Compiler<_TraitsT>::
00178     _M_quantifier()
00179     {
00180       bool __neg = (_M_flags & regex_constants::ECMAScript);
00181       auto __init = [this, &__neg]()
00182     {
00183       if (_M_stack.empty())
00184         __throw_regex_error(regex_constants::error_badrepeat);
00185       __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
00186     };
00187       if (_M_match_token(_ScannerT::_S_token_closure0))
00188     {
00189       __init();
00190       auto __e = _M_pop();
00191       _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
00192                               __e._M_start, __neg));
00193       __e._M_append(__r);
00194       _M_stack.push(__r);
00195     }
00196       else if (_M_match_token(_ScannerT::_S_token_closure1))
00197     {
00198       __init();
00199       auto __e = _M_pop();
00200       __e._M_append(_M_nfa._M_insert_alt(_S_invalid_state_id, __e._M_start,
00201                          __neg));
00202       _M_stack.push(__e);
00203     }
00204       else if (_M_match_token(_ScannerT::_S_token_opt))
00205     {
00206       __init();
00207       auto __e = _M_pop();
00208       auto __end = _M_nfa._M_insert_dummy();
00209       _StateSeqT __r(_M_nfa, _M_nfa._M_insert_alt(_S_invalid_state_id,
00210                               __e._M_start, __neg));
00211       __e._M_append(__end);
00212       __r._M_append(__end);
00213       _M_stack.push(__r);
00214     }
00215       else if (_M_match_token(_ScannerT::_S_token_interval_begin))
00216     {
00217       if (_M_stack.empty())
00218         __throw_regex_error(regex_constants::error_badrepeat);
00219       if (!_M_match_token(_ScannerT::_S_token_dup_count))
00220         __throw_regex_error(regex_constants::error_badbrace);
00221       _StateSeqT __r(_M_pop());
00222       _StateSeqT __e(_M_nfa, _M_nfa._M_insert_dummy());
00223       long __min_rep = _M_cur_int_value(10);
00224       bool __infi = false;
00225       long __n;
00226 
00227       // {3
00228       if (_M_match_token(_ScannerT::_S_token_comma))
00229         if (_M_match_token(_ScannerT::_S_token_dup_count)) // {3,7}
00230           __n = _M_cur_int_value(10) - __min_rep;
00231         else
00232           __infi = true;
00233       else
00234         __n = 0;
00235       if (!_M_match_token(_ScannerT::_S_token_interval_end))
00236         __throw_regex_error(regex_constants::error_brace);
00237 
00238       __neg = __neg && _M_match_token(_ScannerT::_S_token_opt);
00239 
00240       for (long __i = 0; __i < __min_rep; ++__i)
00241         __e._M_append(__r._M_clone());
00242 
00243       if (__infi)
00244         {
00245           auto __tmp = __r._M_clone();
00246           _StateSeqT __s(_M_nfa,
00247                  _M_nfa._M_insert_alt(_S_invalid_state_id,
00248                           __tmp._M_start, __neg));
00249           __tmp._M_append(__s);
00250           __e._M_append(__s);
00251         }
00252       else
00253         {
00254           if (__n < 0)
00255         __throw_regex_error(regex_constants::error_badbrace);
00256           auto __end = _M_nfa._M_insert_dummy();
00257           // _M_alt is the "match more" branch, and _M_next is the
00258           // "match less" one. Switch _M_alt and _M_next of all created
00259           // nodes. This is a hacking but IMO works well.
00260           std::stack<_StateIdT> __stack;
00261           for (long __i = 0; __i < __n; ++__i)
00262         {
00263           auto __tmp = __r._M_clone();
00264           auto __alt = _M_nfa._M_insert_alt(__tmp._M_start,
00265                             __end, __neg);
00266           __stack.push(__alt);
00267           __e._M_append(_StateSeqT(_M_nfa, __alt, __tmp._M_end));
00268         }
00269           __e._M_append(__end);
00270           while (!__stack.empty())
00271         {
00272           auto& __tmp = _M_nfa[__stack.top()];
00273           __stack.pop();
00274           std::swap(__tmp._M_next, __tmp._M_alt);
00275         }
00276         }
00277       _M_stack.push(__e);
00278     }
00279       else
00280     return false;
00281       return true;
00282     }
00283 
00284 #define __INSERT_REGEX_MATCHER(__func, args...)\
00285     do\
00286       if (!(_M_flags & regex_constants::icase))\
00287         if (!(_M_flags & regex_constants::collate))\
00288           __func<false, false>(args);\
00289         else\
00290           __func<false, true>(args);\
00291       else\
00292         if (!(_M_flags & regex_constants::collate))\
00293           __func<true, false>(args);\
00294         else\
00295           __func<true, true>(args);\
00296     while (false)
00297 
00298   template<typename _TraitsT>
00299     bool
00300     _Compiler<_TraitsT>::
00301     _M_atom()
00302     {
00303       if (_M_match_token(_ScannerT::_S_token_anychar))
00304     {
00305       if (!(_M_flags & regex_constants::ECMAScript))
00306         __INSERT_REGEX_MATCHER(_M_insert_any_matcher_posix);
00307       else
00308         __INSERT_REGEX_MATCHER(_M_insert_any_matcher_ecma);
00309     }
00310       else if (_M_try_char())
00311     __INSERT_REGEX_MATCHER(_M_insert_char_matcher);
00312       else if (_M_match_token(_ScannerT::_S_token_backref))
00313     _M_stack.push(_StateSeqT(_M_nfa, _M_nfa.
00314                  _M_insert_backref(_M_cur_int_value(10))));
00315       else if (_M_match_token(_ScannerT::_S_token_quoted_class))
00316     __INSERT_REGEX_MATCHER(_M_insert_character_class_matcher);
00317       else if (_M_match_token(_ScannerT::_S_token_subexpr_no_group_begin))
00318     {
00319       _StateSeqT __r(_M_nfa, _M_nfa._M_insert_dummy());
00320       this->_M_disjunction();
00321       if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
00322         __throw_regex_error(regex_constants::error_paren);
00323       __r._M_append(_M_pop());
00324       _M_stack.push(__r);
00325     }
00326       else if (_M_match_token(_ScannerT::_S_token_subexpr_begin))
00327     {
00328       _StateSeqT __r(_M_nfa, _M_nfa._M_insert_subexpr_begin());
00329       this->_M_disjunction();
00330       if (!_M_match_token(_ScannerT::_S_token_subexpr_end))
00331         __throw_regex_error(regex_constants::error_paren);
00332       __r._M_append(_M_pop());
00333       __r._M_append(_M_nfa._M_insert_subexpr_end());
00334       _M_stack.push(__r);
00335     }
00336       else if (!_M_bracket_expression())
00337     return false;
00338       return true;
00339     }
00340 
00341   template<typename _TraitsT>
00342     bool
00343     _Compiler<_TraitsT>::
00344     _M_bracket_expression()
00345     {
00346       bool __neg =
00347     _M_match_token(_ScannerT::_S_token_bracket_neg_begin);
00348       if (!(__neg || _M_match_token(_ScannerT::_S_token_bracket_begin)))
00349     return false;
00350       __INSERT_REGEX_MATCHER(_M_insert_bracket_matcher, __neg);
00351       return true;
00352     }
00353 #undef __INSERT_REGEX_MATCHER
00354 
00355   template<typename _TraitsT>
00356   template<bool __icase, bool __collate>
00357     void
00358     _Compiler<_TraitsT>::
00359     _M_insert_any_matcher_ecma()
00360     {
00361       _M_stack.push(_StateSeqT(_M_nfa,
00362     _M_nfa._M_insert_matcher
00363       (_AnyMatcher<_TraitsT, true, __icase, __collate>
00364         (_M_traits))));
00365     }
00366 
00367   template<typename _TraitsT>
00368   template<bool __icase, bool __collate>
00369     void
00370     _Compiler<_TraitsT>::
00371     _M_insert_any_matcher_posix()
00372     {
00373       _M_stack.push(_StateSeqT(_M_nfa,
00374     _M_nfa._M_insert_matcher
00375       (_AnyMatcher<_TraitsT, false, __icase, __collate>
00376         (_M_traits))));
00377     }
00378 
00379   template<typename _TraitsT>
00380   template<bool __icase, bool __collate>
00381     void
00382     _Compiler<_TraitsT>::
00383     _M_insert_char_matcher()
00384     {
00385       _M_stack.push(_StateSeqT(_M_nfa,
00386     _M_nfa._M_insert_matcher
00387       (_CharMatcher<_TraitsT, __icase, __collate>
00388         (_M_value[0], _M_traits))));
00389     }
00390 
00391   template<typename _TraitsT>
00392   template<bool __icase, bool __collate>
00393     void
00394     _Compiler<_TraitsT>::
00395     _M_insert_character_class_matcher()
00396     {
00397       _GLIBCXX_DEBUG_ASSERT(_M_value.size() == 1);
00398       _BracketMatcher<_TraitsT, __icase, __collate> __matcher
00399     (_M_ctype.is(_CtypeT::upper, _M_value[0]), _M_traits);
00400       __matcher._M_add_character_class(_M_value, false);
00401       __matcher._M_ready();
00402       _M_stack.push(_StateSeqT(_M_nfa,
00403     _M_nfa._M_insert_matcher(std::move(__matcher))));
00404     }
00405 
00406   template<typename _TraitsT>
00407   template<bool __icase, bool __collate>
00408     void
00409     _Compiler<_TraitsT>::
00410     _M_insert_bracket_matcher(bool __neg)
00411     {
00412       _BracketMatcher<_TraitsT, __icase, __collate> __matcher(__neg, _M_traits);
00413       while (!_M_match_token(_ScannerT::_S_token_bracket_end))
00414     _M_expression_term(__matcher);
00415       __matcher._M_ready();
00416       _M_stack.push(_StateSeqT(_M_nfa,
00417                    _M_nfa._M_insert_matcher(std::move(__matcher))));
00418     }
00419 
00420   template<typename _TraitsT>
00421   template<bool __icase, bool __collate>
00422     void
00423     _Compiler<_TraitsT>::
00424     _M_expression_term(_BracketMatcher<_TraitsT, __icase, __collate>& __matcher)
00425     {
00426       if (_M_match_token(_ScannerT::_S_token_collsymbol))
00427     __matcher._M_add_collating_element(_M_value);
00428       else if (_M_match_token(_ScannerT::_S_token_equiv_class_name))
00429     __matcher._M_add_equivalence_class(_M_value);
00430       else if (_M_match_token(_ScannerT::_S_token_char_class_name))
00431     __matcher._M_add_character_class(_M_value, false);
00432       else if (_M_try_char()) // [a
00433     {
00434       auto __ch = _M_value[0];
00435       if (_M_try_char())
00436         {
00437           if (_M_value[0] == '-') // [a-
00438         {
00439           if (_M_try_char()) // [a-z]
00440             {
00441               __matcher._M_make_range(__ch, _M_value[0]);
00442               return;
00443             }
00444           // If the dash is the last character in the bracket
00445           // expression, it is not special.
00446           if (_M_scanner._M_get_token()
00447               != _ScannerT::_S_token_bracket_end)
00448             __throw_regex_error(regex_constants::error_range);
00449         }
00450           __matcher._M_add_char(_M_value[0]);
00451         }
00452       __matcher._M_add_char(__ch);
00453     }
00454       else if (_M_match_token(_ScannerT::_S_token_quoted_class))
00455     __matcher._M_add_character_class(_M_value,
00456                      _M_ctype.is(_CtypeT::upper,
00457                              _M_value[0]));
00458       else
00459     __throw_regex_error(regex_constants::error_brack);
00460     }
00461 
00462   template<typename _TraitsT>
00463     bool
00464     _Compiler<_TraitsT>::
00465     _M_try_char()
00466     {
00467       bool __is_char = false;
00468       if (_M_match_token(_ScannerT::_S_token_oct_num))
00469     {
00470       __is_char = true;
00471       _M_value.assign(1, _M_cur_int_value(8));
00472     }
00473       else if (_M_match_token(_ScannerT::_S_token_hex_num))
00474     {
00475       __is_char = true;
00476       _M_value.assign(1, _M_cur_int_value(16));
00477     }
00478       else if (_M_match_token(_ScannerT::_S_token_ord_char))
00479     __is_char = true;
00480       return __is_char;
00481     }
00482 
00483   template<typename _TraitsT>
00484     bool
00485     _Compiler<_TraitsT>::
00486     _M_match_token(_TokenT token)
00487     {
00488       if (token == _M_scanner._M_get_token())
00489     {
00490       _M_value = _M_scanner._M_get_value();
00491       _M_scanner._M_advance();
00492       return true;
00493     }
00494       return false;
00495     }
00496 
00497   template<typename _TraitsT>
00498     int
00499     _Compiler<_TraitsT>::
00500     _M_cur_int_value(int __radix)
00501     {
00502       long __v = 0;
00503       for (typename _StringT::size_type __i = 0;
00504        __i < _M_value.length(); ++__i)
00505     __v =__v * __radix + _M_traits.value(_M_value[__i], __radix);
00506       return __v;
00507     }
00508 
00509   template<typename _TraitsT, bool __icase, bool __collate>
00510     bool
00511     _BracketMatcher<_TraitsT, __icase, __collate>::
00512     _M_apply(_CharT __ch, false_type) const
00513     {
00514       bool __ret = false;
00515       if (std::find(_M_char_set.begin(), _M_char_set.end(),
00516             _M_translator._M_translate(__ch))
00517       != _M_char_set.end())
00518     __ret = true;
00519       else
00520     {
00521       auto __s = _M_translator._M_transform(__ch);
00522       for (auto& __it : _M_range_set)
00523         if (__it.first <= __s && __s <= __it.second)
00524           {
00525         __ret = true;
00526         break;
00527           }
00528       if (_M_traits.isctype(__ch, _M_class_set))
00529         __ret = true;
00530       else if (std::find(_M_equiv_set.begin(), _M_equiv_set.end(),
00531                  _M_traits.transform_primary(&__ch, &__ch+1))
00532            != _M_equiv_set.end())
00533         __ret = true;
00534       else
00535         {
00536           for (auto& __it : _M_neg_class_set)
00537         if (!_M_traits.isctype(__ch, __it))
00538           {
00539             __ret = true;
00540             break;
00541           }
00542         }
00543     }
00544       if (_M_is_non_matching)
00545     return !__ret;
00546       else
00547     return __ret;
00548     }
00549 
00550 _GLIBCXX_END_NAMESPACE_VERSION
00551 } // namespace __detail
00552 } // namespace