libstdc++
regex_nfa.h
Go to the documentation of this file.
00001 // class template regex -*- C++ -*-
00002 
00003 // Copyright (C) 2010-2013 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 /**
00026  *  @file bits/regex_nfa.h
00027  *  This is an internal header file, included by other library headers.
00028  *  Do not attempt to use it directly. @headername{regex}
00029  */
00030 
00031 namespace std _GLIBCXX_VISIBILITY(default)
00032 {
00033 namespace __detail
00034 {
00035 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00036 
00037   /**
00038    * @addtogroup regex-detail
00039    * @{
00040    */
00041 
00042   /// Base class for, um, automata.  Could be an NFA or a DFA.  Your choice.
00043   class _Automaton
00044   {
00045   public:
00046     typedef unsigned int _SizeT;
00047 
00048   public:
00049     virtual
00050     ~_Automaton() { }
00051 
00052     virtual _SizeT
00053     _M_sub_count() const = 0;
00054 
00055 #ifdef _GLIBCXX_DEBUG
00056     virtual std::ostream&
00057     _M_dot(std::ostream& __ostr) const = 0;
00058 #endif
00059   };
00060 
00061   /// Generic shared pointer to an automaton.  
00062   typedef std::shared_ptr<_Automaton> _AutomatonPtr;
00063 
00064   /// Operation codes that define the type of transitions within the base NFA
00065   /// that represents the regular expression.
00066   enum _Opcode
00067   {
00068       _S_opcode_unknown       =   0,
00069       _S_opcode_alternative   =   1,
00070       _S_opcode_subexpr_begin =   4,
00071       _S_opcode_subexpr_end   =   5,
00072       _S_opcode_match         = 100,
00073       _S_opcode_accept        = 255
00074   };
00075 
00076   /// Provides a generic facade for a templated match_results.
00077   struct _Results
00078   {
00079     virtual void _M_set_pos(int __i, int __j, const _PatternCursor& __p) = 0;
00080     virtual void _M_set_matched(int __i, bool __is_matched) = 0;
00081   };
00082 
00083   /// Tags current state (for subexpr begin/end).
00084   typedef std::function<void (const _PatternCursor&, _Results&)> _Tagger;
00085 
00086   /// Start state tag.
00087   template<typename _FwdIterT, typename _TraitsT>
00088     struct _StartTagger
00089     {
00090       explicit
00091       _StartTagger(int __i)
00092       : _M_index(__i)
00093       { }
00094 
00095       void
00096       operator()(const _PatternCursor& __pc, _Results& __r)
00097       { __r._M_set_pos(_M_index, 0, __pc); }
00098 
00099       int       _M_index;
00100     };
00101 
00102   /// End state tag.
00103   template<typename _FwdIterT, typename _TraitsT>
00104     struct _EndTagger
00105     {
00106       explicit
00107       _EndTagger(int __i)
00108       : _M_index(__i)
00109       { }
00110 
00111       void
00112       operator()(const _PatternCursor& __pc, _Results& __r)
00113       { __r._M_set_pos(_M_index, 1, __pc); }
00114 
00115       int       _M_index;
00116       _FwdIterT _M_pos;
00117     };
00118 
00119   /// Indicates if current state matches cursor current.
00120   typedef std::function<bool (const _PatternCursor&)> _Matcher;
00121 
00122   /// Matches any character
00123   inline bool
00124   _AnyMatcher(const _PatternCursor&)
00125   { return true; }
00126 
00127   /// Matches a single character
00128   template<typename _InIterT, typename _TraitsT>
00129     struct _CharMatcher
00130     {
00131       typedef typename _TraitsT::char_type char_type;
00132 
00133       explicit
00134       _CharMatcher(char_type __c, const _TraitsT& __t = _TraitsT())
00135       : _M_traits(__t), _M_c(_M_traits.translate(__c))
00136       { }
00137 
00138       bool
00139       operator()(const _PatternCursor& __pc) const
00140       {
00141     typedef const _SpecializedCursor<_InIterT>& _CursorT;
00142     _CursorT __c = static_cast<_CursorT>(__pc);
00143     return _M_traits.translate(__c._M_current()) == _M_c;
00144       }
00145 
00146       const _TraitsT& _M_traits;
00147       char_type       _M_c;
00148     };
00149 
00150   /// Matches a character range (bracket expression)
00151   template<typename _InIterT, typename _TraitsT>
00152     struct _RangeMatcher
00153     {
00154       typedef typename _TraitsT::char_type _CharT;
00155       typedef std::basic_string<_CharT>    _StringT;
00156 
00157       explicit
00158       _RangeMatcher(bool __is_non_matching, const _TraitsT& __t = _TraitsT())
00159       : _M_traits(__t), _M_is_non_matching(__is_non_matching)
00160       { }
00161 
00162       bool
00163       operator()(const _PatternCursor& __pc) const
00164       {
00165     typedef const _SpecializedCursor<_InIterT>& _CursorT;
00166     _CursorT __c = static_cast<_CursorT>(__pc);
00167     return true;
00168       }
00169 
00170       void
00171       _M_add_char(_CharT __c)
00172       { }
00173 
00174       void
00175       _M_add_collating_element(const _StringT& __s)
00176       { }
00177 
00178       void
00179       _M_add_equivalence_class(const _StringT& __s)
00180       { }
00181 
00182       void
00183       _M_add_character_class(const _StringT& __s)
00184       { }
00185 
00186       void
00187       _M_make_range()
00188       { }
00189 
00190       const _TraitsT& _M_traits;
00191       bool            _M_is_non_matching;
00192     };
00193 
00194   /// Identifies a state in the NFA.
00195   typedef int _StateIdT;
00196 
00197   /// The special case in which a state identifier is not an index.
00198   static const _StateIdT _S_invalid_state_id  = -1;
00199 
00200 
00201   /**
00202    * @brief struct _State
00203    *
00204    * An individual state in an NFA
00205    *
00206    * In this case a "state" is an entry in the NFA definition coupled
00207    * with its outgoing transition(s).  All states have a single outgoing
00208    * transition, except for accepting states (which have no outgoing
00209    * transitions) and alt states, which have two outgoing transitions.
00210    */
00211   struct _State
00212   {
00213     typedef int  _OpcodeT;
00214 
00215     _OpcodeT     _M_opcode;    // type of outgoing transition
00216     _StateIdT    _M_next;      // outgoing transition
00217     _StateIdT    _M_alt;       // for _S_opcode_alternative
00218     unsigned int _M_subexpr;   // for _S_opcode_subexpr_*
00219     _Tagger      _M_tagger;    // for _S_opcode_subexpr_*
00220     _Matcher     _M_matches;   // for _S_opcode_match
00221 
00222     explicit _State(_OpcodeT __opcode)
00223     : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
00224     { }
00225 
00226     _State(const _Matcher& __m)
00227     : _M_opcode(_S_opcode_match), _M_next(_S_invalid_state_id), _M_matches(__m)
00228     { }
00229 
00230     _State(_OpcodeT __opcode, unsigned int __s, const _Tagger& __t)
00231     : _M_opcode(__opcode), _M_next(_S_invalid_state_id), _M_subexpr(__s),
00232       _M_tagger(__t)
00233     { }
00234 
00235     _State(_StateIdT __next, _StateIdT __alt)
00236     : _M_opcode(_S_opcode_alternative), _M_next(__next), _M_alt(__alt)
00237     { }
00238 
00239 #ifdef _GLIBCXX_DEBUG
00240     std::ostream&
00241     _M_print(std::ostream& ostr) const;
00242 
00243     // Prints graphviz dot commands for state.
00244     std::ostream&
00245     _M_dot(std::ostream& __ostr, _StateIdT __id) const;
00246 #endif
00247   };
00248 
00249   
00250   /// The Grep Matcher works on sets of states.  Here are sets of states.
00251   typedef std::set<_StateIdT> _StateSet;
00252 
00253   /**
00254    * @brief struct _Nfa
00255    *
00256    * A collection of all states making up an NFA.
00257    *
00258    * An NFA is a 4-tuple M = (K, S, s, F), where
00259    *    K is a finite set of states,
00260    *    S is the alphabet of the NFA,
00261    *    s is the initial state,
00262    *    F is a set of final (accepting) states.
00263    *
00264    * This NFA class is templated on S, a type that will hold values of the
00265    * underlying alphabet (without regard to semantics of that alphabet).  The
00266    * other elements of the tuple are generated during construction of the NFA
00267    * and are available through accessor member functions.
00268    */
00269   class _Nfa
00270   : public _Automaton, public std::vector<_State>
00271   {
00272   public:
00273     typedef _State                              _StateT;
00274     typedef unsigned int                        _SizeT;
00275     typedef regex_constants::syntax_option_type _FlagT;
00276 
00277     _Nfa(_FlagT __f)
00278     : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0)
00279     { }
00280 
00281     ~_Nfa()
00282     { }
00283 
00284     _FlagT
00285     _M_options() const
00286     { return _M_flags; }
00287 
00288     _StateIdT
00289     _M_start() const
00290     { return _M_start_state; }
00291 
00292     const _StateSet&
00293     _M_final_states() const
00294     { return _M_accepting_states; }
00295 
00296     _SizeT
00297     _M_sub_count() const
00298     { return _M_subexpr_count; }
00299 
00300     _StateIdT
00301     _M_insert_accept()
00302     {
00303       this->push_back(_StateT(_S_opcode_accept));
00304       _M_accepting_states.insert(this->size()-1);
00305       return this->size()-1;
00306     }
00307 
00308     _StateIdT
00309     _M_insert_alt(_StateIdT __next, _StateIdT __alt)
00310     {
00311       this->push_back(_StateT(__next, __alt));
00312       return this->size()-1;
00313     }
00314 
00315     _StateIdT
00316     _M_insert_matcher(_Matcher __m)
00317     {
00318       this->push_back(_StateT(__m));
00319       return this->size()-1;
00320     }
00321 
00322     _StateIdT
00323     _M_insert_subexpr_begin(const _Tagger& __t)
00324     {
00325       this->push_back(_StateT(_S_opcode_subexpr_begin, _M_subexpr_count++,
00326                   __t));
00327       return this->size()-1;
00328     }
00329 
00330     _StateIdT 
00331     _M_insert_subexpr_end(unsigned int __i, const _Tagger& __t)
00332     {
00333       this->push_back(_StateT(_S_opcode_subexpr_end, __i, __t));
00334       return this->size()-1;
00335     }
00336 
00337 #ifdef _GLIBCXX_DEBUG
00338     std::ostream&
00339     _M_dot(std::ostream& __ostr) const;
00340 #endif
00341 
00342   private:
00343     _FlagT     _M_flags;
00344     _StateIdT  _M_start_state;
00345     _StateSet  _M_accepting_states;
00346     _SizeT     _M_subexpr_count;
00347   };
00348 
00349   /// Describes a sequence of one or more %_State, its current start
00350   /// and end(s).  This structure contains fragments of an NFA during
00351   /// construction.
00352   class _StateSeq
00353   {
00354   public:
00355     // Constructs a single-node sequence
00356     _StateSeq(_Nfa& __ss, _StateIdT __s, _StateIdT __e = _S_invalid_state_id)
00357     : _M_nfa(__ss), _M_start(__s), _M_end1(__s), _M_end2(__e)
00358     { }
00359     // Constructs a split sequence from two other sequencces
00360     _StateSeq(const _StateSeq& __e1, const _StateSeq& __e2)
00361     : _M_nfa(__e1._M_nfa),
00362       _M_start(_M_nfa._M_insert_alt(__e1._M_start, __e2._M_start)),
00363       _M_end1(__e1._M_end1), _M_end2(__e2._M_end1)
00364     { }
00365 
00366     // Constructs a split sequence from a single sequence
00367     _StateSeq(const _StateSeq& __e, _StateIdT __id)
00368     : _M_nfa(__e._M_nfa),
00369       _M_start(_M_nfa._M_insert_alt(__id, __e._M_start)),
00370       _M_end1(__id), _M_end2(__e._M_end1)
00371     { }
00372 
00373     // Constructs a copy of a %_StateSeq
00374     _StateSeq(const _StateSeq& __rhs)
00375     : _M_nfa(__rhs._M_nfa), _M_start(__rhs._M_start),
00376       _M_end1(__rhs._M_end1), _M_end2(__rhs._M_end2)
00377     { }
00378 
00379 
00380     _StateSeq& operator=(const _StateSeq& __rhs);
00381 
00382     _StateIdT
00383     _M_front() const
00384     { return _M_start; }
00385 
00386     // Extends a sequence by one.
00387     void
00388     _M_push_back(_StateIdT __id);
00389 
00390     // Extends and maybe joins a sequence.
00391     void
00392     _M_append(_StateIdT __id);
00393 
00394     void
00395     _M_append(_StateSeq& __rhs);
00396 
00397     // Clones an entire sequence.
00398     _StateIdT
00399     _M_clone();
00400 
00401   private:
00402     _Nfa&     _M_nfa;
00403     _StateIdT _M_start;
00404     _StateIdT _M_end1;
00405     _StateIdT _M_end2;
00406 
00407   };
00408 
00409  //@} regex-detail
00410 _GLIBCXX_END_NAMESPACE_VERSION
00411 } // namespace __detail
00412 } // namespace std
00413 
00414 #include <bits/regex_nfa.tcc>
00415