1  // class template regex -*- C++ -*-
       2  
       3  // Copyright (C) 2013-2023 Free Software Foundation, Inc.
       4  //
       5  // This file is part of the GNU ISO C++ Library.  This library is free
       6  // software; you can redistribute it and/or modify it under the
       7  // terms of the GNU General Public License as published by the
       8  // Free Software Foundation; either version 3, or (at your option)
       9  // any later version.
      10  
      11  // This library is distributed in the hope that it will be useful,
      12  // but WITHOUT ANY WARRANTY; without even the implied warranty of
      13  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      14  // GNU General Public License for more details.
      15  
      16  // Under Section 7 of GPL version 3, you are granted additional
      17  // permissions described in the GCC Runtime Library Exception, version
      18  // 3.1, as published by the Free Software Foundation.
      19  
      20  // You should have received a copy of the GNU General Public License and
      21  // a copy of the GCC Runtime Library Exception along with this program;
      22  // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
      23  // <http://www.gnu.org/licenses/>.
      24  
      25  /**
      26   *  @file bits/regex_automaton.h
      27   *  This is an internal header file, included by other library headers.
      28   *  Do not attempt to use it directly. @headername{regex}
      29   */
      30  
      31  // This macro defines the maximal state number a NFA can have.
      32  #ifndef _GLIBCXX_REGEX_STATE_LIMIT
      33  #define _GLIBCXX_REGEX_STATE_LIMIT 100000
      34  #endif
      35  
      36  namespace std _GLIBCXX_VISIBILITY(default)
      37  {
      38  _GLIBCXX_BEGIN_NAMESPACE_VERSION
      39  
      40  namespace __detail
      41  {
      42    /**
      43     *  @defgroup regex-detail Base and Implementation Classes
      44     *  @ingroup regex
      45     *  @{
      46     */
      47  
      48    typedef long _StateIdT;
      49    _GLIBCXX17_INLINE constexpr _StateIdT _S_invalid_state_id  = -1;
      50  
      51    template<typename _CharT>
      52      using _Matcher = std::function<bool (_CharT)>;
      53  
      54    /// Operation codes that define the type of transitions within the base NFA
      55    /// that represents the regular expression.
      56    enum _Opcode : int
      57    {
      58        _S_opcode_unknown,
      59        _S_opcode_alternative,
      60        _S_opcode_repeat,
      61        _S_opcode_backref,
      62        _S_opcode_line_begin_assertion,
      63        _S_opcode_line_end_assertion,
      64        _S_opcode_word_boundary,
      65        _S_opcode_subexpr_lookahead,
      66        _S_opcode_subexpr_begin,
      67        _S_opcode_subexpr_end,
      68        _S_opcode_dummy,
      69        _S_opcode_match,
      70        _S_opcode_accept,
      71    };
      72  
      73    struct _State_base
      74    {
      75    protected:
      76      _Opcode      _M_opcode;           // type of outgoing transition
      77  
      78    public:
      79      _StateIdT    _M_next;             // outgoing transition
      80      union // Since they are mutually exclusive.
      81      {
      82        size_t _M_subexpr;        // for _S_opcode_subexpr_*
      83        size_t _M_backref_index;  // for _S_opcode_backref
      84        struct
      85        {
      86  	// for _S_opcode_alternative, _S_opcode_repeat and
      87  	// _S_opcode_subexpr_lookahead
      88  	_StateIdT  _M_alt;
      89  	// for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or
      90  	// quantifiers (ungreedy if set true)
      91  	bool       _M_neg;
      92        };
      93        // For _S_opcode_match
      94        __gnu_cxx::__aligned_membuf<_Matcher<char>> _M_matcher_storage;
      95      };
      96  
      97    protected:
      98      explicit _State_base(_Opcode __opcode) noexcept
      99      : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
     100      { }
     101  
     102    public:
     103      bool
     104      _M_has_alt() const noexcept
     105      {
     106        return _M_opcode == _S_opcode_alternative
     107  	|| _M_opcode == _S_opcode_repeat
     108  	|| _M_opcode == _S_opcode_subexpr_lookahead;
     109      }
     110  
     111  #ifdef _GLIBCXX_DEBUG
     112      std::ostream&
     113      _M_print(std::ostream& __ostr) const;
     114  
     115      // Prints graphviz dot commands for state.
     116      std::ostream&
     117      _M_dot(std::ostream& __ostr, _StateIdT __id) const;
     118  #endif
     119    };
     120  
     121    template<typename _Char_type>
     122      struct _State : _State_base
     123      {
     124        typedef _Matcher<_Char_type> _MatcherT;
     125        static_assert(sizeof(_MatcherT) == sizeof(_Matcher<char>),
     126  		    "std::function<bool(T)> has the same size as "
     127  		    "std::function<bool(char)>");
     128        static_assert(alignof(_MatcherT) == alignof(_Matcher<char>),
     129  		    "std::function<bool(T)> has the same alignment as "
     130  		    "std::function<bool(char)>");
     131  
     132        explicit
     133        _State(_Opcode __opcode) noexcept : _State_base(__opcode)
     134        {
     135  	if (_M_opcode() == _S_opcode_match)
     136  	  new (this->_M_matcher_storage._M_addr()) _MatcherT();
     137        }
     138  
     139        _State(const _State& __rhs) : _State_base(__rhs)
     140        {
     141  	if (__rhs._M_opcode() == _S_opcode_match)
     142  	  new (this->_M_matcher_storage._M_addr())
     143  	    _MatcherT(__rhs._M_get_matcher());
     144        }
     145  
     146        _State(_State&& __rhs) noexcept : _State_base(__rhs)
     147        {
     148  	if (__rhs._M_opcode() == _S_opcode_match)
     149  	  new (this->_M_matcher_storage._M_addr())
     150  	    _MatcherT(std::move(__rhs._M_get_matcher()));
     151        }
     152  
     153        _State&
     154        operator=(const _State&) = delete;
     155  
     156        ~_State()
     157        {
     158  	if (_M_opcode() == _S_opcode_match)
     159  	  _M_get_matcher().~_MatcherT();
     160        }
     161  
     162        // Since correct ctor and dtor rely on _M_opcode, it's better not to
     163        // change it over time.
     164        _Opcode
     165        _M_opcode() const noexcept
     166        { return _State_base::_M_opcode; }
     167  
     168        bool
     169        _M_matches(_Char_type __char) const
     170        { return _M_get_matcher()(__char); }
     171  
     172        _MatcherT&
     173        _M_get_matcher() noexcept
     174        { return *static_cast<_MatcherT*>(this->_M_matcher_storage._M_addr()); }
     175  
     176        const _MatcherT&
     177        _M_get_matcher() const noexcept
     178        {
     179  	return *static_cast<const _MatcherT*>(
     180  	    this->_M_matcher_storage._M_addr());
     181        }
     182      };
     183  
     184    struct _NFA_base
     185    {
     186      typedef regex_constants::syntax_option_type _FlagT;
     187  
     188      explicit
     189      _NFA_base(_FlagT __f) noexcept
     190      : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0),
     191      _M_has_backref(false)
     192      { }
     193  
     194      _NFA_base(_NFA_base&&) = default;
     195  
     196    protected:
     197      ~_NFA_base() = default;
     198  
     199    public:
     200      _FlagT
     201      _M_options() const noexcept
     202      { return _M_flags; }
     203  
     204      _StateIdT
     205      _M_start() const noexcept
     206      { return _M_start_state; }
     207  
     208      size_t
     209      _M_sub_count() const noexcept
     210      { return _M_subexpr_count; }
     211  
     212      _GLIBCXX_STD_C::vector<size_t> _M_paren_stack;
     213      _FlagT                    _M_flags;
     214      _StateIdT                 _M_start_state;
     215      size_t                    _M_subexpr_count;
     216      bool                      _M_has_backref;
     217    };
     218  
     219    template<typename _TraitsT>
     220      struct _NFA
     221      : _NFA_base, _GLIBCXX_STD_C::vector<_State<typename _TraitsT::char_type>>
     222      {
     223        typedef typename _TraitsT::char_type	_Char_type;
     224        typedef _State<_Char_type>		_StateT;
     225        typedef _Matcher<_Char_type>		_MatcherT;
     226  
     227        _NFA(const typename _TraitsT::locale_type& __loc, _FlagT __flags)
     228        : _NFA_base(__flags)
     229        { _M_traits.imbue(__loc); }
     230  
     231        // for performance reasons _NFA objects should only be moved not copied
     232        _NFA(const _NFA&) = delete;
     233        _NFA(_NFA&&) = default;
     234  
     235        _StateIdT
     236        _M_insert_accept()
     237        {
     238  	auto __ret = _M_insert_state(_StateT(_S_opcode_accept));
     239  	return __ret;
     240        }
     241  
     242        _StateIdT
     243        _M_insert_alt(_StateIdT __next, _StateIdT __alt,
     244  		    bool __neg __attribute__((__unused__)))
     245        {
     246  	_StateT __tmp(_S_opcode_alternative);
     247  	// It labels every quantifier to make greedy comparison easier in BFS
     248  	// approach.
     249  	__tmp._M_next = __next;
     250  	__tmp._M_alt = __alt;
     251  	return _M_insert_state(std::move(__tmp));
     252        }
     253  
     254        _StateIdT
     255        _M_insert_repeat(_StateIdT __next, _StateIdT __alt, bool __neg)
     256        {
     257  	_StateT __tmp(_S_opcode_repeat);
     258  	// It labels every quantifier to make greedy comparison easier in BFS
     259  	// approach.
     260  	__tmp._M_next = __next;
     261  	__tmp._M_alt = __alt;
     262  	__tmp._M_neg = __neg;
     263  	return _M_insert_state(std::move(__tmp));
     264        }
     265  
     266        _StateIdT
     267        _M_insert_matcher(_MatcherT __m)
     268        {
     269  	_StateT __tmp(_S_opcode_match);
     270  	__tmp._M_get_matcher() = std::move(__m);
     271  	return _M_insert_state(std::move(__tmp));
     272        }
     273  
     274        _StateIdT
     275        _M_insert_subexpr_begin()
     276        {
     277  	auto __id = this->_M_subexpr_count++;
     278  	this->_M_paren_stack.push_back(__id);
     279  	_StateT __tmp(_S_opcode_subexpr_begin);
     280  	__tmp._M_subexpr = __id;
     281  	return _M_insert_state(std::move(__tmp));
     282        }
     283  
     284        _StateIdT
     285        _M_insert_subexpr_end()
     286        {
     287  	_StateT __tmp(_S_opcode_subexpr_end);
     288  	__tmp._M_subexpr = this->_M_paren_stack.back();
     289  	this->_M_paren_stack.pop_back();
     290  	return _M_insert_state(std::move(__tmp));
     291        }
     292  
     293        _StateIdT
     294        _M_insert_backref(size_t __index);
     295  
     296        _StateIdT
     297        _M_insert_line_begin()
     298        { return _M_insert_state(_StateT(_S_opcode_line_begin_assertion)); }
     299  
     300        _StateIdT
     301        _M_insert_line_end()
     302        { return _M_insert_state(_StateT(_S_opcode_line_end_assertion)); }
     303  
     304        _StateIdT
     305        _M_insert_word_bound(bool __neg)
     306        {
     307  	_StateT __tmp(_S_opcode_word_boundary);
     308  	__tmp._M_neg = __neg;
     309  	return _M_insert_state(std::move(__tmp));
     310        }
     311  
     312        _StateIdT
     313        _M_insert_lookahead(_StateIdT __alt, bool __neg)
     314        {
     315  	_StateT __tmp(_S_opcode_subexpr_lookahead);
     316  	__tmp._M_alt = __alt;
     317  	__tmp._M_neg = __neg;
     318  	return _M_insert_state(std::move(__tmp));
     319        }
     320  
     321        _StateIdT
     322        _M_insert_dummy()
     323        { return _M_insert_state(_StateT(_S_opcode_dummy)); }
     324  
     325        _StateIdT
     326        _M_insert_state(_StateT __s)
     327        {
     328  	this->push_back(std::move(__s));
     329  	if (this->size() > _GLIBCXX_REGEX_STATE_LIMIT)
     330  	  __throw_regex_error(
     331  	    regex_constants::error_space,
     332  	    "Number of NFA states exceeds limit. Please use shorter regex "
     333  	    "string, or use smaller brace expression, or make "
     334  	    "_GLIBCXX_REGEX_STATE_LIMIT larger.");
     335  	return this->size() - 1;
     336        }
     337  
     338        // Eliminate dummy node in this NFA to make it compact.
     339        void
     340        _M_eliminate_dummy();
     341  
     342  #ifdef _GLIBCXX_DEBUG
     343        std::ostream&
     344        _M_dot(std::ostream& __ostr) const;
     345  #endif
     346      public:
     347        _TraitsT                  _M_traits;
     348      };
     349  
     350    /// Describes a sequence of one or more %_State, its current start
     351    /// and end(s).  This structure contains fragments of an NFA during
     352    /// construction.
     353    template<typename _TraitsT>
     354      class _StateSeq
     355      {
     356      public:
     357        typedef _NFA<_TraitsT> _RegexT;
     358  
     359      public:
     360        _StateSeq(_RegexT& __nfa, _StateIdT __s)
     361        : _M_nfa(__nfa), _M_start(__s), _M_end(__s)
     362        { }
     363  
     364        _StateSeq(_RegexT& __nfa, _StateIdT __s, _StateIdT __end)
     365        : _M_nfa(__nfa), _M_start(__s), _M_end(__end)
     366        { }
     367  
     368        // Append a state on *this and change *this to the new sequence.
     369        void
     370        _M_append(_StateIdT __id)
     371        {
     372  	_M_nfa[_M_end]._M_next = __id;
     373  	_M_end = __id;
     374        }
     375  
     376        // Append a sequence on *this and change *this to the new sequence.
     377        void
     378        _M_append(const _StateSeq& __s)
     379        {
     380  	_M_nfa[_M_end]._M_next = __s._M_start;
     381  	_M_end = __s._M_end;
     382        }
     383  
     384        // Clones an entire sequence.
     385        _StateSeq
     386        _M_clone();
     387  
     388      public:
     389        _RegexT&  _M_nfa;
     390        _StateIdT _M_start;
     391        _StateIdT _M_end;
     392      };
     393  
     394   ///@} regex-detail
     395  } // namespace __detail
     396  
     397  _GLIBCXX_END_NAMESPACE_VERSION
     398  } // namespace std
     399  
     400  #include <bits/regex_automaton.tcc>