| /*============================================================================= |
| Boost.Wave: A Standard compliant C++ preprocessor library |
| |
| Spirit based lexer |
| |
| http://www.boost.org/ |
| |
| Copyright (c) 2001, Daniel C. Nuffer. |
| Copyright (c) 2001-2010 Hartmut Kaiser. |
| Distributed under the Boost Software License, Version 1.0. (See accompanying |
| file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
| |
| TODO List: |
| X callback objects (called when a match is made.) |
| X callback passed first & last iterator, and |
| a reference to a lexercontrol object that supports some |
| operations on the lexer. |
| set state |
| terminate |
| state stack (push, pop, top) |
| set new token return value |
| ignore the current token |
| yymore |
| get length of matched token |
| get current lexer state |
| X DFA serialization to save recomputing the DFA. |
| |
| lexer states. |
| organize the file into hpp and ipp. arrange the classes in a logical order. |
| use buffering - iterator only needs be an input iterator, |
| lexer & callback are not templatized on iterator type, but char type. |
| next_token is templatized on iterator type. |
| beginning/ending contexts. |
| ^ and $ |
| DFA minimization. |
| DFA table compression. |
| |
| =============================================================================*/ |
| #ifndef BOOST_SPIRIT_LEXER_HPP |
| #define BOOST_SPIRIT_LEXER_HPP |
| |
| /////////////////////////////////////////////////////////////////////////////// |
| #include <boost/throw_exception.hpp> |
| |
| #include <boost/spirit/include/classic_core.hpp> |
| #include <boost/spirit/include/classic_symbols.hpp> |
| #include <boost/spirit/include/classic_chset.hpp> |
| #include <boost/spirit/include/classic_escape_char.hpp> |
| |
| #include <set> |
| #include <map> |
| #include <vector> |
| #include <stack> |
| #include <utility> // for pair |
| #include <iostream> |
| #include <fstream> |
| #include <boost/assert.hpp> |
| #include <boost/limits.hpp> |
| |
| #if defined(BOOST_NO_STD_ITERATOR_TRAITS) |
| #define BOOST_SPIRIT_IT_NS impl |
| #else |
| #define BOOST_SPIRIT_IT_NS std |
| #endif |
| |
| /////////////////////////////////////////////////////////////////////////////// |
| namespace boost { |
| namespace spirit { |
| namespace classic { |
| |
| typedef unsigned char uchar; |
| typedef unsigned int node_id_t; |
| const node_id_t invalid_node = node_id_t(-1); |
| typedef std::set<node_id_t> node_set; |
| typedef std::vector<uchar> uchar_vector; |
| typedef std::map<node_id_t, node_set> followpos_t; |
| typedef std::vector<uchar_vector> state_match_t; |
| |
| template <typename TokenT> |
| class lexer_control; |
| |
| class bad_regex : public std::exception |
| { |
| }; |
| |
| namespace lexerimpl |
| { |
| |
| class node |
| { |
| public: |
| |
| virtual ~node() {} |
| |
| virtual node* clone() const = 0; |
| virtual bool nullable() const = 0; |
| virtual node_set firstpos() const = 0; |
| virtual node_set lastpos() const = 0; |
| virtual void compute_followpos(followpos_t& followpos) const = 0; |
| virtual void compute_state_match(state_match_t& state_match) const = 0; |
| virtual void get_eof_ids(node_set& eof_set) const = 0; |
| virtual void assign_node_ids(node_id_t& node_count) = 0; |
| #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| virtual void dump(std::ostream& out) const = 0; |
| #endif |
| |
| }; |
| |
| class char_node : public node |
| { |
| |
| public: |
| |
| char_node(const uchar c); |
| char_node(const char_node& x); |
| virtual ~char_node(){} |
| |
| virtual node* clone() const; |
| virtual bool nullable() const; |
| virtual node_set firstpos() const; |
| virtual node_set lastpos() const; |
| virtual void compute_followpos(followpos_t& followpos) const; |
| virtual void compute_state_match(state_match_t& state_match ) const; |
| virtual void get_eof_ids(node_set& eof_set) const; |
| virtual void assign_node_ids(node_id_t& node_count); |
| #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| virtual void dump(std::ostream& out) const; |
| #endif |
| |
| private: |
| |
| uchar m_char; |
| node_id_t m_node_num; |
| }; |
| |
| inline |
| char_node::char_node(const uchar c) |
| : node() |
| , m_char(c) |
| , m_node_num(0) |
| { |
| } |
| |
| inline |
| char_node::char_node(const char_node& x) |
| : node(x) |
| , m_char(x.m_char) |
| , m_node_num(x.m_node_num) |
| { |
| } |
| |
| inline node * |
| char_node::clone() const |
| { |
| return new char_node(*this); |
| } |
| |
| inline bool |
| char_node::nullable() const |
| { |
| return false; |
| } |
| |
| inline node_set |
| char_node::firstpos() const |
| { |
| node_set rval; |
| rval.insert(m_node_num); |
| return rval; |
| } |
| |
| inline node_set |
| char_node::lastpos() const |
| { |
| return firstpos(); |
| } |
| |
| inline void |
| char_node::compute_followpos(followpos_t&) const |
| { |
| return; |
| } |
| |
| inline void |
| char_node::compute_state_match(state_match_t& state_match) const |
| { |
| if (state_match.size() < m_node_num + 1) |
| state_match.resize(m_node_num + 1); |
| state_match[m_node_num].resize(256); |
| state_match[m_node_num][m_char] = 1; |
| } |
| |
| inline void |
| char_node::get_eof_ids(node_set&) const |
| { |
| return; |
| } |
| |
| inline void |
| char_node::assign_node_ids(node_id_t& node_count) |
| { |
| m_node_num = node_count++; |
| } |
| |
| #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| inline void |
| char_node::dump(std::ostream& out) const |
| { |
| out << "\nchar_node m_char = " << m_char; |
| out << " m_node_num = " << m_node_num; |
| out << " nullable() = " << (nullable() ? "true" : "false"); |
| out << " firstpos() = "; |
| node_set fp = firstpos(); |
| std::copy(fp.begin(), fp.end(), |
| std::ostream_iterator<node_id_t>(out, ",")); |
| |
| out << " lastpos() = "; |
| node_set lp = lastpos(); |
| std::copy(lp.begin(), lp.end(), |
| std::ostream_iterator<node_id_t>(out, ",")); |
| } |
| #endif |
| |
| |
| class epsilon_node : public node |
| { |
| |
| public: |
| |
| epsilon_node(); |
| epsilon_node(const epsilon_node& x); |
| virtual ~epsilon_node(){} |
| |
| virtual node* clone() const; |
| virtual bool nullable() const; |
| virtual node_set firstpos() const; |
| virtual node_set lastpos() const; |
| virtual void compute_followpos(followpos_t& followpos) const; |
| virtual void compute_state_match(state_match_t& state_match ) const; |
| virtual void get_eof_ids(node_set& eof_set) const; |
| virtual void assign_node_ids(node_id_t& node_count); |
| #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| virtual void dump(std::ostream& out) const; |
| #endif |
| |
| private: |
| |
| node_id_t m_node_num; |
| }; |
| |
| inline |
| epsilon_node::epsilon_node() |
| : node() |
| , m_node_num(0) |
| { |
| } |
| |
| inline |
| epsilon_node::epsilon_node(const epsilon_node& x) |
| : node(x) |
| , m_node_num(x.m_node_num) |
| { |
| } |
| |
| inline node * |
| epsilon_node::clone() const |
| { |
| return new epsilon_node(*this); |
| } |
| |
| inline bool |
| epsilon_node::nullable() const |
| { |
| return true; |
| } |
| |
| inline node_set |
| epsilon_node::firstpos() const |
| { |
| return node_set(); |
| } |
| |
| inline node_set |
| epsilon_node::lastpos() const |
| { |
| return node_set(); |
| } |
| |
| inline void |
| epsilon_node::compute_followpos(followpos_t&) const |
| { |
| return; |
| } |
| |
| inline void |
| epsilon_node::compute_state_match(state_match_t& state_match) const |
| { |
| if (state_match.size() < m_node_num + 1) |
| state_match.resize(m_node_num + 1); |
| state_match[m_node_num].resize(256, 1); |
| } |
| |
| inline void |
| epsilon_node::get_eof_ids(node_set&) const |
| { |
| return; |
| } |
| |
| inline void |
| epsilon_node::assign_node_ids(node_id_t& node_count) |
| { |
| m_node_num = node_count++; |
| } |
| |
| #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| inline void |
| epsilon_node::dump(std::ostream& out) const |
| { |
| out << "\nepsilon_node"; |
| out << " m_node_num = " << m_node_num; |
| out << " nullable() = " << (nullable() ? "true" : "false"); |
| out << " firstpos() = "; |
| node_set fp = firstpos(); |
| std::copy(fp.begin(), fp.end(), |
| std::ostream_iterator<node_id_t>(out, ",")); |
| |
| out << " lastpos() = "; |
| node_set lp = lastpos(); |
| std::copy(lp.begin(), lp.end(), |
| std::ostream_iterator<node_id_t>(out, ",")); |
| } |
| #endif |
| |
| |
| class or_node : public node |
| { |
| |
| public: |
| |
| or_node(node* left, node* right); |
| or_node(const or_node& x); |
| virtual ~or_node(){} |
| |
| virtual node* clone() const; |
| virtual bool nullable() const; |
| virtual node_set firstpos() const; |
| virtual node_set lastpos() const; |
| virtual void compute_followpos(followpos_t& followpos) const; |
| virtual void compute_state_match(state_match_t& state_match ) const; |
| virtual void get_eof_ids(node_set& eof_set) const; |
| virtual void assign_node_ids(node_id_t& node_count); |
| #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| virtual void dump(std::ostream& out) const; |
| #endif |
| |
| private: |
| |
| std::auto_ptr<node> m_left; |
| std::auto_ptr<node> m_right; |
| }; |
| |
| inline |
| or_node::or_node(node* left, node* right) |
| : node() |
| , m_left(left) |
| , m_right(right) |
| { |
| } |
| |
| inline |
| or_node::or_node(const or_node& x) |
| : node(x) |
| , m_left(x.m_left->clone()) |
| , m_right(x.m_right->clone()) |
| { |
| } |
| |
| inline node * |
| or_node::clone() const |
| { |
| return new or_node(m_left->clone(), m_right->clone()); |
| } |
| |
| inline bool |
| or_node::nullable() const |
| { |
| return m_left->nullable() || m_right->nullable(); |
| } |
| |
| inline node_set |
| or_node::firstpos() const |
| { |
| node_set rval; |
| node_set l = m_left->firstpos(); |
| node_set r = m_right->firstpos(); |
| std::set_union(l.begin(), l.end(), r.begin(), r.end(), |
| std::inserter(rval, rval.begin())); |
| return rval; |
| } |
| |
| inline node_set |
| or_node::lastpos() const |
| { |
| node_set rval; |
| node_set l = m_left->lastpos(); |
| node_set r = m_right->lastpos(); |
| std::set_union(l.begin(), l.end(), r.begin(), r.end(), |
| std::inserter(rval, rval.begin())); |
| return rval; |
| } |
| |
| inline void |
| or_node::compute_followpos(followpos_t& followpos) const |
| { |
| m_left->compute_followpos(followpos); |
| m_right->compute_followpos(followpos); |
| } |
| |
| inline void |
| or_node::compute_state_match(state_match_t& state_match) const |
| { |
| m_left->compute_state_match(state_match); |
| m_right->compute_state_match(state_match); |
| } |
| |
| inline void |
| or_node::get_eof_ids(node_set& eof_nodes) const |
| { |
| m_left->get_eof_ids(eof_nodes); |
| m_right->get_eof_ids(eof_nodes); |
| } |
| |
| inline void |
| or_node::assign_node_ids(node_id_t& node_count) |
| { |
| m_left->assign_node_ids(node_count); |
| m_right->assign_node_ids(node_count); |
| } |
| |
| #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| inline void |
| or_node::dump(std::ostream& out) const |
| { |
| m_left->dump(out); |
| |
| out << "\nor_node"; |
| out << " nullable() = " << (nullable() ? "true" : "false"); |
| out << " firstpos() = "; |
| node_set fp = firstpos(); |
| std::copy(fp.begin(), fp.end(), |
| std::ostream_iterator<node_id_t>(out, ",")); |
| |
| out << " lastpos() = "; |
| node_set lp = lastpos(); |
| std::copy(lp.begin(), lp.end(), |
| std::ostream_iterator<node_id_t>(out, ",")); |
| |
| m_right->dump(out); |
| } |
| #endif |
| |
| |
| class cat_node : public node |
| { |
| |
| public: |
| |
| cat_node(node* left, node* right); |
| cat_node(const cat_node& x); |
| virtual ~cat_node(){} |
| |
| virtual node* clone() const; |
| virtual bool nullable() const; |
| virtual node_set firstpos() const; |
| virtual node_set lastpos() const; |
| virtual void compute_followpos(followpos_t& followpos) const; |
| virtual void compute_state_match(state_match_t& state_match ) const; |
| virtual void get_eof_ids(node_set& eof_set) const; |
| virtual void assign_node_ids(node_id_t& node_count); |
| #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| virtual void dump(std::ostream& out) const; |
| #endif |
| |
| private: |
| |
| std::auto_ptr<node> m_left; |
| std::auto_ptr<node> m_right; |
| }; |
| |
| inline |
| cat_node::cat_node(node* left, node* right) |
| : node() |
| , m_left(left) |
| , m_right(right) |
| { |
| } |
| |
| inline |
| cat_node::cat_node(const cat_node& x) |
| : node(x) |
| , m_left(x.m_left->clone()) |
| , m_right(x.m_right->clone()) |
| { |
| } |
| |
| inline node * |
| cat_node::clone() const |
| { |
| return new cat_node(m_left->clone(), m_right->clone()); |
| } |
| |
| inline bool |
| cat_node::nullable() const |
| { |
| return m_left->nullable() && m_right->nullable(); |
| } |
| |
| inline node_set |
| cat_node::firstpos() const |
| { |
| if (m_left->nullable()) |
| { |
| node_set rval; |
| node_set l = m_left->firstpos(); |
| node_set r = m_right->firstpos(); |
| std::set_union(l.begin(), l.end(), r.begin(), r.end(), |
| std::inserter(rval, rval.begin())); |
| return rval; |
| } |
| else |
| { |
| return m_left->firstpos(); |
| } |
| } |
| |
| inline node_set |
| cat_node::lastpos() const |
| { |
| if (m_right->nullable()) |
| { |
| node_set rval; |
| node_set l = m_left->lastpos(); |
| node_set r = m_right->lastpos(); |
| std::set_union(l.begin(), l.end(), r.begin(), r.end(), |
| std::inserter(rval, rval.begin())); |
| return rval; |
| } |
| else |
| { |
| return m_right->lastpos(); |
| } |
| } |
| |
| inline void |
| cat_node::compute_followpos(followpos_t& followpos) const |
| { |
| node_set l = m_left->lastpos(); |
| for (node_set::iterator i = l.begin(); |
| i != l.end(); |
| ++i) |
| { |
| node_set rf = m_right->firstpos(); |
| followpos[*i].insert(rf.begin(), rf.end()); |
| } |
| |
| m_left->compute_followpos(followpos); |
| m_right->compute_followpos(followpos); |
| } |
| |
| inline void |
| cat_node::compute_state_match(state_match_t& state_match) const |
| { |
| m_left->compute_state_match(state_match); |
| m_right->compute_state_match(state_match); |
| } |
| |
| inline void |
| cat_node::get_eof_ids(node_set& eof_nodes) const |
| { |
| m_left->get_eof_ids(eof_nodes); |
| m_right->get_eof_ids(eof_nodes); |
| } |
| |
| inline void |
| cat_node::assign_node_ids(node_id_t& node_count) |
| { |
| m_left->assign_node_ids(node_count); |
| m_right->assign_node_ids(node_count); |
| } |
| |
| #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| inline void |
| cat_node::dump(std::ostream& out) const |
| { |
| m_left->dump(out); |
| |
| out << "\ncat_node"; |
| out << " nullable() = " << (nullable() ? "true" : "false"); |
| out << " firstpos() = "; |
| node_set fp = firstpos(); |
| std::copy(fp.begin(), fp.end(), |
| std::ostream_iterator<node_id_t>(out, ",")); |
| |
| out << " lastpos() = "; |
| node_set lp = lastpos(); |
| std::copy(lp.begin(), lp.end(), |
| std::ostream_iterator<node_id_t>(out, ",")); |
| |
| m_right->dump(out); |
| } |
| #endif |
| |
| |
| class star_node : public node |
| { |
| |
| public: |
| |
| star_node(node* left); |
| star_node(const star_node& x); |
| virtual ~star_node(){} |
| |
| virtual node* clone() const; |
| virtual bool nullable() const; |
| virtual node_set firstpos() const; |
| virtual node_set lastpos() const; |
| virtual void compute_followpos(followpos_t& followpos) const; |
| virtual void compute_state_match(state_match_t& state_match ) const; |
| virtual void get_eof_ids(node_set& eof_set) const; |
| virtual void assign_node_ids(node_id_t& node_count); |
| #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| virtual void dump(std::ostream& out) const; |
| #endif |
| |
| private: |
| |
| std::auto_ptr<node> m_left; |
| }; |
| |
| inline |
| star_node::star_node(node* left) |
| : node() |
| , m_left(left) |
| { |
| } |
| |
| inline |
| star_node::star_node(const star_node& x) |
| : node(x) |
| , m_left(x.m_left->clone()) |
| { |
| } |
| |
| inline node * |
| star_node::clone() const |
| { |
| return new star_node(m_left->clone()); |
| } |
| |
| inline bool |
| star_node::nullable() const |
| { |
| return true; |
| } |
| |
| inline node_set |
| star_node::firstpos() const |
| { |
| return m_left->firstpos(); |
| } |
| |
| inline node_set |
| star_node::lastpos() const |
| { |
| return m_left->lastpos(); |
| } |
| |
| inline void |
| star_node::compute_followpos(followpos_t& followpos) const |
| { |
| node_set lp = this->lastpos(); |
| for (node_set::iterator i = lp.begin(); |
| i != lp.end(); |
| ++i) |
| { |
| node_set fp = this->firstpos(); |
| followpos[*i].insert(fp.begin(), fp.end()); |
| } |
| |
| m_left->compute_followpos(followpos); |
| } |
| |
| inline void |
| star_node::compute_state_match(state_match_t& state_match) const |
| { |
| m_left->compute_state_match(state_match); |
| } |
| |
| inline void |
| star_node::get_eof_ids(node_set& eof_nodes) const |
| { |
| m_left->get_eof_ids(eof_nodes); |
| } |
| |
| inline void |
| star_node::assign_node_ids(node_id_t& node_count) |
| { |
| m_left->assign_node_ids(node_count); |
| } |
| |
| #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| inline void |
| star_node::dump(std::ostream& out) const |
| { |
| m_left->dump(out); |
| |
| out << "\nstar_node"; |
| out << " nullable() = " << (nullable() ? "true" : "false"); |
| out << " firstpos() = "; |
| node_set fp = firstpos(); |
| std::copy(fp.begin(), fp.end(), |
| std::ostream_iterator<node_id_t>(out, ",")); |
| |
| out << " lastpos() = "; |
| node_set lp = lastpos(); |
| std::copy(lp.begin(), lp.end(), |
| std::ostream_iterator<node_id_t>(out, ",")); |
| |
| } |
| #endif |
| |
| |
| class eof_node : public node |
| { |
| |
| public: |
| |
| eof_node(); |
| eof_node(const eof_node& x); |
| virtual ~eof_node(){} |
| |
| virtual node* clone() const; |
| virtual bool nullable() const; |
| virtual node_set firstpos() const; |
| virtual node_set lastpos() const; |
| virtual void compute_followpos(followpos_t& followpos) const; |
| virtual void compute_state_match(state_match_t& state_match ) const; |
| virtual void get_eof_ids(node_set& eof_set) const; |
| virtual void assign_node_ids(node_id_t& node_count); |
| #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| virtual void dump(std::ostream& out) const; |
| #endif |
| |
| private: |
| |
| node_id_t m_node_num; |
| }; |
| |
| inline |
| eof_node::eof_node() |
| : node() |
| , m_node_num(0) |
| { |
| } |
| |
| inline |
| eof_node::eof_node(const eof_node& x) |
| : node(x) |
| , m_node_num(x.m_node_num) |
| { |
| } |
| |
| inline node * |
| eof_node::clone() const |
| { |
| return new eof_node(*this); |
| } |
| |
| inline bool |
| eof_node::nullable() const |
| { |
| return false; |
| } |
| |
| inline node_set |
| eof_node::firstpos() const |
| { |
| node_set rval; |
| rval.insert(m_node_num); |
| return rval; |
| } |
| |
| inline node_set |
| eof_node::lastpos() const |
| { |
| node_set rval; |
| rval.insert(m_node_num); |
| return rval; |
| } |
| |
| inline void |
| eof_node::compute_followpos(followpos_t&) const |
| { |
| return; |
| } |
| |
| inline void |
| eof_node::compute_state_match(state_match_t& state_match) const |
| { |
| if (state_match.size() < m_node_num + 1) |
| state_match.resize(m_node_num + 1); |
| state_match[m_node_num].resize(256, 0); |
| } |
| |
| inline void |
| eof_node::get_eof_ids(node_set& eof_nodes) const |
| { |
| eof_nodes.insert(m_node_num); |
| } |
| |
| inline void |
| eof_node::assign_node_ids(node_id_t& node_count) |
| { |
| m_node_num = node_count++; |
| } |
| |
| #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| inline void |
| eof_node::dump(std::ostream& out) const |
| { |
| out << "\neof_node"; |
| out << " m_node_num = " << m_node_num; |
| out << " nullable() = " << (nullable() ? "true" : "false"); |
| out << " firstpos() = "; |
| node_set fp = firstpos(); |
| std::copy(fp.begin(), fp.end(), |
| std::ostream_iterator<node_id_t>(out, ",")); |
| |
| out << " lastpos() = "; |
| node_set lp = lastpos(); |
| std::copy(lp.begin(), lp.end(), |
| std::ostream_iterator<node_id_t>(out, ",")); |
| } |
| #endif |
| |
| class ccl_node : public node |
| { |
| |
| public: |
| |
| ccl_node(const std::vector<uchar>& v); |
| ccl_node(const uchar c1, const uchar c2); |
| ccl_node(const ccl_node& x); |
| virtual ~ccl_node(){} |
| |
| virtual node* clone() const; |
| virtual bool nullable() const; |
| virtual node_set firstpos() const; |
| virtual node_set lastpos() const; |
| virtual void compute_followpos(followpos_t& followpos) const; |
| virtual void compute_state_match(state_match_t& state_match ) const; |
| virtual void get_eof_ids(node_set& eof_set) const; |
| virtual void assign_node_ids(node_id_t& node_count); |
| #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| virtual void dump(std::ostream& out) const; |
| #endif |
| |
| private: |
| |
| std::vector<uchar> m_match; |
| node_id_t m_node_num; |
| }; |
| |
| inline |
| ccl_node::ccl_node(const std::vector<uchar>& v) |
| : node() |
| , m_match(v) |
| , m_node_num(0) |
| { |
| m_match.resize(256); // make sure it's the right size |
| } |
| |
| inline |
| ccl_node::ccl_node(const uchar c1, const uchar c2) |
| : node() |
| , m_match(256, uchar(0)) |
| , m_node_num(0) |
| { |
| BOOST_ASSERT(c1 < c2); |
| for (std::size_t i = c1; i <= std::size_t(c2); ++i) |
| { |
| m_match[i] = 1; |
| } |
| } |
| |
| inline |
| ccl_node::ccl_node(const ccl_node& x) |
| : node(x) |
| , m_match(x.m_match) |
| , m_node_num(x.m_node_num) |
| { |
| } |
| |
| inline node * |
| ccl_node::clone() const |
| { |
| return new ccl_node(*this); |
| } |
| |
| inline bool |
| ccl_node::nullable() const |
| { |
| return false; |
| } |
| |
| inline node_set |
| ccl_node::firstpos() const |
| { |
| node_set rval; |
| rval.insert(m_node_num); |
| return rval; |
| } |
| |
| inline node_set |
| ccl_node::lastpos() const |
| { |
| return firstpos(); |
| } |
| |
| inline void |
| ccl_node::compute_followpos(followpos_t&) const |
| { |
| return; |
| } |
| |
| inline void |
| ccl_node::compute_state_match(state_match_t& state_match) const |
| { |
| if (state_match.size() < m_node_num + 1) |
| state_match.resize(m_node_num + 1); |
| state_match[m_node_num] = m_match; |
| } |
| |
| inline void |
| ccl_node::get_eof_ids(node_set&) const |
| { |
| return; |
| } |
| |
| inline void |
| ccl_node::assign_node_ids(node_id_t& node_count) |
| { |
| m_node_num = node_count++; |
| } |
| |
| #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| inline void |
| ccl_node::dump(std::ostream& out) const |
| { |
| out << "\nccl_node m_match = "; |
| for (std::size_t i = 0; i < m_match.size(); ++i) |
| { |
| if (m_match[i]) |
| out << i << ", "; |
| } |
| out << " m_node_num = " << m_node_num; |
| out << " nullable() = " << (nullable() ? "true" : "false"); |
| out << " firstpos() = "; |
| node_set fp = firstpos(); |
| std::copy(fp.begin(), fp.end(), |
| std::ostream_iterator<node_id_t>(out, ",")); |
| |
| out << " lastpos() = "; |
| node_set lp = lastpos(); |
| std::copy(lp.begin(), lp.end(), |
| std::ostream_iterator<node_id_t>(out, ",")); |
| } |
| #endif |
| |
| template <typename ScannerT> |
| class make_concat |
| { |
| typedef typename ScannerT::iterator_t iterator_type; |
| |
| public: |
| |
| make_concat(std::stack<node*>& the_stack) |
| : m_stack(the_stack) |
| {} |
| |
| void operator()(iterator_type const &, iterator_type const &) const |
| { |
| node* right = m_stack.top(); |
| m_stack.pop(); |
| node* left = m_stack.top(); |
| m_stack.pop(); |
| node* newnode = new cat_node(left, right); |
| m_stack.push(newnode); |
| } |
| |
| std::stack<node*>& m_stack; |
| }; |
| |
| template <int CharTSize> |
| struct get_byte_aux; |
| |
| template<> |
| struct get_byte_aux<1> |
| { |
| template <typename CharT> |
| unsigned char operator()(CharT c, unsigned int byte) |
| { |
| BOOST_ASSERT(byte == 0); |
| return c; |
| } |
| }; |
| |
| template<> |
| struct get_byte_aux<2> |
| { |
| template <typename CharT> |
| unsigned char operator()(CharT c, unsigned int byte) |
| { |
| static unsigned long mask[] = |
| { |
| 0xFF00, |
| 0x00FF |
| }; |
| |
| BOOST_ASSERT(byte < 2); |
| return (c & mask[byte]) >> ((sizeof(c) - 1 - byte) * 8); |
| } |
| }; |
| |
| template<> |
| struct get_byte_aux<4> |
| { |
| template <typename CharT> |
| unsigned char operator()(CharT c, unsigned int byte) |
| { |
| static unsigned long mask[] = |
| { |
| 0xFF000000, |
| 0x00FF0000, |
| 0x0000FF00, |
| 0x000000FF |
| }; |
| |
| BOOST_ASSERT(byte < 4); |
| return (c & mask[byte]) >> ((sizeof(c) - 1 - byte) * 8); |
| } |
| }; |
| |
| template <typename CharT> |
| inline unsigned char |
| get_byte(CharT c, unsigned int byte) |
| { |
| return get_byte_aux<sizeof(c)>()(c, byte); |
| } |
| |
| template <typename ScannerT> |
| class make_star |
| { |
| typedef typename ScannerT::iterator_t iterator_type; |
| |
| public: |
| typedef |
| typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type |
| char_t; |
| |
| make_star(std::stack<node*>& the_stack) |
| : m_stack(the_stack) |
| {} |
| |
| void operator()(const char_t) const |
| { |
| node* left = m_stack.top(); |
| m_stack.pop(); |
| node* newnode = new star_node(left); |
| m_stack.push(newnode); |
| } |
| |
| std::stack<node*>& m_stack; |
| }; |
| |
| template <typename ScannerT> |
| class make_or |
| { |
| typedef typename ScannerT::iterator_t iterator_type; |
| |
| public: |
| |
| make_or(std::stack<node*>& the_stack) |
| : m_stack(the_stack) |
| {} |
| |
| void operator()(iterator_type const&, iterator_type const&) const |
| { |
| node* right = m_stack.top(); |
| m_stack.pop(); |
| node* left = m_stack.top(); |
| m_stack.pop(); |
| node* newnode = new or_node(left, right); |
| m_stack.push(newnode); |
| } |
| |
| std::stack<node*>& m_stack; |
| }; |
| |
| template <typename ScannerT> |
| class make_plus |
| { |
| typedef typename ScannerT::iterator_t iterator_type; |
| |
| public: |
| typedef |
| typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type |
| char_t; |
| |
| make_plus(std::stack<node*>& the_stack) |
| : m_stack(the_stack) |
| {} |
| |
| void operator()(const char_t) const |
| { |
| node* left = m_stack.top(); |
| m_stack.pop(); |
| |
| node* copy = left->clone(); |
| |
| node* new_star = new star_node(copy); |
| node* new_cat = new cat_node(left, new_star); |
| m_stack.push(new_cat); |
| } |
| |
| std::stack<node*>& m_stack; |
| }; |
| |
| template <typename ScannerT> |
| class make_optional |
| { |
| typedef typename ScannerT::iterator_t iterator_type; |
| |
| public: |
| typedef |
| typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type |
| char_t; |
| |
| make_optional(std::stack<node*>& the_stack) |
| : m_stack(the_stack) |
| {} |
| |
| void operator()(const char_t) const |
| { |
| node* left = m_stack.top(); |
| m_stack.pop(); |
| |
| node* new_or = new or_node(left, new epsilon_node()); |
| m_stack.push(new_or); |
| } |
| |
| std::stack<node*>& m_stack; |
| }; |
| |
| /////////////////////////////////////////////////////////////////////////////// |
| // utility function |
| template <typename CharT> |
| inline utility::impl::range<CharT> const& |
| full_range() |
| { |
| static utility::impl::range<CharT> full((std::numeric_limits<CharT>::min)(), |
| (std::numeric_limits<CharT>::max)()); |
| return full; |
| } |
| |
| namespace ccl_utils |
| { |
| template <typename char_t> |
| inline utility::impl::range_run<char_t> |
| negate_range_run( |
| const utility::impl::range_run<char_t>& rr) |
| { |
| utility::impl::range_run<char_t> newrr; |
| newrr.set(full_range<char_t>()); |
| for (typename utility::impl::range_run<char_t>::const_iterator iter = rr.begin(); |
| iter != rr.end(); ++iter) |
| newrr.clear(*iter); |
| return newrr; |
| } |
| |
| template <typename char_t> |
| inline node* |
| create_mb_node_seq(char_t c) |
| { |
| node* newnode = new char_node(get_byte(c, 0)); |
| for (unsigned int i = 1; i < sizeof(c); ++i) |
| { |
| node* cnode = new char_node(get_byte(c, i)); |
| node* top_node = new cat_node(newnode, cnode); |
| newnode = top_node; |
| } |
| return newnode; |
| } |
| |
| template <typename char_t> |
| inline void |
| handle_mb_char(char_t c, bool first_time, |
| std::stack<node*>& stack) |
| { |
| node* newnode = create_mb_node_seq(c); |
| |
| if (first_time) |
| { |
| stack.push(newnode); |
| } |
| else |
| { |
| node* top = stack.top(); |
| stack.pop(); |
| |
| node* newtop = new or_node(top, newnode); |
| stack.push(newtop); |
| } |
| } |
| |
| // forward decl only |
| template <typename char_t> |
| inline void |
| handle_mb_range(char_t c1, char_t c2, bool first_time, |
| std::stack<node*>& stack); |
| |
| template <typename char_t> |
| inline void |
| create_nodes(const utility::impl::range_run<char_t>& rr, |
| std::stack<node*>& stack) |
| { |
| |
| if (sizeof(char_t) == 1) |
| { |
| std::vector<uchar> ccl; |
| ccl.resize(256); |
| for (typename utility::impl::range_run<char_t>::const_iterator iter = rr.begin(); |
| iter != rr.end(); ++iter) |
| { |
| for (int i = iter->first; i <= iter->last; ++i) |
| { |
| // this is always true because of the limited datatype |
| // BOOST_ASSERT(uchar(i) < 256 && ccl.size() == 256); |
| ccl[uchar(i)] = 1; |
| } |
| } |
| |
| node* new_ccl = new ccl_node(ccl); |
| stack.push(new_ccl); |
| } |
| else |
| { |
| bool mb_first_time = true; |
| for (typename utility::impl::range_run<char_t>::const_iterator iter = rr.begin(); |
| iter != rr.end(); ++iter) |
| { |
| if (iter->first == iter->last) |
| { |
| handle_mb_char(iter->first, mb_first_time, stack); |
| } |
| else |
| { |
| handle_mb_range(iter->first, iter->last, mb_first_time, stack); |
| } |
| mb_first_time = false; |
| } |
| } |
| } |
| |
| template <typename char_t> |
| inline std::size_t |
| compute_differing_byte(char_t c1, char_t c2) |
| { |
| std::size_t rval = 0; |
| while (rval < sizeof(c1) && |
| get_byte(c1, (unsigned int)rval) == get_byte(c2, (unsigned int)rval)) |
| { |
| ++rval; |
| } |
| return rval; |
| } |
| |
| template <typename char_t> |
| inline node* |
| create_mb_node_type1(std::size_t j, char_t c1, char_t c2) |
| { |
| std::size_t diff = get_byte(c2, (unsigned int)j) - |
| get_byte(c1, (unsigned int)j); |
| if (diff == 1) { |
| return 0; |
| } |
| else if (diff == 2) { |
| return new char_node(get_byte(c1, (unsigned int)j)+1); |
| } |
| else { |
| return new ccl_node(get_byte(c1, (unsigned int)j)+1, |
| get_byte(c2, (unsigned int)j)-1); |
| } |
| } |
| |
| template <typename char_t> |
| inline node * |
| create_mb_node_for_byte(std::size_t i, std::size_t j, std::size_t sizem1, |
| std::size_t differing_byte, char_t c1, char_t c2, node* newnode) |
| { |
| node* cnode; |
| if (i == sizem1 && j == differing_byte && j != sizem1) |
| { |
| node* tmp = create_mb_node_type1(j, c1, c2); |
| if (tmp == 0) |
| { |
| delete newnode; |
| return 0; |
| } |
| else |
| cnode = tmp; |
| } |
| else if (i == differing_byte && j == sizem1) |
| { |
| if (i != sizem1) { |
| cnode = new ccl_node(get_byte(c1, (unsigned int)j), 0xFF); |
| } |
| else { |
| cnode = new ccl_node(get_byte(c1, (unsigned int)j), |
| get_byte(c2, (unsigned int)j)); |
| } |
| } |
| else if (i != differing_byte && i != sizem1 && |
| j == (sizem1 - i + differing_byte)) |
| { |
| cnode = new ccl_node(get_byte(c1, (unsigned int)j)+1, 0xFF); |
| } |
| else if (i + j - differing_byte > sizem1) { |
| cnode = new ccl_node(0, 0xFF); |
| } |
| else {//if (is plain) |
| cnode = new char_node(get_byte(c1, (unsigned int)j)); |
| } |
| |
| node* top_node = new cat_node(newnode, cnode); |
| return top_node; |
| } |
| |
| // On platforms, where wchar_t is a typedef for unsigned short, the |
| // comparision for a negative value is pointless |
| template <bool is_signed> |
| struct correct_char_aux { |
| }; |
| |
| template <> |
| struct correct_char_aux<true> { |
| |
| template <typename char_t> |
| static char_t correct(char_t c) { if (c < 0) c = 0; return c; } |
| }; |
| |
| template <> |
| struct correct_char_aux<false> { |
| |
| template <typename char_t> |
| static char_t correct(char_t c) { return c; } |
| }; |
| |
| template <typename char_t> |
| struct correct_char |
| { |
| static char_t correct(char_t c) |
| { |
| return correct_char_aux<std::numeric_limits<char_t>::is_signed >:: |
| correct(c); |
| } |
| }; |
| |
| template <typename char_t> |
| inline void |
| handle_mb_range(char_t c1, char_t c2, bool first_time, |
| std::stack<node*>& stack) |
| { |
| // The algorithm can't handle negative value chars, which don't make |
| // much sense anyway. This comparision is pointless for wchar_t's on |
| // platforms, where wchar_t is a typedef for unsigned short |
| |
| c1 = correct_char<char_t>::correct(c1); |
| //if (c1 < 0) |
| // c1 = 0; |
| |
| BOOST_ASSERT(c1 < c2); |
| node* newnode = 0; |
| node* savednode = 0; |
| const std::size_t differing_byte = compute_differing_byte(c1, c2); |
| const std::size_t sizem1 = sizeof(c1) - 1; |
| const std::size_t ndb = sizem1 - differing_byte; |
| for (std::size_t i = differing_byte; i < sizeof(c1); ++i) |
| { |
| // generate node for the first byte |
| if (differing_byte == 0 && i == ndb) |
| { |
| node* tmp = create_mb_node_type1(0, c1, c2); |
| if (tmp == 0) |
| continue; |
| else |
| newnode = tmp; |
| } |
| else |
| { |
| newnode = new char_node(get_byte(c1, 0)); |
| } |
| for (std::size_t j = 1; j < sizeof(c1); ++j) |
| { |
| newnode = create_mb_node_for_byte(i, j, sizem1, differing_byte, |
| c1, c2, newnode); |
| if (newnode == 0) |
| goto end_outer_for; |
| } |
| |
| // or together the various parts |
| if (savednode) |
| { |
| node* top_node = new or_node(savednode, newnode); |
| savednode = top_node; |
| } |
| else |
| { |
| savednode = newnode; |
| } |
| end_outer_for: |
| continue; |
| } |
| |
| for (std::size_t k = 0; k < ndb; ++k) |
| { |
| newnode = new char_node(get_byte(c2, 0)); |
| for (std::size_t j = 1; j < sizeof(c2); ++j) |
| { |
| node* cnode; |
| if (k == differing_byte && j == sizem1 && k != sizem1) |
| cnode = new ccl_node(0, get_byte(c2, (unsigned int)j)); |
| |
| else if (k != differing_byte && k != sizem1 && |
| j == (sizem1 - k + differing_byte)) |
| cnode = new ccl_node(0, get_byte(c2, (unsigned int)j)-1); |
| |
| else if (k + j - differing_byte > sizem1) |
| cnode = new ccl_node(0, 0xFF); |
| |
| else //if (is plain) |
| cnode = new char_node(get_byte(c2, (unsigned int)j)); |
| |
| |
| node* top_node = new cat_node(newnode, cnode); |
| newnode = top_node; |
| } |
| |
| // or together the various parts |
| if (savednode) |
| { |
| node* top_node = new or_node(savednode, newnode); |
| savednode = top_node; |
| } |
| else |
| { |
| savednode = newnode; |
| } |
| } |
| |
| |
| if (first_time) |
| { |
| stack.push(savednode); |
| } |
| else |
| { |
| node* top = stack.top(); |
| stack.pop(); |
| |
| node* newtop = new or_node(top, savednode); |
| stack.push(newtop); |
| } |
| } |
| } // namespace ccl_utils |
| |
| template <typename ScannerT> |
| class make_char |
| { |
| typedef typename ScannerT::iterator_t iterator_type; |
| |
| public: |
| typedef |
| typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type |
| char_t; |
| |
| make_char(std::stack<node*>& the_stack) |
| : m_stack(the_stack) |
| {} |
| |
| void operator()(iterator_type const& first, iterator_type const& last) const |
| { |
| const escape_char_parser<lex_escapes, char_t> lex_escape_ch = |
| escape_char_parser<lex_escapes, char_t>(); |
| char_t the_char; |
| iterator_type first_ = first; |
| ScannerT scan(first_, last); |
| lex_escape_ch[assign(the_char)].parse(scan); |
| node* newnode = ccl_utils::create_mb_node_seq(the_char); |
| m_stack.push(newnode); |
| } |
| |
| std::stack<node*>& m_stack; |
| }; |
| |
| |
| template <typename ScannerT> |
| class make_ccl |
| { |
| typedef typename ScannerT::iterator_t iterator_type; |
| |
| public: |
| typedef |
| typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type |
| char_t; |
| |
| make_ccl(std::stack<node*>& the_stack) |
| : m_stack(the_stack) |
| {} |
| |
| static bool is_equal_to_string(iterator_type first, |
| iterator_type const & last, const char* str) |
| { |
| while (first != last &&*str &&*first ==*str) |
| { |
| ++first; |
| ++str; |
| } |
| return*str == 0; |
| } |
| |
| template <typename ParserT> |
| static void fill_ccl(utility::impl::range_run<char_t>& rr, const ParserT& parser) |
| { |
| for (int i = 0; i < 256; ++i) |
| { |
| if (parser.test(static_cast<char_t>(uchar(i)))) |
| rr.set(utility::impl::range<char_t>(char_t(i), char_t(i))); |
| } |
| } |
| |
| void operator()(iterator_type const& first_, iterator_type const& last) const |
| { |
| BOOST_ASSERT(*first_ == '['); |
| |
| iterator_type first = first_; |
| ++first; // skip over '[' |
| bool negated_ccl = false; |
| if (*first == '^') |
| { |
| negated_ccl = true; |
| ++first; |
| } |
| |
| utility::impl::range_run<char_t> rr; |
| while (first != last &&*first != ']') |
| { |
| if (*first == '[') // it's a ccl_expr like [:space:] |
| { |
| // check for [:space:], etc. |
| if (is_equal_to_string(first, last, "[:alnum:]")) |
| { |
| fill_ccl(rr, alnum_p); |
| } |
| else if (is_equal_to_string(first, last, "[:alpha:]")) |
| { |
| fill_ccl(rr, alpha_p); |
| } |
| else if (is_equal_to_string(first, last, "[:blank:]")) |
| { |
| fill_ccl(rr, blank_p); |
| } |
| else if (is_equal_to_string(first, last, "[:cntrl:]")) |
| { |
| fill_ccl(rr, cntrl_p); |
| } |
| else if (is_equal_to_string(first, last, "[:digit:]")) |
| { |
| fill_ccl(rr, digit_p); |
| } |
| else if (is_equal_to_string(first, last, "[:graph:]")) |
| { |
| fill_ccl(rr, graph_p); |
| } |
| else if (is_equal_to_string(first, last, "[:lower:]")) |
| { |
| fill_ccl(rr, lower_p); |
| } |
| else if (is_equal_to_string(first, last, "[:print:]")) |
| { |
| fill_ccl(rr, print_p); |
| } |
| else if (is_equal_to_string(first, last, "[:punct:]")) |
| { |
| fill_ccl(rr, punct_p); |
| } |
| else if (is_equal_to_string(first, last, "[:space:]")) |
| { |
| fill_ccl(rr, space_p); |
| } |
| else if (is_equal_to_string(first, last, "[:upper:]")) |
| { |
| fill_ccl(rr, upper_p); |
| } |
| else if (is_equal_to_string(first, last, "[:xdigit:]")) |
| { |
| fill_ccl(rr, xdigit_p); |
| } |
| // this can't happen, because it's parsed before we get here. |
| //else |
| // throw bad_regex(); |
| |
| // Advance past the character class expression |
| while (first != last &&*first != ']') |
| ++first; |
| BOOST_ASSERT(*first == ']'); |
| ++first; |
| } |
| else { |
| const escape_char_parser<lex_escapes, char_t> lex_escape_ch = |
| escape_char_parser<lex_escapes, char_t>(); |
| |
| char_t c1; |
| ScannerT scan(first, last); |
| lex_escape_ch[assign(c1)].parse(scan); |
| if (*scan.first == '-') // insert a range |
| { |
| ++scan.first; |
| char_t c2; |
| lex_escape_ch[assign(c2)].parse(scan); |
| BOOST_ASSERT(c1 < c2); // Throw exception? |
| rr.set(utility::impl::range<char_t>(c1, c2)); |
| } |
| else // insert 1 char |
| { |
| rr.set(utility::impl::range<char_t>(c1, c1)); |
| } |
| } |
| } |
| |
| if (negated_ccl) |
| { |
| rr = ccl_utils::negate_range_run(rr); |
| } |
| |
| ccl_utils::create_nodes(rr, m_stack); |
| } |
| |
| std::stack<node*>& m_stack; |
| }; |
| |
| template <typename ScannerT> |
| class make_any_char |
| { |
| typedef typename ScannerT::iterator_t iterator_type; |
| |
| public: |
| typedef |
| typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type |
| char_t; |
| |
| std::stack<node*>& m_stack; |
| |
| make_any_char(std::stack<node*>& the_stack) |
| : m_stack(the_stack) |
| {} |
| |
| void operator()(const char_t c) const |
| { |
| BOOST_ASSERT(c == '.'); |
| do_any_char(); |
| } |
| |
| void do_any_char() const |
| { |
| static utility::impl::range_run<char_t> rr; |
| rr.set(full_range<char_t>()); |
| char_t newline = '\n'; |
| rr.clear(utility::impl::range<char_t>(newline, newline)); |
| |
| ccl_utils::create_nodes(rr, m_stack); |
| } |
| }; |
| |
| template <typename ScannerT> |
| class make_string |
| { |
| typedef typename ScannerT::iterator_t iterator_type; |
| |
| public: |
| typedef |
| typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type |
| char_t; |
| |
| std::stack<node*>& m_stack; |
| |
| make_string(std::stack<node*>& the_stack) |
| : m_stack(the_stack) |
| {} |
| |
| void operator()(iterator_type const& first, iterator_type const& last) const |
| { |
| BOOST_ASSERT(*first == '"'); |
| |
| iterator_type first_ = first; |
| ScannerT scan(first_, last); |
| ++scan.first; // skip over '"' |
| |
| // empty string not allowed |
| if (*scan.first == '"') |
| { |
| boost::throw_exception(bad_regex()); |
| } |
| |
| const escape_char_parser<lex_escapes, char_t> lex_escape_ch = |
| escape_char_parser<lex_escapes, char_t>(); |
| |
| char_t c; |
| lex_escape_ch[assign(c)].parse(scan); |
| node* top_node = ccl_utils::create_mb_node_seq(c); |
| |
| while (*scan.first != '"' && scan.first != scan.last) |
| { |
| lex_escape_ch[assign(c)].parse(scan); |
| node* cur_node = ccl_utils::create_mb_node_seq(c); |
| top_node = new cat_node(top_node, cur_node); |
| } |
| m_stack.push(top_node); |
| } |
| }; |
| |
| inline |
| node* repeat_node(node* n, int num) |
| { |
| node* list_of_nodes = n; |
| for (int i = 1; i < num; ++i) |
| { |
| list_of_nodes = new cat_node(list_of_nodes, n->clone()); |
| } |
| return list_of_nodes; |
| } |
| |
| inline |
| node* optional_node(node* n) |
| { |
| return new or_node(n, new epsilon_node()); |
| } |
| |
| template <typename ScannerT> |
| class make_rep1 |
| { |
| typedef typename ScannerT::iterator_t iterator_type; |
| |
| public: |
| typedef |
| typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type |
| char_t; |
| |
| std::stack<node*>& m_stack; |
| |
| make_rep1(std::stack<node*>& the_stack) |
| : m_stack(the_stack) |
| {} |
| |
| void operator()(iterator_type const& first, iterator_type const& last) const |
| { |
| BOOST_ASSERT(*first == '{'); |
| |
| iterator_type first_ = first; |
| ScannerT scan(first_, last); |
| ++scan.first; // skip over '{' |
| |
| unsigned int count; |
| uint_p[assign(count)].parse(scan); |
| if (count == 0) |
| boost::throw_exception(bad_regex()); |
| |
| node* top_node = m_stack.top(); |
| m_stack.pop(); |
| top_node = repeat_node(top_node, count); |
| m_stack.push(top_node); |
| } |
| }; |
| |
| template <typename ScannerT> |
| class make_rep2 |
| { |
| typedef typename ScannerT::iterator_t iterator_type; |
| |
| public: |
| typedef |
| typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type |
| char_t; |
| |
| std::stack<node*>& m_stack; |
| |
| make_rep2(std::stack<node*>& the_stack) |
| : m_stack(the_stack) |
| {} |
| |
| void operator()(iterator_type const& first, iterator_type const& last) const |
| { |
| BOOST_ASSERT(*first == '{'); |
| |
| iterator_type first_ = first; |
| ScannerT scan (first_, last); |
| ++scan.first; // skip over '{' |
| |
| unsigned int count; |
| uint_p[assign(count)].parse(scan); |
| if (count == 0) |
| boost::throw_exception(bad_regex()); |
| |
| node* top_node = m_stack.top(); |
| m_stack.pop(); |
| top_node = new cat_node(repeat_node(top_node, count), |
| new star_node(top_node->clone())); |
| m_stack.push(top_node); |
| |
| } |
| }; |
| |
| template <typename ScannerT> |
| class make_rep3 |
| { |
| typedef typename ScannerT::iterator_t iterator_type; |
| |
| public: |
| typedef |
| typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type |
| char_t; |
| |
| std::stack<node*>& m_stack; |
| |
| make_rep3(std::stack<node*>& the_stack) |
| : m_stack(the_stack) |
| {} |
| |
| void operator()(iterator_type const& first, iterator_type const& last) const |
| { |
| BOOST_ASSERT(*first == '{'); |
| |
| iterator_type first_ = first; |
| ScannerT scan(first_, last); |
| ++scan.first; // skip over '{' |
| |
| unsigned int count1, count2; |
| uint_p[assign(count1)].parse(scan); |
| if (count1 == 0) |
| boost::throw_exception(bad_regex()); |
| |
| ++scan.first; // skip over ',' |
| |
| uint_p[assign(count2)].parse(scan); |
| if (count2 <= count1) |
| boost::throw_exception(bad_regex()); |
| |
| node* top_node = m_stack.top(); |
| m_stack.pop(); |
| node* repeats = repeat_node(top_node, count1); |
| top_node = new cat_node(repeats, |
| repeat_node(optional_node(top_node->clone()), |
| count2 - count1)); |
| |
| m_stack.push(top_node); |
| } |
| }; |
| |
| /////////////////////////////////////////////////////////////////////////////// |
| // |
| // Lexer grammar |
| // |
| // Defines the grammar, which mandates the syntax of the understood |
| // lexeme definitions passed to lexer::register_regex. |
| // |
| /////////////////////////////////////////////////////////////////////////////// |
| class lexer_grammar : public boost::spirit::classic::grammar<lexer_grammar> |
| { |
| public: |
| lexer_grammar(std::stack<node*> &node_stack_) |
| : node_stack(node_stack_) {} |
| |
| template <typename ScannerT> |
| struct definition |
| { |
| typedef rule<ScannerT> rule_t; |
| typedef typename ScannerT::iterator_t iterator_type; |
| typedef |
| typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type |
| char_t; |
| |
| rule_t regex, re, series, singleton, singleton2, fullccl, ccl, string, |
| escseq, ccl_char; |
| symbols<> ccl_expr; |
| |
| definition(lexer_grammar const &self) |
| { |
| regex = |
| re >> !('/' >> re) >> !ch_p('$') |
| ; |
| |
| re = |
| series |
| >>*( ('|' >> series)[make_or<ScannerT>(self.node_stack)] ) |
| ; |
| |
| series = |
| singleton |
| >>*( singleton[make_concat<ScannerT>(self.node_stack)] ) |
| ; |
| |
| singleton = |
| ch_p('.')[make_any_char<ScannerT>(self.node_stack)] |
| >> singleton2 |
| | fullccl |
| >> singleton2 |
| | ('"' >> string >> '"') |
| [ |
| make_string<ScannerT>(self.node_stack) |
| ] |
| >> singleton2 |
| | '(' >> re >> ')' |
| >> singleton2 |
| | ((anychar_p - chset<>("/|*+?.(){}\\")) | escseq) |
| [ |
| make_char<ScannerT>(self.node_stack) |
| ] |
| >> singleton2 |
| ; |
| |
| singleton2 = |
| ch_p('*')[make_star<ScannerT>(self.node_stack)] |
| >> singleton2 |
| | ch_p('+')[make_plus<ScannerT>(self.node_stack)] |
| >> singleton2 |
| | ch_p('?')[make_optional<ScannerT>(self.node_stack)] |
| >> singleton2 |
| | ('{' >> uint_p >> '}') |
| [ |
| make_rep1<ScannerT>(self.node_stack) |
| ] |
| >> singleton2 |
| | ('{' >> uint_p >> ',' >> '}') |
| [ |
| make_rep2<ScannerT>(self.node_stack) |
| ] |
| >> singleton2 |
| | ('{' >> uint_p >> ',' >> uint_p >> '}') |
| [ |
| make_rep3<ScannerT>(self.node_stack) |
| ] |
| >> singleton2 |
| | epsilon_p |
| ; |
| |
| fullccl = |
| ('[' >> !ch_p('^') >> ccl >> ']') |
| [ |
| make_ccl<ScannerT>(self.node_stack) |
| ] |
| ; |
| |
| ccl = |
| *(ccl_expr | (ccl_char >> !('-' >> ccl_char))) |
| ; |
| |
| ccl_char = |
| ( (anychar_p - chset<>("\\\n]")) | escseq ) |
| ; |
| |
| ccl_expr = |
| "[:alnum:]", |
| "[:alpha:]", |
| "[:blank:]", |
| "[:cntrl:]", |
| "[:digit:]", |
| "[:graph:]", |
| "[:lower:]", |
| "[:print:]", |
| "[:punct:]", |
| "[:space:]", |
| "[:upper:]", |
| "[:xdigit:]" |
| ; |
| |
| string = |
| +( (anychar_p - chset<>("\"\\")) | escseq ) |
| ; |
| |
| typedef |
| uint_parser<char_t, 8, 1, |
| std::numeric_limits<char_t>::digits / 3 + 1 |
| > oct_parser_t; |
| typedef |
| uint_parser<char_t, 16, 1, |
| std::numeric_limits<char_t>::digits / 4 + 1 |
| > hex_parser_t; |
| |
| escseq = |
| ch_p('\\') |
| >> ( |
| oct_parser_t() |
| | as_lower_d['x'] >> hex_parser_t() |
| | (anychar_p - chset<>('\n')) |
| ) |
| ; |
| |
| #define BOOST_SLEX_DEBUG (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| |
| BOOST_SPIRIT_DEBUG_TRACE_RULE(regex, BOOST_SLEX_DEBUG); |
| BOOST_SPIRIT_DEBUG_TRACE_RULE(re, BOOST_SLEX_DEBUG); |
| BOOST_SPIRIT_DEBUG_TRACE_RULE(series, BOOST_SLEX_DEBUG); |
| BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton, BOOST_SLEX_DEBUG); |
| BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton2, BOOST_SLEX_DEBUG); |
| BOOST_SPIRIT_DEBUG_TRACE_RULE(fullccl, BOOST_SLEX_DEBUG); |
| BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl, BOOST_SLEX_DEBUG); |
| BOOST_SPIRIT_DEBUG_TRACE_RULE(string, BOOST_SLEX_DEBUG); |
| BOOST_SPIRIT_DEBUG_TRACE_RULE(escseq, BOOST_SLEX_DEBUG); |
| BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl_char, BOOST_SLEX_DEBUG); |
| |
| #undef BOOST_SLEX_DEBUG |
| } |
| |
| rule<ScannerT> const& |
| start() const { return regex; } |
| }; |
| |
| std::stack<node*> &node_stack; |
| |
| }; // class lexer_grammar |
| |
| template <typename StringT> |
| inline node * |
| parse(lexer_grammar& g, StringT const& str) |
| { |
| typedef |
| scanner<typename StringT::const_iterator, scanner_policies<> > |
| scanner_t; |
| typedef typename rule<scanner_t>::template result<scanner_t>::type |
| result_t; |
| |
| typename StringT::const_iterator first = str.begin(); |
| typename StringT::const_iterator last = str.end(); |
| |
| scanner_t scan(first, last); |
| // typename rule<scanner_t>::result_t hit = g.parse(scan); |
| result_t hit = g.parse(scan); |
| if (!hit || !scan.at_end()) |
| { |
| while (g.node_stack.size()) |
| { |
| delete g.node_stack.top(); |
| g.node_stack.pop(); |
| } |
| return 0; |
| } |
| |
| BOOST_ASSERT(g.node_stack.size() == 1); |
| node* rval = g.node_stack.top(); |
| g.node_stack.pop(); |
| node* an_eof_node = new eof_node(); |
| rval = new cat_node(rval, an_eof_node); |
| return rval; |
| } |
| |
| inline |
| void make_case_insensitive(state_match_t& state_match) |
| { |
| // TODO: Fix this. |
| // This doesn't take into account foreign languages, figure out how to |
| // do that. Also this approach is broken for this implementation of |
| // wide chars. |
| for (state_match_t::iterator iter = state_match.begin(); |
| iter != state_match.end(); ++iter) |
| { |
| int i, j; |
| for (i = 'A', j = 'a'; i <= 'Z'; ++i, ++j) |
| { |
| // if either is set, turn them both on |
| (*iter)[i] = (*iter)[j] = uchar((*iter)[i] | (*iter)[j]); |
| } |
| } |
| } |
| |
| template<bool wide_char> |
| struct regex_match_helper; |
| |
| template<> |
| struct regex_match_helper<false> // single byte char |
| { |
| template <typename DfaT, typename IteratorT> |
| static bool |
| do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, |
| int& regex_index, |
| std::basic_string< |
| typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type |
| > *token) |
| { |
| typedef std::basic_string< |
| typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type |
| > string_type; |
| typedef typename string_type::size_type size_type; |
| |
| node_id_t s = 0; |
| node_id_t last_accepting_index = invalid_node; |
| IteratorT p = first; |
| IteratorT last_accepting_cpos = first; |
| while (p != last) |
| { |
| s = dfa.transition_table[s][(uchar)*p]; |
| if (s == invalid_node) |
| break; |
| if (token) token->append((size_type)1, *p); |
| ++p; |
| if (dfa.acceptance_index[s] != invalid_node) |
| { |
| last_accepting_index = s; |
| last_accepting_cpos = p; |
| } |
| } |
| if (last_accepting_index != invalid_node) |
| { |
| #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " << |
| dfa.acceptance_index[last_accepting_index] << '\n'; |
| #endif |
| |
| first = last_accepting_cpos; |
| regex_index = dfa.acceptance_index[last_accepting_index]; |
| return true; |
| } |
| else |
| return false; |
| } |
| }; |
| |
| template<> |
| struct regex_match_helper<true> // wide char |
| { |
| template <typename DfaT, typename IteratorT> |
| static bool |
| do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, |
| int& regex_index, |
| std::basic_string< |
| typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type |
| > *token) |
| { |
| typedef |
| typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type |
| char_t; |
| typedef std::basic_string<char_t> string_type; |
| typedef typename string_type::size_type size_type; |
| |
| node_id_t s = 0; |
| node_id_t last_accepting_index = invalid_node; |
| IteratorT wp = first; |
| IteratorT last_accepting_cpos = first; |
| |
| while (wp != last) |
| { |
| for (unsigned int i = 0; i < sizeof(char_t); ++i) |
| { |
| s = dfa.transition_table[s][get_byte(*wp,i)]; |
| if (s == invalid_node) |
| { |
| goto break_while; |
| } |
| } |
| if (token) token->append((size_type)1, *wp); |
| ++wp; |
| if (dfa.acceptance_index[s] != invalid_node) |
| { |
| last_accepting_index = s; |
| last_accepting_cpos = wp; |
| } |
| |
| } |
| |
| break_while: |
| if (last_accepting_index != invalid_node) |
| { |
| #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " << |
| dfa.acceptance_index[last_accepting_index] << '\n'; |
| #endif |
| first = last_accepting_cpos; |
| regex_index = dfa.acceptance_index[last_accepting_index]; |
| |
| return true; |
| } |
| else |
| return false; |
| } |
| }; |
| |
| template <typename DfaT, typename IteratorT, bool wide_char> |
| struct regex_match |
| { |
| static bool |
| do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last, |
| int& regex_index, |
| std::basic_string< |
| typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type |
| > *token) |
| { |
| return regex_match_helper<wide_char>::do_match( |
| dfa, first, last, regex_index, token); |
| } |
| }; |
| |
| } // namespace lexerimpl |
| |
| /////////////////////////////////////////////////////////////////////////////// |
| // |
| template <typename IteratorT = char const*, typename TokenT = int, |
| typename CallbackT = void(*)(IteratorT const &, |
| IteratorT &, |
| IteratorT const&, |
| TokenT const&, |
| lexer_control<TokenT> &)> |
| class lexer |
| { |
| public: |
| typedef CallbackT callback_t; |
| typedef |
| typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type |
| char_t; |
| |
| struct regex_info |
| { |
| std::basic_string<char_t> str; |
| TokenT token; |
| CallbackT callback; |
| |
| regex_info(const std::basic_string<char_t>& _str, |
| const TokenT& _token, |
| const CallbackT& _callback) |
| : str(_str) |
| , token(_token) |
| , callback(_callback) |
| {} |
| |
| }; |
| |
| struct dfa_table |
| { |
| std::vector<std::vector<node_id_t> > transition_table; |
| std::vector<node_id_t> acceptance_index; |
| }; |
| typedef std::vector<node_id_t> node_table_t; |
| typedef std::vector<node_table_t> transition_table_t; |
| typedef std::vector<dfa_table> dfa_t; |
| |
| |
| lexer(unsigned int states = 1); |
| |
| void register_regex(const std::basic_string<char_t>& regex, |
| const TokenT& id, const CallbackT& cb = CallbackT(), |
| unsigned int state = 0); |
| |
| TokenT next_token(IteratorT &first, IteratorT const &last, |
| std::basic_string<char_t> *token = 0); |
| |
| void create_dfa(); |
| bool has_compiled_dfa() { return m_compiled_dfa; } |
| |
| void set_case_insensitive(bool insensitive); |
| |
| #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| void dump(std::ostream& out); |
| #endif |
| typedef std::vector<std::vector<regex_info> > regex_list_t; |
| |
| bool load (std::ifstream &in, long unique_id = 0); |
| bool save (std::ofstream &out, long unique_id = 0); |
| enum { |
| SLEX_SIGNATURE = 0x58454C53, // "SLEX" |
| SLEX_VERSION_100 = 0x0100, // persistance version |
| SLEX_LAST_KNOWN_VERSION = SLEX_VERSION_100, |
| SLEX_MINOR_VERSION_MASK = 0xFF |
| }; |
| |
| private: |
| |
| void create_dfa_for_state(int state); |
| |
| static bool regex_match(const dfa_t& dfa, IteratorT& first, |
| IteratorT& last, int& regex_index); |
| |
| mutable std::stack<lexerimpl::node*> node_stack; |
| lexerimpl::lexer_grammar g; |
| |
| mutable bool m_compiled_dfa; |
| mutable dfa_t m_dfa; |
| |
| regex_list_t m_regex_list; |
| bool m_case_insensitive; |
| |
| unsigned int m_state; |
| std::stack<unsigned int> m_state_stack; |
| unsigned int m_num_states; |
| }; |
| |
| |
| template <typename IteratorT, typename TokenT, typename CallbackT> |
| inline |
| lexer<IteratorT, TokenT, CallbackT>::lexer(unsigned int states) |
| : g(node_stack) |
| , m_compiled_dfa(false) |
| , m_regex_list(states) |
| , m_case_insensitive(false) |
| , m_state(0) |
| , m_state_stack() |
| , m_num_states(states) |
| { |
| BOOST_SPIRIT_DEBUG_TRACE_NODE_NAME(g, "slex::lexer", |
| BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX); |
| } |
| |
| template <typename IteratorT, typename TokenT, typename CallbackT> |
| inline void |
| lexer<IteratorT, TokenT, CallbackT>::register_regex( |
| const std::basic_string<char_t>& regex, const TokenT& id, |
| const CallbackT& callback, unsigned int state) |
| { |
| if (state > m_num_states) { |
| m_regex_list.resize(state); |
| m_num_states = state; |
| } |
| m_regex_list[state].push_back(regex_info(regex, id, callback)); |
| } |
| |
| template <typename IteratorT, typename TokenT, typename CallbackT> |
| inline TokenT |
| lexer<IteratorT, TokenT, CallbackT>::next_token( |
| IteratorT &first, IteratorT const& last, |
| std::basic_string< |
| typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type |
| > *token) |
| { |
| if (!m_compiled_dfa) |
| { |
| create_dfa(); |
| } |
| |
| IteratorT saved = first; |
| int regex_index; |
| if (!lexerimpl::regex_match<dfa_table, IteratorT, (sizeof(char_t) > 1)>:: |
| do_match(m_dfa[m_state], first, last, regex_index, token)) |
| return -1; // TODO: can't return -1, need to return some invalid token. |
| // how to figure this out? We can use traits I guess. |
| else |
| { |
| regex_info regex = m_regex_list[m_state][regex_index]; |
| TokenT rval = regex.token; |
| if (regex.callback) |
| { |
| // execute corresponding callback |
| lexer_control<TokenT> controller(rval, m_state, m_state_stack); |
| regex.callback(saved, first, last, regex.token, controller); |
| if (controller.ignore_current_token_set()) { |
| if (token) |
| token->erase(); |
| return next_token(first, last, token); |
| } |
| } |
| return rval; |
| } |
| } |
| |
| namespace lexerimpl |
| { |
| |
| inline |
| bool find_acceptance_state(const node_set& eof_node_ids, |
| const node_set& current_set, |
| node_id_t& acceptance_node_id) |
| { |
| for(node_set::const_iterator nsi = eof_node_ids.begin(); |
| nsi != eof_node_ids.end(); ++nsi) |
| { |
| node_id_t eof_node_id =*nsi; |
| if (current_set.end() != current_set.find(eof_node_id)) |
| { |
| // store the first one we come to as the |
| // matched pattern |
| acceptance_node_id = eof_node_id; |
| // don't bother searching for more |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| template <typename RegexListT, typename GrammarT> |
| inline std::auto_ptr<node> |
| parse_regexes(const RegexListT& regex_list, GrammarT& g) |
| { |
| // parse the expressions into a tree |
| if (regex_list.begin() == regex_list.end()) |
| boost::throw_exception(bad_regex()); |
| |
| typename RegexListT::const_iterator ri = regex_list.begin(); |
| std::auto_ptr<node> tree(lexerimpl::parse(g, (*ri).str)); |
| if (tree.get() == 0) |
| boost::throw_exception(bad_regex()); |
| |
| ++ri; |
| for (/**/; ri != regex_list.end(); ++ri) |
| { |
| std::auto_ptr<node> next_tree(lexerimpl::parse(g, (*ri).str)); |
| if (next_tree.get() == 0) |
| boost::throw_exception(bad_regex()); |
| std::auto_ptr<node> newnode(new or_node(tree.release(), next_tree.release())); |
| tree = newnode; |
| } |
| return tree; |
| } |
| |
| } //namespace lexerimpl |
| |
| template <typename IteratorT, typename TokenT, typename CallbackT> |
| inline void |
| lexer<IteratorT, TokenT, CallbackT>::create_dfa() |
| { |
| m_dfa.resize(m_num_states); |
| for (unsigned int i = 0; i < m_num_states; ++i) |
| create_dfa_for_state(i); |
| } |
| |
| // Algorithm from Compilers: Principles, Techniques, and Tools p. 141 |
| template <typename IteratorT, typename TokenT, typename CallbackT> |
| inline void |
| lexer<IteratorT, TokenT, CallbackT>::create_dfa_for_state(int state) |
| { |
| using lexerimpl::node; |
| std::auto_ptr<node> tree = lexerimpl::parse_regexes(m_regex_list[state], g); |
| node_id_t dummy = 0; |
| tree->assign_node_ids(dummy); |
| |
| #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| tree->dump(std::cout); |
| #endif |
| |
| // compute followpos(root) |
| followpos_t followpos; |
| tree->compute_followpos(followpos); |
| |
| // the dfa states <-> nfa state groups |
| std::map<node_set, node_id_t> dstates1; |
| std::map<node_id_t, node_set> dstates2; |
| |
| // the dfa transitions |
| m_dfa[state].transition_table.push_back( |
| std::vector<node_id_t>(256, invalid_node)); |
| m_dfa[state].acceptance_index.push_back(invalid_node); |
| |
| // whether the dfa state has been processed yet |
| std::vector<node_id_t> marked; |
| |
| // used to give a unique id to each dfa state |
| node_id_t num_states = 0; |
| |
| // initially, the only unmarked state in Dstates is firstpos(root). |
| marked.push_back(0); |
| node_set fpr = tree->firstpos(); |
| dstates1[fpr] = 0; |
| dstates2[0] = fpr; |
| state_match_t state_match; |
| tree->compute_state_match(state_match); |
| |
| if (m_case_insensitive) |
| lexerimpl::make_case_insensitive(state_match); |
| |
| node_set eof_node_ids; |
| tree->get_eof_ids(eof_node_ids); |
| // translate the eof_node_ids into a 0-based index |
| std::map<node_id_t, node_id_t> eof_node_id_map; |
| unsigned int x = 0; |
| for (node_set::iterator node_id_it = eof_node_ids.begin(); |
| node_id_it != eof_node_ids.end(); |
| ++node_id_it) |
| { |
| eof_node_id_map[*node_id_it] = x++; |
| } |
| |
| // figure out if this is an acceptance state |
| node_id_t eof_node_id; |
| if (lexerimpl::find_acceptance_state(eof_node_ids, fpr, eof_node_id)) |
| { |
| m_dfa[state].acceptance_index[0] = eof_node_id_map[eof_node_id]; |
| } |
| |
| std::vector<node_id_t>::iterator i = std::find(marked.begin(), marked.end(), |
| node_id_t(0)); |
| while (marked.end() != i) |
| { |
| *i = 1; |
| node_id_t T = node_id_t(std::distance(marked.begin(), i)); |
| BOOST_ASSERT(T < dstates2.size()); |
| node_set Tstates = dstates2[T]; |
| for (node_id_t j = 0; j < 256; ++j) |
| { |
| node_set U; |
| for (node_set::iterator k = Tstates.begin(); |
| k != Tstates.end(); ++k) |
| { |
| node_id_t p =*k; |
| BOOST_ASSERT(p < state_match.size()); |
| BOOST_ASSERT(j < state_match[p].size()); |
| if (state_match[p][j]) |
| { |
| node_set fpp = followpos[p]; |
| U.insert(fpp.begin(), fpp.end()); |
| } |
| } |
| if (U.size() > 0) |
| { |
| std::map<node_set, node_id_t>::iterator l = dstates1.find(U); |
| node_id_t target_state; |
| if (l == dstates1.end()) // not in the states yet |
| { |
| ++num_states; |
| dstates1[U] = target_state = num_states; |
| dstates2[target_state] = U; |
| marked.push_back(0); |
| m_dfa[state].transition_table.push_back( |
| std::vector<node_id_t>(256, invalid_node)); |
| m_dfa[state].acceptance_index.push_back(invalid_node); |
| // figure out if this is an acceptance state |
| node_id_t eof_node_id; |
| if (lexerimpl::find_acceptance_state(eof_node_ids, U, eof_node_id)) |
| { |
| m_dfa[state].acceptance_index[target_state] = |
| eof_node_id_map[eof_node_id]; |
| } |
| } |
| else |
| { |
| target_state = dstates1[U]; |
| } |
| |
| BOOST_ASSERT(T < m_dfa[state].transition_table.size()); |
| BOOST_ASSERT(j < m_dfa[state].transition_table[T].size()); |
| m_dfa[state].transition_table[T][j] = target_state; |
| } |
| |
| } |
| |
| i = std::find(marked.begin(), marked.end(), node_id_t(0)); |
| } |
| m_compiled_dfa = true; |
| } |
| |
| template <typename IteratorT, typename TokenT, typename CallbackT> |
| inline void |
| lexer<IteratorT, TokenT, CallbackT>::set_case_insensitive(bool insensitive) |
| { |
| m_case_insensitive = insensitive; |
| } |
| |
| |
| #if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX) |
| template <typename IteratorT, typename TokenT, typename CallbackT> |
| inline void |
| lexer<IteratorT, TokenT, CallbackT>::dump(std::ostream& out) |
| { |
| for (unsigned x = 0; x < m_dfa.size(); ++x) |
| { |
| out << "\nm_dfa[" << x << "] has " << m_dfa[x].transition_table.size() << " states\n"; |
| for (node_id_t i = 0; i < m_dfa[x].transition_table.size(); ++i) |
| { |
| out << "state " << i << ":"; |
| for (node_id_t j = 0; j < m_dfa[x].transition_table[i].size(); ++j) |
| { |
| if (m_dfa[x].transition_table[i][j] != invalid_node) |
| out << j << "->" << m_dfa[x].transition_table[i][j] << " "; |
| } |
| out << "\n"; |
| } |
| out << "acceptance states: "; |
| for(unsigned int k = 0; k < m_dfa[x].acceptance_index.size(); ++k) |
| { |
| if (m_dfa[x].acceptance_index[k] != invalid_node) |
| out << '<' << k << ',' << m_dfa[x].acceptance_index[k] << "> "; |
| } |
| out << endl; |
| } |
| } |
| #endif |
| |
| /////////////////////////////////////////////////////////////////////////////// |
| // load the lexer tables |
| #define slex_in(strm, val) \ |
| strm.read((char*)&val, sizeof(val)); \ |
| if (std::ios::goodbit != strm.rdstate()) return false |
| |
| template <typename IteratorT, typename TokenT, typename CallbackT> |
| inline bool |
| lexer<IteratorT, TokenT, CallbackT>::load (std::ifstream &in, long unique_id) |
| { |
| // ensure correct signature and version |
| long signature = 0; |
| |
| slex_in (in, signature); |
| if (signature != SLEX_SIGNATURE) |
| return false; // not for us |
| |
| long version = 0; |
| |
| slex_in (in, version); |
| if ((version & ~SLEX_MINOR_VERSION_MASK) > SLEX_LAST_KNOWN_VERSION) |
| return false; // to new for us |
| |
| long uid = 0; |
| |
| slex_in (in, uid); |
| if (uid != unique_id) |
| return false; // not saved by us |
| |
| // load auxiliary members |
| int num_states = 0; |
| |
| slex_in (in, num_states); |
| |
| // load the dfa tables |
| dfa_t in_dfa; |
| std::size_t dfa_size = 0; |
| |
| slex_in (in, dfa_size); |
| in_dfa.resize(dfa_size); |
| for (std::size_t dfa = 0; dfa < dfa_size; ++dfa) |
| { |
| // load the transition tables |
| std::size_t tt_size = 0; |
| transition_table_t &tt_table = in_dfa[dfa].transition_table; |
| |
| slex_in (in, tt_size); |
| tt_table.resize(tt_size); |
| for (std::size_t tt = 0; tt < tt_size; ++tt) |
| { |
| std::size_t nt_size = 0; |
| node_table_t &nt_table = tt_table[tt]; |
| |
| slex_in (in, nt_size); |
| nt_table.resize(nt_size); |
| for (std::size_t nt = 0; nt < nt_size; ++nt) |
| { |
| slex_in (in, nt_table[nt]); |
| } |
| } |
| |
| // load the acceptance index table |
| std::size_t ai_size = 0; |
| node_table_t &ai_table = in_dfa[dfa].acceptance_index; |
| |
| slex_in (in, ai_size); |
| ai_table.resize(ai_size); |
| for (std::size_t ai = 0; ai < ai_size; ++ai) |
| { |
| slex_in (in, ai_table[ai]); |
| } |
| } |
| |
| m_dfa.swap(in_dfa); // success, swap in the read values |
| m_num_states = num_states; |
| |
| m_compiled_dfa = true; |
| return true; |
| } |
| |
| #undef slex_in |
| |
| /////////////////////////////////////////////////////////////////////////////// |
| // save the lexer tables |
| #define slex_out(strm, val) \ |
| strm.write((char*)&val, sizeof(val)); \ |
| if (std::ios::goodbit != strm.rdstate()) return false |
| |
| template <typename IteratorT, typename TokenT, typename CallbackT> |
| inline bool |
| lexer<IteratorT, TokenT, CallbackT>::save (std::ofstream &out, long unique_id) |
| { |
| // save signature and version information |
| long out_long = SLEX_SIGNATURE; |
| |
| slex_out(out, out_long); |
| out_long = SLEX_VERSION_100; |
| slex_out(out, out_long); |
| slex_out(out, unique_id); |
| |
| // save auxiliary members |
| slex_out(out, m_num_states); |
| |
| // save the dfa tables |
| typedef typename dfa_t::const_iterator dfa_iter_t; |
| typedef transition_table_t::const_iterator transition_table_iter_t; |
| typedef node_table_t::const_iterator node_table_iter_t; |
| |
| std::size_t out_size_t = m_dfa.size(); |
| slex_out(out, out_size_t); |
| |
| dfa_iter_t end = m_dfa.end(); |
| for (dfa_iter_t it = m_dfa.begin(); it != end; ++it) |
| { |
| // save the transition table |
| out_size_t = (*it).transition_table.size(); |
| slex_out(out, out_size_t); |
| |
| transition_table_iter_t tt_end = (*it).transition_table.end(); |
| for (transition_table_iter_t tt_it = (*it).transition_table.begin(); |
| tt_it != tt_end; |
| ++tt_it) |
| { |
| out_size_t = (*tt_it).size(); |
| slex_out(out, out_size_t); |
| |
| node_table_iter_t nt_end = (*tt_it).end(); |
| for (node_table_iter_t nt_it = (*tt_it).begin(); |
| nt_it != nt_end; |
| ++nt_it) |
| { |
| slex_out(out, (*nt_it)); |
| } |
| } |
| |
| // save the acceptance index table |
| out_size_t = (*it).acceptance_index.size(); |
| slex_out(out, out_size_t); |
| |
| node_table_iter_t nt_end = (*it).acceptance_index.end(); |
| for (node_table_iter_t nt_it = (*it).acceptance_index.begin(); |
| nt_it != nt_end; |
| ++nt_it) |
| { |
| slex_out(out, (*nt_it)); |
| } |
| } |
| return true; |
| } |
| |
| #undef slex_out |
| |
| /* |
| a lexer_control object supports some operations on the lexer. |
| get current lexer state |
| set state |
| terminate |
| state stack (push, pop, top) |
| set new token return value |
| ignore the current token |
| yymore |
| get length of matched token |
| */ |
| template <typename TokenT> |
| class lexer_control |
| { |
| public: |
| |
| lexer_control(TokenT& token, unsigned int& current_state, |
| std::stack<unsigned int>& state_stack); |
| // functions dealing with the lexer state |
| |
| // set the state to state |
| void set_state(unsigned int state); |
| |
| // get the current state |
| unsigned int get_state(); |
| |
| // pushes the current state onto the top of the state stack and |
| // switches to new_state |
| void push_state(unsigned int new_state); |
| |
| // pops the top of the state stack and switches to it. |
| void pop_state(); |
| |
| // returns the top of the stack without altering the stack's contents. |
| unsigned int top_state(); |
| |
| // functions dealing with the token returned. |
| |
| // set a new TokenT return value, overriding that one that was |
| // registered via. register_regex() |
| void set_token(const TokenT& token); |
| |
| // tell the lexer to return an invalid token, signifying termination. |
| void terminate(); |
| |
| // ignore the current token, and move on to the next one. The current |
| // token will NOT be returned from next_token() |
| void ignore_current_token(); |
| |
| // returns true if ignore_current_token() has been called, |
| // false otherwise. |
| bool ignore_current_token_set(); |
| |
| private: |
| TokenT& m_token; |
| bool m_ignore_current_token; |
| unsigned int& m_current_state; |
| std::stack<unsigned int>& m_state_stack; |
| }; |
| |
| template <typename TokenT> |
| inline |
| lexer_control<TokenT>::lexer_control(TokenT& token, unsigned int& current_state, |
| std::stack<unsigned int>& state_stack) |
| : m_token(token) |
| , m_ignore_current_token(false) |
| , m_current_state(current_state) |
| , m_state_stack(state_stack) |
| { |
| } |
| |
| template <typename TokenT> |
| inline void |
| lexer_control<TokenT>::set_state(unsigned int state) |
| { |
| m_current_state = state; |
| } |
| |
| template <typename TokenT> |
| inline unsigned int |
| lexer_control<TokenT>::get_state() |
| { |
| return m_current_state; |
| } |
| |
| template <typename TokenT> |
| inline void |
| lexer_control<TokenT>::push_state(unsigned int new_state) |
| { |
| m_state_stack.push(m_current_state); |
| m_current_state = new_state; |
| } |
| |
| template <typename TokenT> |
| inline void |
| lexer_control<TokenT>::pop_state() |
| { |
| m_current_state = m_state_stack.top(); |
| m_state_stack.pop(); |
| } |
| |
| template <typename TokenT> |
| inline unsigned int |
| lexer_control<TokenT>::top_state() |
| { |
| return m_state_stack.top(); |
| } |
| |
| template <typename TokenT> |
| inline void |
| lexer_control<TokenT>::set_token(const TokenT& token) |
| { |
| m_token = token; |
| } |
| |
| template <typename TokenT> |
| inline void |
| lexer_control<TokenT>::terminate() |
| { |
| m_token = -1; // TOOD: fix this, need to be determined by traits |
| } |
| |
| template <typename TokenT> |
| inline void |
| lexer_control<TokenT>::ignore_current_token() |
| { |
| m_ignore_current_token = true; |
| } |
| |
| template <typename TokenT> |
| inline bool |
| lexer_control<TokenT>::ignore_current_token_set() |
| { |
| return m_ignore_current_token; |
| } |
| |
| } // namespace classic |
| } // namespace spirit |
| } // namespace boost |
| |
| #undef BOOST_SPIRIT_IT_NS |
| |
| #endif |
| |