blob: 0f95d4e151f901572fc4d1bacfcf470383cc732e [file] [log] [blame]
/*=============================================================================
Boost.Wave: A Standard compliant C++ preprocessor library
Spirit based lexer
http://www.boost.org/
Copyright (c) 2001, Daniel C. Nuffer.
Copyright (c) 2001-2010 Hartmut Kaiser.
Distributed under the Boost Software License, Version 1.0. (See accompanying
file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
TODO List:
X callback objects (called when a match is made.)
X callback passed first & last iterator, and
a reference to a lexercontrol object that supports some
operations on the lexer.
set state
terminate
state stack (push, pop, top)
set new token return value
ignore the current token
yymore
get length of matched token
get current lexer state
X DFA serialization to save recomputing the DFA.
lexer states.
organize the file into hpp and ipp. arrange the classes in a logical order.
use buffering - iterator only needs be an input iterator,
lexer & callback are not templatized on iterator type, but char type.
next_token is templatized on iterator type.
beginning/ending contexts.
^ and $
DFA minimization.
DFA table compression.
=============================================================================*/
#ifndef BOOST_SPIRIT_LEXER_HPP
#define BOOST_SPIRIT_LEXER_HPP
///////////////////////////////////////////////////////////////////////////////
#include <boost/throw_exception.hpp>
#include <boost/spirit/include/classic_core.hpp>
#include <boost/spirit/include/classic_symbols.hpp>
#include <boost/spirit/include/classic_chset.hpp>
#include <boost/spirit/include/classic_escape_char.hpp>
#include <set>
#include <map>
#include <vector>
#include <stack>
#include <utility> // for pair
#include <iostream>
#include <fstream>
#include <boost/assert.hpp>
#include <boost/limits.hpp>
#if defined(BOOST_NO_STD_ITERATOR_TRAITS)
#define BOOST_SPIRIT_IT_NS impl
#else
#define BOOST_SPIRIT_IT_NS std
#endif
///////////////////////////////////////////////////////////////////////////////
namespace boost {
namespace spirit {
namespace classic {
typedef unsigned char uchar;
typedef unsigned int node_id_t;
const node_id_t invalid_node = node_id_t(-1);
typedef std::set<node_id_t> node_set;
typedef std::vector<uchar> uchar_vector;
typedef std::map<node_id_t, node_set> followpos_t;
typedef std::vector<uchar_vector> state_match_t;
template <typename TokenT>
class lexer_control;
class bad_regex : public std::exception
{
};
namespace lexerimpl
{
class node
{
public:
virtual ~node() {}
virtual node* clone() const = 0;
virtual bool nullable() const = 0;
virtual node_set firstpos() const = 0;
virtual node_set lastpos() const = 0;
virtual void compute_followpos(followpos_t& followpos) const = 0;
virtual void compute_state_match(state_match_t& state_match) const = 0;
virtual void get_eof_ids(node_set& eof_set) const = 0;
virtual void assign_node_ids(node_id_t& node_count) = 0;
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
virtual void dump(std::ostream& out) const = 0;
#endif
};
class char_node : public node
{
public:
char_node(const uchar c);
char_node(const char_node& x);
virtual ~char_node(){}
virtual node* clone() const;
virtual bool nullable() const;
virtual node_set firstpos() const;
virtual node_set lastpos() const;
virtual void compute_followpos(followpos_t& followpos) const;
virtual void compute_state_match(state_match_t& state_match ) const;
virtual void get_eof_ids(node_set& eof_set) const;
virtual void assign_node_ids(node_id_t& node_count);
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
virtual void dump(std::ostream& out) const;
#endif
private:
uchar m_char;
node_id_t m_node_num;
};
inline
char_node::char_node(const uchar c)
: node()
, m_char(c)
, m_node_num(0)
{
}
inline
char_node::char_node(const char_node& x)
: node(x)
, m_char(x.m_char)
, m_node_num(x.m_node_num)
{
}
inline node *
char_node::clone() const
{
return new char_node(*this);
}
inline bool
char_node::nullable() const
{
return false;
}
inline node_set
char_node::firstpos() const
{
node_set rval;
rval.insert(m_node_num);
return rval;
}
inline node_set
char_node::lastpos() const
{
return firstpos();
}
inline void
char_node::compute_followpos(followpos_t&) const
{
return;
}
inline void
char_node::compute_state_match(state_match_t& state_match) const
{
if (state_match.size() < m_node_num + 1)
state_match.resize(m_node_num + 1);
state_match[m_node_num].resize(256);
state_match[m_node_num][m_char] = 1;
}
inline void
char_node::get_eof_ids(node_set&) const
{
return;
}
inline void
char_node::assign_node_ids(node_id_t& node_count)
{
m_node_num = node_count++;
}
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
inline void
char_node::dump(std::ostream& out) const
{
out << "\nchar_node m_char = " << m_char;
out << " m_node_num = " << m_node_num;
out << " nullable() = " << (nullable() ? "true" : "false");
out << " firstpos() = ";
node_set fp = firstpos();
std::copy(fp.begin(), fp.end(),
std::ostream_iterator<node_id_t>(out, ","));
out << " lastpos() = ";
node_set lp = lastpos();
std::copy(lp.begin(), lp.end(),
std::ostream_iterator<node_id_t>(out, ","));
}
#endif
class epsilon_node : public node
{
public:
epsilon_node();
epsilon_node(const epsilon_node& x);
virtual ~epsilon_node(){}
virtual node* clone() const;
virtual bool nullable() const;
virtual node_set firstpos() const;
virtual node_set lastpos() const;
virtual void compute_followpos(followpos_t& followpos) const;
virtual void compute_state_match(state_match_t& state_match ) const;
virtual void get_eof_ids(node_set& eof_set) const;
virtual void assign_node_ids(node_id_t& node_count);
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
virtual void dump(std::ostream& out) const;
#endif
private:
node_id_t m_node_num;
};
inline
epsilon_node::epsilon_node()
: node()
, m_node_num(0)
{
}
inline
epsilon_node::epsilon_node(const epsilon_node& x)
: node(x)
, m_node_num(x.m_node_num)
{
}
inline node *
epsilon_node::clone() const
{
return new epsilon_node(*this);
}
inline bool
epsilon_node::nullable() const
{
return true;
}
inline node_set
epsilon_node::firstpos() const
{
return node_set();
}
inline node_set
epsilon_node::lastpos() const
{
return node_set();
}
inline void
epsilon_node::compute_followpos(followpos_t&) const
{
return;
}
inline void
epsilon_node::compute_state_match(state_match_t& state_match) const
{
if (state_match.size() < m_node_num + 1)
state_match.resize(m_node_num + 1);
state_match[m_node_num].resize(256, 1);
}
inline void
epsilon_node::get_eof_ids(node_set&) const
{
return;
}
inline void
epsilon_node::assign_node_ids(node_id_t& node_count)
{
m_node_num = node_count++;
}
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
inline void
epsilon_node::dump(std::ostream& out) const
{
out << "\nepsilon_node";
out << " m_node_num = " << m_node_num;
out << " nullable() = " << (nullable() ? "true" : "false");
out << " firstpos() = ";
node_set fp = firstpos();
std::copy(fp.begin(), fp.end(),
std::ostream_iterator<node_id_t>(out, ","));
out << " lastpos() = ";
node_set lp = lastpos();
std::copy(lp.begin(), lp.end(),
std::ostream_iterator<node_id_t>(out, ","));
}
#endif
class or_node : public node
{
public:
or_node(node* left, node* right);
or_node(const or_node& x);
virtual ~or_node(){}
virtual node* clone() const;
virtual bool nullable() const;
virtual node_set firstpos() const;
virtual node_set lastpos() const;
virtual void compute_followpos(followpos_t& followpos) const;
virtual void compute_state_match(state_match_t& state_match ) const;
virtual void get_eof_ids(node_set& eof_set) const;
virtual void assign_node_ids(node_id_t& node_count);
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
virtual void dump(std::ostream& out) const;
#endif
private:
std::auto_ptr<node> m_left;
std::auto_ptr<node> m_right;
};
inline
or_node::or_node(node* left, node* right)
: node()
, m_left(left)
, m_right(right)
{
}
inline
or_node::or_node(const or_node& x)
: node(x)
, m_left(x.m_left->clone())
, m_right(x.m_right->clone())
{
}
inline node *
or_node::clone() const
{
return new or_node(m_left->clone(), m_right->clone());
}
inline bool
or_node::nullable() const
{
return m_left->nullable() || m_right->nullable();
}
inline node_set
or_node::firstpos() const
{
node_set rval;
node_set l = m_left->firstpos();
node_set r = m_right->firstpos();
std::set_union(l.begin(), l.end(), r.begin(), r.end(),
std::inserter(rval, rval.begin()));
return rval;
}
inline node_set
or_node::lastpos() const
{
node_set rval;
node_set l = m_left->lastpos();
node_set r = m_right->lastpos();
std::set_union(l.begin(), l.end(), r.begin(), r.end(),
std::inserter(rval, rval.begin()));
return rval;
}
inline void
or_node::compute_followpos(followpos_t& followpos) const
{
m_left->compute_followpos(followpos);
m_right->compute_followpos(followpos);
}
inline void
or_node::compute_state_match(state_match_t& state_match) const
{
m_left->compute_state_match(state_match);
m_right->compute_state_match(state_match);
}
inline void
or_node::get_eof_ids(node_set& eof_nodes) const
{
m_left->get_eof_ids(eof_nodes);
m_right->get_eof_ids(eof_nodes);
}
inline void
or_node::assign_node_ids(node_id_t& node_count)
{
m_left->assign_node_ids(node_count);
m_right->assign_node_ids(node_count);
}
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
inline void
or_node::dump(std::ostream& out) const
{
m_left->dump(out);
out << "\nor_node";
out << " nullable() = " << (nullable() ? "true" : "false");
out << " firstpos() = ";
node_set fp = firstpos();
std::copy(fp.begin(), fp.end(),
std::ostream_iterator<node_id_t>(out, ","));
out << " lastpos() = ";
node_set lp = lastpos();
std::copy(lp.begin(), lp.end(),
std::ostream_iterator<node_id_t>(out, ","));
m_right->dump(out);
}
#endif
class cat_node : public node
{
public:
cat_node(node* left, node* right);
cat_node(const cat_node& x);
virtual ~cat_node(){}
virtual node* clone() const;
virtual bool nullable() const;
virtual node_set firstpos() const;
virtual node_set lastpos() const;
virtual void compute_followpos(followpos_t& followpos) const;
virtual void compute_state_match(state_match_t& state_match ) const;
virtual void get_eof_ids(node_set& eof_set) const;
virtual void assign_node_ids(node_id_t& node_count);
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
virtual void dump(std::ostream& out) const;
#endif
private:
std::auto_ptr<node> m_left;
std::auto_ptr<node> m_right;
};
inline
cat_node::cat_node(node* left, node* right)
: node()
, m_left(left)
, m_right(right)
{
}
inline
cat_node::cat_node(const cat_node& x)
: node(x)
, m_left(x.m_left->clone())
, m_right(x.m_right->clone())
{
}
inline node *
cat_node::clone() const
{
return new cat_node(m_left->clone(), m_right->clone());
}
inline bool
cat_node::nullable() const
{
return m_left->nullable() && m_right->nullable();
}
inline node_set
cat_node::firstpos() const
{
if (m_left->nullable())
{
node_set rval;
node_set l = m_left->firstpos();
node_set r = m_right->firstpos();
std::set_union(l.begin(), l.end(), r.begin(), r.end(),
std::inserter(rval, rval.begin()));
return rval;
}
else
{
return m_left->firstpos();
}
}
inline node_set
cat_node::lastpos() const
{
if (m_right->nullable())
{
node_set rval;
node_set l = m_left->lastpos();
node_set r = m_right->lastpos();
std::set_union(l.begin(), l.end(), r.begin(), r.end(),
std::inserter(rval, rval.begin()));
return rval;
}
else
{
return m_right->lastpos();
}
}
inline void
cat_node::compute_followpos(followpos_t& followpos) const
{
node_set l = m_left->lastpos();
for (node_set::iterator i = l.begin();
i != l.end();
++i)
{
node_set rf = m_right->firstpos();
followpos[*i].insert(rf.begin(), rf.end());
}
m_left->compute_followpos(followpos);
m_right->compute_followpos(followpos);
}
inline void
cat_node::compute_state_match(state_match_t& state_match) const
{
m_left->compute_state_match(state_match);
m_right->compute_state_match(state_match);
}
inline void
cat_node::get_eof_ids(node_set& eof_nodes) const
{
m_left->get_eof_ids(eof_nodes);
m_right->get_eof_ids(eof_nodes);
}
inline void
cat_node::assign_node_ids(node_id_t& node_count)
{
m_left->assign_node_ids(node_count);
m_right->assign_node_ids(node_count);
}
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
inline void
cat_node::dump(std::ostream& out) const
{
m_left->dump(out);
out << "\ncat_node";
out << " nullable() = " << (nullable() ? "true" : "false");
out << " firstpos() = ";
node_set fp = firstpos();
std::copy(fp.begin(), fp.end(),
std::ostream_iterator<node_id_t>(out, ","));
out << " lastpos() = ";
node_set lp = lastpos();
std::copy(lp.begin(), lp.end(),
std::ostream_iterator<node_id_t>(out, ","));
m_right->dump(out);
}
#endif
class star_node : public node
{
public:
star_node(node* left);
star_node(const star_node& x);
virtual ~star_node(){}
virtual node* clone() const;
virtual bool nullable() const;
virtual node_set firstpos() const;
virtual node_set lastpos() const;
virtual void compute_followpos(followpos_t& followpos) const;
virtual void compute_state_match(state_match_t& state_match ) const;
virtual void get_eof_ids(node_set& eof_set) const;
virtual void assign_node_ids(node_id_t& node_count);
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
virtual void dump(std::ostream& out) const;
#endif
private:
std::auto_ptr<node> m_left;
};
inline
star_node::star_node(node* left)
: node()
, m_left(left)
{
}
inline
star_node::star_node(const star_node& x)
: node(x)
, m_left(x.m_left->clone())
{
}
inline node *
star_node::clone() const
{
return new star_node(m_left->clone());
}
inline bool
star_node::nullable() const
{
return true;
}
inline node_set
star_node::firstpos() const
{
return m_left->firstpos();
}
inline node_set
star_node::lastpos() const
{
return m_left->lastpos();
}
inline void
star_node::compute_followpos(followpos_t& followpos) const
{
node_set lp = this->lastpos();
for (node_set::iterator i = lp.begin();
i != lp.end();
++i)
{
node_set fp = this->firstpos();
followpos[*i].insert(fp.begin(), fp.end());
}
m_left->compute_followpos(followpos);
}
inline void
star_node::compute_state_match(state_match_t& state_match) const
{
m_left->compute_state_match(state_match);
}
inline void
star_node::get_eof_ids(node_set& eof_nodes) const
{
m_left->get_eof_ids(eof_nodes);
}
inline void
star_node::assign_node_ids(node_id_t& node_count)
{
m_left->assign_node_ids(node_count);
}
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
inline void
star_node::dump(std::ostream& out) const
{
m_left->dump(out);
out << "\nstar_node";
out << " nullable() = " << (nullable() ? "true" : "false");
out << " firstpos() = ";
node_set fp = firstpos();
std::copy(fp.begin(), fp.end(),
std::ostream_iterator<node_id_t>(out, ","));
out << " lastpos() = ";
node_set lp = lastpos();
std::copy(lp.begin(), lp.end(),
std::ostream_iterator<node_id_t>(out, ","));
}
#endif
class eof_node : public node
{
public:
eof_node();
eof_node(const eof_node& x);
virtual ~eof_node(){}
virtual node* clone() const;
virtual bool nullable() const;
virtual node_set firstpos() const;
virtual node_set lastpos() const;
virtual void compute_followpos(followpos_t& followpos) const;
virtual void compute_state_match(state_match_t& state_match ) const;
virtual void get_eof_ids(node_set& eof_set) const;
virtual void assign_node_ids(node_id_t& node_count);
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
virtual void dump(std::ostream& out) const;
#endif
private:
node_id_t m_node_num;
};
inline
eof_node::eof_node()
: node()
, m_node_num(0)
{
}
inline
eof_node::eof_node(const eof_node& x)
: node(x)
, m_node_num(x.m_node_num)
{
}
inline node *
eof_node::clone() const
{
return new eof_node(*this);
}
inline bool
eof_node::nullable() const
{
return false;
}
inline node_set
eof_node::firstpos() const
{
node_set rval;
rval.insert(m_node_num);
return rval;
}
inline node_set
eof_node::lastpos() const
{
node_set rval;
rval.insert(m_node_num);
return rval;
}
inline void
eof_node::compute_followpos(followpos_t&) const
{
return;
}
inline void
eof_node::compute_state_match(state_match_t& state_match) const
{
if (state_match.size() < m_node_num + 1)
state_match.resize(m_node_num + 1);
state_match[m_node_num].resize(256, 0);
}
inline void
eof_node::get_eof_ids(node_set& eof_nodes) const
{
eof_nodes.insert(m_node_num);
}
inline void
eof_node::assign_node_ids(node_id_t& node_count)
{
m_node_num = node_count++;
}
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
inline void
eof_node::dump(std::ostream& out) const
{
out << "\neof_node";
out << " m_node_num = " << m_node_num;
out << " nullable() = " << (nullable() ? "true" : "false");
out << " firstpos() = ";
node_set fp = firstpos();
std::copy(fp.begin(), fp.end(),
std::ostream_iterator<node_id_t>(out, ","));
out << " lastpos() = ";
node_set lp = lastpos();
std::copy(lp.begin(), lp.end(),
std::ostream_iterator<node_id_t>(out, ","));
}
#endif
class ccl_node : public node
{
public:
ccl_node(const std::vector<uchar>& v);
ccl_node(const uchar c1, const uchar c2);
ccl_node(const ccl_node& x);
virtual ~ccl_node(){}
virtual node* clone() const;
virtual bool nullable() const;
virtual node_set firstpos() const;
virtual node_set lastpos() const;
virtual void compute_followpos(followpos_t& followpos) const;
virtual void compute_state_match(state_match_t& state_match ) const;
virtual void get_eof_ids(node_set& eof_set) const;
virtual void assign_node_ids(node_id_t& node_count);
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
virtual void dump(std::ostream& out) const;
#endif
private:
std::vector<uchar> m_match;
node_id_t m_node_num;
};
inline
ccl_node::ccl_node(const std::vector<uchar>& v)
: node()
, m_match(v)
, m_node_num(0)
{
m_match.resize(256); // make sure it's the right size
}
inline
ccl_node::ccl_node(const uchar c1, const uchar c2)
: node()
, m_match(256, uchar(0))
, m_node_num(0)
{
BOOST_ASSERT(c1 < c2);
for (std::size_t i = c1; i <= std::size_t(c2); ++i)
{
m_match[i] = 1;
}
}
inline
ccl_node::ccl_node(const ccl_node& x)
: node(x)
, m_match(x.m_match)
, m_node_num(x.m_node_num)
{
}
inline node *
ccl_node::clone() const
{
return new ccl_node(*this);
}
inline bool
ccl_node::nullable() const
{
return false;
}
inline node_set
ccl_node::firstpos() const
{
node_set rval;
rval.insert(m_node_num);
return rval;
}
inline node_set
ccl_node::lastpos() const
{
return firstpos();
}
inline void
ccl_node::compute_followpos(followpos_t&) const
{
return;
}
inline void
ccl_node::compute_state_match(state_match_t& state_match) const
{
if (state_match.size() < m_node_num + 1)
state_match.resize(m_node_num + 1);
state_match[m_node_num] = m_match;
}
inline void
ccl_node::get_eof_ids(node_set&) const
{
return;
}
inline void
ccl_node::assign_node_ids(node_id_t& node_count)
{
m_node_num = node_count++;
}
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
inline void
ccl_node::dump(std::ostream& out) const
{
out << "\nccl_node m_match = ";
for (std::size_t i = 0; i < m_match.size(); ++i)
{
if (m_match[i])
out << i << ", ";
}
out << " m_node_num = " << m_node_num;
out << " nullable() = " << (nullable() ? "true" : "false");
out << " firstpos() = ";
node_set fp = firstpos();
std::copy(fp.begin(), fp.end(),
std::ostream_iterator<node_id_t>(out, ","));
out << " lastpos() = ";
node_set lp = lastpos();
std::copy(lp.begin(), lp.end(),
std::ostream_iterator<node_id_t>(out, ","));
}
#endif
template <typename ScannerT>
class make_concat
{
typedef typename ScannerT::iterator_t iterator_type;
public:
make_concat(std::stack<node*>& the_stack)
: m_stack(the_stack)
{}
void operator()(iterator_type const &, iterator_type const &) const
{
node* right = m_stack.top();
m_stack.pop();
node* left = m_stack.top();
m_stack.pop();
node* newnode = new cat_node(left, right);
m_stack.push(newnode);
}
std::stack<node*>& m_stack;
};
template <int CharTSize>
struct get_byte_aux;
template<>
struct get_byte_aux<1>
{
template <typename CharT>
unsigned char operator()(CharT c, unsigned int byte)
{
BOOST_ASSERT(byte == 0);
return c;
}
};
template<>
struct get_byte_aux<2>
{
template <typename CharT>
unsigned char operator()(CharT c, unsigned int byte)
{
static unsigned long mask[] =
{
0xFF00,
0x00FF
};
BOOST_ASSERT(byte < 2);
return (c & mask[byte]) >> ((sizeof(c) - 1 - byte) * 8);
}
};
template<>
struct get_byte_aux<4>
{
template <typename CharT>
unsigned char operator()(CharT c, unsigned int byte)
{
static unsigned long mask[] =
{
0xFF000000,
0x00FF0000,
0x0000FF00,
0x000000FF
};
BOOST_ASSERT(byte < 4);
return (c & mask[byte]) >> ((sizeof(c) - 1 - byte) * 8);
}
};
template <typename CharT>
inline unsigned char
get_byte(CharT c, unsigned int byte)
{
return get_byte_aux<sizeof(c)>()(c, byte);
}
template <typename ScannerT>
class make_star
{
typedef typename ScannerT::iterator_t iterator_type;
public:
typedef
typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
char_t;
make_star(std::stack<node*>& the_stack)
: m_stack(the_stack)
{}
void operator()(const char_t) const
{
node* left = m_stack.top();
m_stack.pop();
node* newnode = new star_node(left);
m_stack.push(newnode);
}
std::stack<node*>& m_stack;
};
template <typename ScannerT>
class make_or
{
typedef typename ScannerT::iterator_t iterator_type;
public:
make_or(std::stack<node*>& the_stack)
: m_stack(the_stack)
{}
void operator()(iterator_type const&, iterator_type const&) const
{
node* right = m_stack.top();
m_stack.pop();
node* left = m_stack.top();
m_stack.pop();
node* newnode = new or_node(left, right);
m_stack.push(newnode);
}
std::stack<node*>& m_stack;
};
template <typename ScannerT>
class make_plus
{
typedef typename ScannerT::iterator_t iterator_type;
public:
typedef
typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
char_t;
make_plus(std::stack<node*>& the_stack)
: m_stack(the_stack)
{}
void operator()(const char_t) const
{
node* left = m_stack.top();
m_stack.pop();
node* copy = left->clone();
node* new_star = new star_node(copy);
node* new_cat = new cat_node(left, new_star);
m_stack.push(new_cat);
}
std::stack<node*>& m_stack;
};
template <typename ScannerT>
class make_optional
{
typedef typename ScannerT::iterator_t iterator_type;
public:
typedef
typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
char_t;
make_optional(std::stack<node*>& the_stack)
: m_stack(the_stack)
{}
void operator()(const char_t) const
{
node* left = m_stack.top();
m_stack.pop();
node* new_or = new or_node(left, new epsilon_node());
m_stack.push(new_or);
}
std::stack<node*>& m_stack;
};
///////////////////////////////////////////////////////////////////////////////
// utility function
template <typename CharT>
inline utility::impl::range<CharT> const&
full_range()
{
static utility::impl::range<CharT> full((std::numeric_limits<CharT>::min)(),
(std::numeric_limits<CharT>::max)());
return full;
}
namespace ccl_utils
{
template <typename char_t>
inline utility::impl::range_run<char_t>
negate_range_run(
const utility::impl::range_run<char_t>& rr)
{
utility::impl::range_run<char_t> newrr;
newrr.set(full_range<char_t>());
for (typename utility::impl::range_run<char_t>::const_iterator iter = rr.begin();
iter != rr.end(); ++iter)
newrr.clear(*iter);
return newrr;
}
template <typename char_t>
inline node*
create_mb_node_seq(char_t c)
{
node* newnode = new char_node(get_byte(c, 0));
for (unsigned int i = 1; i < sizeof(c); ++i)
{
node* cnode = new char_node(get_byte(c, i));
node* top_node = new cat_node(newnode, cnode);
newnode = top_node;
}
return newnode;
}
template <typename char_t>
inline void
handle_mb_char(char_t c, bool first_time,
std::stack<node*>& stack)
{
node* newnode = create_mb_node_seq(c);
if (first_time)
{
stack.push(newnode);
}
else
{
node* top = stack.top();
stack.pop();
node* newtop = new or_node(top, newnode);
stack.push(newtop);
}
}
// forward decl only
template <typename char_t>
inline void
handle_mb_range(char_t c1, char_t c2, bool first_time,
std::stack<node*>& stack);
template <typename char_t>
inline void
create_nodes(const utility::impl::range_run<char_t>& rr,
std::stack<node*>& stack)
{
if (sizeof(char_t) == 1)
{
std::vector<uchar> ccl;
ccl.resize(256);
for (typename utility::impl::range_run<char_t>::const_iterator iter = rr.begin();
iter != rr.end(); ++iter)
{
for (int i = iter->first; i <= iter->last; ++i)
{
// this is always true because of the limited datatype
// BOOST_ASSERT(uchar(i) < 256 && ccl.size() == 256);
ccl[uchar(i)] = 1;
}
}
node* new_ccl = new ccl_node(ccl);
stack.push(new_ccl);
}
else
{
bool mb_first_time = true;
for (typename utility::impl::range_run<char_t>::const_iterator iter = rr.begin();
iter != rr.end(); ++iter)
{
if (iter->first == iter->last)
{
handle_mb_char(iter->first, mb_first_time, stack);
}
else
{
handle_mb_range(iter->first, iter->last, mb_first_time, stack);
}
mb_first_time = false;
}
}
}
template <typename char_t>
inline std::size_t
compute_differing_byte(char_t c1, char_t c2)
{
std::size_t rval = 0;
while (rval < sizeof(c1) &&
get_byte(c1, (unsigned int)rval) == get_byte(c2, (unsigned int)rval))
{
++rval;
}
return rval;
}
template <typename char_t>
inline node*
create_mb_node_type1(std::size_t j, char_t c1, char_t c2)
{
std::size_t diff = get_byte(c2, (unsigned int)j) -
get_byte(c1, (unsigned int)j);
if (diff == 1) {
return 0;
}
else if (diff == 2) {
return new char_node(get_byte(c1, (unsigned int)j)+1);
}
else {
return new ccl_node(get_byte(c1, (unsigned int)j)+1,
get_byte(c2, (unsigned int)j)-1);
}
}
template <typename char_t>
inline node *
create_mb_node_for_byte(std::size_t i, std::size_t j, std::size_t sizem1,
std::size_t differing_byte, char_t c1, char_t c2, node* newnode)
{
node* cnode;
if (i == sizem1 && j == differing_byte && j != sizem1)
{
node* tmp = create_mb_node_type1(j, c1, c2);
if (tmp == 0)
{
delete newnode;
return 0;
}
else
cnode = tmp;
}
else if (i == differing_byte && j == sizem1)
{
if (i != sizem1) {
cnode = new ccl_node(get_byte(c1, (unsigned int)j), 0xFF);
}
else {
cnode = new ccl_node(get_byte(c1, (unsigned int)j),
get_byte(c2, (unsigned int)j));
}
}
else if (i != differing_byte && i != sizem1 &&
j == (sizem1 - i + differing_byte))
{
cnode = new ccl_node(get_byte(c1, (unsigned int)j)+1, 0xFF);
}
else if (i + j - differing_byte > sizem1) {
cnode = new ccl_node(0, 0xFF);
}
else {//if (is plain)
cnode = new char_node(get_byte(c1, (unsigned int)j));
}
node* top_node = new cat_node(newnode, cnode);
return top_node;
}
// On platforms, where wchar_t is a typedef for unsigned short, the
// comparision for a negative value is pointless
template <bool is_signed>
struct correct_char_aux {
};
template <>
struct correct_char_aux<true> {
template <typename char_t>
static char_t correct(char_t c) { if (c < 0) c = 0; return c; }
};
template <>
struct correct_char_aux<false> {
template <typename char_t>
static char_t correct(char_t c) { return c; }
};
template <typename char_t>
struct correct_char
{
static char_t correct(char_t c)
{
return correct_char_aux<std::numeric_limits<char_t>::is_signed >::
correct(c);
}
};
template <typename char_t>
inline void
handle_mb_range(char_t c1, char_t c2, bool first_time,
std::stack<node*>& stack)
{
// The algorithm can't handle negative value chars, which don't make
// much sense anyway. This comparision is pointless for wchar_t's on
// platforms, where wchar_t is a typedef for unsigned short
c1 = correct_char<char_t>::correct(c1);
//if (c1 < 0)
// c1 = 0;
BOOST_ASSERT(c1 < c2);
node* newnode = 0;
node* savednode = 0;
const std::size_t differing_byte = compute_differing_byte(c1, c2);
const std::size_t sizem1 = sizeof(c1) - 1;
const std::size_t ndb = sizem1 - differing_byte;
for (std::size_t i = differing_byte; i < sizeof(c1); ++i)
{
// generate node for the first byte
if (differing_byte == 0 && i == ndb)
{
node* tmp = create_mb_node_type1(0, c1, c2);
if (tmp == 0)
continue;
else
newnode = tmp;
}
else
{
newnode = new char_node(get_byte(c1, 0));
}
for (std::size_t j = 1; j < sizeof(c1); ++j)
{
newnode = create_mb_node_for_byte(i, j, sizem1, differing_byte,
c1, c2, newnode);
if (newnode == 0)
goto end_outer_for;
}
// or together the various parts
if (savednode)
{
node* top_node = new or_node(savednode, newnode);
savednode = top_node;
}
else
{
savednode = newnode;
}
end_outer_for:
continue;
}
for (std::size_t k = 0; k < ndb; ++k)
{
newnode = new char_node(get_byte(c2, 0));
for (std::size_t j = 1; j < sizeof(c2); ++j)
{
node* cnode;
if (k == differing_byte && j == sizem1 && k != sizem1)
cnode = new ccl_node(0, get_byte(c2, (unsigned int)j));
else if (k != differing_byte && k != sizem1 &&
j == (sizem1 - k + differing_byte))
cnode = new ccl_node(0, get_byte(c2, (unsigned int)j)-1);
else if (k + j - differing_byte > sizem1)
cnode = new ccl_node(0, 0xFF);
else //if (is plain)
cnode = new char_node(get_byte(c2, (unsigned int)j));
node* top_node = new cat_node(newnode, cnode);
newnode = top_node;
}
// or together the various parts
if (savednode)
{
node* top_node = new or_node(savednode, newnode);
savednode = top_node;
}
else
{
savednode = newnode;
}
}
if (first_time)
{
stack.push(savednode);
}
else
{
node* top = stack.top();
stack.pop();
node* newtop = new or_node(top, savednode);
stack.push(newtop);
}
}
} // namespace ccl_utils
template <typename ScannerT>
class make_char
{
typedef typename ScannerT::iterator_t iterator_type;
public:
typedef
typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
char_t;
make_char(std::stack<node*>& the_stack)
: m_stack(the_stack)
{}
void operator()(iterator_type const& first, iterator_type const& last) const
{
const escape_char_parser<lex_escapes, char_t> lex_escape_ch =
escape_char_parser<lex_escapes, char_t>();
char_t the_char;
iterator_type first_ = first;
ScannerT scan(first_, last);
lex_escape_ch[assign(the_char)].parse(scan);
node* newnode = ccl_utils::create_mb_node_seq(the_char);
m_stack.push(newnode);
}
std::stack<node*>& m_stack;
};
template <typename ScannerT>
class make_ccl
{
typedef typename ScannerT::iterator_t iterator_type;
public:
typedef
typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
char_t;
make_ccl(std::stack<node*>& the_stack)
: m_stack(the_stack)
{}
static bool is_equal_to_string(iterator_type first,
iterator_type const & last, const char* str)
{
while (first != last &&*str &&*first ==*str)
{
++first;
++str;
}
return*str == 0;
}
template <typename ParserT>
static void fill_ccl(utility::impl::range_run<char_t>& rr, const ParserT& parser)
{
for (int i = 0; i < 256; ++i)
{
if (parser.test(static_cast<char_t>(uchar(i))))
rr.set(utility::impl::range<char_t>(char_t(i), char_t(i)));
}
}
void operator()(iterator_type const& first_, iterator_type const& last) const
{
BOOST_ASSERT(*first_ == '[');
iterator_type first = first_;
++first; // skip over '['
bool negated_ccl = false;
if (*first == '^')
{
negated_ccl = true;
++first;
}
utility::impl::range_run<char_t> rr;
while (first != last &&*first != ']')
{
if (*first == '[') // it's a ccl_expr like [:space:]
{
// check for [:space:], etc.
if (is_equal_to_string(first, last, "[:alnum:]"))
{
fill_ccl(rr, alnum_p);
}
else if (is_equal_to_string(first, last, "[:alpha:]"))
{
fill_ccl(rr, alpha_p);
}
else if (is_equal_to_string(first, last, "[:blank:]"))
{
fill_ccl(rr, blank_p);
}
else if (is_equal_to_string(first, last, "[:cntrl:]"))
{
fill_ccl(rr, cntrl_p);
}
else if (is_equal_to_string(first, last, "[:digit:]"))
{
fill_ccl(rr, digit_p);
}
else if (is_equal_to_string(first, last, "[:graph:]"))
{
fill_ccl(rr, graph_p);
}
else if (is_equal_to_string(first, last, "[:lower:]"))
{
fill_ccl(rr, lower_p);
}
else if (is_equal_to_string(first, last, "[:print:]"))
{
fill_ccl(rr, print_p);
}
else if (is_equal_to_string(first, last, "[:punct:]"))
{
fill_ccl(rr, punct_p);
}
else if (is_equal_to_string(first, last, "[:space:]"))
{
fill_ccl(rr, space_p);
}
else if (is_equal_to_string(first, last, "[:upper:]"))
{
fill_ccl(rr, upper_p);
}
else if (is_equal_to_string(first, last, "[:xdigit:]"))
{
fill_ccl(rr, xdigit_p);
}
// this can't happen, because it's parsed before we get here.
//else
// throw bad_regex();
// Advance past the character class expression
while (first != last &&*first != ']')
++first;
BOOST_ASSERT(*first == ']');
++first;
}
else {
const escape_char_parser<lex_escapes, char_t> lex_escape_ch =
escape_char_parser<lex_escapes, char_t>();
char_t c1;
ScannerT scan(first, last);
lex_escape_ch[assign(c1)].parse(scan);
if (*scan.first == '-') // insert a range
{
++scan.first;
char_t c2;
lex_escape_ch[assign(c2)].parse(scan);
BOOST_ASSERT(c1 < c2); // Throw exception?
rr.set(utility::impl::range<char_t>(c1, c2));
}
else // insert 1 char
{
rr.set(utility::impl::range<char_t>(c1, c1));
}
}
}
if (negated_ccl)
{
rr = ccl_utils::negate_range_run(rr);
}
ccl_utils::create_nodes(rr, m_stack);
}
std::stack<node*>& m_stack;
};
template <typename ScannerT>
class make_any_char
{
typedef typename ScannerT::iterator_t iterator_type;
public:
typedef
typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
char_t;
std::stack<node*>& m_stack;
make_any_char(std::stack<node*>& the_stack)
: m_stack(the_stack)
{}
void operator()(const char_t c) const
{
BOOST_ASSERT(c == '.');
do_any_char();
}
void do_any_char() const
{
static utility::impl::range_run<char_t> rr;
rr.set(full_range<char_t>());
char_t newline = '\n';
rr.clear(utility::impl::range<char_t>(newline, newline));
ccl_utils::create_nodes(rr, m_stack);
}
};
template <typename ScannerT>
class make_string
{
typedef typename ScannerT::iterator_t iterator_type;
public:
typedef
typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
char_t;
std::stack<node*>& m_stack;
make_string(std::stack<node*>& the_stack)
: m_stack(the_stack)
{}
void operator()(iterator_type const& first, iterator_type const& last) const
{
BOOST_ASSERT(*first == '"');
iterator_type first_ = first;
ScannerT scan(first_, last);
++scan.first; // skip over '"'
// empty string not allowed
if (*scan.first == '"')
{
boost::throw_exception(bad_regex());
}
const escape_char_parser<lex_escapes, char_t> lex_escape_ch =
escape_char_parser<lex_escapes, char_t>();
char_t c;
lex_escape_ch[assign(c)].parse(scan);
node* top_node = ccl_utils::create_mb_node_seq(c);
while (*scan.first != '"' && scan.first != scan.last)
{
lex_escape_ch[assign(c)].parse(scan);
node* cur_node = ccl_utils::create_mb_node_seq(c);
top_node = new cat_node(top_node, cur_node);
}
m_stack.push(top_node);
}
};
inline
node* repeat_node(node* n, int num)
{
node* list_of_nodes = n;
for (int i = 1; i < num; ++i)
{
list_of_nodes = new cat_node(list_of_nodes, n->clone());
}
return list_of_nodes;
}
inline
node* optional_node(node* n)
{
return new or_node(n, new epsilon_node());
}
template <typename ScannerT>
class make_rep1
{
typedef typename ScannerT::iterator_t iterator_type;
public:
typedef
typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
char_t;
std::stack<node*>& m_stack;
make_rep1(std::stack<node*>& the_stack)
: m_stack(the_stack)
{}
void operator()(iterator_type const& first, iterator_type const& last) const
{
BOOST_ASSERT(*first == '{');
iterator_type first_ = first;
ScannerT scan(first_, last);
++scan.first; // skip over '{'
unsigned int count;
uint_p[assign(count)].parse(scan);
if (count == 0)
boost::throw_exception(bad_regex());
node* top_node = m_stack.top();
m_stack.pop();
top_node = repeat_node(top_node, count);
m_stack.push(top_node);
}
};
template <typename ScannerT>
class make_rep2
{
typedef typename ScannerT::iterator_t iterator_type;
public:
typedef
typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
char_t;
std::stack<node*>& m_stack;
make_rep2(std::stack<node*>& the_stack)
: m_stack(the_stack)
{}
void operator()(iterator_type const& first, iterator_type const& last) const
{
BOOST_ASSERT(*first == '{');
iterator_type first_ = first;
ScannerT scan (first_, last);
++scan.first; // skip over '{'
unsigned int count;
uint_p[assign(count)].parse(scan);
if (count == 0)
boost::throw_exception(bad_regex());
node* top_node = m_stack.top();
m_stack.pop();
top_node = new cat_node(repeat_node(top_node, count),
new star_node(top_node->clone()));
m_stack.push(top_node);
}
};
template <typename ScannerT>
class make_rep3
{
typedef typename ScannerT::iterator_t iterator_type;
public:
typedef
typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
char_t;
std::stack<node*>& m_stack;
make_rep3(std::stack<node*>& the_stack)
: m_stack(the_stack)
{}
void operator()(iterator_type const& first, iterator_type const& last) const
{
BOOST_ASSERT(*first == '{');
iterator_type first_ = first;
ScannerT scan(first_, last);
++scan.first; // skip over '{'
unsigned int count1, count2;
uint_p[assign(count1)].parse(scan);
if (count1 == 0)
boost::throw_exception(bad_regex());
++scan.first; // skip over ','
uint_p[assign(count2)].parse(scan);
if (count2 <= count1)
boost::throw_exception(bad_regex());
node* top_node = m_stack.top();
m_stack.pop();
node* repeats = repeat_node(top_node, count1);
top_node = new cat_node(repeats,
repeat_node(optional_node(top_node->clone()),
count2 - count1));
m_stack.push(top_node);
}
};
///////////////////////////////////////////////////////////////////////////////
//
// Lexer grammar
//
// Defines the grammar, which mandates the syntax of the understood
// lexeme definitions passed to lexer::register_regex.
//
///////////////////////////////////////////////////////////////////////////////
class lexer_grammar : public boost::spirit::classic::grammar<lexer_grammar>
{
public:
lexer_grammar(std::stack<node*> &node_stack_)
: node_stack(node_stack_) {}
template <typename ScannerT>
struct definition
{
typedef rule<ScannerT> rule_t;
typedef typename ScannerT::iterator_t iterator_type;
typedef
typename BOOST_SPIRIT_IT_NS::iterator_traits<iterator_type>::value_type
char_t;
rule_t regex, re, series, singleton, singleton2, fullccl, ccl, string,
escseq, ccl_char;
symbols<> ccl_expr;
definition(lexer_grammar const &self)
{
regex =
re >> !('/' >> re) >> !ch_p('$')
;
re =
series
>>*( ('|' >> series)[make_or<ScannerT>(self.node_stack)] )
;
series =
singleton
>>*( singleton[make_concat<ScannerT>(self.node_stack)] )
;
singleton =
ch_p('.')[make_any_char<ScannerT>(self.node_stack)]
>> singleton2
| fullccl
>> singleton2
| ('"' >> string >> '"')
[
make_string<ScannerT>(self.node_stack)
]
>> singleton2
| '(' >> re >> ')'
>> singleton2
| ((anychar_p - chset<>("/|*+?.(){}\\")) | escseq)
[
make_char<ScannerT>(self.node_stack)
]
>> singleton2
;
singleton2 =
ch_p('*')[make_star<ScannerT>(self.node_stack)]
>> singleton2
| ch_p('+')[make_plus<ScannerT>(self.node_stack)]
>> singleton2
| ch_p('?')[make_optional<ScannerT>(self.node_stack)]
>> singleton2
| ('{' >> uint_p >> '}')
[
make_rep1<ScannerT>(self.node_stack)
]
>> singleton2
| ('{' >> uint_p >> ',' >> '}')
[
make_rep2<ScannerT>(self.node_stack)
]
>> singleton2
| ('{' >> uint_p >> ',' >> uint_p >> '}')
[
make_rep3<ScannerT>(self.node_stack)
]
>> singleton2
| epsilon_p
;
fullccl =
('[' >> !ch_p('^') >> ccl >> ']')
[
make_ccl<ScannerT>(self.node_stack)
]
;
ccl =
*(ccl_expr | (ccl_char >> !('-' >> ccl_char)))
;
ccl_char =
( (anychar_p - chset<>("\\\n]")) | escseq )
;
ccl_expr =
"[:alnum:]",
"[:alpha:]",
"[:blank:]",
"[:cntrl:]",
"[:digit:]",
"[:graph:]",
"[:lower:]",
"[:print:]",
"[:punct:]",
"[:space:]",
"[:upper:]",
"[:xdigit:]"
;
string =
+( (anychar_p - chset<>("\"\\")) | escseq )
;
typedef
uint_parser<char_t, 8, 1,
std::numeric_limits<char_t>::digits / 3 + 1
> oct_parser_t;
typedef
uint_parser<char_t, 16, 1,
std::numeric_limits<char_t>::digits / 4 + 1
> hex_parser_t;
escseq =
ch_p('\\')
>> (
oct_parser_t()
| as_lower_d['x'] >> hex_parser_t()
| (anychar_p - chset<>('\n'))
)
;
#define BOOST_SLEX_DEBUG (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
BOOST_SPIRIT_DEBUG_TRACE_RULE(regex, BOOST_SLEX_DEBUG);
BOOST_SPIRIT_DEBUG_TRACE_RULE(re, BOOST_SLEX_DEBUG);
BOOST_SPIRIT_DEBUG_TRACE_RULE(series, BOOST_SLEX_DEBUG);
BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton, BOOST_SLEX_DEBUG);
BOOST_SPIRIT_DEBUG_TRACE_RULE(singleton2, BOOST_SLEX_DEBUG);
BOOST_SPIRIT_DEBUG_TRACE_RULE(fullccl, BOOST_SLEX_DEBUG);
BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl, BOOST_SLEX_DEBUG);
BOOST_SPIRIT_DEBUG_TRACE_RULE(string, BOOST_SLEX_DEBUG);
BOOST_SPIRIT_DEBUG_TRACE_RULE(escseq, BOOST_SLEX_DEBUG);
BOOST_SPIRIT_DEBUG_TRACE_RULE(ccl_char, BOOST_SLEX_DEBUG);
#undef BOOST_SLEX_DEBUG
}
rule<ScannerT> const&
start() const { return regex; }
};
std::stack<node*> &node_stack;
}; // class lexer_grammar
template <typename StringT>
inline node *
parse(lexer_grammar& g, StringT const& str)
{
typedef
scanner<typename StringT::const_iterator, scanner_policies<> >
scanner_t;
typedef typename rule<scanner_t>::template result<scanner_t>::type
result_t;
typename StringT::const_iterator first = str.begin();
typename StringT::const_iterator last = str.end();
scanner_t scan(first, last);
// typename rule<scanner_t>::result_t hit = g.parse(scan);
result_t hit = g.parse(scan);
if (!hit || !scan.at_end())
{
while (g.node_stack.size())
{
delete g.node_stack.top();
g.node_stack.pop();
}
return 0;
}
BOOST_ASSERT(g.node_stack.size() == 1);
node* rval = g.node_stack.top();
g.node_stack.pop();
node* an_eof_node = new eof_node();
rval = new cat_node(rval, an_eof_node);
return rval;
}
inline
void make_case_insensitive(state_match_t& state_match)
{
// TODO: Fix this.
// This doesn't take into account foreign languages, figure out how to
// do that. Also this approach is broken for this implementation of
// wide chars.
for (state_match_t::iterator iter = state_match.begin();
iter != state_match.end(); ++iter)
{
int i, j;
for (i = 'A', j = 'a'; i <= 'Z'; ++i, ++j)
{
// if either is set, turn them both on
(*iter)[i] = (*iter)[j] = uchar((*iter)[i] | (*iter)[j]);
}
}
}
template<bool wide_char>
struct regex_match_helper;
template<>
struct regex_match_helper<false> // single byte char
{
template <typename DfaT, typename IteratorT>
static bool
do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last,
int& regex_index,
std::basic_string<
typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
> *token)
{
typedef std::basic_string<
typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
> string_type;
typedef typename string_type::size_type size_type;
node_id_t s = 0;
node_id_t last_accepting_index = invalid_node;
IteratorT p = first;
IteratorT last_accepting_cpos = first;
while (p != last)
{
s = dfa.transition_table[s][(uchar)*p];
if (s == invalid_node)
break;
if (token) token->append((size_type)1, *p);
++p;
if (dfa.acceptance_index[s] != invalid_node)
{
last_accepting_index = s;
last_accepting_cpos = p;
}
}
if (last_accepting_index != invalid_node)
{
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " <<
dfa.acceptance_index[last_accepting_index] << '\n';
#endif
first = last_accepting_cpos;
regex_index = dfa.acceptance_index[last_accepting_index];
return true;
}
else
return false;
}
};
template<>
struct regex_match_helper<true> // wide char
{
template <typename DfaT, typename IteratorT>
static bool
do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last,
int& regex_index,
std::basic_string<
typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
> *token)
{
typedef
typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
char_t;
typedef std::basic_string<char_t> string_type;
typedef typename string_type::size_type size_type;
node_id_t s = 0;
node_id_t last_accepting_index = invalid_node;
IteratorT wp = first;
IteratorT last_accepting_cpos = first;
while (wp != last)
{
for (unsigned int i = 0; i < sizeof(char_t); ++i)
{
s = dfa.transition_table[s][get_byte(*wp,i)];
if (s == invalid_node)
{
goto break_while;
}
}
if (token) token->append((size_type)1, *wp);
++wp;
if (dfa.acceptance_index[s] != invalid_node)
{
last_accepting_index = s;
last_accepting_cpos = wp;
}
}
break_while:
if (last_accepting_index != invalid_node)
{
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
std::cout << "dfa.acceptance_index[" << last_accepting_index << "] = " <<
dfa.acceptance_index[last_accepting_index] << '\n';
#endif
first = last_accepting_cpos;
regex_index = dfa.acceptance_index[last_accepting_index];
return true;
}
else
return false;
}
};
template <typename DfaT, typename IteratorT, bool wide_char>
struct regex_match
{
static bool
do_match(DfaT const& dfa, IteratorT &first, IteratorT const& last,
int& regex_index,
std::basic_string<
typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
> *token)
{
return regex_match_helper<wide_char>::do_match(
dfa, first, last, regex_index, token);
}
};
} // namespace lexerimpl
///////////////////////////////////////////////////////////////////////////////
//
template <typename IteratorT = char const*, typename TokenT = int,
typename CallbackT = void(*)(IteratorT const &,
IteratorT &,
IteratorT const&,
TokenT const&,
lexer_control<TokenT> &)>
class lexer
{
public:
typedef CallbackT callback_t;
typedef
typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
char_t;
struct regex_info
{
std::basic_string<char_t> str;
TokenT token;
CallbackT callback;
regex_info(const std::basic_string<char_t>& _str,
const TokenT& _token,
const CallbackT& _callback)
: str(_str)
, token(_token)
, callback(_callback)
{}
};
struct dfa_table
{
std::vector<std::vector<node_id_t> > transition_table;
std::vector<node_id_t> acceptance_index;
};
typedef std::vector<node_id_t> node_table_t;
typedef std::vector<node_table_t> transition_table_t;
typedef std::vector<dfa_table> dfa_t;
lexer(unsigned int states = 1);
void register_regex(const std::basic_string<char_t>& regex,
const TokenT& id, const CallbackT& cb = CallbackT(),
unsigned int state = 0);
TokenT next_token(IteratorT &first, IteratorT const &last,
std::basic_string<char_t> *token = 0);
void create_dfa();
bool has_compiled_dfa() { return m_compiled_dfa; }
void set_case_insensitive(bool insensitive);
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
void dump(std::ostream& out);
#endif
typedef std::vector<std::vector<regex_info> > regex_list_t;
bool load (std::ifstream &in, long unique_id = 0);
bool save (std::ofstream &out, long unique_id = 0);
enum {
SLEX_SIGNATURE = 0x58454C53, // "SLEX"
SLEX_VERSION_100 = 0x0100, // persistance version
SLEX_LAST_KNOWN_VERSION = SLEX_VERSION_100,
SLEX_MINOR_VERSION_MASK = 0xFF
};
private:
void create_dfa_for_state(int state);
static bool regex_match(const dfa_t& dfa, IteratorT& first,
IteratorT& last, int& regex_index);
mutable std::stack<lexerimpl::node*> node_stack;
lexerimpl::lexer_grammar g;
mutable bool m_compiled_dfa;
mutable dfa_t m_dfa;
regex_list_t m_regex_list;
bool m_case_insensitive;
unsigned int m_state;
std::stack<unsigned int> m_state_stack;
unsigned int m_num_states;
};
template <typename IteratorT, typename TokenT, typename CallbackT>
inline
lexer<IteratorT, TokenT, CallbackT>::lexer(unsigned int states)
: g(node_stack)
, m_compiled_dfa(false)
, m_regex_list(states)
, m_case_insensitive(false)
, m_state(0)
, m_state_stack()
, m_num_states(states)
{
BOOST_SPIRIT_DEBUG_TRACE_NODE_NAME(g, "slex::lexer",
BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX);
}
template <typename IteratorT, typename TokenT, typename CallbackT>
inline void
lexer<IteratorT, TokenT, CallbackT>::register_regex(
const std::basic_string<char_t>& regex, const TokenT& id,
const CallbackT& callback, unsigned int state)
{
if (state > m_num_states) {
m_regex_list.resize(state);
m_num_states = state;
}
m_regex_list[state].push_back(regex_info(regex, id, callback));
}
template <typename IteratorT, typename TokenT, typename CallbackT>
inline TokenT
lexer<IteratorT, TokenT, CallbackT>::next_token(
IteratorT &first, IteratorT const& last,
std::basic_string<
typename BOOST_SPIRIT_IT_NS::iterator_traits<IteratorT>::value_type
> *token)
{
if (!m_compiled_dfa)
{
create_dfa();
}
IteratorT saved = first;
int regex_index;
if (!lexerimpl::regex_match<dfa_table, IteratorT, (sizeof(char_t) > 1)>::
do_match(m_dfa[m_state], first, last, regex_index, token))
return -1; // TODO: can't return -1, need to return some invalid token.
// how to figure this out? We can use traits I guess.
else
{
regex_info regex = m_regex_list[m_state][regex_index];
TokenT rval = regex.token;
if (regex.callback)
{
// execute corresponding callback
lexer_control<TokenT> controller(rval, m_state, m_state_stack);
regex.callback(saved, first, last, regex.token, controller);
if (controller.ignore_current_token_set()) {
if (token)
token->erase();
return next_token(first, last, token);
}
}
return rval;
}
}
namespace lexerimpl
{
inline
bool find_acceptance_state(const node_set& eof_node_ids,
const node_set& current_set,
node_id_t& acceptance_node_id)
{
for(node_set::const_iterator nsi = eof_node_ids.begin();
nsi != eof_node_ids.end(); ++nsi)
{
node_id_t eof_node_id =*nsi;
if (current_set.end() != current_set.find(eof_node_id))
{
// store the first one we come to as the
// matched pattern
acceptance_node_id = eof_node_id;
// don't bother searching for more
return true;
}
}
return false;
}
template <typename RegexListT, typename GrammarT>
inline std::auto_ptr<node>
parse_regexes(const RegexListT& regex_list, GrammarT& g)
{
// parse the expressions into a tree
if (regex_list.begin() == regex_list.end())
boost::throw_exception(bad_regex());
typename RegexListT::const_iterator ri = regex_list.begin();
std::auto_ptr<node> tree(lexerimpl::parse(g, (*ri).str));
if (tree.get() == 0)
boost::throw_exception(bad_regex());
++ri;
for (/**/; ri != regex_list.end(); ++ri)
{
std::auto_ptr<node> next_tree(lexerimpl::parse(g, (*ri).str));
if (next_tree.get() == 0)
boost::throw_exception(bad_regex());
std::auto_ptr<node> newnode(new or_node(tree.release(), next_tree.release()));
tree = newnode;
}
return tree;
}
} //namespace lexerimpl
template <typename IteratorT, typename TokenT, typename CallbackT>
inline void
lexer<IteratorT, TokenT, CallbackT>::create_dfa()
{
m_dfa.resize(m_num_states);
for (unsigned int i = 0; i < m_num_states; ++i)
create_dfa_for_state(i);
}
// Algorithm from Compilers: Principles, Techniques, and Tools p. 141
template <typename IteratorT, typename TokenT, typename CallbackT>
inline void
lexer<IteratorT, TokenT, CallbackT>::create_dfa_for_state(int state)
{
using lexerimpl::node;
std::auto_ptr<node> tree = lexerimpl::parse_regexes(m_regex_list[state], g);
node_id_t dummy = 0;
tree->assign_node_ids(dummy);
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
tree->dump(std::cout);
#endif
// compute followpos(root)
followpos_t followpos;
tree->compute_followpos(followpos);
// the dfa states <-> nfa state groups
std::map<node_set, node_id_t> dstates1;
std::map<node_id_t, node_set> dstates2;
// the dfa transitions
m_dfa[state].transition_table.push_back(
std::vector<node_id_t>(256, invalid_node));
m_dfa[state].acceptance_index.push_back(invalid_node);
// whether the dfa state has been processed yet
std::vector<node_id_t> marked;
// used to give a unique id to each dfa state
node_id_t num_states = 0;
// initially, the only unmarked state in Dstates is firstpos(root).
marked.push_back(0);
node_set fpr = tree->firstpos();
dstates1[fpr] = 0;
dstates2[0] = fpr;
state_match_t state_match;
tree->compute_state_match(state_match);
if (m_case_insensitive)
lexerimpl::make_case_insensitive(state_match);
node_set eof_node_ids;
tree->get_eof_ids(eof_node_ids);
// translate the eof_node_ids into a 0-based index
std::map<node_id_t, node_id_t> eof_node_id_map;
unsigned int x = 0;
for (node_set::iterator node_id_it = eof_node_ids.begin();
node_id_it != eof_node_ids.end();
++node_id_it)
{
eof_node_id_map[*node_id_it] = x++;
}
// figure out if this is an acceptance state
node_id_t eof_node_id;
if (lexerimpl::find_acceptance_state(eof_node_ids, fpr, eof_node_id))
{
m_dfa[state].acceptance_index[0] = eof_node_id_map[eof_node_id];
}
std::vector<node_id_t>::iterator i = std::find(marked.begin(), marked.end(),
node_id_t(0));
while (marked.end() != i)
{
*i = 1;
node_id_t T = node_id_t(std::distance(marked.begin(), i));
BOOST_ASSERT(T < dstates2.size());
node_set Tstates = dstates2[T];
for (node_id_t j = 0; j < 256; ++j)
{
node_set U;
for (node_set::iterator k = Tstates.begin();
k != Tstates.end(); ++k)
{
node_id_t p =*k;
BOOST_ASSERT(p < state_match.size());
BOOST_ASSERT(j < state_match[p].size());
if (state_match[p][j])
{
node_set fpp = followpos[p];
U.insert(fpp.begin(), fpp.end());
}
}
if (U.size() > 0)
{
std::map<node_set, node_id_t>::iterator l = dstates1.find(U);
node_id_t target_state;
if (l == dstates1.end()) // not in the states yet
{
++num_states;
dstates1[U] = target_state = num_states;
dstates2[target_state] = U;
marked.push_back(0);
m_dfa[state].transition_table.push_back(
std::vector<node_id_t>(256, invalid_node));
m_dfa[state].acceptance_index.push_back(invalid_node);
// figure out if this is an acceptance state
node_id_t eof_node_id;
if (lexerimpl::find_acceptance_state(eof_node_ids, U, eof_node_id))
{
m_dfa[state].acceptance_index[target_state] =
eof_node_id_map[eof_node_id];
}
}
else
{
target_state = dstates1[U];
}
BOOST_ASSERT(T < m_dfa[state].transition_table.size());
BOOST_ASSERT(j < m_dfa[state].transition_table[T].size());
m_dfa[state].transition_table[T][j] = target_state;
}
}
i = std::find(marked.begin(), marked.end(), node_id_t(0));
}
m_compiled_dfa = true;
}
template <typename IteratorT, typename TokenT, typename CallbackT>
inline void
lexer<IteratorT, TokenT, CallbackT>::set_case_insensitive(bool insensitive)
{
m_case_insensitive = insensitive;
}
#if defined(BOOST_SPIRIT_DEBUG) && (BOOST_SPIRIT_DEBUG_FLAGS & BOOST_SPIRIT_DEBUG_FLAGS_SLEX)
template <typename IteratorT, typename TokenT, typename CallbackT>
inline void
lexer<IteratorT, TokenT, CallbackT>::dump(std::ostream& out)
{
for (unsigned x = 0; x < m_dfa.size(); ++x)
{
out << "\nm_dfa[" << x << "] has " << m_dfa[x].transition_table.size() << " states\n";
for (node_id_t i = 0; i < m_dfa[x].transition_table.size(); ++i)
{
out << "state " << i << ":";
for (node_id_t j = 0; j < m_dfa[x].transition_table[i].size(); ++j)
{
if (m_dfa[x].transition_table[i][j] != invalid_node)
out << j << "->" << m_dfa[x].transition_table[i][j] << " ";
}
out << "\n";
}
out << "acceptance states: ";
for(unsigned int k = 0; k < m_dfa[x].acceptance_index.size(); ++k)
{
if (m_dfa[x].acceptance_index[k] != invalid_node)
out << '<' << k << ',' << m_dfa[x].acceptance_index[k] << "> ";
}
out << endl;
}
}
#endif
///////////////////////////////////////////////////////////////////////////////
// load the lexer tables
#define slex_in(strm, val) \
strm.read((char*)&val, sizeof(val)); \
if (std::ios::goodbit != strm.rdstate()) return false
template <typename IteratorT, typename TokenT, typename CallbackT>
inline bool
lexer<IteratorT, TokenT, CallbackT>::load (std::ifstream &in, long unique_id)
{
// ensure correct signature and version
long signature = 0;
slex_in (in, signature);
if (signature != SLEX_SIGNATURE)
return false; // not for us
long version = 0;
slex_in (in, version);
if ((version & ~SLEX_MINOR_VERSION_MASK) > SLEX_LAST_KNOWN_VERSION)
return false; // to new for us
long uid = 0;
slex_in (in, uid);
if (uid != unique_id)
return false; // not saved by us
// load auxiliary members
int num_states = 0;
slex_in (in, num_states);
// load the dfa tables
dfa_t in_dfa;
std::size_t dfa_size = 0;
slex_in (in, dfa_size);
in_dfa.resize(dfa_size);
for (std::size_t dfa = 0; dfa < dfa_size; ++dfa)
{
// load the transition tables
std::size_t tt_size = 0;
transition_table_t &tt_table = in_dfa[dfa].transition_table;
slex_in (in, tt_size);
tt_table.resize(tt_size);
for (std::size_t tt = 0; tt < tt_size; ++tt)
{
std::size_t nt_size = 0;
node_table_t &nt_table = tt_table[tt];
slex_in (in, nt_size);
nt_table.resize(nt_size);
for (std::size_t nt = 0; nt < nt_size; ++nt)
{
slex_in (in, nt_table[nt]);
}
}
// load the acceptance index table
std::size_t ai_size = 0;
node_table_t &ai_table = in_dfa[dfa].acceptance_index;
slex_in (in, ai_size);
ai_table.resize(ai_size);
for (std::size_t ai = 0; ai < ai_size; ++ai)
{
slex_in (in, ai_table[ai]);
}
}
m_dfa.swap(in_dfa); // success, swap in the read values
m_num_states = num_states;
m_compiled_dfa = true;
return true;
}
#undef slex_in
///////////////////////////////////////////////////////////////////////////////
// save the lexer tables
#define slex_out(strm, val) \
strm.write((char*)&val, sizeof(val)); \
if (std::ios::goodbit != strm.rdstate()) return false
template <typename IteratorT, typename TokenT, typename CallbackT>
inline bool
lexer<IteratorT, TokenT, CallbackT>::save (std::ofstream &out, long unique_id)
{
// save signature and version information
long out_long = SLEX_SIGNATURE;
slex_out(out, out_long);
out_long = SLEX_VERSION_100;
slex_out(out, out_long);
slex_out(out, unique_id);
// save auxiliary members
slex_out(out, m_num_states);
// save the dfa tables
typedef typename dfa_t::const_iterator dfa_iter_t;
typedef transition_table_t::const_iterator transition_table_iter_t;
typedef node_table_t::const_iterator node_table_iter_t;
std::size_t out_size_t = m_dfa.size();
slex_out(out, out_size_t);
dfa_iter_t end = m_dfa.end();
for (dfa_iter_t it = m_dfa.begin(); it != end; ++it)
{
// save the transition table
out_size_t = (*it).transition_table.size();
slex_out(out, out_size_t);
transition_table_iter_t tt_end = (*it).transition_table.end();
for (transition_table_iter_t tt_it = (*it).transition_table.begin();
tt_it != tt_end;
++tt_it)
{
out_size_t = (*tt_it).size();
slex_out(out, out_size_t);
node_table_iter_t nt_end = (*tt_it).end();
for (node_table_iter_t nt_it = (*tt_it).begin();
nt_it != nt_end;
++nt_it)
{
slex_out(out, (*nt_it));
}
}
// save the acceptance index table
out_size_t = (*it).acceptance_index.size();
slex_out(out, out_size_t);
node_table_iter_t nt_end = (*it).acceptance_index.end();
for (node_table_iter_t nt_it = (*it).acceptance_index.begin();
nt_it != nt_end;
++nt_it)
{
slex_out(out, (*nt_it));
}
}
return true;
}
#undef slex_out
/*
a lexer_control object supports some operations on the lexer.
get current lexer state
set state
terminate
state stack (push, pop, top)
set new token return value
ignore the current token
yymore
get length of matched token
*/
template <typename TokenT>
class lexer_control
{
public:
lexer_control(TokenT& token, unsigned int& current_state,
std::stack<unsigned int>& state_stack);
// functions dealing with the lexer state
// set the state to state
void set_state(unsigned int state);
// get the current state
unsigned int get_state();
// pushes the current state onto the top of the state stack and
// switches to new_state
void push_state(unsigned int new_state);
// pops the top of the state stack and switches to it.
void pop_state();
// returns the top of the stack without altering the stack's contents.
unsigned int top_state();
// functions dealing with the token returned.
// set a new TokenT return value, overriding that one that was
// registered via. register_regex()
void set_token(const TokenT& token);
// tell the lexer to return an invalid token, signifying termination.
void terminate();
// ignore the current token, and move on to the next one. The current
// token will NOT be returned from next_token()
void ignore_current_token();
// returns true if ignore_current_token() has been called,
// false otherwise.
bool ignore_current_token_set();
private:
TokenT& m_token;
bool m_ignore_current_token;
unsigned int& m_current_state;
std::stack<unsigned int>& m_state_stack;
};
template <typename TokenT>
inline
lexer_control<TokenT>::lexer_control(TokenT& token, unsigned int& current_state,
std::stack<unsigned int>& state_stack)
: m_token(token)
, m_ignore_current_token(false)
, m_current_state(current_state)
, m_state_stack(state_stack)
{
}
template <typename TokenT>
inline void
lexer_control<TokenT>::set_state(unsigned int state)
{
m_current_state = state;
}
template <typename TokenT>
inline unsigned int
lexer_control<TokenT>::get_state()
{
return m_current_state;
}
template <typename TokenT>
inline void
lexer_control<TokenT>::push_state(unsigned int new_state)
{
m_state_stack.push(m_current_state);
m_current_state = new_state;
}
template <typename TokenT>
inline void
lexer_control<TokenT>::pop_state()
{
m_current_state = m_state_stack.top();
m_state_stack.pop();
}
template <typename TokenT>
inline unsigned int
lexer_control<TokenT>::top_state()
{
return m_state_stack.top();
}
template <typename TokenT>
inline void
lexer_control<TokenT>::set_token(const TokenT& token)
{
m_token = token;
}
template <typename TokenT>
inline void
lexer_control<TokenT>::terminate()
{
m_token = -1; // TOOD: fix this, need to be determined by traits
}
template <typename TokenT>
inline void
lexer_control<TokenT>::ignore_current_token()
{
m_ignore_current_token = true;
}
template <typename TokenT>
inline bool
lexer_control<TokenT>::ignore_current_token_set()
{
return m_ignore_current_token;
}
} // namespace classic
} // namespace spirit
} // namespace boost
#undef BOOST_SPIRIT_IT_NS
#endif