blob: 130f5362c6f2093e29d676ebbe056f39f40c2b2c [file] [log] [blame]
// Copyright (c) 2001-2010 Hartmut Kaiser
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
// This example shows how to create a simple lexer recognizing a couple of
// different tokens and how to use this with a grammar. This example has a
// heavily backtracking grammar which makes it a candidate for lexer based
// parsing (all tokens are scanned and generated only once, even if
// backtracking is required) which speeds up the overall parsing process
// considerably, out-weighting the overhead needed for setting up the lexer.
// Additionally it demonstrates how to use one of the defined tokens as a
// parser component in the grammar.
//
// The grammar recognizes a simple input structure: any number of English
// simple sentences (statements, questions and commands) are recognized and
// are being counted separately.
// #define BOOST_SPIRIT_DEBUG
// #define BOOST_SPIRIT_LEXERTL_DEBUG
#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <iostream>
#include <fstream>
#include <string>
#include "example.hpp"
using namespace boost::spirit;
using namespace boost::spirit::ascii;
using boost::phoenix::ref;
///////////////////////////////////////////////////////////////////////////////
// Token definition
///////////////////////////////////////////////////////////////////////////////
template <typename Lexer>
struct example2_tokens : lex::lexer<Lexer>
{
example2_tokens()
{
// A 'word' is comprised of one or more letters and an optional
// apostrophe. If it contains an apostrophe, there may only be one and
// the apostrophe must be preceded and succeeded by at least 1 letter.
// For example, "I'm" and "doesn't" meet the definition of 'word' we
// define below.
word = "[a-zA-Z]+('[a-zA-Z]+)?";
// Associate the tokens and the token set with the lexer. Note that
// single character token definitions as used below always get
// interpreted literally and never as special regex characters. This is
// done to be able to assign single characters the id of their character
// code value, allowing to reference those as literals in Qi grammars.
this->self = lex::token_def<>(',') | '!' | '.' | '?' | ' ' | '\n' | word;
}
lex::token_def<> word;
};
///////////////////////////////////////////////////////////////////////////////
// Grammar definition
///////////////////////////////////////////////////////////////////////////////
template <typename Iterator>
struct example2_grammar : qi::grammar<Iterator>
{
template <typename TokenDef>
example2_grammar(TokenDef const& tok)
: example2_grammar::base_type(story)
, paragraphs(0), commands(0), questions(0), statements(0)
{
story
= +paragraph
;
paragraph
= ( +( command [ ++ref(commands) ]
| question [ ++ref(questions) ]
| statement [ ++ref(statements) ]
)
>> *char_(' ') >> +char_('\n')
)
[ ++ref(paragraphs) ]
;
command
= +(tok.word | ' ' | ',') >> '!'
;
question
= +(tok.word | ' ' | ',') >> '?'
;
statement
= +(tok.word | ' ' | ',') >> '.'
;
BOOST_SPIRIT_DEBUG_NODE(story);
BOOST_SPIRIT_DEBUG_NODE(paragraph);
BOOST_SPIRIT_DEBUG_NODE(command);
BOOST_SPIRIT_DEBUG_NODE(question);
BOOST_SPIRIT_DEBUG_NODE(statement);
}
qi::rule<Iterator> story, paragraph, command, question, statement;
int paragraphs, commands, questions, statements;
};
///////////////////////////////////////////////////////////////////////////////
int main()
{
// iterator type used to expose the underlying input stream
typedef std::string::iterator base_iterator_type;
// This is the token type to return from the lexer iterator
typedef lex::lexertl::token<base_iterator_type> token_type;
// This is the lexer type to use to tokenize the input.
// Here we use the lexertl based lexer engine.
typedef lex::lexertl::lexer<token_type> lexer_type;
// This is the token definition type (derived from the given lexer type).
typedef example2_tokens<lexer_type> example2_tokens;
// this is the iterator type exposed by the lexer
typedef example2_tokens::iterator_type iterator_type;
// this is the type of the grammar to parse
typedef example2_grammar<iterator_type> example2_grammar;
// now we use the types defined above to create the lexer and grammar
// object instances needed to invoke the parsing process
example2_tokens tokens; // Our lexer
example2_grammar calc(tokens); // Our parser
std::string str (read_from_file("example2.input"));
// At this point we generate the iterator pair used to expose the
// tokenized input stream.
std::string::iterator it = str.begin();
iterator_type iter = tokens.begin(it, str.end());
iterator_type end = tokens.end();
// Parsing is done based on the the token stream, not the character
// stream read from the input.
bool r = qi::parse(iter, end, calc);
if (r && iter == end)
{
std::cout << "-------------------------\n";
std::cout << "Parsing succeeded\n";
std::cout << "There were "
<< calc.commands << " commands, "
<< calc.questions << " questions, and "
<< calc.statements << " statements.\n";
std::cout << "-------------------------\n";
}
else
{
std::cout << "-------------------------\n";
std::cout << "Parsing failed\n";
std::cout << "-------------------------\n";
}
std::cout << "Bye... :-) \n\n";
return 0;
}