| // Copyright (c) 2001-2010 Hartmut Kaiser |
| // |
| // Distributed under the Boost Software License, Version 1.0. (See accompanying |
| // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) |
| |
| // This example is the equivalent to the following flex program: |
| /* |
| //[wcf_flex_version |
| %{ |
| #define ID_WORD 1000 |
| #define ID_EOL 1001 |
| #define ID_CHAR 1002 |
| int c = 0, w = 0, l = 0; |
| %} |
| %% |
| [^ \t\n]+ { return ID_WORD; } |
| \n { return ID_EOL; } |
| . { return ID_CHAR; } |
| %% |
| bool count(int tok) |
| { |
| switch (tok) { |
| case ID_WORD: ++w; c += yyleng; break; |
| case ID_EOL: ++l; ++c; break; |
| case ID_CHAR: ++c; break; |
| default: |
| return false; |
| } |
| return true; |
| } |
| void main() |
| { |
| int tok = EOF; |
| do { |
| tok = yylex(); |
| if (!count(tok)) |
| break; |
| } while (EOF != tok); |
| printf("%d %d %d\n", l, w, c); |
| } |
| //] |
| */ |
| // Its purpose is to do the word count function of the wc command in UNIX. It |
| // prints the number of lines, words and characters in a file. |
| // |
| // This examples shows how to use the tokenize() function together with a |
| // simple functor, which gets executed whenever a token got matched in the |
| // input sequence. |
| |
| // #define BOOST_SPIRIT_LEXERTL_DEBUG |
| |
| #include <boost/config/warning_disable.hpp> |
| //[wcf_includes |
| #include <boost/spirit/include/lex_lexertl.hpp> |
| #include <boost/bind.hpp> |
| #include <boost/ref.hpp> |
| //] |
| |
| #include <iostream> |
| #include <string> |
| |
| #include "example.hpp" |
| |
| //[wcf_namespaces |
| namespace lex = boost::spirit::lex; |
| //] |
| |
| /////////////////////////////////////////////////////////////////////////////// |
| // Token id definitions |
| /////////////////////////////////////////////////////////////////////////////// |
| //[wcf_token_ids |
| enum token_ids |
| { |
| ID_WORD = 1000, |
| ID_EOL, |
| ID_CHAR |
| }; |
| //] |
| |
| //[wcf_token_definition |
| /*` The template `word_count_tokens` defines three different tokens: |
| `ID_WORD`, `ID_EOL`, and `ID_CHAR`, representing a word (anything except |
| a whitespace or a newline), a newline character, and any other character |
| (`ID_WORD`, `ID_EOL`, and `ID_CHAR` are enum values representing the token |
| ids, but could be anything else convertible to an integer as well). |
| The direct base class of any token definition class needs to be the |
| template `lex::lexer<>`, where the corresponding template parameter (here: |
| `lex::lexertl::lexer<BaseIterator>`) defines which underlying lexer engine has |
| to be used to provide the required state machine functionality. In this |
| example we use the Lexertl based lexer engine as the underlying lexer type. |
| */ |
| template <typename Lexer> |
| struct word_count_tokens : lex::lexer<Lexer> |
| { |
| word_count_tokens() |
| { |
| // define tokens (the regular expression to match and the corresponding |
| // token id) and add them to the lexer |
| this->self.add |
| ("[^ \t\n]+", ID_WORD) // words (anything except ' ', '\t' or '\n') |
| ("\n", ID_EOL) // newline characters |
| (".", ID_CHAR) // anything else is a plain character |
| ; |
| } |
| }; |
| //] |
| |
| //[wcf_functor |
| /*` In this example the struct 'counter' is used as a functor counting the |
| characters, words and lines in the analyzed input sequence by identifying |
| the matched tokens as passed from the /Spirit.Lex/ library. |
| */ |
| struct counter |
| { |
| //<- this is an implementation detail specific to boost::bind and doesn't show |
| // up in the documentation |
| typedef bool result_type; |
| //-> |
| // the function operator gets called for each of the matched tokens |
| // c, l, w are references to the counters used to keep track of the numbers |
| template <typename Token> |
| bool operator()(Token const& t, std::size_t& c, std::size_t& w, std::size_t& l) const |
| { |
| switch (t.id()) { |
| case ID_WORD: // matched a word |
| // since we're using a default token type in this example, every |
| // token instance contains a `iterator_range<BaseIterator>` as its token |
| // attribute pointing to the matched character sequence in the input |
| ++w; c += t.value().size(); |
| break; |
| case ID_EOL: // matched a newline character |
| ++l; ++c; |
| break; |
| case ID_CHAR: // matched something else |
| ++c; |
| break; |
| } |
| return true; // always continue to tokenize |
| } |
| }; |
| //] |
| |
| /////////////////////////////////////////////////////////////////////////////// |
| //[wcf_main |
| /*` The main function simply loads the given file into memory (as a |
| `std::string`), instantiates an instance of the token definition template |
| using the correct iterator type (`word_count_tokens<char const*>`), |
| and finally calls `lex::tokenize`, passing an instance of the counter function |
| object. The return value of `lex::tokenize()` will be `true` if the |
| whole input sequence has been successfully tokenized, and `false` otherwise. |
| */ |
| int main(int argc, char* argv[]) |
| { |
| // these variables are used to count characters, words and lines |
| std::size_t c = 0, w = 0, l = 0; |
| |
| // read input from the given file |
| std::string str (read_from_file(1 == argc ? "word_count.input" : argv[1])); |
| |
| // create the token definition instance needed to invoke the lexical analyzer |
| word_count_tokens<lex::lexertl::lexer<> > word_count_functor; |
| |
| // tokenize the given string, the bound functor gets invoked for each of |
| // the matched tokens |
| char const* first = str.c_str(); |
| char const* last = &first[str.size()]; |
| bool r = lex::tokenize(first, last, word_count_functor, |
| boost::bind(counter(), _1, boost::ref(c), boost::ref(w), boost::ref(l))); |
| |
| // print results |
| if (r) { |
| std::cout << "lines: " << l << ", words: " << w |
| << ", characters: " << c << "\n"; |
| } |
| else { |
| std::string rest(first, last); |
| std::cout << "Lexical analysis failed\n" << "stopped at: \"" |
| << rest << "\"\n"; |
| } |
| return 0; |
| } |
| //] |
| |