blob: 42bb25c916526a9dea64b8912b2fec2d63ec03ff [file] [log] [blame]
/*
* parser classes for MySpell
*
* implemented: text, HTML, TeX
*
* Copyright (C) 2002, Laszlo Nemeth
*
*/
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Hunspell, based on MySpell.
*
* The Initial Developers of the Original Code are
* Kevin Hendricks (MySpell) and Németh László (Hunspell).
* Portions created by the Initial Developers are Copyright (C) 2002-2005
* the Initial Developers. All Rights Reserved.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef TEXTPARSER_HXX_
#define TEXTPARSER_HXX_
// set sum of actual and previous lines
#define MAXPREVLINE 4
#ifndef MAXLNLEN
#define MAXLNLEN 8192
#endif
#include "../hunspell/w_char.hxx"
#include <vector>
/*
* Base Text Parser
*
*/
class TextParser {
protected:
int wordcharacters[256]; // for detection of the word boundaries
std::string line[MAXPREVLINE]; // parsed and previous lines
std::vector<bool> urlline; // mask for url detection
int checkurl;
int actual; // actual line
size_t head; // head position
size_t token;// begin of token
int state; // state of automata
int utf8; // UTF-8 character encoding
int next_char(const char* line, size_t* pos);
const w_char* wordchars_utf16;
int wclen;
public:
TextParser(const w_char* wordchars, int len);
explicit TextParser(const char* wc);
virtual ~TextParser();
void put_line(const char* line);
std::string get_line() const;
std::string get_prevline(int n) const;
virtual bool next_token(std::string&);
virtual int change_token(const char* word);
void set_url_checking(int check);
size_t get_tokenpos();
int is_wordchar(const char* w);
inline int is_utf8() { return utf8; }
const char* get_latin1(const char* s);
char* next_char();
int tokenize_urls();
void check_urls();
int get_url(size_t token_pos, size_t* head);
bool alloc_token(size_t token, size_t* head, std::string& out);
private:
void init(const char*);
void init(const w_char* wordchars, int len);
};
#endif