| /* ***** BEGIN LICENSE BLOCK ***** |
| * Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
| * |
| * The contents of this file are subject to the Mozilla Public License Version |
| * 1.1 (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * http://www.mozilla.org/MPL/ |
| * |
| * Software distributed under the License is distributed on an "AS IS" basis, |
| * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
| * for the specific language governing rights and limitations under the |
| * License. |
| * |
| * The Original Code is Hunspell, based on MySpell. |
| * |
| * The Initial Developers of the Original Code are |
| * Kevin Hendricks (MySpell) and Németh László (Hunspell). |
| * Portions created by the Initial Developers are Copyright (C) 2002-2005 |
| * the Initial Developers. All Rights Reserved. |
| * |
| * Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno, |
| * Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád, |
| * Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter, |
| * Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls, |
| * Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen |
| * |
| * Alternatively, the contents of this file may be used under the terms of |
| * either the GNU General Public License Version 2 or later (the "GPL"), or |
| * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), |
| * in which case the provisions of the GPL or the LGPL are applicable instead |
| * of those above. If you wish to allow use of your version of this file only |
| * under the terms of either the GPL or the LGPL, and not to allow others to |
| * use your version of this file under the terms of the MPL, indicate your |
| * decision by deleting the provisions above and replace them with the notice |
| * and other provisions required by the GPL or the LGPL. If you do not delete |
| * the provisions above, a recipient may use your version of this file under |
| * the terms of any one of the MPL, the GPL or the LGPL. |
| * |
| * ***** END LICENSE BLOCK ***** */ |
| |
| #include <cstdlib> |
| #include <cstring> |
| #include <cstdio> |
| #include <ctype.h> |
| |
| #include "../hunspell/csutil.hxx" |
| #include "xmlparser.hxx" |
| |
| #ifndef W32 |
| using namespace std; |
| #endif |
| |
| enum { ST_NON_WORD, ST_WORD, ST_TAG, ST_CHAR_ENTITY, ST_OTHER_TAG, ST_ATTRIB }; |
| |
| static const char* __PATTERN__[][2] = {{"<!--", "-->"}, |
| {"<[cdata[", "]]>"}, // XML comment |
| {"<", ">"}}; |
| |
| #define __PATTERN_LEN__ (sizeof(__PATTERN__) / (sizeof(char*) * 2)) |
| |
| static const char* (*__PATTERN2__)[2] = NULL; |
| |
| #define __PATTERN_LEN2__ 0 |
| |
| #define ENTITY_APOS "'" |
| #define UTF8_APOS "\xe2\x80\x99" |
| #define APOSTROPHE "'" |
| |
| XMLParser::XMLParser(const char* wordchars) |
| : TextParser(wordchars) |
| , pattern_num(0), pattern2_num(0), prevstate(0), checkattr(0), quotmark(0) { |
| } |
| |
| XMLParser::XMLParser(const w_char* wordchars, int len) |
| : TextParser(wordchars, len) |
| , pattern_num(0), pattern2_num(0), prevstate(0), checkattr(0), quotmark(0) { |
| } |
| |
| XMLParser::~XMLParser() {} |
| |
| int XMLParser::look_pattern(const char* p[][2], unsigned int len, int column) { |
| for (unsigned int i = 0; i < len; i++) { |
| const char* j = line[actual].c_str() + head; |
| const char* k = p[i][column]; |
| while ((*k != '\0') && (tolower(*j) == *k)) { |
| j++; |
| k++; |
| } |
| if (*k == '\0') |
| return i; |
| } |
| return -1; |
| } |
| |
| /* |
| * XML parser |
| * |
| */ |
| |
| bool XMLParser::next_token(const char* PATTERN[][2], |
| unsigned int PATTERN_LEN, |
| const char* PATTERN2[][2], |
| unsigned int PATTERN_LEN2, |
| std::string& t) { |
| t.clear(); |
| const char* latin1; |
| |
| for (;;) { |
| switch (state) { |
| case ST_NON_WORD: // non word chars |
| prevstate = ST_NON_WORD; |
| if ((pattern_num = look_pattern(PATTERN, PATTERN_LEN, 0)) != -1) { |
| checkattr = 0; |
| if ((pattern2_num = look_pattern(PATTERN2, PATTERN_LEN2, 0)) != -1) { |
| checkattr = 1; |
| } |
| state = ST_TAG; |
| } else if (is_wordchar(line[actual].c_str() + head)) { |
| state = ST_WORD; |
| token = head; |
| } else if ((latin1 = get_latin1(line[actual].c_str() + head))) { |
| state = ST_WORD; |
| token = head; |
| head += strlen(latin1); |
| } else if (line[actual][head] == '&') { |
| state = ST_CHAR_ENTITY; |
| } |
| break; |
| case ST_WORD: // wordchar |
| if ((latin1 = get_latin1(line[actual].c_str() + head))) { |
| head += strlen(latin1); |
| } else if ((is_wordchar((char*)APOSTROPHE) || |
| (is_utf8() && is_wordchar((char*)UTF8_APOS))) && |
| strncmp(line[actual].c_str() + head, ENTITY_APOS, |
| strlen(ENTITY_APOS)) == 0 && |
| is_wordchar(line[actual].c_str() + head + strlen(ENTITY_APOS))) { |
| head += strlen(ENTITY_APOS) - 1; |
| } else if (is_utf8() && |
| is_wordchar((char*)APOSTROPHE) && // add Unicode apostrophe |
| // to the WORDCHARS, if |
| // needed |
| strncmp(line[actual].c_str() + head, UTF8_APOS, strlen(UTF8_APOS)) == |
| 0 && |
| is_wordchar(line[actual].c_str() + head + strlen(UTF8_APOS))) { |
| head += strlen(UTF8_APOS) - 1; |
| } else if (!is_wordchar(line[actual].c_str() + head)) { |
| state = prevstate; |
| if (alloc_token(token, &head, t)) |
| return true; |
| } |
| break; |
| case ST_TAG: // comment, labels, etc |
| int i; |
| if ((checkattr == 1) && |
| ((i = look_pattern(PATTERN2, PATTERN_LEN2, 1)) != -1) && |
| (strcmp(PATTERN2[i][0], PATTERN2[pattern2_num][0]) == 0)) { |
| checkattr = 2; |
| } else if ((checkattr > 0) && (line[actual][head] == '>')) { |
| state = ST_NON_WORD; |
| } else if (((i = look_pattern(PATTERN, PATTERN_LEN, 1)) != -1) && |
| (strcmp(PATTERN[i][1], PATTERN[pattern_num][1]) == 0)) { |
| state = ST_NON_WORD; |
| head += strlen(PATTERN[pattern_num][1]) - 1; |
| } else if ((strcmp(PATTERN[pattern_num][0], "<") == 0) && |
| ((line[actual][head] == '"') || |
| (line[actual][head] == '\''))) { |
| quotmark = line[actual][head]; |
| state = ST_ATTRIB; |
| } |
| break; |
| case ST_ATTRIB: // non word chars |
| prevstate = ST_ATTRIB; |
| if (line[actual][head] == quotmark) { |
| state = ST_TAG; |
| if (checkattr == 2) |
| checkattr = 1; |
| // for IMG ALT |
| } else if (is_wordchar(line[actual].c_str() + head) && (checkattr == 2)) { |
| state = ST_WORD; |
| token = head; |
| } else if (line[actual][head] == '&') { |
| state = ST_CHAR_ENTITY; |
| } |
| break; |
| case ST_CHAR_ENTITY: // SGML element |
| if ((tolower(line[actual][head]) == ';')) { |
| state = prevstate; |
| head--; |
| } |
| } |
| if (next_char(line[actual].c_str(), &head)) |
| return false; |
| } |
| } |
| |
| bool XMLParser::next_token(std::string& t) { |
| return next_token(__PATTERN__, __PATTERN_LEN__, __PATTERN2__, |
| __PATTERN_LEN2__, t); |
| } |
| |
| int XMLParser::change_token(const char* word) { |
| if (strstr(word, APOSTROPHE) != NULL || strchr(word, '"') != NULL || |
| strchr(word, '&') != NULL || strchr(word, '<') != NULL || |
| strchr(word, '>') != NULL) { |
| std::string r(word); |
| mystrrep(r, "&", "__namp;__"); |
| mystrrep(r, "__namp;__", "&"); |
| mystrrep(r, APOSTROPHE, ENTITY_APOS); |
| mystrrep(r, "\"", """); |
| mystrrep(r, ">", ">"); |
| mystrrep(r, "<", "<"); |
| return TextParser::change_token(r.c_str()); |
| } |
| return TextParser::change_token(word); |
| } |