blob: df851c54d0ae348d8ced1be64ac11c4dcace9d04 [file] [log] [blame]
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Hunspell, based on MySpell.
*
* The Initial Developers of the Original Code are
* Kevin Hendricks (MySpell) and Németh László (Hunspell).
* Portions created by the Initial Developers are Copyright (C) 2002-2005
* the Initial Developers. All Rights Reserved.
*
* Contributor(s): David Einstein, Davide Prina, Giuseppe Modugno,
* Gianluca Turconi, Simon Brouwer, Noll János, Bíró Árpád,
* Goldman Eleonóra, Sarlós Tamás, Bencsáth Boldizsár, Halácsy Péter,
* Dvornik László, Gefferth András, Nagy Viktor, Varga Dániel, Chris Halls,
* Rene Engelhard, Bram Moolenaar, Dafydd Jones, Harri Pitkänen
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/*
* Copyright 2002 Kevin B. Hendricks, Stratford, Ontario, Canada
* And Contributors. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* 3. All modifications to the source code must be clearly marked as
* such. Binary redistributions based on modified source code
* must be clearly marked as modified versions in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include "affixmgr.hxx"
#include "hunspell.hxx"
#include "suggestmgr.hxx"
#include "hunspell.h"
#ifndef HUNSPELL_CHROME_CLIENT
# include "config.h"
#endif
#include "csutil.hxx"
#include <limits>
#include <string>
#define MAXWORDUTF8LEN (MAXWORDLEN * 2)
class HunspellImpl
{
public:
#ifdef HUNSPELL_CHROME_CLIENT
HunspellImpl(const unsigned char* bdict_data, size_t bdict_length);
#else
HunspellImpl(const char* affpath, const char* dpath, const char* key);
#endif
~HunspellImpl();
#ifndef HUNSPELL_CHROME_CLIENT
int add_dic(const char* dpath, const char* key);
#endif
std::vector<std::string> suffix_suggest(const std::string& root_word);
std::vector<std::string> generate(const std::string& word, const std::vector<std::string>& pl);
std::vector<std::string> generate(const std::string& word, const std::string& pattern);
std::vector<std::string> stem(const std::string& word);
std::vector<std::string> stem(const std::vector<std::string>& morph);
std::vector<std::string> analyze(const std::string& word);
int get_langnum() const;
bool input_conv(const std::string& word, std::string& dest);
bool spell(const std::string& word, int* info = NULL, std::string* root = NULL);
std::vector<std::string> suggest(const std::string& word);
const std::string& get_wordchars() const;
const std::vector<w_char>& get_wordchars_utf16() const;
const std::string& get_dict_encoding() const;
int add(const std::string& word);
int add_with_affix(const std::string& word, const std::string& example);
int remove(const std::string& word);
const std::string& get_version() const;
struct cs_info* get_csconv();
std::vector<char> dic_encoding_vec;
private:
AffixMgr* pAMgr;
std::vector<HashMgr*> m_HMgrs;
SuggestMgr* pSMgr;
#ifndef HUNSPELL_CHROME_CLIENT // We are using BDict instead.
char* affixpath;
#endif
std::string encoding;
struct cs_info* csconv;
int langnum;
int utf8;
int complexprefixes;
std::vector<std::string> wordbreak;
#ifdef HUNSPELL_CHROME_CLIENT
// Not owned by us, owned by the Hunspell object.
hunspell::BDictReader* bdict_reader;
#endif
private:
void cleanword(std::string& dest, const std::string&, int* pcaptype, int* pabbrev);
size_t cleanword2(std::string& dest,
std::vector<w_char>& dest_u,
const std::string& src,
int* pcaptype,
size_t* pabbrev);
void mkinitcap(std::string& u8);
int mkinitcap2(std::string& u8, std::vector<w_char>& u16);
int mkinitsmall2(std::string& u8, std::vector<w_char>& u16);
void mkallcap(std::string& u8);
int mkallsmall2(std::string& u8, std::vector<w_char>& u16);
struct hentry* checkword(const std::string& source, int* info, std::string* root);
std::string sharps_u8_l1(const std::string& source);
hentry*
spellsharps(std::string& base, size_t start_pos, int, int, int* info, std::string* root);
int is_keepcase(const hentry* rv);
void insert_sug(std::vector<std::string>& slst, const std::string& word);
void cat_result(std::string& result, const std::string& st);
std::vector<std::string> spellml(const std::string& word);
std::string get_xml_par(const char* par);
const char* get_xml_pos(const char* s, const char* attr);
std::vector<std::string> get_xml_list(const char* list, const char* tag);
int check_xml_par(const char* q, const char* attr, const char* value);
private:
HunspellImpl(const HunspellImpl&);
HunspellImpl& operator=(const HunspellImpl&);
};
#ifdef HUNSPELL_CHROME_CLIENT
Hunspell::Hunspell(const unsigned char* bdict_data, size_t bdict_length)
: m_Impl(new HunspellImpl(bdict_data, bdict_length)) {
#else
Hunspell::Hunspell(const char* affpath, const char* dpath, const char* key)
: m_Impl(new HunspellImpl(affpath, dpath, key)) {
#endif
}
#ifdef HUNSPELL_CHROME_CLIENT
HunspellImpl::HunspellImpl(const unsigned char* bdict_data, size_t bdict_length) {
#else
HunspellImpl::HunspellImpl(const char* affpath, const char* dpath, const char* key) {
#endif
csconv = NULL;
utf8 = 0;
complexprefixes = 0;
#ifndef HUNSPELL_CHROME_CLIENT
affixpath = mystrdup(affpath);
#endif
#ifdef HUNSPELL_CHROME_CLIENT
bdict_reader = new hunspell::BDictReader;
bdict_reader->Init(bdict_data, bdict_length);
/* first set up the hash manager */
m_HMgrs.push_back(new HashMgr(bdict_reader));
pAMgr = new AffixMgr(bdict_reader, m_HMgrs); // TODO: 'key' ?
#else
/* first set up the hash manager */
m_HMgrs.push_back(new HashMgr(dpath, affpath, key));
/* next set up the affix manager */
/* it needs access to the hash manager lookup methods */
pAMgr = new AffixMgr(affpath, m_HMgrs, key);
#endif
/* get the preferred try string and the dictionary */
/* encoding from the Affix Manager for that dictionary */
char* try_string = pAMgr->get_try_string();
encoding = pAMgr->get_encoding();
langnum = pAMgr->get_langnum();
utf8 = pAMgr->get_utf8();
if (!utf8)
csconv = get_current_cs(encoding);
complexprefixes = pAMgr->get_complexprefixes();
wordbreak = pAMgr->get_breaktable();
dic_encoding_vec.resize(encoding.size()+1);
strcpy(&dic_encoding_vec[0], encoding.c_str());
/* and finally set up the suggestion manager */
#ifdef HUNSPELL_CHROME_CLIENT
pSMgr = new SuggestMgr(bdict_reader, try_string, MAXSUGGESTION, pAMgr);
#else
pSMgr = new SuggestMgr(try_string, MAXSUGGESTION, pAMgr);
#endif
if (try_string)
free(try_string);
}
Hunspell::~Hunspell() {
delete m_Impl;
}
HunspellImpl::~HunspellImpl() {
delete pSMgr;
delete pAMgr;
for (size_t i = 0; i < m_HMgrs.size(); ++i)
delete m_HMgrs[i];
pSMgr = NULL;
pAMgr = NULL;
#ifdef MOZILLA_CLIENT
delete[] csconv;
#endif
csconv = NULL;
#ifdef HUNSPELL_CHROME_CLIENT
if (bdict_reader) delete bdict_reader;
bdict_reader = NULL;
#else
if (affixpath)
free(affixpath);
affixpath = NULL;
#endif
}
#ifndef HUNSPELL_CHROME_CLIENT
// load extra dictionaries
int Hunspell::add_dic(const char* dpath, const char* key) {
return m_Impl->add_dic(dpath, key);
}
// load extra dictionaries
int HunspellImpl::add_dic(const char* dpath, const char* key) {
if (!affixpath)
return 1;
m_HMgrs.push_back(new HashMgr(dpath, affixpath, key));
return 0;
}
#endif
// make a copy of src at destination while removing all leading
// blanks and removing any trailing periods after recording
// their presence with the abbreviation flag
// also since already going through character by character,
// set the capitalization type
// return the length of the "cleaned" (and UTF-8 encoded) word
size_t HunspellImpl::cleanword2(std::string& dest,
std::vector<w_char>& dest_utf,
const std::string& src,
int* pcaptype,
size_t* pabbrev) {
dest.clear();
dest_utf.clear();
const char* q = src.c_str();
// first skip over any leading blanks
while ((*q != '\0') && (*q == ' '))
q++;
// now strip off any trailing periods (recording their presence)
*pabbrev = 0;
int nl = strlen(q);
while ((nl > 0) && (*(q + nl - 1) == '.')) {
nl--;
(*pabbrev)++;
}
// if no characters are left it can't be capitalized
if (nl <= 0) {
*pcaptype = NOCAP;
return 0;
}
dest.append(q, nl);
nl = dest.size();
if (utf8) {
u8_u16(dest_utf, dest);
*pcaptype = get_captype_utf8(dest_utf, langnum);
} else {
*pcaptype = get_captype(dest, csconv);
}
return nl;
}
void HunspellImpl::cleanword(std::string& dest,
const std::string& src,
int* pcaptype,
int* pabbrev) {
dest.clear();
const unsigned char* q = (const unsigned char*)src.c_str();
int firstcap = 0;
// first skip over any leading blanks
while ((*q != '\0') && (*q == ' '))
q++;
// now strip off any trailing periods (recording their presence)
*pabbrev = 0;
int nl = strlen((const char*)q);
while ((nl > 0) && (*(q + nl - 1) == '.')) {
nl--;
(*pabbrev)++;
}
// if no characters are left it can't be capitalized
if (nl <= 0) {
*pcaptype = NOCAP;
return;
}
// now determine the capitalization type of the first nl letters
int ncap = 0;
int nneutral = 0;
int nc = 0;
if (!utf8) {
while (nl > 0) {
nc++;
if (csconv[(*q)].ccase)
ncap++;
if (csconv[(*q)].cupper == csconv[(*q)].clower)
nneutral++;
dest.push_back(*q++);
nl--;
}
// remember to terminate the destination string
firstcap = csconv[static_cast<unsigned char>(dest[0])].ccase;
} else {
std::vector<w_char> t;
u8_u16(t, src);
for (size_t i = 0; i < t.size(); ++i) {
unsigned short idx = (t[i].h << 8) + t[i].l;
unsigned short low = unicodetolower(idx, langnum);
if (idx != low)
ncap++;
if (unicodetoupper(idx, langnum) == low)
nneutral++;
}
u16_u8(dest, t);
if (ncap) {
unsigned short idx = (t[0].h << 8) + t[0].l;
firstcap = (idx != unicodetolower(idx, langnum));
}
}
// now finally set the captype
if (ncap == 0) {
*pcaptype = NOCAP;
} else if ((ncap == 1) && firstcap) {
*pcaptype = INITCAP;
} else if ((ncap == nc) || ((ncap + nneutral) == nc)) {
*pcaptype = ALLCAP;
} else if ((ncap > 1) && firstcap) {
*pcaptype = HUHINITCAP;
} else {
*pcaptype = HUHCAP;
}
}
void HunspellImpl::mkallcap(std::string& u8) {
if (utf8) {
std::vector<w_char> u16;
u8_u16(u16, u8);
::mkallcap_utf(u16, langnum);
u16_u8(u8, u16);
} else {
::mkallcap(u8, csconv);
}
}
int HunspellImpl::mkallsmall2(std::string& u8, std::vector<w_char>& u16) {
if (utf8) {
::mkallsmall_utf(u16, langnum);
u16_u8(u8, u16);
} else {
::mkallsmall(u8, csconv);
}
return u8.size();
}
// convert UTF-8 sharp S codes to latin 1
std::string HunspellImpl::sharps_u8_l1(const std::string& source) {
std::string dest(source);
mystrrep(dest, "\xC3\x9F", "\xDF");
return dest;
}
// recursive search for right ss - sharp s permutations
hentry* HunspellImpl::spellsharps(std::string& base,
size_t n_pos,
int n,
int repnum,
int* info,
std::string* root) {
size_t pos = base.find("ss", n_pos);
if (pos != std::string::npos && (n < MAXSHARPS)) {
base[pos] = '\xC3';
base[pos + 1] = '\x9F';
hentry* h = spellsharps(base, pos + 2, n + 1, repnum + 1, info, root);
if (h)
return h;
base[pos] = 's';
base[pos + 1] = 's';
h = spellsharps(base, pos + 2, n + 1, repnum, info, root);
if (h)
return h;
} else if (repnum > 0) {
if (utf8)
return checkword(base, info, root);
std::string tmp(sharps_u8_l1(base));
return checkword(tmp, info, root);
}
return NULL;
}
int HunspellImpl::is_keepcase(const hentry* rv) {
return pAMgr && rv->astr && pAMgr->get_keepcase() &&
TESTAFF(rv->astr, pAMgr->get_keepcase(), rv->alen);
}
/* insert a word to the beginning of the suggestion array */
void HunspellImpl::insert_sug(std::vector<std::string>& slst, const std::string& word) {
slst.insert(slst.begin(), word);
}
bool Hunspell::spell(const std::string& word, int* info, std::string* root) {
return m_Impl->spell(word, info, root);
}
bool HunspellImpl::spell(const std::string& word, int* info, std::string* root) {
#ifdef HUNSPELL_CHROME_CLIENT
if (m_HMgrs[0]) m_HMgrs[0]->EmptyHentryCache();
#endif
struct hentry* rv = NULL;
int info2 = 0;
if (!info)
info = &info2;
else
*info = 0;
// Hunspell supports XML input of the simplified API (see manual)
if (word == SPELL_XML)
return true;
if (utf8) {
if (word.size() >= MAXWORDUTF8LEN)
return false;
} else {
if (word.size() >= MAXWORDLEN)
return false;
}
int captype = NOCAP;
size_t abbv = 0;
size_t wl = 0;
std::string scw;
std::vector<w_char> sunicw;
// input conversion
RepList* rl = pAMgr ? pAMgr->get_iconvtable() : NULL;
{
std::string wspace;
bool convstatus = rl ? rl->conv(word, wspace) : false;
if (convstatus)
wl = cleanword2(scw, sunicw, wspace, &captype, &abbv);
else
wl = cleanword2(scw, sunicw, word, &captype, &abbv);
}
#ifdef MOZILLA_CLIENT
// accept the abbreviated words without dots
// workaround for the incomplete tokenization of Mozilla
abbv = 1;
#endif
if (wl == 0 || m_HMgrs.empty())
return true;
if (root)
root->clear();
// allow numbers with dots, dashes and commas (but forbid double separators:
// "..", "--" etc.)
enum { NBEGIN, NNUM, NSEP };
int nstate = NBEGIN;
size_t i;
for (i = 0; (i < wl); i++) {
if ((scw[i] <= '9') && (scw[i] >= '0')) {
nstate = NNUM;
} else if ((scw[i] == ',') || (scw[i] == '.') || (scw[i] == '-')) {
if ((nstate == NSEP) || (i == 0))
break;
nstate = NSEP;
} else
break;
}
if ((i == wl) && (nstate == NNUM))
return true;
switch (captype) {
case HUHCAP:
/* FALLTHROUGH */
case HUHINITCAP:
*info += SPELL_ORIGCAP;
/* FALLTHROUGH */
case NOCAP:
rv = checkword(scw, info, root);
if ((abbv) && !(rv)) {
std::string u8buffer(scw);
u8buffer.push_back('.');
rv = checkword(u8buffer, info, root);
}
break;
case ALLCAP: {
*info += SPELL_ORIGCAP;
rv = checkword(scw, info, root);
if (rv)
break;
if (abbv) {
std::string u8buffer(scw);
u8buffer.push_back('.');
rv = checkword(u8buffer, info, root);
if (rv)
break;
}
// Spec. prefix handling for Catalan, French, Italian:
// prefixes separated by apostrophe (SANT'ELIA -> Sant'+Elia).
size_t apos = pAMgr ? scw.find('\'') : std::string::npos;
if (apos != std::string::npos) {
mkallsmall2(scw, sunicw);
//conversion may result in string with different len to pre-mkallsmall2
//so re-scan
if (apos != std::string::npos && apos < scw.size() - 1) {
std::string part1 = scw.substr(0, apos+1);
std::string part2 = scw.substr(apos+1);
if (utf8) {
std::vector<w_char> part1u, part2u;
u8_u16(part1u, part1);
u8_u16(part2u, part2);
mkinitcap2(part2, part2u);
scw = part1 + part2;
sunicw = part1u;
sunicw.insert(sunicw.end(), part2u.begin(), part2u.end());
rv = checkword(scw, info, root);
if (rv)
break;
} else {
mkinitcap2(part2, sunicw);
scw = part1 + part2;
rv = checkword(scw, info, root);
if (rv)
break;
}
mkinitcap2(scw, sunicw);
rv = checkword(scw, info, root);
if (rv)
break;
}
}
if (pAMgr && pAMgr->get_checksharps() && scw.find("SS") != std::string::npos) {
mkallsmall2(scw, sunicw);
std::string u8buffer(scw);
rv = spellsharps(u8buffer, 0, 0, 0, info, root);
if (!rv) {
mkinitcap2(scw, sunicw);
rv = spellsharps(scw, 0, 0, 0, info, root);
}
if ((abbv) && !(rv)) {
u8buffer.push_back('.');
rv = spellsharps(u8buffer, 0, 0, 0, info, root);
if (!rv) {
u8buffer = std::string(scw);
u8buffer.push_back('.');
rv = spellsharps(u8buffer, 0, 0, 0, info, root);
}
}
if (rv)
break;
}
}
case INITCAP: {
*info += SPELL_ORIGCAP;
mkallsmall2(scw, sunicw);
std::string u8buffer(scw);
mkinitcap2(scw, sunicw);
if (captype == INITCAP)
*info += SPELL_INITCAP;
rv = checkword(scw, info, root);
if (captype == INITCAP)
*info -= SPELL_INITCAP;
// forbid bad capitalization
// (for example, ijs -> Ijs instead of IJs in Dutch)
// use explicit forms in dic: Ijs/F (F = FORBIDDENWORD flag)
if (*info & SPELL_FORBIDDEN) {
rv = NULL;
break;
}
if (rv && is_keepcase(rv) && (captype == ALLCAP))
rv = NULL;
if (rv)
break;
rv = checkword(u8buffer, info, root);
if (abbv && !rv) {
u8buffer.push_back('.');
rv = checkword(u8buffer, info, root);
if (!rv) {
u8buffer = scw;
u8buffer.push_back('.');
if (captype == INITCAP)
*info += SPELL_INITCAP;
rv = checkword(u8buffer, info, root);
if (captype == INITCAP)
*info -= SPELL_INITCAP;
if (rv && is_keepcase(rv) && (captype == ALLCAP))
rv = NULL;
break;
}
}
if (rv && is_keepcase(rv) &&
((captype == ALLCAP) ||
// if CHECKSHARPS: KEEPCASE words with \xDF are allowed
// in INITCAP form, too.
!(pAMgr->get_checksharps() &&
((utf8 && u8buffer.find("\xC3\x9F") != std::string::npos) ||
(!utf8 && u8buffer.find('\xDF') != std::string::npos)))))
rv = NULL;
break;
}
}
if (rv) {
if (pAMgr && pAMgr->get_warn() && rv->astr &&
TESTAFF(rv->astr, pAMgr->get_warn(), rv->alen)) {
*info += SPELL_WARN;
if (pAMgr->get_forbidwarn())
return false;
return true;
}
return true;
}
// recursive breaking at break points
if (!wordbreak.empty()) {
int nbr = 0;
wl = scw.size();
// calculate break points for recursion limit
for (size_t j = 0; j < wordbreak.size(); ++j) {
size_t pos = 0;
while ((pos = scw.find(wordbreak[j], pos)) != std::string::npos) {
++nbr;
pos += wordbreak[j].size();
}
}
if (nbr >= 10)
return false;
// check boundary patterns (^begin and end$)
for (size_t j = 0; j < wordbreak.size(); ++j) {
size_t plen = wordbreak[j].size();
if (plen == 1 || plen > wl)
continue;
if (wordbreak[j][0] == '^' &&
scw.compare(0, plen - 1, wordbreak[j], 1, plen -1) == 0 && spell(scw.substr(plen - 1)))
return true;
if (wordbreak[j][plen - 1] == '$' &&
scw.compare(wl - plen + 1, plen - 1, wordbreak[j], 0, plen - 1) == 0) {
std::string suffix(scw.substr(wl - plen + 1));
scw.resize(wl - plen + 1);
if (spell(scw))
return true;
scw.append(suffix);
}
}
// other patterns
for (size_t j = 0; j < wordbreak.size(); ++j) {
size_t plen = wordbreak[j].size();
size_t found = scw.find(wordbreak[j]);
if ((found > 0) && (found < wl - plen)) {
if (!spell(scw.substr(found + plen)))
continue;
std::string suffix(scw.substr(found));
scw.resize(found);
// examine 2 sides of the break point
if (spell(scw))
return true;
scw.append(suffix);
// LANG_hu: spec. dash rule
if (langnum == LANG_hu && wordbreak[j] == "-") {
suffix = scw.substr(found + 1);
scw.resize(found + 1);
if (spell(scw))
return true; // check the first part with dash
scw.append(suffix);
}
// end of LANG specific region
}
}
}
return false;
}
struct hentry* HunspellImpl::checkword(const std::string& w, int* info, std::string* root) {
bool usebuffer = false;
std::string w2;
const char* word;
int len;
const char* ignoredchars = pAMgr ? pAMgr->get_ignore() : NULL;
if (ignoredchars != NULL) {
w2.assign(w);
if (utf8) {
const std::vector<w_char>& ignoredchars_utf16 =
pAMgr->get_ignore_utf16();
remove_ignored_chars_utf(w2, ignoredchars_utf16);
} else {
remove_ignored_chars(w2, ignoredchars);
}
word = w2.c_str();
len = w2.size();
usebuffer = true;
} else {
word = w.c_str();
len = w.size();
}
if (!len)
return NULL;
#ifdef HUNSPELL_CHROME_CLIENT
// We need to check if the word length is valid to make coverity (Event
// fixed_size_dest: Possible overrun of N byte fixed size buffer) happy.
if ((utf8 && strlen(word) >= MAXWORDUTF8LEN) || (!utf8 && strlen(word) >= MAXWORDLEN))
return NULL;
#endif
// word reversing wrapper for complex prefixes
if (complexprefixes) {
if (!usebuffer) {
w2.assign(word);
usebuffer = true;
}
if (utf8)
reverseword_utf(w2);
else
reverseword(w2);
}
if (usebuffer) {
word = w2.c_str();
}
// look word in hash table
struct hentry* he = NULL;
for (size_t i = 0; (i < m_HMgrs.size()) && !he; ++i) {
he = m_HMgrs[i]->lookup(word);
// check forbidden and onlyincompound words
if ((he) && (he->astr) && (pAMgr) &&
TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
if (info)
*info += SPELL_FORBIDDEN;
// LANG_hu section: set dash information for suggestions
if (langnum == LANG_hu) {
if (pAMgr->get_compoundflag() &&
TESTAFF(he->astr, pAMgr->get_compoundflag(), he->alen)) {
if (info)
*info += SPELL_COMPOUND;
}
}
return NULL;
}
// he = next not needaffix, onlyincompound homonym or onlyupcase word
while (he && (he->astr) && pAMgr &&
((pAMgr->get_needaffix() &&
TESTAFF(he->astr, pAMgr->get_needaffix(), he->alen)) ||
(pAMgr->get_onlyincompound() &&
TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) ||
(info && (*info & SPELL_INITCAP) &&
TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen))))
he = he->next_homonym;
}
// check with affixes
if (!he && pAMgr) {
// try stripping off affixes */
he = pAMgr->affix_check(word, len, 0);
// check compound restriction and onlyupcase
if (he && he->astr &&
((pAMgr->get_onlyincompound() &&
TESTAFF(he->astr, pAMgr->get_onlyincompound(), he->alen)) ||
(info && (*info & SPELL_INITCAP) &&
TESTAFF(he->astr, ONLYUPCASEFLAG, he->alen)))) {
he = NULL;
}
if (he) {
if ((he->astr) && (pAMgr) &&
TESTAFF(he->astr, pAMgr->get_forbiddenword(), he->alen)) {
if (info)
*info += SPELL_FORBIDDEN;
return NULL;
}
if (root) {
root->assign(he->word);
if (complexprefixes) {
if (utf8)
reverseword_utf(*root);
else
reverseword(*root);
}
}
// try check compound word
} else if (pAMgr->get_compound()) {
struct hentry* rwords[100]; // buffer for COMPOUND pattern checking
he = pAMgr->compound_check(word, 0, 0, 100, 0, NULL, (hentry**)&rwords, 0, 0, info);
// LANG_hu section: `moving rule' with last dash
if ((!he) && (langnum == LANG_hu) && (word[len - 1] == '-')) {
std::string dup(word, len - 1);
he = pAMgr->compound_check(dup, -5, 0, 100, 0, NULL, (hentry**)&rwords, 1, 0, info);
}
// end of LANG specific region
if (he) {
if (root) {
root->assign(he->word);
if (complexprefixes) {
if (utf8)
reverseword_utf(*root);
else
reverseword(*root);
}
}
if (info)
*info += SPELL_COMPOUND;
}
}
}
return he;
}
std::vector<std::string> Hunspell::suggest(const std::string& word) {
return m_Impl->suggest(word);
}
std::vector<std::string> HunspellImpl::suggest(const std::string& word) {
#ifdef HUNSPELL_CHROME_CLIENT
if (m_HMgrs[0]) m_HMgrs[0]->EmptyHentryCache();
#endif
std::vector<std::string> slst;
int onlycmpdsug = 0;
if (!pSMgr || m_HMgrs.empty())
return slst;
// process XML input of the simplified API (see manual)
if (word.compare(0, sizeof(SPELL_XML) - 3, SPELL_XML, sizeof(SPELL_XML) - 3) == 0) {
return spellml(word);
}
if (utf8) {
if (word.size() >= MAXWORDUTF8LEN)
return slst;
} else {
if (word.size() >= MAXWORDLEN)
return slst;
}
int captype = NOCAP;
size_t abbv = 0;
size_t wl = 0;
std::string scw;
std::vector<w_char> sunicw;
// input conversion
RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
{
std::string wspace;
bool convstatus = rl ? rl->conv(word, wspace) : false;
if (convstatus)
wl = cleanword2(scw, sunicw, wspace, &captype, &abbv);
else
wl = cleanword2(scw, sunicw, word, &captype, &abbv);
if (wl == 0)
return slst;
}
int capwords = 0;
// check capitalized form for FORCEUCASE
if (pAMgr && captype == NOCAP && pAMgr->get_forceucase()) {
int info = SPELL_ORIGCAP;
if (checkword(scw, &info, NULL)) {
std::string form(scw);
mkinitcap(form);
slst.push_back(form);
return slst;
}
}
switch (captype) {
case NOCAP: {
pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
break;
}
case INITCAP: {
capwords = 1;
pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
std::string wspace(scw);
mkallsmall2(wspace, sunicw);
pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
break;
}
case HUHINITCAP:
capwords = 1;
case HUHCAP: {
pSMgr->suggest(slst, scw.c_str(), &onlycmpdsug);
// something.The -> something. The
size_t dot_pos = scw.find('.');
if (dot_pos != std::string::npos) {
std::string postdot = scw.substr(dot_pos + 1);
int captype_;
if (utf8) {
std::vector<w_char> postdotu;
u8_u16(postdotu, postdot);
captype_ = get_captype_utf8(postdotu, langnum);
} else {
captype_ = get_captype(postdot, csconv);
}
if (captype_ == INITCAP) {
std::string str(scw);
str.insert(dot_pos + 1, 1, ' ');
insert_sug(slst, str);
}
}
std::string wspace;
if (captype == HUHINITCAP) {
// TheOpenOffice.org -> The OpenOffice.org
wspace = scw;
mkinitsmall2(wspace, sunicw);
pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
}
wspace = scw;
mkallsmall2(wspace, sunicw);
if (spell(wspace.c_str()))
insert_sug(slst, wspace);
size_t prevns = slst.size();
pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
if (captype == HUHINITCAP) {
mkinitcap2(wspace, sunicw);
if (spell(wspace.c_str()))
insert_sug(slst, wspace);
pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
}
// aNew -> "a New" (instead of "a new")
for (size_t j = prevns; j < slst.size(); ++j) {
const char* space = strchr(slst[j].c_str(), ' ');
if (space) {
size_t slen = strlen(space + 1);
// different case after space (need capitalisation)
if ((slen < wl) && strcmp(scw.c_str() + wl - slen, space + 1)) {
std::string first(slst[j].c_str(), space + 1);
std::string second(space + 1);
std::vector<w_char> w;
if (utf8)
u8_u16(w, second);
mkinitcap2(second, w);
// set as first suggestion
slst.erase(slst.begin() + j);
slst.insert(slst.begin(), first + second);
}
}
}
break;
}
case ALLCAP: {
std::string wspace(scw);
mkallsmall2(wspace, sunicw);
pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
if (pAMgr && pAMgr->get_keepcase() && spell(wspace.c_str()))
insert_sug(slst, wspace);
mkinitcap2(wspace, sunicw);
pSMgr->suggest(slst, wspace.c_str(), &onlycmpdsug);
for (size_t j = 0; j < slst.size(); ++j) {
mkallcap(slst[j]);
if (pAMgr && pAMgr->get_checksharps()) {
if (utf8) {
mystrrep(slst[j], "\xC3\x9F", "SS");
} else {
mystrrep(slst[j], "\xDF", "SS");
}
}
}
break;
}
}
// LANG_hu section: replace '-' with ' ' in Hungarian
if (langnum == LANG_hu) {
for (size_t j = 0; j < slst.size(); ++j) {
size_t pos = slst[j].find('-');
if (pos != std::string::npos) {
int info;
std::string w(slst[j].substr(0, pos));
w.append(slst[j].substr(pos + 1));
(void)spell(w, &info, NULL);
if ((info & SPELL_COMPOUND) && (info & SPELL_FORBIDDEN)) {
slst[j][pos] = ' ';
} else
slst[j][pos] = '-';
}
}
}
// END OF LANG_hu section
// try ngram approach since found nothing or only compound words
if (pAMgr && (slst.empty() || onlycmpdsug) && (pAMgr->get_maxngramsugs() != 0)) {
switch (captype) {
case NOCAP: {
pSMgr->ngsuggest(slst, scw.c_str(), m_HMgrs);
break;
}
case HUHINITCAP:
capwords = 1;
case HUHCAP: {
std::string wspace(scw);
mkallsmall2(wspace, sunicw);
pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs);
break;
}
case INITCAP: {
capwords = 1;
std::string wspace(scw);
mkallsmall2(wspace, sunicw);
pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs);
break;
}
case ALLCAP: {
std::string wspace(scw);
mkallsmall2(wspace, sunicw);
size_t oldns = slst.size();
pSMgr->ngsuggest(slst, wspace.c_str(), m_HMgrs);
for (size_t j = oldns; j < slst.size(); ++j) {
mkallcap(slst[j]);
}
break;
}
}
}
// try dash suggestion (Afo-American -> Afro-American)
size_t dash_pos = scw.find('-');
if (dash_pos != std::string::npos) {
int nodashsug = 1;
for (size_t j = 0; j < slst.size() && nodashsug == 1; ++j) {
if (slst[j].find('-') != std::string::npos)
nodashsug = 0;
}
size_t prev_pos = 0;
bool last = false;
while (nodashsug && !last) {
if (dash_pos == scw.size())
last = 1;
std::string chunk = scw.substr(prev_pos, dash_pos - prev_pos);
if (!spell(chunk.c_str())) {
std::vector<std::string> nlst = suggest(chunk.c_str());
for (std::vector<std::string>::reverse_iterator j = nlst.rbegin(); j != nlst.rend(); ++j) {
std::string wspace = scw.substr(0, prev_pos);
wspace.append(*j);
if (!last) {
wspace.append("-");
wspace.append(scw.substr(dash_pos + 1));
}
insert_sug(slst, wspace);
}
nodashsug = 0;
}
if (!last) {
prev_pos = dash_pos + 1;
dash_pos = scw.find('-', prev_pos);
}
if (dash_pos == std::string::npos)
dash_pos = scw.size();
}
}
// word reversing wrapper for complex prefixes
if (complexprefixes) {
for (size_t j = 0; j < slst.size(); ++j) {
if (utf8)
reverseword_utf(slst[j]);
else
reverseword(slst[j]);
}
}
// capitalize
if (capwords)
for (size_t j = 0; j < slst.size(); ++j) {
mkinitcap(slst[j]);
}
// expand suggestions with dot(s)
if (abbv && pAMgr && pAMgr->get_sugswithdots()) {
for (size_t j = 0; j < slst.size(); ++j) {
slst[j].append(word.substr(word.size() - abbv));
}
}
// remove bad capitalized and forbidden forms
if (pAMgr && (pAMgr->get_keepcase() || pAMgr->get_forbiddenword())) {
switch (captype) {
case INITCAP:
case ALLCAP: {
size_t l = 0;
for (size_t j = 0; j < slst.size(); ++j) {
if (slst[j].find(' ') == std::string::npos && !spell(slst[j])) {
std::string s;
std::vector<w_char> w;
if (utf8) {
u8_u16(w, slst[j]);
} else {
s = slst[j];
}
mkallsmall2(s, w);
if (spell(s)) {
slst[l] = s;
++l;
} else {
mkinitcap2(s, w);
if (spell(s)) {
slst[l] = s;
++l;
}
}
} else {
slst[l] = slst[j];
++l;
}
}
slst.resize(l);
}
}
}
// remove duplications
size_t l = 0;
for (size_t j = 0; j < slst.size(); ++j) {
slst[l] = slst[j];
for (size_t k = 0; k < l; ++k) {
if (slst[k] == slst[j]) {
--l;
break;
}
}
++l;
}
slst.resize(l);
// output conversion
rl = (pAMgr) ? pAMgr->get_oconvtable() : NULL;
for (size_t j = 0; rl && j < slst.size(); ++j) {
std::string wspace;
if (rl->conv(slst[j], wspace)) {
slst[j] = wspace;
}
}
return slst;
}
const std::string& Hunspell::get_dict_encoding() const {
return m_Impl->get_dict_encoding();
}
const std::string& HunspellImpl::get_dict_encoding() const {
return encoding;
}
std::vector<std::string> Hunspell::stem(const std::vector<std::string>& desc) {
return m_Impl->stem(desc);
}
std::vector<std::string> HunspellImpl::stem(const std::vector<std::string>& desc) {
std::vector<std::string> slst;
std::string result2;
if (desc.empty())
return slst;
for (size_t i = 0; i < desc.size(); ++i) {
std::string result;
// add compound word parts (except the last one)
const char* s = desc[i].c_str();
const char* part = strstr(s, MORPH_PART);
if (part) {
const char* nextpart = strstr(part + 1, MORPH_PART);
while (nextpart) {
std::string field;
copy_field(field, part, MORPH_PART);
result.append(field);
part = nextpart;
nextpart = strstr(part + 1, MORPH_PART);
}
s = part;
}
std::string tok(s);
size_t alt = 0;
while ((alt = tok.find(" | ", alt)) != std::string::npos) {
tok[alt + 1] = MSEP_ALT;
}
std::vector<std::string> pl = line_tok(tok, MSEP_ALT);
for (size_t k = 0; k < pl.size(); ++k) {
// add derivational suffixes
if (pl[k].find(MORPH_DERI_SFX) != std::string::npos) {
// remove inflectional suffixes
const size_t is = pl[k].find(MORPH_INFL_SFX);
if (is != std::string::npos)
pl[k].resize(is);
std::vector<std::string> singlepl;
singlepl.push_back(pl[k]);
std::string sg = pSMgr->suggest_gen(singlepl, pl[k]);
if (!sg.empty()) {
std::vector<std::string> gen = line_tok(sg, MSEP_REC);
for (size_t j = 0; j < gen.size(); ++j) {
result2.push_back(MSEP_REC);
result2.append(result);
result2.append(gen[j]);
}
}
} else {
result2.push_back(MSEP_REC);
result2.append(result);
if (pl[k].find(MORPH_SURF_PFX) != std::string::npos) {
std::string field;
copy_field(field, pl[k], MORPH_SURF_PFX);
result2.append(field);
}
std::string field;
copy_field(field, pl[k], MORPH_STEM);
result2.append(field);
}
}
}
slst = line_tok(result2, MSEP_REC);
uniqlist(slst);
return slst;
}
std::vector<std::string> Hunspell::stem(const std::string& word) {
return m_Impl->stem(word);
}
std::vector<std::string> HunspellImpl::stem(const std::string& word) {
return stem(analyze(word));
}
const char* Hunspell::get_wordchars() const {
return m_Impl->get_wordchars().c_str();
}
const std::string& Hunspell::get_wordchars_cpp() const {
return m_Impl->get_wordchars();
}
const std::string& HunspellImpl::get_wordchars() const {
return pAMgr->get_wordchars();
}
const std::vector<w_char>& Hunspell::get_wordchars_utf16() const {
return m_Impl->get_wordchars_utf16();
}
const std::vector<w_char>& HunspellImpl::get_wordchars_utf16() const {
return pAMgr->get_wordchars_utf16();
}
void HunspellImpl::mkinitcap(std::string& u8) {
if (utf8) {
std::vector<w_char> u16;
u8_u16(u16, u8);
::mkinitcap_utf(u16, langnum);
u16_u8(u8, u16);
} else {
::mkinitcap(u8, csconv);
}
}
int HunspellImpl::mkinitcap2(std::string& u8, std::vector<w_char>& u16) {
if (utf8) {
::mkinitcap_utf(u16, langnum);
u16_u8(u8, u16);
} else {
::mkinitcap(u8, csconv);
}
return u8.size();
}
int HunspellImpl::mkinitsmall2(std::string& u8, std::vector<w_char>& u16) {
if (utf8) {
::mkinitsmall_utf(u16, langnum);
u16_u8(u8, u16);
} else {
::mkinitsmall(u8, csconv);
}
return u8.size();
}
int Hunspell::add(const std::string& word) {
return m_Impl->add(word);
}
int HunspellImpl::add(const std::string& word) {
if (!m_HMgrs.empty())
return m_HMgrs[0]->add(word);
return 0;
}
int Hunspell::add_with_affix(const std::string& word, const std::string& example) {
return m_Impl->add_with_affix(word, example);
}
int HunspellImpl::add_with_affix(const std::string& word, const std::string& example) {
if (!m_HMgrs.empty())
return m_HMgrs[0]->add_with_affix(word, example);
return 0;
}
int Hunspell::remove(const std::string& word) {
return m_Impl->remove(word);
}
int HunspellImpl::remove(const std::string& word) {
if (!m_HMgrs.empty())
return m_HMgrs[0]->remove(word);
return 0;
}
const char* Hunspell::get_version() const {
return m_Impl->get_version().c_str();
}
const std::string& Hunspell::get_version_cpp() const {
return m_Impl->get_version();
}
const std::string& HunspellImpl::get_version() const {
return pAMgr->get_version();
}
struct cs_info* HunspellImpl::get_csconv() {
return csconv;
}
struct cs_info* Hunspell::get_csconv() {
return m_Impl->get_csconv();
}
void HunspellImpl::cat_result(std::string& result, const std::string& st) {
if (!st.empty()) {
if (!result.empty())
result.append("\n");
result.append(st);
}
}
std::vector<std::string> Hunspell::analyze(const std::string& word) {
return m_Impl->analyze(word);
}
std::vector<std::string> HunspellImpl::analyze(const std::string& word) {
std::vector<std::string> slst;
if (!pSMgr || m_HMgrs.empty())
return slst;
if (utf8) {
if (word.size() >= MAXWORDUTF8LEN)
return slst;
} else {
if (word.size() >= MAXWORDLEN)
return slst;
}
int captype = NOCAP;
size_t abbv = 0;
size_t wl = 0;
std::string scw;
std::vector<w_char> sunicw;
// input conversion
RepList* rl = (pAMgr) ? pAMgr->get_iconvtable() : NULL;
{
std::string wspace;
bool convstatus = rl ? rl->conv(word, wspace) : false;
if (convstatus)
wl = cleanword2(scw, sunicw, wspace, &captype, &abbv);
else
wl = cleanword2(scw, sunicw, word, &captype, &abbv);
}
if (wl == 0) {
if (abbv) {
scw.clear();
for (wl = 0; wl < abbv; wl++)
scw.push_back('.');
abbv = 0;
} else
return slst;
}
std::string result;
size_t n = 0;
// test numbers
// LANG_hu section: set dash information for suggestions
if (langnum == LANG_hu) {
size_t n2 = 0;
size_t n3 = 0;
while ((n < wl) && (((scw[n] <= '9') && (scw[n] >= '0')) ||
(((scw[n] == '.') || (scw[n] == ',')) && (n > 0)))) {
n++;
if ((scw[n] == '.') || (scw[n] == ',')) {
if (((n2 == 0) && (n > 3)) ||
((n2 > 0) && ((scw[n - 1] == '.') || (scw[n - 1] == ','))))
break;
n2++;
n3 = n;
}
}
if ((n == wl) && (n3 > 0) && (n - n3 > 3))
return slst;
if ((n == wl) || ((n > 0) && ((scw[n] == '%') || (scw[n] == '\xB0')) &&
checkword(scw.substr(n), NULL, NULL))) {
result.append(scw);
result.resize(n - 1);
if (n == wl)
cat_result(result, pSMgr->suggest_morph(scw.substr(n - 1)));
else {
std::string chunk = scw.substr(n - 1, 1);
cat_result(result, pSMgr->suggest_morph(chunk));
result.push_back('+'); // XXX SPEC. MORPHCODE
cat_result(result, pSMgr->suggest_morph(scw.substr(n)));
}
return line_tok(result, MSEP_REC);
}
}
// END OF LANG_hu section
switch (captype) {
case HUHCAP:
case HUHINITCAP:
case NOCAP: {
cat_result(result, pSMgr->suggest_morph(scw));
if (abbv) {
std::string u8buffer(scw);
u8buffer.push_back('.');
cat_result(result, pSMgr->suggest_morph(u8buffer));
}
break;
}
case INITCAP: {
mkallsmall2(scw, sunicw);
std::string u8buffer(scw);
mkinitcap2(scw, sunicw);
cat_result(result, pSMgr->suggest_morph(u8buffer));
cat_result(result, pSMgr->suggest_morph(scw));
if (abbv) {
u8buffer.push_back('.');
cat_result(result, pSMgr->suggest_morph(u8buffer));
u8buffer = scw;
u8buffer.push_back('.');
cat_result(result, pSMgr->suggest_morph(u8buffer));
}
break;
}
case ALLCAP: {
cat_result(result, pSMgr->suggest_morph(scw));
if (abbv) {
std::string u8buffer(scw);
u8buffer.push_back('.');
cat_result(result, pSMgr->suggest_morph(u8buffer));
}
mkallsmall2(scw, sunicw);
std::string u8buffer(scw);
mkinitcap2(scw, sunicw);
cat_result(result, pSMgr->suggest_morph(u8buffer));
cat_result(result, pSMgr->suggest_morph(scw));
if (abbv) {
u8buffer.push_back('.');
cat_result(result, pSMgr->suggest_morph(u8buffer));
u8buffer = scw;
u8buffer.push_back('.');
cat_result(result, pSMgr->suggest_morph(u8buffer));
}
break;
}
}
if (!result.empty()) {
// word reversing wrapper for complex prefixes
if (complexprefixes) {
if (utf8)
reverseword_utf(result);
else
reverseword(result);
}
return line_tok(result, MSEP_REC);
}
// compound word with dash (HU) I18n
// LANG_hu section: set dash information for suggestions
size_t dash_pos = langnum == LANG_hu ? scw.find('-') : std::string::npos;
if (dash_pos != std::string::npos) {
int nresult = 0;
std::string part1 = scw.substr(0, dash_pos);
std::string part2 = scw.substr(dash_pos+1);
// examine 2 sides of the dash
if (part2.empty()) { // base word ending with dash
if (spell(part1)) {
std::string p = pSMgr->suggest_morph(part1);
if (!p.empty()) {
slst = line_tok(p, MSEP_REC);
return slst;
}
}
} else if (part2.size() == 1 && part2[0] == 'e') { // XXX (HU) -e hat.
if (spell(part1) && (spell("-e"))) {
std::string st = pSMgr->suggest_morph(part1);
if (!st.empty()) {
result.append(st);
}
result.push_back('+'); // XXX spec. separator in MORPHCODE
st = pSMgr->suggest_morph("-e");
if (!st.empty()) {
result.append(st);
}
return line_tok(result, MSEP_REC);
}
} else {
// first word ending with dash: word- XXX ???
part1.push_back(' ');
nresult = spell(part1);
part1.erase(part1.size() - 1);
if (nresult && spell(part2) &&
((part2.size() > 1) || ((part2[0] > '0') && (part2[0] < '9')))) {
std::string st = pSMgr->suggest_morph(part1);
if (!st.empty()) {
result.append(st);
result.push_back('+'); // XXX spec. separator in MORPHCODE
}
st = pSMgr->suggest_morph(part2);
if (!st.empty()) {
result.append(st);
}
return line_tok(result, MSEP_REC);
}
}
// affixed number in correct word
if (nresult && (dash_pos > 0) &&
(((scw[dash_pos - 1] <= '9') && (scw[dash_pos - 1] >= '0')) ||
(scw[dash_pos - 1] == '.'))) {
n = 1;
if (scw[dash_pos - n] == '.')
n++;
// search first not a number character to left from dash
while ((dash_pos >= n) && ((scw[dash_pos - n] == '0') || (n < 3)) &&
(n < 6)) {
n++;
}
if (dash_pos < n)
n--;
// numbers: valami1000000-hoz
// examine 100000-hoz, 10000-hoz 1000-hoz, 10-hoz,
// 56-hoz, 6-hoz
for (; n >= 1; n--) {
if (scw[dash_pos - n] < '0' || scw[dash_pos - n] > '9') {
continue;
}
std::string chunk = scw.substr(dash_pos - n);
if (checkword(chunk, NULL, NULL)) {
result.append(chunk);
std::string st = pSMgr->suggest_morph(chunk);
if (!st.empty()) {
result.append(st);
}
return line_tok(result, MSEP_REC);
}
}
}
}
return slst;
}
std::vector<std::string> Hunspell::generate(const std::string& word, const std::vector<std::string>& pl) {
return m_Impl->generate(word, pl);
}
std::vector<std::string> HunspellImpl::generate(const std::string& word, const std::vector<std::string>& pl) {
std::vector<std::string> slst;
if (!pSMgr || pl.empty())
return slst;
std::vector<std::string> pl2 = analyze(word);
int captype = NOCAP;
int abbv = 0;
std::string cw;
cleanword(cw, word, &captype, &abbv);
std::string result;
for (size_t i = 0; i < pl.size(); ++i) {
cat_result(result, pSMgr->suggest_gen(pl2, pl[i]));
}
if (!result.empty()) {
// allcap
if (captype == ALLCAP)
mkallcap(result);
// line split
slst = line_tok(result, MSEP_REC);
// capitalize
if (captype == INITCAP || captype == HUHINITCAP) {
for (size_t j = 0; j < slst.size(); ++j) {
mkinitcap(slst[j]);
}
}
// temporary filtering of prefix related errors (eg.
// generate("undrinkable", "eats") --> "undrinkables" and "*undrinks")
std::vector<std::string>::iterator it = slst.begin();
while (it != slst.end()) {
if (!spell(*it)) {
it = slst.erase(it);
} else {
++it;
}
}
}
return slst;
}
std::vector<std::string> Hunspell::generate(const std::string& word, const std::string& pattern) {
return m_Impl->generate(word, pattern);
}
std::vector<std::string> HunspellImpl::generate(const std::string& word, const std::string& pattern) {
std::vector<std::string> pl = analyze(pattern);
std::vector<std::string> slst = generate(word, pl);
uniqlist(slst);
return slst;
}
// minimal XML parser functions
std::string HunspellImpl::get_xml_par(const char* par) {
std::string dest;
if (!par)
return dest;
char end = *par;
if (end == '>')
end = '<';
else if (end != '\'' && end != '"')
return 0; // bad XML
for (par++; *par != '\0' && *par != end; ++par) {
dest.push_back(*par);
}
mystrrep(dest, "&lt;", "<");
mystrrep(dest, "&amp;", "&");
return dest;
}
int Hunspell::get_langnum() const {
return m_Impl->get_langnum();
}
int HunspellImpl::get_langnum() const {
return langnum;
}
bool Hunspell::input_conv(const std::string& word, std::string& dest) {
return m_Impl->input_conv(word, dest);
}
int Hunspell::input_conv(const char* word, char* dest, size_t destsize) {
std::string d;
bool ret = input_conv(word, d);
if (ret && d.size() < destsize) {
strncpy(dest, d.c_str(), destsize);
return 1;
}
return 0;
}
bool HunspellImpl::input_conv(const std::string& word, std::string& dest) {
RepList* rl = pAMgr ? pAMgr->get_iconvtable() : NULL;
if (rl) {
return rl->conv(word, dest);
}
dest.assign(word);
return false;
}
// return the beginning of the element (attr == NULL) or the attribute
const char* HunspellImpl::get_xml_pos(const char* s, const char* attr) {
const char* end = strchr(s, '>');
const char* p = s;
if (attr == NULL)
return end;
do {
p = strstr(p, attr);
if (!p || p >= end)
return 0;
} while (*(p - 1) != ' ' && *(p - 1) != '\n');
return p + strlen(attr);
}
int HunspellImpl::check_xml_par(const char* q,
const char* attr,
const char* value) {
std::string cw = get_xml_par(get_xml_pos(q, attr));
if (cw == value)
return 1;
return 0;
}
std::vector<std::string> HunspellImpl::get_xml_list(const char* list, const char* tag) {
std::vector<std::string> slst;
if (!list)
return slst;
const char* p = list;
for (size_t n = 0; ((p = strstr(p, tag)) != NULL); ++p, ++n) {
std::string cw = get_xml_par(p + strlen(tag) - 1);
if (cw.empty()) {
break;
}
slst.push_back(cw);
}
return slst;
}
std::vector<std::string> HunspellImpl::spellml(const std::string& in_word) {
std::vector<std::string> slst;
const char* word = in_word.c_str();
const char* q = strstr(word, "<query");
if (!q)
return slst; // bad XML input
const char* q2 = strchr(q, '>');
if (!q2)
return slst; // bad XML input
q2 = strstr(q2, "<word");
if (!q2)
return slst; // bad XML input
if (check_xml_par(q, "type=", "analyze")) {
std::string cw = get_xml_par(strchr(q2, '>'));
if (!cw.empty())
slst = analyze(cw);
if (slst.empty())
return slst;
// convert the result to <code><a>ana1</a><a>ana2</a></code> format
std::string r;
r.append("<code>");
for (size_t i = 0; i < slst.size(); ++i) {
r.append("<a>");
std::string entry(slst[i]);
mystrrep(entry, "\t", " ");
mystrrep(entry, "&", "&amp;");
mystrrep(entry, "<", "&lt;");
r.append(entry);
r.append("</a>");
}
r.append("</code>");
slst.clear();
slst.push_back(r);
return slst;
} else if (check_xml_par(q, "type=", "stem")) {
std::string cw = get_xml_par(strchr(q2, '>'));
if (!cw.empty())
return stem(cw);
} else if (check_xml_par(q, "type=", "generate")) {
std::string cw = get_xml_par(strchr(q2, '>'));
if (cw.empty())
return slst;
const char* q3 = strstr(q2 + 1, "<word");
if (q3) {
std::string cw2 = get_xml_par(strchr(q3, '>'));
if (!cw2.empty()) {
return generate(cw, cw2);
}
} else {
if ((q2 = strstr(q2 + 1, "<code")) != NULL) {
std::vector<std::string> slst2 = get_xml_list(strchr(q2, '>'), "<a>");
if (!slst2.empty()) {
slst = generate(cw, slst2);
uniqlist(slst);
return slst;
}
}
}
}
return slst;
}
int Hunspell::spell(const char* word, int* info, char** root) {
std::string sroot;
bool ret = m_Impl->spell(word, info, root ? &sroot : NULL);
if (root) {
if (sroot.empty()) {
*root = NULL;
} else {
*root = mystrdup(sroot.c_str());
}
}
return ret;
}
namespace {
int munge_vector(char*** slst, const std::vector<std::string>& items) {
if (items.empty()) {
*slst = NULL;
return 0;
} else {
*slst = (char**)malloc(sizeof(char*) * items.size());
if (!*slst)
return 0;
for (size_t i = 0; i < items.size(); ++i)
(*slst)[i] = mystrdup(items[i].c_str());
}
return items.size();
}
}
void Hunspell::free_list(char*** slst, int n) {
Hunspell_free_list((Hunhandle*)(this), slst, n);
}
int Hunspell::suggest(char*** slst, const char* word) {
return Hunspell_suggest((Hunhandle*)(this), slst, word);
}
int Hunspell::suffix_suggest(char*** slst, const char* root_word) {
std::vector<std::string> stems = m_Impl->suffix_suggest(root_word);
return munge_vector(slst, stems);
}
char* Hunspell::get_dic_encoding() {
return &(m_Impl->dic_encoding_vec[0]);
}
int Hunspell::stem(char*** slst, char** desc, int n) {
return Hunspell_stem2((Hunhandle*)(this), slst, desc, n);
}
int Hunspell::stem(char*** slst, const char* word) {
return Hunspell_stem((Hunhandle*)(this), slst, word);
}
int Hunspell::analyze(char*** slst, const char* word) {
return Hunspell_analyze((Hunhandle*)(this), slst, word);
}
int Hunspell::generate(char*** slst, const char* word, char** pl, int pln) {
return Hunspell_generate2((Hunhandle*)(this), slst, word, pl, pln);
}
int Hunspell::generate(char*** slst, const char* word, const char* pattern) {
return Hunspell_generate((Hunhandle*)(this), slst, word, pattern);
}
Hunhandle* Hunspell_create(const char* affpath, const char* dpath) {
#ifdef HUNSPELL_CHROME_CLIENT
return NULL;
#else
return (Hunhandle*)(new Hunspell(affpath, dpath));
#endif
}
Hunhandle* Hunspell_create_key(const char* affpath,
const char* dpath,
const char* key) {
#ifdef HUNSPELL_CHROME_CLIENT
return NULL;
#else
return reinterpret_cast<Hunhandle*>(new Hunspell(affpath, dpath, key));
#endif
}
void Hunspell_destroy(Hunhandle* pHunspell) {
delete reinterpret_cast<Hunspell*>(pHunspell);
}
#ifndef HUNSPELL_CHROME_CLIENT
int Hunspell_add_dic(Hunhandle* pHunspell, const char* dpath) {
return reinterpret_cast<Hunspell*>(pHunspell)->add_dic(dpath);
}
#endif
int Hunspell_spell(Hunhandle* pHunspell, const char* word) {
return reinterpret_cast<Hunspell*>(pHunspell)->spell(std::string(word));
}
char* Hunspell_get_dic_encoding(Hunhandle* pHunspell) {
return reinterpret_cast<Hunspell*>(pHunspell)->get_dic_encoding();
}
int Hunspell_suggest(Hunhandle* pHunspell, char*** slst, const char* word) {
std::vector<std::string> suggests = reinterpret_cast<Hunspell*>(pHunspell)->suggest(word);
return munge_vector(slst, suggests);
}
int Hunspell_analyze(Hunhandle* pHunspell, char*** slst, const char* word) {
std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->analyze(word);
return munge_vector(slst, stems);
}
int Hunspell_stem(Hunhandle* pHunspell, char*** slst, const char* word) {
std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->stem(word);
return munge_vector(slst, stems);
}
int Hunspell_stem2(Hunhandle* pHunspell, char*** slst, char** desc, int n) {
std::vector<std::string> morph;
for (int i = 0; i < n; ++i)
morph.push_back(desc[i]);
std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->stem(morph);
return munge_vector(slst, stems);
}
int Hunspell_generate(Hunhandle* pHunspell,
char*** slst,
const char* word,
const char* pattern) {
std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->generate(word, pattern);
return munge_vector(slst, stems);
}
int Hunspell_generate2(Hunhandle* pHunspell,
char*** slst,
const char* word,
char** desc,
int n) {
std::vector<std::string> morph;
for (int i = 0; i < n; ++i)
morph.push_back(desc[i]);
std::vector<std::string> stems = reinterpret_cast<Hunspell*>(pHunspell)->generate(word, morph);
return munge_vector(slst, stems);
}
/* functions for run-time modification of the dictionary */
/* add word to the run-time dictionary */
int Hunspell_add(Hunhandle* pHunspell, const char* word) {
return reinterpret_cast<Hunspell*>(pHunspell)->add(word);
}
/* add word to the run-time dictionary with affix flags of
* the example (a dictionary word): Hunspell will recognize
* affixed forms of the new word, too.
*/
int Hunspell_add_with_affix(Hunhandle* pHunspell,
const char* word,
const char* example) {
return reinterpret_cast<Hunspell*>(pHunspell)->add_with_affix(word, example);
}
/* remove word from the run-time dictionary */
int Hunspell_remove(Hunhandle* pHunspell, const char* word) {
return reinterpret_cast<Hunspell*>(pHunspell)->remove(word);
}
void Hunspell_free_list(Hunhandle*, char*** list, int n) {
if (list && *list) {
for (int i = 0; i < n; i++)
free((*list)[i]);
free(*list);
*list = NULL;
}
}
std::vector<std::string> Hunspell::suffix_suggest(const std::string& root_word) {
return m_Impl->suffix_suggest(root_word);
}
std::vector<std::string> HunspellImpl::suffix_suggest(const std::string& root_word) {
std::vector<std::string> slst;
struct hentry* he = NULL;
int len;
std::string w2;
const char* word;
const char* ignoredchars = pAMgr->get_ignore();
if (ignoredchars != NULL) {
w2.assign(root_word);
if (utf8) {
const std::vector<w_char>& ignoredchars_utf16 =
pAMgr->get_ignore_utf16();
remove_ignored_chars_utf(w2, ignoredchars_utf16);
} else {
remove_ignored_chars(w2, ignoredchars);
}
word = w2.c_str();
} else
word = root_word.c_str();
len = strlen(word);
if (!len)
return slst;
for (size_t i = 0; (i < m_HMgrs.size()) && !he; ++i) {
he = m_HMgrs[i]->lookup(word);
}
if (he) {
slst = pAMgr->get_suffix_words(he->astr, he->alen, root_word.c_str());
}
return slst;
}