blob: 4ab9ece0838e3bacf4cbe1e12946d654347b273e [file] [log] [blame]
//
// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
//
// Distributed under the Boost Software License, Version 1.0. (See
// accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
//
#ifndef BOOST_LOCALE_IMPL_WCONV_CODEPAGE_HPP
#define BOOST_LOCALE_IMPL_WCONV_CODEPAGE_HPP
#include <boost/locale/encoding.hpp>
#include <algorithm>
#include <cstring>
#include <string>
#include "conv.hpp"
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <windows.h>
#include <vector>
namespace boost {
namespace locale {
namespace conv {
namespace impl {
struct windows_encoding {
char const *name;
unsigned codepage;
unsigned was_tested;
};
bool operator<(windows_encoding const &l,windows_encoding const &r)
{
return strcmp(l.name,r.name) < 0;
}
windows_encoding all_windows_encodings[] = {
{ "big5", 950, 0 },
{ "cp1250", 1250, 0 },
{ "cp1251", 1251, 0 },
{ "cp1252", 1252, 0 },
{ "cp1253", 1253, 0 },
{ "cp1254", 1254, 0 },
{ "cp1255", 1255, 0 },
{ "cp1256", 1256, 0 },
{ "cp1257", 1257, 0 },
{ "cp874", 874, 0 },
{ "cp932", 932, 0 },
{ "cp936", 936, 0 },
{ "eucjp", 20932, 0 },
{ "euckr", 51949, 0 },
{ "gb18030", 54936, 0 },
{ "gb2312", 20936, 0 },
{ "gbk", 936, 0 },
{ "iso2022jp", 50220, 0 },
{ "iso2022kr", 50225, 0 },
{ "iso88591", 28591, 0 },
{ "iso885913", 28603, 0 },
{ "iso885915", 28605, 0 },
{ "iso88592", 28592, 0 },
{ "iso88593", 28593, 0 },
{ "iso88594", 28594, 0 },
{ "iso88595", 28595, 0 },
{ "iso88596", 28596, 0 },
{ "iso88597", 28597, 0 },
{ "iso88598", 28598, 0 },
{ "iso88599", 28599, 0 },
{ "koi8r", 20866, 0 },
{ "koi8u", 21866, 0 },
{ "ms936", 936, 0 },
{ "shiftjis", 932, 0 },
{ "sjis", 932, 0 },
{ "usascii", 20127, 0 },
{ "utf8", 65001, 0 },
{ "windows1250", 1250, 0 },
{ "windows1251", 1251, 0 },
{ "windows1252", 1252, 0 },
{ "windows1253", 1253, 0 },
{ "windows1254", 1254, 0 },
{ "windows1255", 1255, 0 },
{ "windows1256", 1256, 0 },
{ "windows1257", 1257, 0 },
{ "windows874", 874, 0 },
{ "windows932", 932, 0 },
{ "windows936", 936, 0 },
};
size_t remove_substitutions(std::vector<char> &v)
{
if(std::find(v.begin(),v.end(),0) == v.end()) {
return v.size();
}
std::vector<char> v2;
v2.reserve(v.size());
for(unsigned i=0;i<v.size();i++) {
if(v[i]!=0)
v2.push_back(v[i]);
}
v.swap(v2);
return v.size();
}
void multibyte_to_wide_one_by_one(int codepage,char const *begin,char const *end,std::vector<wchar_t> &buf)
{
buf.reserve(end-begin);
while(begin!=end) {
wchar_t wide_buf[4];
int n = 0;
int len = IsDBCSLeadByteEx(codepage,*begin) ? 2 : 1;
if(len == 2 && begin+1==end)
return;
n = MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,len,wide_buf,4);
for(int i=0;i<n;i++)
buf.push_back(wide_buf[i]);
begin+=len;
}
}
void multibyte_to_wide(int codepage,char const *begin,char const *end,bool do_skip,std::vector<wchar_t> &buf)
{
if(begin==end)
return;
int n = MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,end-begin,0,0);
if(n == 0) {
if(do_skip) {
multibyte_to_wide_one_by_one(codepage,begin,end,buf);
return;
}
throw conversion_error();
}
buf.resize(n,0);
if(MultiByteToWideChar(codepage,MB_ERR_INVALID_CHARS,begin,end-begin,&buf.front(),buf.size())==0)
throw conversion_error();
}
void wide_to_multibyte_non_zero(int codepage,wchar_t const *begin,wchar_t const *end,bool do_skip,std::vector<char> &buf)
{
if(begin==end)
return;
BOOL substitute = FALSE;
BOOL *substitute_ptr = codepage == 65001 || codepage == 65000 ? 0 : &substitute;
char subst_char = 0;
char *subst_char_ptr = codepage == 65001 || codepage == 65000 ? 0 : &subst_char;
int n = WideCharToMultiByte(codepage,0,begin,end-begin,0,0,subst_char_ptr,substitute_ptr);
buf.resize(n);
if(WideCharToMultiByte(codepage,0,begin,end-begin,&buf[0],n,subst_char_ptr,substitute_ptr)==0)
throw conversion_error();
if(substitute) {
if(do_skip)
remove_substitutions(buf);
else
throw conversion_error();
}
}
void wide_to_multibyte(int codepage,wchar_t const *begin,wchar_t const *end,bool do_skip,std::vector<char> &buf)
{
if(begin==end)
return;
buf.reserve(end-begin);
wchar_t const *e = std::find(begin,end,L'\0');
wchar_t const *b = begin;
for(;;) {
std::vector<char> tmp;
wide_to_multibyte_non_zero(codepage,b,e,do_skip,tmp);
size_t osize = buf.size();
buf.resize(osize+tmp.size());
std::copy(tmp.begin(),tmp.end(),buf.begin()+osize);
if(e!=end) {
buf.push_back('\0');
b=e+1;
e=std::find(b,end,L'0');
}
else
break;
}
}
int encoding_to_windows_codepage(char const *ccharset)
{
std::string charset = normalize_encoding(ccharset);
windows_encoding ref;
ref.name = charset.c_str();
size_t n = sizeof(all_windows_encodings)/sizeof(all_windows_encodings[0]);
windows_encoding *begin = all_windows_encodings;
windows_encoding *end = all_windows_encodings + n;
windows_encoding *ptr = std::lower_bound(begin,end,ref);
if(ptr!=end && strcmp(ptr->name,charset.c_str())==0) {
if(ptr->was_tested) {
return ptr->codepage;
}
else if(IsValidCodePage(ptr->codepage)) {
// the thread safety is not an issue, maximum
// it would be checked more then once
ptr->was_tested=1;
return ptr->codepage;
}
else {
return -1;
}
}
return -1;
}
template<typename CharType>
bool validate_utf16(CharType const *str,unsigned len)
{
CharType const *begin = str;
CharType const *end = str+len;
while(begin!=end) {
utf::code_point c = utf::utf_traits<CharType,2>::template decode<CharType const *>(begin,end);
if(c==utf::illegal || c==utf::incomplete)
return false;
}
return true;
}
template<typename CharType,typename OutChar>
void clean_invalid_utf16(CharType const *str,unsigned len,std::vector<OutChar> &out)
{
out.reserve(len);
for(unsigned i=0;i<len;i++) {
uint16_t c = static_cast<uint16_t>(str[i]);
if(0xD800 <= c && c<= 0xDBFF) {
i++;
if(i>=len)
return;
uint16_t c2=static_cast<uint16_t>(str[i]);
if(0xDC00 <= c2 && c2 <= 0xDFFF) {
out.push_back(static_cast<OutChar>(c));
out.push_back(static_cast<OutChar>(c2));
}
}
else if(0xDC00 <= c && c <=0xDFFF)
continue;
else
out.push_back(static_cast<OutChar>(c));
}
}
class wconv_between : public converter_between {
public:
wconv_between() :
how_(skip),
to_code_page_ (-1),
from_code_page_ ( -1)
{
}
bool open(char const *to_charset,char const *from_charset,method_type how)
{
how_ = how;
to_code_page_ = encoding_to_windows_codepage(to_charset);
from_code_page_ = encoding_to_windows_codepage(from_charset);
if(to_code_page_ == -1 || from_code_page_ == -1)
return false;
return true;
}
virtual std::string convert(char const *begin,char const *end)
{
if(to_code_page_ == 65001 && from_code_page_ == 65001)
return utf_to_utf<char>(begin,end,how_);
std::string res;
std::vector<wchar_t> tmp; // buffer for mb2w
std::wstring tmps; // buffer for utf_to_utf
wchar_t const *wbegin=0;
wchar_t const *wend=0;
if(from_code_page_ == 65001) {
tmps = utf_to_utf<wchar_t>(begin,end,how_);
if(tmps.empty())
return res;
wbegin = tmps.c_str();
wend = wbegin + tmps.size();
}
else {
multibyte_to_wide(from_code_page_,begin,end,how_ == skip,tmp);
if(tmp.empty())
return res;
wbegin = &tmp[0];
wend = wbegin + tmp.size();
}
if(to_code_page_ == 65001) {
return utf_to_utf<char>(wbegin,wend,how_);
}
std::vector<char> ctmp;
wide_to_multibyte(to_code_page_,wbegin,wend,how_ == skip,ctmp);
if(ctmp.empty())
return res;
res.assign(&ctmp.front(),ctmp.size());
return res;
}
private:
method_type how_;
int to_code_page_;
int from_code_page_;
};
template<typename CharType,int size = sizeof(CharType) >
class wconv_to_utf;
template<typename CharType,int size = sizeof(CharType) >
class wconv_from_utf;
template<>
class wconv_to_utf<char,1> : public converter_to_utf<char> , public wconv_between {
public:
virtual bool open(char const *cs,method_type how)
{
return wconv_between::open("UTF-8",cs,how);
}
virtual std::string convert(char const *begin,char const *end)
{
return wconv_between::convert(begin,end);
}
};
template<>
class wconv_from_utf<char,1> : public converter_from_utf<char> , public wconv_between {
public:
virtual bool open(char const *cs,method_type how)
{
return wconv_between::open(cs,"UTF-8",how);
}
virtual std::string convert(char const *begin,char const *end)
{
return wconv_between::convert(begin,end);
}
};
template<typename CharType>
class wconv_to_utf<CharType,2> : public converter_to_utf<CharType> {
public:
typedef CharType char_type;
typedef std::basic_string<char_type> string_type;
wconv_to_utf() :
how_(skip),
code_page_(-1)
{
}
virtual bool open(char const *charset,method_type how)
{
how_ = how;
code_page_ = encoding_to_windows_codepage(charset);
return code_page_ != -1;
}
virtual string_type convert(char const *begin,char const *end)
{
if(code_page_ == 65001) {
return utf_to_utf<char_type>(begin,end,how_);
}
std::vector<wchar_t> tmp;
multibyte_to_wide(code_page_,begin,end,how_ == skip,tmp);
string_type res;
if(!tmp.empty())
res.assign(reinterpret_cast<char_type *>(&tmp.front()),tmp.size());
return res;
}
private:
method_type how_;
int code_page_;
};
template<typename CharType>
class wconv_from_utf<CharType,2> : public converter_from_utf<CharType> {
public:
typedef CharType char_type;
typedef std::basic_string<char_type> string_type;
wconv_from_utf() :
how_(skip),
code_page_(-1)
{
}
virtual bool open(char const *charset,method_type how)
{
how_ = how;
code_page_ = encoding_to_windows_codepage(charset);
return code_page_ != -1;
}
virtual std::string convert(CharType const *begin,CharType const *end)
{
if(code_page_ == 65001) {
return utf_to_utf<char>(begin,end,how_);
}
wchar_t const *wbegin = 0;
wchar_t const *wend = 0;
std::vector<wchar_t> buffer; // if needed
if(begin==end)
return std::string();
if(validate_utf16(begin,end-begin)) {
wbegin = reinterpret_cast<wchar_t const *>(begin);
wend = reinterpret_cast<wchar_t const *>(end);
}
else {
if(how_ == stop) {
throw conversion_error();
}
else {
clean_invalid_utf16(begin,end-begin,buffer);
if(!buffer.empty()) {
wbegin = &buffer[0];
wend = wbegin + buffer.size();
}
}
}
std::string res;
if(wbegin==wend)
return res;
std::vector<char> ctmp;
wide_to_multibyte(code_page_,wbegin,wend,how_ == skip,ctmp);
if(ctmp.empty())
return res;
res.assign(&ctmp.front(),ctmp.size());
return res;
}
private:
method_type how_;
int code_page_;
};
template<typename CharType>
class wconv_to_utf<CharType,4> : public converter_to_utf<CharType> {
public:
typedef CharType char_type;
typedef std::basic_string<char_type> string_type;
wconv_to_utf() :
how_(skip),
code_page_(-1)
{
}
virtual bool open(char const *charset,method_type how)
{
how_ = how;
code_page_ = encoding_to_windows_codepage(charset);
return code_page_ != -1;
}
virtual string_type convert(char const *begin,char const *end)
{
if(code_page_ == 65001) {
return utf_to_utf<char_type>(begin,end,how_);
}
std::vector<wchar_t> buf;
multibyte_to_wide(code_page_,begin,end,how_ == skip,buf);
if(buf.empty())
return string_type();
return utf_to_utf<CharType>(&buf[0],&buf[0]+buf.size(),how_);
}
private:
method_type how_;
int code_page_;
};
template<typename CharType>
class wconv_from_utf<CharType,4> : public converter_from_utf<CharType> {
public:
typedef CharType char_type;
typedef std::basic_string<char_type> string_type;
wconv_from_utf() :
how_(skip),
code_page_(-1)
{
}
virtual bool open(char const *charset,method_type how)
{
how_ = how;
code_page_ = encoding_to_windows_codepage(charset);
return code_page_ != -1;
}
virtual std::string convert(CharType const *begin,CharType const *end)
{
if(code_page_ == 65001) {
return utf_to_utf<char>(begin,end,how_);
}
std::wstring tmp = utf_to_utf<wchar_t>(begin,end,how_);
std::vector<char> ctmp;
wide_to_multibyte(code_page_,tmp.c_str(),tmp.c_str()+tmp.size(),how_ == skip,ctmp);
std::string res;
if(ctmp.empty())
return res;
res.assign(&ctmp.front(),ctmp.size());
return res;
}
private:
method_type how_;
int code_page_;
};
} // impl
} // conv
} // locale
} // boost
#endif
// vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4