boost/libs/locale/src/util/codecvt_converter.cpp - nest-cam/4320010/boost - Git at Google

 //
 //  Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
 //
 //  Distributed under the Boost Software License, Version 1.0. (See
 //  accompanying file LICENSE_1_0.txt or copy at
 //  http://www.boost.org/LICENSE_1_0.txt)
 //
 #define BOOST_LOCALE_SOURCE
 #include <boost/locale/generator.hpp>
 #include <boost/locale/encoding.hpp>

 #include "../encoding/conv.hpp"

 #include <boost/locale/util.hpp>

 #ifdef BOOST_MSVC
 #  pragma warning(disable : 4244 4996) // loose data
 #endif

 #include <cstddef>
 #include <string.h>
 #include <vector>
 #include <algorithm>

 //#define DEBUG_CODECVT

 #ifdef DEBUG_CODECVT
 #include <iostream>
 #endif

 namespace boost {
 namespace locale {
 namespace util {

     class utf8_converter  : public base_converter {
     public:
         virtual int max_len() const
         {
             return 4;
         }

         virtual utf8_converter *clone() const
         {
             return new utf8_converter();
         }

         bool is_thread_safe() const
         {
             return true;
         }

         virtual uint32_t to_unicode(char const *&begin,char const *end)
         {
             char const *p=begin;

             utf::code_point c = utf::utf_traits<char>::decode(p,end);

             if(c==utf::illegal)
                 return illegal;

             if(c==utf::incomplete)
                 return incomplete;

             begin = p;
             return c;
         }

         virtual uint32_t from_unicode(uint32_t u,char *begin,char const *end)
         {
             if(!utf::is_valid_codepoint(u))
                 return illegal;
             int width = utf::utf_traits<char>::width(u);
             std::ptrdiff_t d=end-begin;
             if(d < width)
                 return incomplete;
             utf::utf_traits<char>::encode(u,begin);
             return width;
         }
     }; // utf8_converter

     class simple_converter : public base_converter {
     public:

         virtual ~simple_converter()
         {
         }

         simple_converter(std::string const &encoding)
         {
             for(unsigned i=0;i<128;i++)
                 to_unicode_tbl_[i]=i;
             for(unsigned i=128;i<256;i++) {
                 char buf[2] = { char(i) , 0 };
                 try {
                     std::wstring const tmp = conv::to_utf<wchar_t>(buf,buf+1,encoding,conv::stop);
                     if(tmp.size() == 1) {
                         to_unicode_tbl_[i] = tmp[0];
                     }
                     else {
                         to_unicode_tbl_[i] = illegal;
                     }
                 }
                 catch(conv::conversion_error const &/*e*/) {
                     to_unicode_tbl_[i] = illegal;
                 }
             }
             from_unicode_tbl_.resize(256);
             for(unsigned i=0;i<256;i++) {
                 from_unicode_tbl_[to_unicode_tbl_[i] & 0xFF].push_back(i);
             }
         }

         virtual int max_len() const
         {
             return 1;
         }

         virtual bool is_thread_safe() const
         {
             return true;
         }
         virtual base_converter *clone() const
         {
            return new simple_converter(*this);
         }
         virtual uint32_t to_unicode(char const *&begin,char const *end)
         {
             if(begin==end)
                 return incomplete;
             unsigned char c = *begin++;
             return to_unicode_tbl_[c];
         }
         virtual uint32_t from_unicode(uint32_t u,char *begin,char const *end)
         {
             if(begin==end)
                 return incomplete;
             std::vector<unsigned char> const &tbl = from_unicode_tbl_[u & 0xFF];
             for(std::vector<unsigned char>::const_iterator p=tbl.begin();p!=tbl.end();++p) {
                 if(to_unicode_tbl_[*p]==u) {
                     *begin++ = *p;
                     return 1;
                 }
             }
             return illegal;
         }
     private:
         uint32_t to_unicode_tbl_[256];
         std::vector<std::vector<unsigned char> > from_unicode_tbl_;
     };

     namespace {
         char const *simple_encoding_table[] = {
             "cp1250",
             "cp1251",
             "cp1252",
             "cp1253",
             "cp1254",
             "cp1255",
             "cp1256",
             "cp1257",
             "iso88591",
             "iso885913",
             "iso885915",
             "iso88592",
             "iso88593",
             "iso88594",
             "iso88595",
             "iso88596",
             "iso88597",
             "iso88598",
             "iso88599",
             "koi8r",
             "koi8u",
             "usascii",
             "windows1250",
             "windows1251",
             "windows1252",
             "windows1253",
             "windows1254",
             "windows1255",
             "windows1256",
             "windows1257"
         };

         bool compare_strings(char const *l,char const *r)
         {
             return strcmp(l,r) < 0;
         }
     }


     std::auto_ptr<base_converter> create_simple_converter(std::string const &encoding)
     {
         std::auto_ptr<base_converter> res;
         std::string norm = conv::impl::normalize_encoding(encoding.c_str());
         if(std::binary_search<char const **>( simple_encoding_table,
                         simple_encoding_table + sizeof(simple_encoding_table)/sizeof(char const *),
                         norm.c_str(),
                         compare_strings))
         {
             res.reset(new simple_converter(encoding));
         }
         return res;
     }


     std::auto_ptr<base_converter> create_utf8_converter()
     {
         std::auto_ptr<base_converter> res(new utf8_converter());
         return res;
     }

     //
     // Traits for sizeof char
     //
     template<typename CharType,int n=sizeof(CharType)>
     struct uchar_traits;

     template<typename CharType>
     struct uchar_traits<CharType,2> {
         typedef uint16_t uint_type;
     };
     template<typename CharType>
     struct uchar_traits<CharType,4> {
         typedef uint32_t uint_type;
     };

     // Real codecvt

     template<typename CharType>
     class code_converter : public std::codecvt<CharType,char,std::mbstate_t>
     {
     public:
         code_converter(std::auto_ptr<base_converter> cvt,size_t refs = 0) :
           std::codecvt<CharType,char,std::mbstate_t>(refs),
             cvt_(cvt)
         {
             max_len_ = cvt_->max_len();
         }
     protected:

         typedef CharType uchar;

         virtual std::codecvt_base::result do_unshift(std::mbstate_t &s,char *from,char * /*to*/,char *&next) const
         {
             uint16_t &state = *reinterpret_cast<uint16_t *>(&s);
 #ifdef DEBUG_CODECVT
             std::cout << "Entering unshift " << std::hex << state << std::dec << std::endl;
 #endif
             if(state != 0)
                 return std::codecvt_base::error;
             next=from;
             return std::codecvt_base::ok;
         }
         virtual int do_encoding() const throw()
         {
             return 0;
         }
         virtual int do_max_length() const throw()
         {
             return max_len_;
         }
         virtual bool do_always_noconv() const throw()
         {
             return false;
         }

         virtual std::codecvt_base::result
         do_in(  std::mbstate_t &state,
                 char const *from,
                 char const *from_end,
                 char const *&from_next,
                 uchar *uto,
                 uchar *uto_end,
                 uchar *&uto_next) const
         {
             typedef typename uchar_traits<uchar>::uint_type uint_type;
             uint_type *to=reinterpret_cast<uint_type *>(uto);
             uint_type *to_end=reinterpret_cast<uint_type *>(uto_end);
             uint_type *&to_next=reinterpret_cast<uint_type *&>(uto_next);
             return do_real_in(state,from,from_end,from_next,to,to_end,to_next);
         }

         virtual int
         do_length(  std::mbstate_t &state,
                 char const *from,
                 char const *from_end,
                 size_t max) const
         {
             char const *from_next=from;
             std::vector<uchar> chrs(max+1);
             uchar *to=&chrs.front();
             uchar *to_end=to+max;
             uchar *to_next=to;
             do_in(state,from,from_end,from_next,to,to_end,to_next);
             return from_next-from;
         }

         virtual std::codecvt_base::result
         do_out( std::mbstate_t &state,
                 uchar const *ufrom,
                 uchar const *ufrom_end,
                 uchar const *&ufrom_next,
                 char *to,
                 char *to_end,
                 char *&to_next) const
         {
             typedef typename uchar_traits<uchar>::uint_type uint_type;
             uint_type const *from=reinterpret_cast<uint_type const *>(ufrom);
             uint_type const *from_end=reinterpret_cast<uint_type const *>(ufrom_end);
             uint_type const *&from_next=reinterpret_cast<uint_type const *&>(ufrom_next);
             return do_real_out(state,from,from_end,from_next,to,to_end,to_next);
         }


     private:

         //
         // Implementation for UTF-32
         //
         std::codecvt_base::result
         do_real_in( std::mbstate_t &/*state*/,
                     char const *from,
                     char const *from_end,
                     char const *&from_next,
                     uint32_t *to,
                     uint32_t *to_end,
                     uint32_t *&to_next) const
         {
             std::auto_ptr<base_converter> cvtp;
             base_converter *cvt = 0;
             if(cvt_->is_thread_safe()) {
                 cvt = cvt_.get();
             }
             else {
                 cvtp.reset(cvt_->clone());
                 cvt = cvtp.get();
             }
             std::codecvt_base::result r=std::codecvt_base::ok;
             while(to < to_end && from < from_end)
             {
                 uint32_t ch=cvt->to_unicode(from,from_end);
                 if(ch==base_converter::illegal) {
                     r=std::codecvt_base::error;
                     break;
                 }
                 if(ch==base_converter::incomplete) {
                     r=std::codecvt_base::partial;
                     break;
                 }
                 *to++=ch;
             }
             from_next=from;
             to_next=to;
             if(r!=std::codecvt_base::ok)
                 return r;
             if(from!=from_end)
                 return std::codecvt_base::partial;
             return r;
         }

         //
         // Implementation for UTF-32
         //
         std::codecvt_base::result
         do_real_out(std::mbstate_t &/*state*/, // state is not used there
                     uint32_t const *from,
                     uint32_t const *from_end,
                     uint32_t const *&from_next,
                     char *to,
                     char *to_end,
                     char *&to_next) const
         {
             std::auto_ptr<base_converter> cvtp;
             base_converter *cvt = 0;
             if(cvt_->is_thread_safe()) {
                 cvt = cvt_.get();
             }
             else {
                 cvtp.reset(cvt_->clone());
                 cvt = cvtp.get();
             }

             std::codecvt_base::result r=std::codecvt_base::ok;
             while(to < to_end && from < from_end)
             {
                 uint32_t len=cvt->from_unicode(*from,to,to_end);
                 if(len==base_converter::illegal) {
                     r=std::codecvt_base::error;
                     break;
                 }
                 if(len==base_converter::incomplete) {
                     r=std::codecvt_base::partial;
                     break;
                 }
                 from++;
                 to+=len;
             }
             from_next=from;
             to_next=to;
             if(r!=std::codecvt_base::ok)
                 return r;
             if(from!=from_end)
                 return std::codecvt_base::partial;
             return r;
         }

         //
         // Implementation for UTF-16
         //
         std::codecvt_base::result
         do_real_in( std::mbstate_t &std_state,
                     char const *from,
                     char const *from_end,
                     char const *&from_next,
                     uint16_t *to,
                     uint16_t *to_end,
                     uint16_t *&to_next) const
         {
             std::auto_ptr<base_converter> cvtp;
             base_converter *cvt = 0;
             if(cvt_->is_thread_safe()) {
                 cvt = cvt_.get();
             }
             else {
                 cvtp.reset(cvt_->clone());
                 cvt = cvtp.get();
             }
             std::codecvt_base::result r=std::codecvt_base::ok;
             // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
             // according to standard. We use it to keed a flag 0/1 for surrogate pair writing
             //
             // if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observerd
             // and first pair is written, but no input consumed
             uint16_t &state = *reinterpret_cast<uint16_t *>(&std_state);
             while(to < to_end && from < from_end)
             {
 #ifdef DEBUG_CODECVT
                 std::cout << "Entering IN--------------" << std::endl;
                 std::cout << "State " << std::hex << state <<std::endl;
                 std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
 #endif
                 char const *from_saved = from;
                 uint32_t ch=cvt->to_unicode(from,from_end);
                 if(ch==base_converter::illegal) {
                     r=std::codecvt_base::error;
                     break;
                 }
                 if(ch==base_converter::incomplete) {
                     r=std::codecvt_base::partial;
                     break;
                 }
                 // Normal codepoints go direcly to stream
                 if(ch <= 0xFFFF) {
                     *to++=ch;
                 }
                 else {
                     // for  other codepoints we do following
                     //
                     // 1. We can't consume our input as we may find ourselfs
                     //    in state where all input consumed but not all output written,i.e. only
                     //    1st pair is written
                     // 2. We only write first pair and mark this in the state, we also revert back
                     //    the from pointer in order to make sure this codepoint would be read
                     //    once again and then we would consume our input together with writing
                     //    second surrogate pair
                     ch-=0x10000;
                     uint16_t vh = ch >> 10;
                     uint16_t vl = ch & 0x3FF;
                     uint16_t w1 = vh + 0xD800;
                     uint16_t w2 = vl + 0xDC00;
                     if(state == 0) {
                         from = from_saved;
                         *to++ = w1;
                         state = 1;
                     }
                     else {
                         *to++ = w2;
                         state = 0;
                     }
                 }
             }
             from_next=from;
             to_next=to;
             if(r == std::codecvt_base::ok && (from!=from_end || state!=0))
                 r = std::codecvt_base::partial;
 #ifdef DEBUG_CODECVT
             std::cout << "Returning ";
             switch(r) {
             case std::codecvt_base::ok:
                 std::cout << "ok" << std::endl;
                 break;
             case std::codecvt_base::partial:
                 std::cout << "partial" << std::endl;
                 break;
             case std::codecvt_base::error:
                 std::cout << "error" << std::endl;
                 break;
             default:
                 std::cout << "other" << std::endl;
                 break;
             }
             std::cout << "State " << std::hex << state <<std::endl;
             std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
 #endif
             return r;
         }

         //encoding// Implementation for UTF-16
         //
         std::codecvt_base::result
         do_real_out(std::mbstate_t &std_state,
                     uint16_t const *from,
                     uint16_t const *from_end,
                     uint16_t const *&from_next,
                     char *to,
                     char *to_end,
                     char *&to_next) const
         {
             std::auto_ptr<base_converter> cvtp;
             base_converter *cvt = 0;
             if(cvt_->is_thread_safe()) {
                 cvt = cvt_.get();
             }
             else {
                 cvtp.reset(cvt_->clone());
                 cvt = cvtp.get();
             }
             std::codecvt_base::result r=std::codecvt_base::ok;
             // mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
             // according to standard. We assume that sizeof(mbstate_t) >=2 in order
             // to be able to store first observerd surrogate pair
             //
             // State: state!=0 - a first surrogate pair was observerd (state = first pair),
             // we expect the second one to come and then zero the state
             ///
             uint16_t &state = *reinterpret_cast<uint16_t *>(&std_state);
             while(to < to_end && from < from_end)
             {
 #ifdef DEBUG_CODECVT
             std::cout << "Entering OUT --------------" << std::endl;
             std::cout << "State " << std::hex << state <<std::endl;
             std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
 #endif
                 uint32_t ch=0;
                 if(state != 0) {
                     // if the state idecates that 1st surrogate pair was written
                     // we should make sure that the second one that comes is actually
                     // second surrogate
                     uint16_t w1 = state;
                     uint16_t w2 = *from;
                     // we don't forward from as writing may fail to incomplete or
                     // partial conversion
                     if(0xDC00 <= w2 && w2<=0xDFFF) {
                         uint16_t vh = w1 - 0xD800;
                         uint16_t vl = w2 - 0xDC00;
                         ch=((uint32_t(vh) << 10)  | vl) + 0x10000;
                     }
                     else {
                         // Invalid surrogate
                         r=std::codecvt_base::error;
                         break;
                     }
                 }
                 else {
                     ch = *from;
                     if(0xD800 <= ch && ch<=0xDBFF) {
                         // if this is a first surrogate pair we put
                         // it into the state and consume it, note we don't
                         // go forward as it should be illegal so we increase
                         // the from pointer manually
                         state = ch;
                         from++;
                         continue;
                     }
                     else if(0xDC00 <= ch && ch<=0xDFFF) {
                         // if we observe second surrogate pair and
                         // first only may be expected we should break from the loop with error
                         // as it is illegal input
                         r=std::codecvt_base::error;
                         break;
                     }
                 }

                 uint32_t len=cvt->from_unicode(ch,to,to_end);
                 if(len==base_converter::illegal) {
                     r=std::codecvt_base::error;
                     break;
                 }
                 if(len==base_converter::incomplete) {
                     r=std::codecvt_base::partial;
                     break;
                 }
                 state = 0;
                 to+=len;
                 from++;
             }
             from_next=from;
             to_next=to;
             if(r==std::codecvt_base::ok && from!=from_end)
                 r = std::codecvt_base::partial;
 #ifdef DEBUG_CODECVT
             std::cout << "Returning ";
             switch(r) {
             case std::codecvt_base::ok:
                 std::cout << "ok" << std::endl;
                 break;
             case std::codecvt_base::partial:
                 std::cout << "partial" << std::endl;
                 break;
             case std::codecvt_base::error:
                 std::cout << "error" << std::endl;
                 break;
             default:
                 std::cout << "other" << std::endl;
                 break;
             }
             std::cout << "State " << std::hex << state <<std::endl;
             std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
 #endif
             return r;
         }

         int max_len_;
         std::auto_ptr<base_converter> cvt_;

     };

     static const char ensure_mbstate_size_is_at_least_2[sizeof(std::mbstate_t) >= 2 ? 1 : -1] = {0};

     template<>
     class code_converter<char> : public std::codecvt<char,char,std::mbstate_t>
     {
     public:
         code_converter(std::auto_ptr<base_converter> /*cvt*/,size_t refs = 0) :
           std::codecvt<char,char,std::mbstate_t>(refs)
         {
         }
     };


     std::locale create_codecvt(std::locale const &in,std::auto_ptr<base_converter> cvt,character_facet_type type)
     {
         if(!cvt.get())
             cvt.reset(new base_converter());
         switch(type) {
         case char_facet:
             return std::locale(in,new code_converter<char>(cvt));
         case wchar_t_facet:
             return std::locale(in,new code_converter<wchar_t>(cvt));
         #if defined(BOOST_HAS_CHAR16_T) && !defined(BOOST_NO_CHAR16_T_CODECVT)
         case char16_t_facet:
             return std::locale(in,new code_converter<char16_t>(cvt));
         #endif
         #if defined(BOOST_HAS_CHAR32_T) && !defined(BOOST_NO_CHAR32_T_CODECVT)
         case char32_t_facet:
             return std::locale(in,new code_converter<char32_t>(cvt));
         #endif
         default:
             return in;
         }
     }


 } // util
 } // locale
 } // boost

 // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
	//
	// Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
	//
	// Distributed under the Boost Software License, Version 1.0. (See
	// accompanying file LICENSE_1_0.txt or copy at
	// http://www.boost.org/LICENSE_1_0.txt)
	//
	#define BOOST_LOCALE_SOURCE
	#include <boost/locale/generator.hpp>
	#include <boost/locale/encoding.hpp>

	#include "../encoding/conv.hpp"

	#include <boost/locale/util.hpp>

	#ifdef BOOST_MSVC
	# pragma warning(disable : 4244 4996) // loose data
	#endif

	#include <cstddef>
	#include <string.h>
	#include <vector>
	#include <algorithm>

	//#define DEBUG_CODECVT

	#ifdef DEBUG_CODECVT
	#include <iostream>
	#endif

	namespace boost {
	namespace locale {
	namespace util {

	class utf8_converter : public base_converter {
	public:
	virtual int max_len() const
	{
	return 4;
	}

	virtual utf8_converter *clone() const
	{
	return new utf8_converter();
	}

	bool is_thread_safe() const
	{
	return true;
	}

	virtual uint32_t to_unicode(char const &begin,char const end)
	{
	char const *p=begin;

	utf::code_point c = utf::utf_traits<char>::decode(p,end);

	if(c==utf::illegal)
	return illegal;

	if(c==utf::incomplete)
	return incomplete;

	begin = p;
	return c;
	}

	virtual uint32_t from_unicode(uint32_t u,char begin,char const end)
	{
	if(!utf::is_valid_codepoint(u))
	return illegal;
	int width = utf::utf_traits<char>::width(u);
	std::ptrdiff_t d=end-begin;
	if(d < width)
	return incomplete;
	utf::utf_traits<char>::encode(u,begin);
	return width;
	}
	}; // utf8_converter

	class simple_converter : public base_converter {
	public:

	virtual ~simple_converter()
	{
	}

	simple_converter(std::string const &encoding)
	{
	for(unsigned i=0;i<128;i++)
	to_unicode_tbl_[i]=i;
	for(unsigned i=128;i<256;i++) {
	char buf[2] = { char(i) , 0 };
	try {
	std::wstring const tmp = conv::to_utf<wchar_t>(buf,buf+1,encoding,conv::stop);
	if(tmp.size() == 1) {
	to_unicode_tbl_[i] = tmp[0];
	}
	else {
	to_unicode_tbl_[i] = illegal;
	}
	}
	catch(conv::conversion_error const &/e/) {
	to_unicode_tbl_[i] = illegal;
	}
	}
	from_unicode_tbl_.resize(256);
	for(unsigned i=0;i<256;i++) {
	from_unicode_tbl_[to_unicode_tbl_[i] & 0xFF].push_back(i);
	}
	}

	virtual int max_len() const
	{
	return 1;
	}

	virtual bool is_thread_safe() const
	{
	return true;
	}
	virtual base_converter *clone() const
	{
	return new simple_converter(*this);
	}
	virtual uint32_t to_unicode(char const &begin,char const end)
	{
	if(begin==end)
	return incomplete;
	unsigned char c = *begin++;
	return to_unicode_tbl_[c];
	}
	virtual uint32_t from_unicode(uint32_t u,char begin,char const end)
	{
	if(begin==end)
	return incomplete;
	std::vector<unsigned char> const &tbl = from_unicode_tbl_[u & 0xFF];
	for(std::vector<unsigned char>::const_iterator p=tbl.begin();p!=tbl.end();++p) {
	if(to_unicode_tbl_[*p]==u) {
	begin++ = p;
	return 1;
	}
	}
	return illegal;
	}
	private:
	uint32_t to_unicode_tbl_[256];
	std::vector<std::vector<unsigned char> > from_unicode_tbl_;
	};

	namespace {
	char const *simple_encoding_table[] = {
	"cp1250",
	"cp1251",
	"cp1252",
	"cp1253",
	"cp1254",
	"cp1255",
	"cp1256",
	"cp1257",
	"iso88591",
	"iso885913",
	"iso885915",
	"iso88592",
	"iso88593",
	"iso88594",
	"iso88595",
	"iso88596",
	"iso88597",
	"iso88598",
	"iso88599",
	"koi8r",
	"koi8u",
	"usascii",
	"windows1250",
	"windows1251",
	"windows1252",
	"windows1253",
	"windows1254",
	"windows1255",
	"windows1256",
	"windows1257"
	};

	bool compare_strings(char const l,char const r)
	{
	return strcmp(l,r) < 0;
	}
	}


	std::auto_ptr<base_converter> create_simple_converter(std::string const &encoding)
	{
	std::auto_ptr<base_converter> res;
	std::string norm = conv::impl::normalize_encoding(encoding.c_str());
	if(std::binary_search<char const **>( simple_encoding_table,
	simple_encoding_table + sizeof(simple_encoding_table)/sizeof(char const *),
	norm.c_str(),
	compare_strings))
	{
	res.reset(new simple_converter(encoding));
	}
	return res;
	}



	std::auto_ptr<base_converter> create_utf8_converter()
	{
	std::auto_ptr<base_converter> res(new utf8_converter());
	return res;
	}

	//
	// Traits for sizeof char
	//
	template<typename CharType,int n=sizeof(CharType)>
	struct uchar_traits;

	template<typename CharType>
	struct uchar_traits<CharType,2> {
	typedef uint16_t uint_type;
	};
	template<typename CharType>
	struct uchar_traits<CharType,4> {
	typedef uint32_t uint_type;
	};

	// Real codecvt

	template<typename CharType>
	class code_converter : public std::codecvt<CharType,char,std::mbstate_t>
	{
	public:
	code_converter(std::auto_ptr<base_converter> cvt,size_t refs = 0) :
	std::codecvt<CharType,char,std::mbstate_t>(refs),
	cvt_(cvt)
	{
	max_len_ = cvt_->max_len();
	}
	protected:

	typedef CharType uchar;

	virtual std::codecvt_base::result do_unshift(std::mbstate_t &s,char from,char /to/,char *&next) const
	{
	uint16_t &state = reinterpret_cast<uint16_t >(&s);
	#ifdef DEBUG_CODECVT
	std::cout << "Entering unshift " << std::hex << state << std::dec << std::endl;
	#endif
	if(state != 0)
	return std::codecvt_base::error;
	next=from;
	return std::codecvt_base::ok;
	}
	virtual int do_encoding() const throw()
	{
	return 0;
	}
	virtual int do_max_length() const throw()
	{
	return max_len_;
	}
	virtual bool do_always_noconv() const throw()
	{
	return false;
	}

	virtual std::codecvt_base::result
	do_in( std::mbstate_t &state,
	char const *from,
	char const *from_end,
	char const *&from_next,
	uchar *uto,
	uchar *uto_end,
	uchar *&uto_next) const
	{
	typedef typename uchar_traits<uchar>::uint_type uint_type;
	uint_type to=reinterpret_cast<uint_type >(uto);
	uint_type to_end=reinterpret_cast<uint_type >(uto_end);
	uint_type &to_next=reinterpret_cast<uint_type &>(uto_next);
	return do_real_in(state,from,from_end,from_next,to,to_end,to_next);
	}

	virtual int
	do_length( std::mbstate_t &state,
	char const *from,
	char const *from_end,
	size_t max) const
	{
	char const *from_next=from;
	std::vector<uchar> chrs(max+1);
	uchar *to=&chrs.front();
	uchar *to_end=to+max;
	uchar *to_next=to;
	do_in(state,from,from_end,from_next,to,to_end,to_next);
	return from_next-from;
	}

	virtual std::codecvt_base::result
	do_out( std::mbstate_t &state,
	uchar const *ufrom,
	uchar const *ufrom_end,
	uchar const *&ufrom_next,
	char *to,
	char *to_end,
	char *&to_next) const
	{
	typedef typename uchar_traits<uchar>::uint_type uint_type;
	uint_type const from=reinterpret_cast<uint_type const >(ufrom);
	uint_type const from_end=reinterpret_cast<uint_type const >(ufrom_end);
	uint_type const &from_next=reinterpret_cast<uint_type const &>(ufrom_next);
	return do_real_out(state,from,from_end,from_next,to,to_end,to_next);
	}


	private:

	//
	// Implementation for UTF-32
	//
	std::codecvt_base::result
	do_real_in( std::mbstate_t &/state/,
	char const *from,
	char const *from_end,
	char const *&from_next,
	uint32_t *to,
	uint32_t *to_end,
	uint32_t *&to_next) const
	{
	std::auto_ptr<base_converter> cvtp;
	base_converter *cvt = 0;
	if(cvt_->is_thread_safe()) {
	cvt = cvt_.get();
	}
	else {
	cvtp.reset(cvt_->clone());
	cvt = cvtp.get();
	}
	std::codecvt_base::result r=std::codecvt_base::ok;
	while(to < to_end && from < from_end)
	{
	uint32_t ch=cvt->to_unicode(from,from_end);
	if(ch==base_converter::illegal) {
	r=std::codecvt_base::error;
	break;
	}
	if(ch==base_converter::incomplete) {
	r=std::codecvt_base::partial;
	break;
	}
	*to++=ch;
	}
	from_next=from;
	to_next=to;
	if(r!=std::codecvt_base::ok)
	return r;
	if(from!=from_end)
	return std::codecvt_base::partial;
	return r;
	}

	//
	// Implementation for UTF-32
	//
	std::codecvt_base::result
	do_real_out(std::mbstate_t &/state/, // state is not used there
	uint32_t const *from,
	uint32_t const *from_end,
	uint32_t const *&from_next,
	char *to,
	char *to_end,
	char *&to_next) const
	{
	std::auto_ptr<base_converter> cvtp;
	base_converter *cvt = 0;
	if(cvt_->is_thread_safe()) {
	cvt = cvt_.get();
	}
	else {
	cvtp.reset(cvt_->clone());
	cvt = cvtp.get();
	}

	std::codecvt_base::result r=std::codecvt_base::ok;
	while(to < to_end && from < from_end)
	{
	uint32_t len=cvt->from_unicode(*from,to,to_end);
	if(len==base_converter::illegal) {
	r=std::codecvt_base::error;
	break;
	}
	if(len==base_converter::incomplete) {
	r=std::codecvt_base::partial;
	break;
	}
	from++;
	to+=len;
	}
	from_next=from;
	to_next=to;
	if(r!=std::codecvt_base::ok)
	return r;
	if(from!=from_end)
	return std::codecvt_base::partial;
	return r;
	}

	//
	// Implementation for UTF-16
	//
	std::codecvt_base::result
	do_real_in( std::mbstate_t &std_state,
	char const *from,
	char const *from_end,
	char const *&from_next,
	uint16_t *to,
	uint16_t *to_end,
	uint16_t *&to_next) const
	{
	std::auto_ptr<base_converter> cvtp;
	base_converter *cvt = 0;
	if(cvt_->is_thread_safe()) {
	cvt = cvt_.get();
	}
	else {
	cvtp.reset(cvt_->clone());
	cvt = cvtp.get();
	}
	std::codecvt_base::result r=std::codecvt_base::ok;
	// mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
	// according to standard. We use it to keed a flag 0/1 for surrogate pair writing
	//
	// if 0 no code above >0xFFFF observed, of 1 a code above 0xFFFF observerd
	// and first pair is written, but no input consumed
	uint16_t &state = reinterpret_cast<uint16_t >(&std_state);
	while(to < to_end && from < from_end)
	{
	#ifdef DEBUG_CODECVT
	std::cout << "Entering IN--------------" << std::endl;
	std::cout << "State " << std::hex << state <<std::endl;
	std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
	#endif
	char const *from_saved = from;
	uint32_t ch=cvt->to_unicode(from,from_end);
	if(ch==base_converter::illegal) {
	r=std::codecvt_base::error;
	break;
	}
	if(ch==base_converter::incomplete) {
	r=std::codecvt_base::partial;
	break;
	}
	// Normal codepoints go direcly to stream
	if(ch <= 0xFFFF) {
	*to++=ch;
	}
	else {
	// for other codepoints we do following
	//
	// 1. We can't consume our input as we may find ourselfs
	// in state where all input consumed but not all output written,i.e. only
	// 1st pair is written
	// 2. We only write first pair and mark this in the state, we also revert back
	// the from pointer in order to make sure this codepoint would be read
	// once again and then we would consume our input together with writing
	// second surrogate pair
	ch-=0x10000;
	uint16_t vh = ch >> 10;
	uint16_t vl = ch & 0x3FF;
	uint16_t w1 = vh + 0xD800;
	uint16_t w2 = vl + 0xDC00;
	if(state == 0) {
	from = from_saved;
	*to++ = w1;
	state = 1;
	}
	else {
	*to++ = w2;
	state = 0;
	}
	}
	}
	from_next=from;
	to_next=to;
	if(r == std::codecvt_base::ok && (from!=from_end \|\| state!=0))
	r = std::codecvt_base::partial;
	#ifdef DEBUG_CODECVT
	std::cout << "Returning ";
	switch(r) {
	case std::codecvt_base::ok:
	std::cout << "ok" << std::endl;
	break;
	case std::codecvt_base::partial:
	std::cout << "partial" << std::endl;
	break;
	case std::codecvt_base::error:
	std::cout << "error" << std::endl;
	break;
	default:
	std::cout << "other" << std::endl;
	break;
	}
	std::cout << "State " << std::hex << state <<std::endl;
	std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
	#endif
	return r;
	}

	//encoding// Implementation for UTF-16
	//
	std::codecvt_base::result
	do_real_out(std::mbstate_t &std_state,
	uint16_t const *from,
	uint16_t const *from_end,
	uint16_t const *&from_next,
	char *to,
	char *to_end,
	char *&to_next) const
	{
	std::auto_ptr<base_converter> cvtp;
	base_converter *cvt = 0;
	if(cvt_->is_thread_safe()) {
	cvt = cvt_.get();
	}
	else {
	cvtp.reset(cvt_->clone());
	cvt = cvtp.get();
	}
	std::codecvt_base::result r=std::codecvt_base::ok;
	// mbstate_t is POD type and should be initialized to 0 (i.a. state = stateT())
	// according to standard. We assume that sizeof(mbstate_t) >=2 in order
	// to be able to store first observerd surrogate pair
	//
	// State: state!=0 - a first surrogate pair was observerd (state = first pair),
	// we expect the second one to come and then zero the state
	///
	uint16_t &state = reinterpret_cast<uint16_t >(&std_state);
	while(to < to_end && from < from_end)
	{
	#ifdef DEBUG_CODECVT
	std::cout << "Entering OUT --------------" << std::endl;
	std::cout << "State " << std::hex << state <<std::endl;
	std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
	#endif
	uint32_t ch=0;
	if(state != 0) {
	// if the state idecates that 1st surrogate pair was written
	// we should make sure that the second one that comes is actually
	// second surrogate
	uint16_t w1 = state;
	uint16_t w2 = *from;
	// we don't forward from as writing may fail to incomplete or
	// partial conversion
	if(0xDC00 <= w2 && w2<=0xDFFF) {
	uint16_t vh = w1 - 0xD800;
	uint16_t vl = w2 - 0xDC00;
	ch=((uint32_t(vh) << 10) \| vl) + 0x10000;
	}
	else {
	// Invalid surrogate
	r=std::codecvt_base::error;
	break;
	}
	}
	else {
	ch = *from;
	if(0xD800 <= ch && ch<=0xDBFF) {
	// if this is a first surrogate pair we put
	// it into the state and consume it, note we don't
	// go forward as it should be illegal so we increase
	// the from pointer manually
	state = ch;
	from++;
	continue;
	}
	else if(0xDC00 <= ch && ch<=0xDFFF) {
	// if we observe second surrogate pair and
	// first only may be expected we should break from the loop with error
	// as it is illegal input
	r=std::codecvt_base::error;
	break;
	}
	}

	uint32_t len=cvt->from_unicode(ch,to,to_end);
	if(len==base_converter::illegal) {
	r=std::codecvt_base::error;
	break;
	}
	if(len==base_converter::incomplete) {
	r=std::codecvt_base::partial;
	break;
	}
	state = 0;
	to+=len;
	from++;
	}
	from_next=from;
	to_next=to;
	if(r==std::codecvt_base::ok && from!=from_end)
	r = std::codecvt_base::partial;
	#ifdef DEBUG_CODECVT
	std::cout << "Returning ";
	switch(r) {
	case std::codecvt_base::ok:
	std::cout << "ok" << std::endl;
	break;
	case std::codecvt_base::partial:
	std::cout << "partial" << std::endl;
	break;
	case std::codecvt_base::error:
	std::cout << "error" << std::endl;
	break;
	default:
	std::cout << "other" << std::endl;
	break;
	}
	std::cout << "State " << std::hex << state <<std::endl;
	std::cout << "Left in " << std::dec << from_end - from << " out " << to_end -to << std::endl;
	#endif
	return r;
	}

	int max_len_;
	std::auto_ptr<base_converter> cvt_;

	};

	static const char ensure_mbstate_size_is_at_least_2[sizeof(std::mbstate_t) >= 2 ? 1 : -1] = {0};

	template<>
	class code_converter<char> : public std::codecvt<char,char,std::mbstate_t>
	{
	public:
	code_converter(std::auto_ptr<base_converter> /cvt/,size_t refs = 0) :
	std::codecvt<char,char,std::mbstate_t>(refs)
	{
	}
	};


	std::locale create_codecvt(std::locale const &in,std::auto_ptr<base_converter> cvt,character_facet_type type)
	{
	if(!cvt.get())
	cvt.reset(new base_converter());
	switch(type) {
	case char_facet:
	return std::locale(in,new code_converter<char>(cvt));
	case wchar_t_facet:
	return std::locale(in,new code_converter<wchar_t>(cvt));
	#if defined(BOOST_HAS_CHAR16_T) && !defined(BOOST_NO_CHAR16_T_CODECVT)
	case char16_t_facet:
	return std::locale(in,new code_converter<char16_t>(cvt));
	#endif
	#if defined(BOOST_HAS_CHAR32_T) && !defined(BOOST_NO_CHAR32_T_CODECVT)
	case char32_t_facet:
	return std::locale(in,new code_converter<char32_t>(cvt));
	#endif
	default:
	return in;
	}
	}


	} // util
	} // locale
	} // boost

	// vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4