boost_1_45_0/libs/iostreams/test/detail/utf8_codecvt_facet.cpp - nest-learning-thermostat/5.0/boost - Git at Google

 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
 // utf8_codecvt_facet.cpp

 // Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
 // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
 // Distributed under the Boost Software License, Version 1.0. (See accompany-
 // ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

 // See http://www.boost.org/libs/iostreams for documentation.

 //#include <cstdlib> // for multi-byte converson routines

 // Jonathan Turkanis:
 //   - Replaced test for BOOST_NO_STD_WSTREAMBUF with test for
 //     BOOST_IOSTREAMS_NO_WIDE_STREAMS;
 //   - Derived from codecvt_helper instead of codecvt.

 #include <boost/config.hpp>
 #include <boost/iostreams/detail/config/wide_streams.hpp>
 #ifdef BOOST_IOSTREAMS_NO_LOCALES
 # error "C++ locales not supported on this platform"
 #else

 #include <cassert>
 #include <cstddef>

 #include <boost/detail/workaround.hpp>
 #include "./utf8_codecvt_facet.hpp"

 #if BOOST_WORKAROUND(__BORLANDC__, <= 0x600)
 # pragma warn -sig // Conversion may lose significant digits
 # pragma warn -rng // Constant is out of range in comparison
 #endif

 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
 // implementation for wchar_t

 // Translate incoming UTF-8 into UCS-4
 std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_in(
     std::mbstate_t&,
     const char * from,
     const char * from_end,
     const char * & from_next,
     wchar_t * to,
     wchar_t * to_end,
     wchar_t * & to_next
 ) const {
     // Basic algorithm:  The first octet determines how many
     // octets total make up the UCS-4 character.  The remaining
     // "continuing octets" all begin with "10". To convert, subtract
     // the amount that specifies the number of octets from the first
     // octet.  Subtract 0x80 (1000 0000) from each continuing octet,
     // then mash the whole lot together.  Note that each continuing
     // octet only uses 6 bits as unique values, so only shift by
     // multiples of 6 to combine.
     while (from != from_end && to != to_end) {

         // Error checking   on the first octet
         if (invalid_leading_octet(*from)){
             from_next = from;
             to_next = to;
             return std::codecvt_base::error;
         }

         // The first octet is   adjusted by a value dependent upon
         // the number   of "continuing octets" encoding the character
         const   int cont_octet_count = get_cont_octet_count(*from);
         const   wchar_t octet1_modifier_table[] =   {
             0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
         };

         // The unsigned char conversion is necessary in case char is
         // signed   (I learned this the hard way)
         wchar_t ucs_result =
             (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];

         // Invariants   :
         //   1) At the start of the loop,   'i' continuing characters have been
         //    processed
         //   2) *from   points to the next continuing character to be processed.
         int i   = 0;
         while(i != cont_octet_count && from != from_end) {

             // Error checking on continuing characters
             if (invalid_continuing_octet(*from)) {
                 from_next   = from;
                 to_next =   to;
                 return std::codecvt_base::error;
             }

             ucs_result *= (1 << 6);

             // each continuing character has an extra (10xxxxxx)b attached to
             // it that must be removed.
             ucs_result += (unsigned char)(*from++) - 0x80;
             ++i;
         }

         // If   the buffer ends with an incomplete unicode character...
         if (from == from_end && i   != cont_octet_count) {
             // rewind "from" to before the current character translation
             from_next = from - (i+1);
             to_next = to;
             return std::codecvt_base::partial;
         }
         *to++   = ucs_result;
     }
     from_next = from;
     to_next = to;

     // Were we done converting or did we run out of destination space?
     if(from == from_end) return std::codecvt_base::ok;
     else return std::codecvt_base::partial;
 }

 std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_out(
     std::mbstate_t &,
     const wchar_t *   from,
     const wchar_t * from_end,
     const wchar_t * & from_next,
     char * to,
     char * to_end,
     char * & to_next
 ) const
 {
     // RG - consider merging this table with the other one
     const wchar_t octet1_modifier_table[] = {
         0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
     };

     while (from != from_end && to != to_end) {

 #define BOOST_NULL // Prevent macro expansion
         // Check for invalid UCS-4 character
         if (*from  > std::numeric_limits<wchar_t>::max BOOST_NULL ()) {
             from_next = from;
             to_next = to;
             return std::codecvt_base::error;
         }
 #undef BOOST_NULL

         int cont_octet_count = get_cont_octet_out_count(*from);

         // RG  - comment this formula better
         int shift_exponent = (cont_octet_count) *   6;

         // Process the first character
         *to++ = octet1_modifier_table[cont_octet_count] +
             (unsigned char)(*from / (1 << shift_exponent));

         // Process the continuation characters
         // Invariants: At   the start of the loop:
         //   1) 'i' continuing octets   have been generated
         //   2) '*to'   points to the next location to place an octet
         //   3) shift_exponent is   6 more than needed for the next octet
         int i   = 0;
         while   (i != cont_octet_count && to != to_end) {
             shift_exponent -= 6;
             *to++ = 0x80 + ((*from / (1 << shift_exponent)) % (1 << 6));
             ++i;
         }
         // If   we filled up the out buffer before encoding the character
         if(to   == to_end && i != cont_octet_count) {
             from_next = from;
             to_next = to - (i+1);
             return std::codecvt_base::partial;
         }
         *from++;
     }
     from_next = from;
     to_next = to;
     // Were we done or did we run out of destination space
     if(from == from_end) return std::codecvt_base::ok;
     else return std::codecvt_base::partial;
 }

 // How many char objects can I process to get <= max_limit
 // wchar_t objects?
 int utf8_codecvt_facet_wchar_t::do_length(
     BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER std::mbstate_t &,
     const char * from,
     const char * from_end,
     std::size_t max_limit
 ) const throw()
 {
     // RG - this code is confusing!  I need a better way to express it.
     // and test cases.

     // Invariants:
     // 1) last_octet_count has the size of the last measured character
     // 2) char_count holds the number of characters shown to fit
     // within the bounds so far (no greater than max_limit)
     // 3) from_next points to the octet 'last_octet_count' before the
     // last measured character.
     int last_octet_count=0;
     std::size_t char_count = 0;
     const char* from_next = from;
     // Use "<" because the buffer may represent incomplete characters
     while (from_next+last_octet_count <= from_end && char_count <= max_limit) {
         from_next += last_octet_count;
         last_octet_count = (get_octet_count(*from_next));
         ++char_count;
     }
     return from_next-from_end;
 }

 unsigned int utf8_codecvt_facet_wchar_t::get_octet_count(
     unsigned char   lead_octet
 ){
     // if the 0-bit (MSB) is 0, then 1 character
     if (lead_octet <= 0x7f) return 1;

     // Otherwise the count number of consecutive 1 bits starting at MSB
     assert(0xc0 <= lead_octet && lead_octet <= 0xfd);

     if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
     else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
     else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
     else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
     else return 6;
 }

 namespace {
 template<std::size_t s>
 int get_cont_octet_out_count_impl(wchar_t word){
     if (word < 0x80) {
         return 0;
     }
     if (word < 0x800) {
         return 1;
     }
     return 2;
 }

 // note the following code will generate on some platforms where
 // wchar_t is defined as UCS2.  The warnings are superfluous as
 // the specialization is never instantitiated with such compilers.
 template<>
 int get_cont_octet_out_count_impl<4>(wchar_t word)
 {
     if (word < 0x80) {
         return 0;
     }
     if (word < 0x800) {
         return 1;
     }
     if (word < 0x10000) {
         return 2;
     }
     if (word < 0x200000) {
         return 3;
     }
     if (word < 0x4000000) {
         return 4;
     }
     return 5;
 }

 } // namespace anonymous

 // How many "continuing octets" will be needed for this word
 // ==   total octets - 1.
 int utf8_codecvt_facet_wchar_t::get_cont_octet_out_count(
     wchar_t word
 ) const {
     return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
 }

 #if 0 // not used?
 /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
 // implementation for char

 std::codecvt_base::result utf8_codecvt_facet_char::do_in(
     std::mbstate_t & state,
     const char * from,
     const char * from_end,
     const char * & from_next,
     char * to,
     char * to_end,
     char * & to_next
 ) const
 {
     while(from_next < from_end){
         wchar_t w;
         wchar_t *wnext = & w;
         utf8_codecvt_facet_wchar_t::result ucs4_result;
         ucs4_result = base_class::do_in(
             state,
             from, from_end, from_next,
             wnext, wnext + 1, wnext
         );
         if(codecvt_base::ok != ucs4_result)
             return ucs4_result;
         // if the conversion succeeds.
         int length = std::wctomb(to_next, w);
         assert(-1 != length);
         to_next += length;
     }
     return codecvt_base::ok;
 }

 std::codecvt_base::result utf8_codecvt_facet_char::do_out(
     mbstate_t & state,
     const char * from,
     const char * from_end,
     const char * & from_next,
     char * to,
     char * to_end,
     char * & to_next
 ) const
 {
     while(from_next < from_end){
         wchar_t w;
         int result = std::mbtowc(&w, from_next,  MB_LENGTH_MAX);
         assert(-1 != result);
         from_next += result;
         utf8_codecvt_facet_wchar_t::result ucs4_result;

         const wchar_t *wptr = & w;
         ucs4_result = base_class::do_out(
             state,
             wptr, wptr+1, wptr,
             to_next, to_end, to_next
         );
         if(codecvt_base::ok != ucs4_result)
             return ucs4_result;
     }
     return codecvt_base::ok;
 }

 // How many bytes objects can I process to get <= max_limit
 // char objects?
 int utf8_codecvt_facet_char::do_length(
     // it seems that the standard doesn't use const so these librarires
     // would be in error
     BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER
     utf8_codecvt_facet_wchar_t::mbstate_t & initial_state,
     const char * from_next,
     const char * from_end,
     std::size_t max_limit
 ) const
 {
     int total_length = 0;
     const char *from = from_next;
     mbstate_t state = initial_state;
     while(from_next < from_end){
         wchar_t w;
         wchar_t *wnext = & w;
         utf8_codecvt_facet_wchar_t::result ucs4_result;
         ucs4_result = base_class::do_in(
             state,
             from_next, from_end, from_next,
             wnext, wnext + 1, wnext
         );

         if(codecvt_base::ok != ucs4_result)
             break;

         char carray[MB_LENGTH_MAX];
         std::size_t count = wctomb(carray, w);
         if(count > max_limit)
             break;

         max_limit -= count;
         total_length = from_next - from;
     }
     return total_length;
 }
 #endif

 #endif //BOOST_IOSTREAMS_NO_WIDE_STREAMS
	/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
	// utf8_codecvt_facet.cpp

	// Copyright (c) 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
	// Andrew Lumsdaine, Indiana University (lums@osl.iu.edu).
	// Distributed under the Boost Software License, Version 1.0. (See accompany-
	// ing file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

	// See http://www.boost.org/libs/iostreams for documentation.

	//#include <cstdlib> // for multi-byte converson routines

	// Jonathan Turkanis:
	// - Replaced test for BOOST_NO_STD_WSTREAMBUF with test for
	// BOOST_IOSTREAMS_NO_WIDE_STREAMS;
	// - Derived from codecvt_helper instead of codecvt.

	#include <boost/config.hpp>
	#include <boost/iostreams/detail/config/wide_streams.hpp>
	#ifdef BOOST_IOSTREAMS_NO_LOCALES
	# error "C++ locales not supported on this platform"
	#else

	#include <cassert>
	#include <cstddef>

	#include <boost/detail/workaround.hpp>
	#include "./utf8_codecvt_facet.hpp"

	#if BOOST_WORKAROUND(__BORLANDC__, <= 0x600)
	# pragma warn -sig // Conversion may lose significant digits
	# pragma warn -rng // Constant is out of range in comparison
	#endif

	/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
	// implementation for wchar_t

	// Translate incoming UTF-8 into UCS-4
	std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_in(
	std::mbstate_t&,
	const char * from,
	const char * from_end,
	const char * & from_next,
	wchar_t * to,
	wchar_t * to_end,
	wchar_t * & to_next
	) const {
	// Basic algorithm: The first octet determines how many
	// octets total make up the UCS-4 character. The remaining
	// "continuing octets" all begin with "10". To convert, subtract
	// the amount that specifies the number of octets from the first
	// octet. Subtract 0x80 (1000 0000) from each continuing octet,
	// then mash the whole lot together. Note that each continuing
	// octet only uses 6 bits as unique values, so only shift by
	// multiples of 6 to combine.
	while (from != from_end && to != to_end) {

	// Error checking on the first octet
	if (invalid_leading_octet(*from)){
	from_next = from;
	to_next = to;
	return std::codecvt_base::error;
	}

	// The first octet is adjusted by a value dependent upon
	// the number of "continuing octets" encoding the character
	const int cont_octet_count = get_cont_octet_count(*from);
	const wchar_t octet1_modifier_table[] = {
	0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
	};

	// The unsigned char conversion is necessary in case char is
	// signed (I learned this the hard way)
	wchar_t ucs_result =
	(unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];

	// Invariants :
	// 1) At the start of the loop, 'i' continuing characters have been
	// processed
	// 2) *from points to the next continuing character to be processed.
	int i = 0;
	while(i != cont_octet_count && from != from_end) {

	// Error checking on continuing characters
	if (invalid_continuing_octet(*from)) {
	from_next = from;
	to_next = to;
	return std::codecvt_base::error;
	}

	ucs_result *= (1 << 6);

	// each continuing character has an extra (10xxxxxx)b attached to
	// it that must be removed.
	ucs_result += (unsigned char)(*from++) - 0x80;
	++i;
	}

	// If the buffer ends with an incomplete unicode character...
	if (from == from_end && i != cont_octet_count) {
	// rewind "from" to before the current character translation
	from_next = from - (i+1);
	to_next = to;
	return std::codecvt_base::partial;
	}
	*to++ = ucs_result;
	}
	from_next = from;
	to_next = to;

	// Were we done converting or did we run out of destination space?
	if(from == from_end) return std::codecvt_base::ok;
	else return std::codecvt_base::partial;
	}

	std::codecvt_base::result utf8_codecvt_facet_wchar_t::do_out(
	std::mbstate_t &,
	const wchar_t * from,
	const wchar_t * from_end,
	const wchar_t * & from_next,
	char * to,
	char * to_end,
	char * & to_next
	) const
	{
	// RG - consider merging this table with the other one
	const wchar_t octet1_modifier_table[] = {
	0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
	};

	while (from != from_end && to != to_end) {

	#define BOOST_NULL // Prevent macro expansion
	// Check for invalid UCS-4 character
	if (*from > std::numeric_limits<wchar_t>::max BOOST_NULL ()) {
	from_next = from;
	to_next = to;
	return std::codecvt_base::error;
	}
	#undef BOOST_NULL

	int cont_octet_count = get_cont_octet_out_count(*from);

	// RG - comment this formula better
	int shift_exponent = (cont_octet_count) * 6;

	// Process the first character
	*to++ = octet1_modifier_table[cont_octet_count] +
	(unsigned char)(*from / (1 << shift_exponent));

	// Process the continuation characters
	// Invariants: At the start of the loop:
	// 1) 'i' continuing octets have been generated
	// 2) '*to' points to the next location to place an octet
	// 3) shift_exponent is 6 more than needed for the next octet
	int i = 0;
	while (i != cont_octet_count && to != to_end) {
	shift_exponent -= 6;
	to++ = 0x80 + ((from / (1 << shift_exponent)) % (1 << 6));
	++i;
	}
	// If we filled up the out buffer before encoding the character
	if(to == to_end && i != cont_octet_count) {
	from_next = from;
	to_next = to - (i+1);
	return std::codecvt_base::partial;
	}
	*from++;
	}
	from_next = from;
	to_next = to;
	// Were we done or did we run out of destination space
	if(from == from_end) return std::codecvt_base::ok;
	else return std::codecvt_base::partial;
	}

	// How many char objects can I process to get <= max_limit
	// wchar_t objects?
	int utf8_codecvt_facet_wchar_t::do_length(
	BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER std::mbstate_t &,
	const char * from,
	const char * from_end,
	std::size_t max_limit
	) const throw()
	{
	// RG - this code is confusing! I need a better way to express it.
	// and test cases.

	// Invariants:
	// 1) last_octet_count has the size of the last measured character
	// 2) char_count holds the number of characters shown to fit
	// within the bounds so far (no greater than max_limit)
	// 3) from_next points to the octet 'last_octet_count' before the
	// last measured character.
	int last_octet_count=0;
	std::size_t char_count = 0;
	const char* from_next = from;
	// Use "<" because the buffer may represent incomplete characters
	while (from_next+last_octet_count <= from_end && char_count <= max_limit) {
	from_next += last_octet_count;
	last_octet_count = (get_octet_count(*from_next));
	++char_count;
	}
	return from_next-from_end;
	}

	unsigned int utf8_codecvt_facet_wchar_t::get_octet_count(
	unsigned char lead_octet
	){
	// if the 0-bit (MSB) is 0, then 1 character
	if (lead_octet <= 0x7f) return 1;

	// Otherwise the count number of consecutive 1 bits starting at MSB
	assert(0xc0 <= lead_octet && lead_octet <= 0xfd);

	if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
	else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
	else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
	else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
	else return 6;
	}

	namespace {
	template<std::size_t s>
	int get_cont_octet_out_count_impl(wchar_t word){
	if (word < 0x80) {
	return 0;
	}
	if (word < 0x800) {
	return 1;
	}
	return 2;
	}

	// note the following code will generate on some platforms where
	// wchar_t is defined as UCS2. The warnings are superfluous as
	// the specialization is never instantitiated with such compilers.
	template<>
	int get_cont_octet_out_count_impl<4>(wchar_t word)
	{
	if (word < 0x80) {
	return 0;
	}
	if (word < 0x800) {
	return 1;
	}
	if (word < 0x10000) {
	return 2;
	}
	if (word < 0x200000) {
	return 3;
	}
	if (word < 0x4000000) {
	return 4;
	}
	return 5;
	}

	} // namespace anonymous

	// How many "continuing octets" will be needed for this word
	// == total octets - 1.
	int utf8_codecvt_facet_wchar_t::get_cont_octet_out_count(
	wchar_t word
	) const {
	return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
	}

	#if 0 // not used?
	/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
	// implementation for char

	std::codecvt_base::result utf8_codecvt_facet_char::do_in(
	std::mbstate_t & state,
	const char * from,
	const char * from_end,
	const char * & from_next,
	char * to,
	char * to_end,
	char * & to_next
	) const
	{
	while(from_next < from_end){
	wchar_t w;
	wchar_t *wnext = & w;
	utf8_codecvt_facet_wchar_t::result ucs4_result;
	ucs4_result = base_class::do_in(
	state,
	from, from_end, from_next,
	wnext, wnext + 1, wnext
	);
	if(codecvt_base::ok != ucs4_result)
	return ucs4_result;
	// if the conversion succeeds.
	int length = std::wctomb(to_next, w);
	assert(-1 != length);
	to_next += length;
	}
	return codecvt_base::ok;
	}

	std::codecvt_base::result utf8_codecvt_facet_char::do_out(
	mbstate_t & state,
	const char * from,
	const char * from_end,
	const char * & from_next,
	char * to,
	char * to_end,
	char * & to_next
	) const
	{
	while(from_next < from_end){
	wchar_t w;
	int result = std::mbtowc(&w, from_next, MB_LENGTH_MAX);
	assert(-1 != result);
	from_next += result;
	utf8_codecvt_facet_wchar_t::result ucs4_result;

	const wchar_t *wptr = & w;
	ucs4_result = base_class::do_out(
	state,
	wptr, wptr+1, wptr,
	to_next, to_end, to_next
	);
	if(codecvt_base::ok != ucs4_result)
	return ucs4_result;
	}
	return codecvt_base::ok;
	}

	// How many bytes objects can I process to get <= max_limit
	// char objects?
	int utf8_codecvt_facet_char::do_length(
	// it seems that the standard doesn't use const so these librarires
	// would be in error
	BOOST_IOSTREAMS_CODECVT_CV_QUALIFIER
	utf8_codecvt_facet_wchar_t::mbstate_t & initial_state,
	const char * from_next,
	const char * from_end,
	std::size_t max_limit
	) const
	{
	int total_length = 0;
	const char *from = from_next;
	mbstate_t state = initial_state;
	while(from_next < from_end){
	wchar_t w;
	wchar_t *wnext = & w;
	utf8_codecvt_facet_wchar_t::result ucs4_result;
	ucs4_result = base_class::do_in(
	state,
	from_next, from_end, from_next,
	wnext, wnext + 1, wnext
	);

	if(codecvt_base::ok != ucs4_result)
	break;

	char carray[MB_LENGTH_MAX];
	std::size_t count = wctomb(carray, w);
	if(count > max_limit)
	break;

	max_limit -= count;
	total_length = from_next - from;
	}
	return total_length;
	}
	#endif

	#endif //BOOST_IOSTREAMS_NO_WIDE_STREAMS