libstdc++
codecvt_specializations.h
Go to the documentation of this file.
00001 // Locale support (codecvt) -*- C++ -*-
00002 
00003 // Copyright (C) 2000-2013 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 //
00026 // ISO C++ 14882: 22.2.1.5 Template class codecvt
00027 //
00028 
00029 // Written by Benjamin Kosnik <bkoz@redhat.com>
00030 
00031 /** @file ext/codecvt_specializations.h
00032  *  This file is a GNU extension to the Standard C++ Library.
00033  */
00034 
00035 #ifndef _EXT_CODECVT_SPECIALIZATIONS_H
00036 #define _EXT_CODECVT_SPECIALIZATIONS_H 1
00037 
00038 #include <bits/c++config.h>
00039 #include <locale>
00040 #include <iconv.h>
00041 
00042 namespace __gnu_cxx _GLIBCXX_VISIBILITY(default)
00043 {
00044 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00045 
00046   /// Extension to use iconv for dealing with character encodings.
00047   // This includes conversions and comparisons between various character
00048   // sets.  This object encapsulates data that may need to be shared between
00049   // char_traits, codecvt and ctype.
00050   class encoding_state
00051   {
00052   public:
00053     // Types: 
00054     // NB: A conversion descriptor subsumes and enhances the
00055     // functionality of a simple state type such as mbstate_t.
00056     typedef iconv_t descriptor_type;
00057     
00058   protected:
00059     // Name of internal character set encoding.
00060     std::string         _M_int_enc;
00061 
00062     // Name of external character set encoding.
00063     std::string     _M_ext_enc;
00064 
00065     // Conversion descriptor between external encoding to internal encoding.
00066     descriptor_type _M_in_desc;
00067 
00068     // Conversion descriptor between internal encoding to external encoding.
00069     descriptor_type _M_out_desc;
00070 
00071     // The byte-order marker for the external encoding, if necessary.
00072     int         _M_ext_bom;
00073 
00074     // The byte-order marker for the internal encoding, if necessary.
00075     int         _M_int_bom;
00076 
00077     // Number of external bytes needed to construct one complete
00078     // character in the internal encoding.
00079     // NB: -1 indicates variable, or stateful, encodings.
00080     int         _M_bytes;
00081 
00082   public:
00083     explicit 
00084     encoding_state() 
00085     : _M_in_desc(0), _M_out_desc(0), _M_ext_bom(0), _M_int_bom(0), _M_bytes(0)
00086     { }
00087 
00088     explicit 
00089     encoding_state(const char* __int, const char* __ext, 
00090            int __ibom = 0, int __ebom = 0, int __bytes = 1)
00091     : _M_int_enc(__int), _M_ext_enc(__ext), _M_in_desc(0), _M_out_desc(0), 
00092       _M_ext_bom(__ebom), _M_int_bom(__ibom), _M_bytes(__bytes)
00093     { init(); }
00094 
00095     // 21.1.2 traits typedefs
00096     // p4
00097     // typedef STATE_T state_type
00098     // requires: state_type shall meet the requirements of
00099     // CopyConstructible types (20.1.3)
00100     // NB: This does not preserve the actual state of the conversion
00101     // descriptor member, but it does duplicate the encoding
00102     // information.
00103     encoding_state(const encoding_state& __obj) : _M_in_desc(0), _M_out_desc(0)
00104     { construct(__obj); }
00105 
00106     // Need assignment operator as well.
00107     encoding_state&
00108     operator=(const encoding_state& __obj)
00109     {
00110       construct(__obj);
00111       return *this;
00112     }
00113 
00114     ~encoding_state()
00115     { destroy(); } 
00116 
00117     bool
00118     good() const throw()
00119     { 
00120       const descriptor_type __err = (iconv_t)(-1);
00121       bool __test = _M_in_desc && _M_in_desc != __err; 
00122       __test &=  _M_out_desc && _M_out_desc != __err;
00123       return __test;
00124     }
00125     
00126     int
00127     character_ratio() const
00128     { return _M_bytes; }
00129 
00130     const std::string
00131     internal_encoding() const
00132     { return _M_int_enc; }
00133 
00134     int 
00135     internal_bom() const
00136     { return _M_int_bom; }
00137 
00138     const std::string
00139     external_encoding() const
00140     { return _M_ext_enc; }
00141 
00142     int 
00143     external_bom() const
00144     { return _M_ext_bom; }
00145 
00146     const descriptor_type&
00147     in_descriptor() const
00148     { return _M_in_desc; }
00149 
00150     const descriptor_type&
00151     out_descriptor() const
00152     { return _M_out_desc; }
00153 
00154   protected:
00155     void
00156     init()
00157     {
00158       const descriptor_type __err = (iconv_t)(-1);
00159       const bool __have_encodings = _M_int_enc.size() && _M_ext_enc.size();
00160       if (!_M_in_desc && __have_encodings)
00161     {
00162       _M_in_desc = iconv_open(_M_int_enc.c_str(), _M_ext_enc.c_str());
00163       if (_M_in_desc == __err)
00164         std::__throw_runtime_error(__N("encoding_state::_M_init "
00165                     "creating iconv input descriptor failed"));
00166     }
00167       if (!_M_out_desc && __have_encodings)
00168     {
00169       _M_out_desc = iconv_open(_M_ext_enc.c_str(), _M_int_enc.c_str());
00170       if (_M_out_desc == __err)
00171         std::__throw_runtime_error(__N("encoding_state::_M_init "
00172                   "creating iconv output descriptor failed"));
00173     }
00174     }
00175 
00176     void
00177     construct(const encoding_state& __obj)
00178     {
00179       destroy();
00180       _M_int_enc = __obj._M_int_enc;
00181       _M_ext_enc = __obj._M_ext_enc;
00182       _M_ext_bom = __obj._M_ext_bom;
00183       _M_int_bom = __obj._M_int_bom;
00184       _M_bytes = __obj._M_bytes;
00185       init();
00186     }
00187 
00188     void
00189     destroy() throw()
00190     {
00191       const descriptor_type __err = (iconv_t)(-1);
00192       if (_M_in_desc && _M_in_desc != __err) 
00193     {
00194       iconv_close(_M_in_desc);
00195       _M_in_desc = 0;
00196     }
00197       if (_M_out_desc && _M_out_desc != __err) 
00198     {
00199       iconv_close(_M_out_desc);
00200       _M_out_desc = 0;
00201     }
00202     }
00203   };
00204 
00205   /// encoding_char_traits
00206   // Custom traits type with encoding_state for the state type, and the
00207   // associated fpos<encoding_state> for the position type, all other
00208   // bits equivalent to the required char_traits instantiations.
00209   template<typename _CharT>
00210     struct encoding_char_traits : public std::char_traits<_CharT>
00211     {
00212       typedef encoding_state                state_type;
00213       typedef typename std::fpos<state_type>        pos_type;
00214     };
00215 
00216 _GLIBCXX_END_NAMESPACE_VERSION
00217 } // namespace
00218 
00219 
00220 namespace std _GLIBCXX_VISIBILITY(default)
00221 {
00222 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00223 
00224   using __gnu_cxx::encoding_state;
00225 
00226   /// codecvt<InternT, _ExternT, encoding_state> specialization.
00227   // This partial specialization takes advantage of iconv to provide
00228   // code conversions between a large number of character encodings.
00229   template<typename _InternT, typename _ExternT>
00230     class codecvt<_InternT, _ExternT, encoding_state>
00231     : public __codecvt_abstract_base<_InternT, _ExternT, encoding_state>
00232     {
00233     public:      
00234       // Types:
00235       typedef codecvt_base::result          result;
00236       typedef _InternT                  intern_type;
00237       typedef _ExternT                  extern_type;
00238       typedef __gnu_cxx::encoding_state         state_type;
00239       typedef state_type::descriptor_type       descriptor_type;
00240 
00241       // Data Members:
00242       static locale::id         id;
00243 
00244       explicit 
00245       codecvt(size_t __refs = 0)
00246       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
00247       { }
00248 
00249       explicit 
00250       codecvt(state_type& __enc, size_t __refs = 0)
00251       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
00252       { }
00253 
00254      protected:
00255       virtual 
00256       ~codecvt() { }
00257 
00258       virtual result
00259       do_out(state_type& __state, const intern_type* __from, 
00260          const intern_type* __from_end, const intern_type*& __from_next,
00261          extern_type* __to, extern_type* __to_end,
00262          extern_type*& __to_next) const;
00263 
00264       virtual result
00265       do_unshift(state_type& __state, extern_type* __to, 
00266          extern_type* __to_end, extern_type*& __to_next) const;
00267 
00268       virtual result
00269       do_in(state_type& __state, const extern_type* __from, 
00270         const extern_type* __from_end, const extern_type*& __from_next,
00271         intern_type* __to, intern_type* __to_end, 
00272         intern_type*& __to_next) const;
00273 
00274       virtual int 
00275       do_encoding() const throw();
00276 
00277       virtual bool 
00278       do_always_noconv() const throw();
00279 
00280       virtual int 
00281       do_length(state_type&, const extern_type* __from, 
00282         const extern_type* __end, size_t __max) const;
00283 
00284       virtual int 
00285       do_max_length() const throw();
00286     };
00287 
00288   template<typename _InternT, typename _ExternT>
00289     locale::id 
00290     codecvt<_InternT, _ExternT, encoding_state>::id;
00291 
00292   // This adaptor works around the signature problems of the second
00293   // argument to iconv():  SUSv2 and others use 'const char**', but glibc 2.2
00294   // uses 'char**', which matches the POSIX 1003.1-2001 standard.
00295   // Using this adaptor, g++ will do the work for us.
00296   template<typename _Tp>
00297     inline size_t
00298     __iconv_adaptor(size_t(*__func)(iconv_t, _Tp, size_t*, char**, size_t*),
00299                     iconv_t __cd, char** __inbuf, size_t* __inbytes,
00300                     char** __outbuf, size_t* __outbytes)
00301     { return __func(__cd, (_Tp)__inbuf, __inbytes, __outbuf, __outbytes); }
00302 
00303   template<typename _InternT, typename _ExternT>
00304     codecvt_base::result
00305     codecvt<_InternT, _ExternT, encoding_state>::
00306     do_out(state_type& __state, const intern_type* __from, 
00307        const intern_type* __from_end, const intern_type*& __from_next,
00308        extern_type* __to, extern_type* __to_end,
00309        extern_type*& __to_next) const
00310     {
00311       result __ret = codecvt_base::error;
00312       if (__state.good())
00313     {
00314       const descriptor_type& __desc = __state.out_descriptor();
00315       const size_t __fmultiple = sizeof(intern_type);
00316       size_t __fbytes = __fmultiple * (__from_end - __from);
00317       const size_t __tmultiple = sizeof(extern_type);
00318       size_t __tbytes = __tmultiple * (__to_end - __to); 
00319       
00320       // Argument list for iconv specifies a byte sequence. Thus,
00321       // all to/from arrays must be brutally casted to char*.
00322       char* __cto = reinterpret_cast<char*>(__to);
00323       char* __cfrom;
00324       size_t __conv;
00325 
00326       // Some encodings need a byte order marker as the first item
00327       // in the byte stream, to designate endian-ness. The default
00328       // value for the byte order marker is NULL, so if this is
00329       // the case, it's not necessary and we can just go on our
00330       // merry way.
00331       int __int_bom = __state.internal_bom();
00332       if (__int_bom)
00333         {     
00334           size_t __size = __from_end - __from;
00335           intern_type* __cfixed = static_cast<intern_type*>
00336         (__builtin_alloca(sizeof(intern_type) * (__size + 1)));
00337           __cfixed[0] = static_cast<intern_type>(__int_bom);
00338           char_traits<intern_type>::copy(__cfixed + 1, __from, __size);
00339           __cfrom = reinterpret_cast<char*>(__cfixed);
00340           __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
00341                                         &__fbytes, &__cto, &__tbytes); 
00342         }
00343       else
00344         {
00345           intern_type* __cfixed = const_cast<intern_type*>(__from);
00346           __cfrom = reinterpret_cast<char*>(__cfixed);
00347           __conv = __iconv_adaptor(iconv, __desc, &__cfrom, &__fbytes, 
00348                        &__cto, &__tbytes); 
00349         }
00350 
00351       if (__conv != size_t(-1))
00352         {
00353           __from_next = reinterpret_cast<const intern_type*>(__cfrom);
00354           __to_next = reinterpret_cast<extern_type*>(__cto);
00355           __ret = codecvt_base::ok;
00356         }
00357       else 
00358         {
00359           if (__fbytes < __fmultiple * (__from_end - __from))
00360         {
00361           __from_next = reinterpret_cast<const intern_type*>(__cfrom);
00362           __to_next = reinterpret_cast<extern_type*>(__cto);
00363           __ret = codecvt_base::partial;
00364         }
00365           else
00366         __ret = codecvt_base::error;
00367         }
00368     }
00369       return __ret; 
00370     }
00371 
00372   template<typename _InternT, typename _ExternT>
00373     codecvt_base::result
00374     codecvt<_InternT, _ExternT, encoding_state>::
00375     do_unshift(state_type& __state, extern_type* __to, 
00376            extern_type* __to_end, extern_type*& __to_next) const
00377     {
00378       result __ret = codecvt_base::error;
00379       if (__state.good())
00380     {
00381       const descriptor_type& __desc = __state.in_descriptor();
00382       const size_t __tmultiple = sizeof(intern_type);
00383       size_t __tlen = __tmultiple * (__to_end - __to); 
00384       
00385       // Argument list for iconv specifies a byte sequence. Thus,
00386       // all to/from arrays must be brutally casted to char*.
00387       char* __cto = reinterpret_cast<char*>(__to);
00388       size_t __conv = __iconv_adaptor(iconv,__desc, 0, 0,
00389                                           &__cto, &__tlen); 
00390       
00391       if (__conv != size_t(-1))
00392         {
00393           __to_next = reinterpret_cast<extern_type*>(__cto);
00394           if (__tlen == __tmultiple * (__to_end - __to))
00395         __ret = codecvt_base::noconv;
00396           else if (__tlen == 0)
00397         __ret = codecvt_base::ok;
00398           else
00399         __ret = codecvt_base::partial;
00400         }
00401       else 
00402         __ret = codecvt_base::error;
00403     }
00404       return __ret; 
00405     }
00406    
00407   template<typename _InternT, typename _ExternT>
00408     codecvt_base::result
00409     codecvt<_InternT, _ExternT, encoding_state>::
00410     do_in(state_type& __state, const extern_type* __from, 
00411       const extern_type* __from_end, const extern_type*& __from_next,
00412       intern_type* __to, intern_type* __to_end, 
00413       intern_type*& __to_next) const
00414     { 
00415       result __ret = codecvt_base::error;
00416       if (__state.good())
00417     {
00418       const descriptor_type& __desc = __state.in_descriptor();
00419       const size_t __fmultiple = sizeof(extern_type);
00420       size_t __flen = __fmultiple * (__from_end - __from);
00421       const size_t __tmultiple = sizeof(intern_type);
00422       size_t __tlen = __tmultiple * (__to_end - __to); 
00423       
00424       // Argument list for iconv specifies a byte sequence. Thus,
00425       // all to/from arrays must be brutally casted to char*.
00426       char* __cto = reinterpret_cast<char*>(__to);
00427       char* __cfrom;
00428       size_t __conv;
00429 
00430       // Some encodings need a byte order marker as the first item
00431       // in the byte stream, to designate endian-ness. The default
00432       // value for the byte order marker is NULL, so if this is
00433       // the case, it's not necessary and we can just go on our
00434       // merry way.
00435       int __ext_bom = __state.external_bom();
00436       if (__ext_bom)
00437         {     
00438           size_t __size = __from_end - __from;
00439           extern_type* __cfixed =  static_cast<extern_type*>
00440         (__builtin_alloca(sizeof(extern_type) * (__size + 1)));
00441           __cfixed[0] = static_cast<extern_type>(__ext_bom);
00442           char_traits<extern_type>::copy(__cfixed + 1, __from, __size);
00443           __cfrom = reinterpret_cast<char*>(__cfixed);
00444           __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
00445                                        &__flen, &__cto, &__tlen); 
00446         }
00447       else
00448         {
00449           extern_type* __cfixed = const_cast<extern_type*>(__from);
00450           __cfrom = reinterpret_cast<char*>(__cfixed);
00451           __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
00452                                        &__flen, &__cto, &__tlen); 
00453         }
00454 
00455       
00456       if (__conv != size_t(-1))
00457         {
00458           __from_next = reinterpret_cast<const extern_type*>(__cfrom);
00459           __to_next = reinterpret_cast<intern_type*>(__cto);
00460           __ret = codecvt_base::ok;
00461         }
00462       else 
00463         {
00464           if (__flen < static_cast<size_t>(__from_end - __from))
00465         {
00466           __from_next = reinterpret_cast<const extern_type*>(__cfrom);
00467           __to_next = reinterpret_cast<intern_type*>(__cto);
00468           __ret = codecvt_base::partial;
00469         }
00470           else
00471         __ret = codecvt_base::error;
00472         }
00473     }
00474       return __ret; 
00475     }
00476   
00477   template<typename _InternT, typename _ExternT>
00478     int 
00479     codecvt<_InternT, _ExternT, encoding_state>::
00480     do_encoding() const throw()
00481     {
00482       int __ret = 0;
00483       if (sizeof(_ExternT) <= sizeof(_InternT))
00484     __ret = sizeof(_InternT) / sizeof(_ExternT);
00485       return __ret; 
00486     }
00487   
00488   template<typename _InternT, typename _ExternT>
00489     bool 
00490     codecvt<_InternT, _ExternT, encoding_state>::
00491     do_always_noconv() const throw()
00492     { return false; }
00493   
00494   template<typename _InternT, typename _ExternT>
00495     int 
00496     codecvt<_InternT, _ExternT, encoding_state>::
00497     do_length(state_type&, const extern_type* __from, 
00498           const extern_type* __end, size_t __max) const
00499     { return std::min(__max, static_cast<size_t>(__end - __from)); }
00500 
00501   // _GLIBCXX_RESOLVE_LIB_DEFECTS
00502   // 74.  Garbled text for codecvt::do_max_length
00503   template<typename _InternT, typename _ExternT>
00504     int 
00505     codecvt<_InternT, _ExternT, encoding_state>::
00506     do_max_length() const throw()
00507     { return 1; }
00508 
00509 _GLIBCXX_END_NAMESPACE_VERSION
00510 } // namespace
00511 
00512 #endif