MyGUI  3.0.1
MyGUI_UString.h
Go to the documentation of this file.
1 // Modified from OpenGUI under lenient license
2 // Original copyright details and licensing below:
3 // OpenGUI (http://opengui.sourceforge.net)
4 // This source code is released under the BSD License
5 
6 // Permission is given to the Ogre project to use the contents of file within its
7 // source and binary applications, as well as any derivative works, in accordance
8 // with the terms of any license under which Ogre is or will be distributed.
9 //
10 // Ogre may relicense its copy of this file, as well as any OpenGUI released updates
11 // to this file, under any terms that it deems fit, and is not required to maintain
12 // the original BSD licensing terms of this file, however OpenGUI retains the right
13 // to present its copy of this file under the terms of any license under which
14 // OpenGUI is distributed.
15 //
16 // Ogre is not required to release to OpenGUI any future changes that it makes to
17 // this file, and understands and agrees that any such changes that are released
18 // back to OpenGUI will become available under the terms of any license under which
19 // OpenGUI is distributed.
20 //
21 // For brevity, this permission text may be removed from this file if desired.
22 // The original record kept within the SourceForge (http://sourceforge.net/) tracker
23 // is sufficient.
24 //
25 // - Eric Shorkey (zero/zeroskill) <opengui@rightbracket.com> [January 20th, 2007]
26 
27 #ifndef __MYGUI_U_STRING_H__
28 #define __MYGUI_U_STRING_H__
29 
30 #include "MyGUI_Prerequest.h"
31 #include "MyGUI_Types.h"
32 #include "MyGUI_Diagnostic.h"
33 #include "MyGUI_LogManager.h"
34 
35 // these are explained later
36 #include <iterator>
37 #include <string>
38 #include <stdexcept>
39 
40 // this pragma used to avoid warnings from some advanced gcc warnings flags
41 #if MYGUI_COMPILER == MYGUI_COMPILER_GNUC
42 #pragma GCC system_header
43 #endif
44 
45 // Workaround for VC7:
46 // when build with /MD or /MDd, VC7 have both std::basic_string<unsigned short> and
47 // basic_string<__wchar_t> instantiated in msvcprt[d].lib/MSVCP71[D].dll, but the header
48 // files tells compiler that only one of them is over there (based on /Zc:wchar_t compile
49 // option). And since this file used both of them, causing compiler instantiating another
50 // one in user object code, which lead to duplicate symbols with msvcprt.lib/MSVCP71[D].dll.
51 //
52 #if MYGUI_COMPILER == MYGUI_COMPILER_MSVC && (1300 <= MYGUI_COMP_VER && MYGUI_COMP_VER <= 1310)
53 
54 # if defined(_DLL_CPPLIB)
55 
56 namespace std
57 {
58  template class _CRTIMP2 basic_string<unsigned short, char_traits<unsigned short>,
59  allocator<unsigned short> >;
60 
61  template class _CRTIMP2 basic_string<__wchar_t, char_traits<__wchar_t>,
62  allocator<__wchar_t> >;
63 }
64 
65 # endif // defined(_DLL_CPPLIB)
66 
67 #endif // MYGUI_COMPILER == MYGUI_COMPILER_MSVC && MYGUI_COMP_VER == 1300
68 
69 
70 namespace MyGUI
71 {
72 
73  /* READ THIS NOTICE BEFORE USING IN YOUR OWN APPLICATIONS
74  =NOTICE=
75  This class is not a complete Unicode solution. It purposefully does not
76  provide certain functionality, such as proper lexical sorting for
77  Unicode values. It does provide comparison operators for the sole purpose
78  of using UString as an index with std::map and other operator< sorted
79  containers, but it should NOT be relied upon for meaningful lexical
80  operations, such as alphabetical sorts. If you need this type of
81  functionality, look into using ICU instead (http://icu.sourceforge.net/).
82 
83  =REQUIREMENTS=
84  There are a few requirements for proper operation. They are fairly small,
85  and shouldn't restrict usage on any reasonable target.
86  * Compiler must support unsigned 16-bit integer types
87  * Compiler must support signed 32-bit integer types
88  * wchar_t must be either UTF-16 or UTF-32 encoding, and specified as such
89  using the WCHAR_UTF16 macro as outlined below.
90  * You must include <iterator>, <string>, and <wchar>. Probably more, but
91  these are the most obvious.
92 
93  =REQUIRED PREPROCESSOR MACROS=
94  This class requires two preprocessor macros to be defined in order to
95  work as advertised.
96  INT32 - must be mapped to a signed 32 bit integer (ex. #define INT32 int)
97  UINT16 - must be mapped to an unsigned 16 bit integer (ex. #define UINT32 unsigned short)
98 
99  Additionally, a third macro should be defined to control the evaluation of wchar_t:
100  WCHAR_UTF16 - should be defined when wchar_t represents UTF-16 code points,
101  such as in Windows. Otherwise it is assumed that wchar_t is a 32-bit
102  integer representing UTF-32 code points.
103  */
104 
105  // THIS IS A VERY BRIEF AUTO DETECTION. YOU MAY NEED TO TWEAK THIS
106 #ifdef __STDC_ISO_10646__
107 // for any compiler that provides this, wchar_t is guaranteed to hold any Unicode value with a single code point (32-bit or larger)
108 // so we can safely skip the rest of the testing
109 #else // #ifdef __STDC_ISO_10646__
110 #if defined( __WIN32__ ) || defined( _WIN32 )
111 #define WCHAR_UTF16 // All currently known Windows platforms utilize UTF-16 encoding in wchar_t
112 #else // #if defined( __WIN32__ ) || defined( _WIN32 )
113 #if WCHAR_MAX <= 0xFFFF // this is a last resort fall back test; WCHAR_MAX is defined in <wchar.h>
114 #define WCHAR_UTF16 // best we can tell, wchar_t is not larger than 16-bit
115 #endif // #if WCHAR_MAX <= 0xFFFF
116 #endif // #if defined( __WIN32__ ) || defined( _WIN32 )
117 #endif // #ifdef __STDC_ISO_10646__
118 
119 
120 // MYGUI_IS_NATIVE_WCHAR_T means that wchar_t isn't a typedef of
121 // uint16 or uint32.
122 #if MYGUI_COMPILER == MYGUI_COMPILER_MSVC
123 
124 // Don't define wchar_t related functions since it'll duplicate
125 // with UString::code_point related functions when compile
126 // without /Zc:wchar_t, because in this case both of them are
127 // a typedef of uint16.
128 # if defined(_NATIVE_WCHAR_T_DEFINED)
129 # define MYGUI_IS_NATIVE_WCHAR_T 1
130 # else
131 # define MYGUI_IS_NATIVE_WCHAR_T 0
132 # endif
133 
134 #else // MYGUI_COMPILER != MYGUI_COMPILER_MSVC
135 
136 // Assumed wchar_t is natively for other compilers
137 # define MYGUI_IS_NATIVE_WCHAR_T 1
138 
139 #endif // MYGUI_COMPILER == MYGUI_COMPILER_MSVC
140 
142 
167  class UString
168  {
169  // constants used in UTF-8 conversions
170  static const unsigned char _lead1 = 0xC0; //110xxxxx
171  static const unsigned char _lead1_mask = 0x1F; //00011111
172  static const unsigned char _lead2 = 0xE0; //1110xxxx
173  static const unsigned char _lead2_mask = 0x0F; //00001111
174  static const unsigned char _lead3 = 0xF0; //11110xxx
175  static const unsigned char _lead3_mask = 0x07; //00000111
176  static const unsigned char _lead4 = 0xF8; //111110xx
177  static const unsigned char _lead4_mask = 0x03; //00000011
178  static const unsigned char _lead5 = 0xFC; //1111110x
179  static const unsigned char _lead5_mask = 0x01; //00000001
180  static const unsigned char _cont = 0x80; //10xxxxxx
181  static const unsigned char _cont_mask = 0x3F; //00111111
182 
183  public:
185  typedef size_t size_type;
187  static const size_type npos = ~(size_t)0;
188 
191 
194 
197 
198  typedef std::basic_string<code_point> dstring; // data string
199 
201  typedef std::basic_string<unicode_char> utf32string;
202 
204  class invalid_data: public std::runtime_error
205  { /* i don't know why the beautifier is freaking out on this line */
206  public:
208  explicit invalid_data( const std::string& _Message ): std::runtime_error( _Message )
209  {
210  /* The thing is, Bob, it's not that I'm lazy, it's that I just don't care. */
211  }
212  };
213 
214  //#########################################################################
216  class _base_iterator: public std::iterator<std::random_access_iterator_tag, value_type>
217  { /* i don't know why the beautifier is freaking out on this line */
218  friend class UString;
219  protected:
221  {
222  mString = 0;
223  }
224 
225  void _seekFwd( size_type c )
226  {
227  mIter += c;
228  }
229  void _seekRev( size_type c )
230  {
231  mIter -= c;
232  }
233  void _become( const _base_iterator& i )
234  {
235  mIter = i.mIter;
236  mString = i.mString;
237  }
238  bool _test_begin() const
239  {
240  return mIter == mString->mData.begin();
241  }
242  bool _test_end() const
243  {
244  return mIter == mString->mData.end();
245  }
247  {
248  return mIter - mString->mData.begin();
249  }
250  void _jump_to( size_type index )
251  {
252  mIter = mString->mData.begin() + index;
253  }
254 
256  {
257  size_type current_index = _get_index();
258  return mString->getChar( current_index );
259  }
261  {
262  size_type current_index = _get_index();
263  int change = mString->setChar( current_index, uc );
264  _jump_to( current_index );
265  return change;
266  }
267 
268  void _moveNext()
269  {
270  _seekFwd( 1 ); // move 1 code point forward
271  if ( _test_end() ) return; // exit if we hit the end
272  if ( _utf16_surrogate_follow( mIter[0] ) )
273  {
274  // landing on a follow code point means we might be part of a bigger character
275  // so we test for that
276  code_point lead_half = 0;
277  //NB: we can't possibly be at the beginning here, so no need to test
278  lead_half = mIter[-1]; // check the previous code point to see if we're part of a surrogate pair
279  if ( _utf16_surrogate_lead( lead_half ) )
280  {
281  _seekFwd( 1 ); // if so, then advance 1 more code point
282  }
283  }
284  }
285  void _movePrev()
286  {
287  _seekRev( 1 ); // move 1 code point backwards
288  if ( _test_begin() ) return; // exit if we hit the beginning
289  if ( _utf16_surrogate_follow( mIter[0] ) )
290  {
291  // landing on a follow code point means we might be part of a bigger character
292  // so we test for that
293  code_point lead_half = 0;
294  lead_half = mIter[-1]; // check the previous character to see if we're part of a surrogate pair
295  if ( _utf16_surrogate_lead( lead_half ) )
296  {
297  _seekRev( 1 ); // if so, then rewind 1 more code point
298  }
299  }
300  }
301 
302  dstring::iterator mIter;
304  };
305 
306  //#########################################################################
307  // FORWARD ITERATORS
308  //#########################################################################
309  class _const_fwd_iterator; // forward declaration
310 
313  { /* i don't know why the beautifier is freaking out on this line */
314  friend class _const_fwd_iterator;
315  public:
318  {
319  _become( i );
320  }
321 
324  {
325  _seekFwd( 1 );
326  return *this;
327  }
330  {
331  _fwd_iterator tmp( *this );
332  _seekFwd( 1 );
333  return tmp;
334  }
335 
338  {
339  _seekRev( 1 );
340  return *this;
341  }
344  {
345  _fwd_iterator tmp( *this );
346  _seekRev( 1 );
347  return tmp;
348  }
349 
352  {
353  _fwd_iterator tmp( *this );
354  tmp._seekFwd( n );
355  return tmp;
356  }
358  _fwd_iterator operator+( difference_type n )
359  {
360  _fwd_iterator tmp( *this );
361  if ( n < 0 )
362  tmp._seekRev( -n );
363  else
364  tmp._seekFwd( n );
365  return tmp;
366  }
369  {
370  _fwd_iterator tmp( *this );
371  tmp._seekRev( n );
372  return tmp;
373  }
375  _fwd_iterator operator-( difference_type n )
376  {
377  _fwd_iterator tmp( *this );
378  if ( n < 0 )
379  tmp._seekFwd( -n );
380  else
381  tmp._seekRev( n );
382  return tmp;
383  }
384 
387  {
388  _seekFwd( n );
389  return *this;
390  }
392  _fwd_iterator& operator+=( difference_type n )
393  {
394  if ( n < 0 )
395  _seekRev( -n );
396  else
397  _seekFwd( n );
398  return *this;
399  }
402  {
403  _seekRev( n );
404  return *this;
405  }
407  _fwd_iterator& operator-=( difference_type n )
408  {
409  if ( n < 0 )
410  _seekFwd( -n );
411  else
412  _seekRev( n );
413  return *this;
414  }
415 
418  {
419  return *mIter;
420  }
421 
424  {
425  _fwd_iterator tmp( *this );
426  tmp += n;
427  return *tmp;
428  }
430  value_type& operator[]( difference_type n ) const
431  {
432  _fwd_iterator tmp( *this );
433  tmp += n;
434  return *tmp;
435  }
436 
439  {
440  _moveNext();
441  return *this;
442  }
445  {
446  _movePrev();
447  return *this;
448  }
451  {
452  return _getCharacter();
453  }
456  {
457  return _setCharacter( uc );
458  }
459  };
460 
461 
462  //#########################################################################
465  { /* i don't know why the beautifier is freaking out on this line */
466  public:
469  {
470  _become( i );
471  }
473  {
474  _become( i );
475  }
476 
479  {
480  _seekFwd( 1 );
481  return *this;
482  }
485  {
486  _const_fwd_iterator tmp( *this );
487  _seekFwd( 1 );
488  return tmp;
489  }
490 
493  {
494  _seekRev( 1 );
495  return *this;
496  }
499  {
500  _const_fwd_iterator tmp( *this );
501  _seekRev( 1 );
502  return tmp;
503  }
504 
507  {
508  _const_fwd_iterator tmp( *this );
509  tmp._seekFwd( n );
510  return tmp;
511  }
513  _const_fwd_iterator operator+( difference_type n )
514  {
515  _const_fwd_iterator tmp( *this );
516  if ( n < 0 )
517  tmp._seekRev( -n );
518  else
519  tmp._seekFwd( n );
520  return tmp;
521  }
524  {
525  _const_fwd_iterator tmp( *this );
526  tmp._seekRev( n );
527  return tmp;
528  }
530  _const_fwd_iterator operator-( difference_type n )
531  {
532  _const_fwd_iterator tmp( *this );
533  if ( n < 0 )
534  tmp._seekFwd( -n );
535  else
536  tmp._seekRev( n );
537  return tmp;
538  }
539 
542  {
543  _seekFwd( n );
544  return *this;
545  }
547  _const_fwd_iterator& operator+=( difference_type n )
548  {
549  if ( n < 0 )
550  _seekRev( -n );
551  else
552  _seekFwd( n );
553  return *this;
554  }
557  {
558  _seekRev( n );
559  return *this;
560  }
562  _const_fwd_iterator& operator-=( difference_type n )
563  {
564  if ( n < 0 )
565  _seekFwd( -n );
566  else
567  _seekRev( n );
568  return *this;
569  }
570 
572  const value_type& operator*() const
573  {
574  return *mIter;
575  }
576 
578  const value_type& operator[]( size_type n ) const
579  {
580  _const_fwd_iterator tmp( *this );
581  tmp += n;
582  return *tmp;
583  }
585  const value_type& operator[]( difference_type n ) const
586  {
587  _const_fwd_iterator tmp( *this );
588  tmp += n;
589  return *tmp;
590  }
591 
594  {
595  _moveNext();
596  return *this;
597  }
600  {
601  _movePrev();
602  return *this;
603  }
606  {
607  return _getCharacter();
608  }
609 
611  friend size_type operator-( const _const_fwd_iterator& left, const _const_fwd_iterator& right );
613  friend bool operator==( const _const_fwd_iterator& left, const _const_fwd_iterator& right );
615  friend bool operator!=( const _const_fwd_iterator& left, const _const_fwd_iterator& right );
617  friend bool operator<( const _const_fwd_iterator& left, const _const_fwd_iterator& right );
619  friend bool operator<=( const _const_fwd_iterator& left, const _const_fwd_iterator& right );
621  friend bool operator>( const _const_fwd_iterator& left, const _const_fwd_iterator& right );
623  friend bool operator>=( const _const_fwd_iterator& left, const _const_fwd_iterator& right );
624 
625  };
626 
627  //#########################################################################
628  // REVERSE ITERATORS
629  //#########################################################################
630  class _const_rev_iterator; // forward declaration
633  { /* i don't know why the beautifier is freaking out on this line */
634  friend class _const_rev_iterator;
635  public:
638  {
639  _become( i );
640  }
641 
644  {
645  _seekRev( 1 );
646  return *this;
647  }
650  {
651  _rev_iterator tmp( *this );
652  _seekRev( 1 );
653  return tmp;
654  }
655 
658  {
659  _seekFwd( 1 );
660  return *this;
661  }
664  {
665  _rev_iterator tmp( *this );
666  _seekFwd( 1 );
667  return tmp;
668  }
669 
672  {
673  _rev_iterator tmp( *this );
674  tmp._seekRev( n );
675  return tmp;
676  }
678  _rev_iterator operator+( difference_type n )
679  {
680  _rev_iterator tmp( *this );
681  if ( n < 0 )
682  tmp._seekFwd( -n );
683  else
684  tmp._seekRev( n );
685  return tmp;
686  }
689  {
690  _rev_iterator tmp( *this );
691  tmp._seekFwd( n );
692  return tmp;
693  }
695  _rev_iterator operator-( difference_type n )
696  {
697  _rev_iterator tmp( *this );
698  if ( n < 0 )
699  tmp._seekRev( -n );
700  else
701  tmp._seekFwd( n );
702  return tmp;
703  }
704 
707  {
708  _seekRev( n );
709  return *this;
710  }
712  _rev_iterator& operator+=( difference_type n )
713  {
714  if ( n < 0 )
715  _seekFwd( -n );
716  else
717  _seekRev( n );
718  return *this;
719  }
722  {
723  _seekFwd( n );
724  return *this;
725  }
727  _rev_iterator& operator-=( difference_type n )
728  {
729  if ( n < 0 )
730  _seekRev( -n );
731  else
732  _seekFwd( n );
733  return *this;
734  }
735 
738  {
739  return mIter[-1];
740  }
741 
744  {
745  _rev_iterator tmp( *this );
746  tmp -= n;
747  return *tmp;
748  }
750  value_type& operator[]( difference_type n ) const
751  {
752  _rev_iterator tmp( *this );
753  tmp -= n;
754  return *tmp;
755  }
756  };
757  //#########################################################################
760  { /* i don't know why the beautifier is freaking out on this line */
761  public:
764  {
765  _become( i );
766  }
768  {
769  _become( i );
770  }
773  {
774  _seekRev( 1 );
775  return *this;
776  }
779  {
780  _const_rev_iterator tmp( *this );
781  _seekRev( 1 );
782  return tmp;
783  }
784 
787  {
788  _seekFwd( 1 );
789  return *this;
790  }
793  {
794  _const_rev_iterator tmp( *this );
795  _seekFwd( 1 );
796  return tmp;
797  }
798 
801  {
802  _const_rev_iterator tmp( *this );
803  tmp._seekRev( n );
804  return tmp;
805  }
807  _const_rev_iterator operator+( difference_type n )
808  {
809  _const_rev_iterator tmp( *this );
810  if ( n < 0 )
811  tmp._seekFwd( -n );
812  else
813  tmp._seekRev( n );
814  return tmp;
815  }
818  {
819  _const_rev_iterator tmp( *this );
820  tmp._seekFwd( n );
821  return tmp;
822  }
824  _const_rev_iterator operator-( difference_type n )
825  {
826  _const_rev_iterator tmp( *this );
827  if ( n < 0 )
828  tmp._seekRev( -n );
829  else
830  tmp._seekFwd( n );
831  return tmp;
832  }
833 
836  {
837  _seekRev( n );
838  return *this;
839  }
841  _const_rev_iterator& operator+=( difference_type n )
842  {
843  if ( n < 0 )
844  _seekFwd( -n );
845  else
846  _seekRev( n );
847  return *this;
848  }
851  {
852  _seekFwd( n );
853  return *this;
854  }
856  _const_rev_iterator& operator-=( difference_type n )
857  {
858  if ( n < 0 )
859  _seekRev( -n );
860  else
861  _seekFwd( n );
862  return *this;
863  }
864 
866  const value_type& operator*() const
867  {
868  return mIter[-1];
869  }
870 
872  const value_type& operator[]( size_type n ) const
873  {
874  _const_rev_iterator tmp( *this );
875  tmp -= n;
876  return *tmp;
877  }
879  const value_type& operator[]( difference_type n ) const
880  {
881  _const_rev_iterator tmp( *this );
882  tmp -= n;
883  return *tmp;
884  }
885 
887  friend size_type operator-( const _const_rev_iterator& left, const _const_rev_iterator& right );
889  friend bool operator==( const _const_rev_iterator& left, const _const_rev_iterator& right );
891  friend bool operator!=( const _const_rev_iterator& left, const _const_rev_iterator& right );
893  friend bool operator<( const _const_rev_iterator& left, const _const_rev_iterator& right );
895  friend bool operator<=( const _const_rev_iterator& left, const _const_rev_iterator& right );
897  friend bool operator>( const _const_rev_iterator& left, const _const_rev_iterator& right );
899  friend bool operator>=( const _const_rev_iterator& left, const _const_rev_iterator& right );
900  };
901  //#########################################################################
902 
907 
908 
910 
911 
913  {
914  _init();
915  }
917  UString( const UString& copy )
918  {
919  _init();
920  mData = copy.mData;
921  }
924  {
925  _init();
926  assign( length, ch );
927  }
929  UString( const code_point* str )
930  {
931  _init();
932  assign( str );
933  }
936  {
937  _init();
938  assign( str, length );
939  }
941  UString( const UString& str, size_type index, size_type length )
942  {
943  _init();
944  assign( str, index, length );
945  }
946 #if MYGUI_IS_NATIVE_WCHAR_T
947 
948  UString( const wchar_t* w_str )
949  {
950  _init();
951  assign( w_str );
952  }
954  UString( const wchar_t* w_str, size_type length )
955  {
956  _init();
957  assign( w_str, length );
958  }
959 #endif
960 
961  UString( const std::wstring& wstr )
962  {
963  _init();
964  assign( wstr );
965  }
967  UString( const char* c_str )
968  {
969  _init();
970  assign( c_str );
971  }
973  UString( const char* c_str, size_type length )
974  {
975  _init();
976  assign( c_str, length );
977  }
979  UString( const std::string& str )
980  {
981  _init();
982  assign( str );
983  }
986  {
987  _cleanBuffer();
988  }
990 
992 
994 
995 
996  size_type size() const
997  {
998  return mData.size();
999  }
1002  {
1003  return size();
1004  }
1006 
1008  {
1009  const_iterator i = begin(), ie = end();
1010  size_type c = 0;
1011  while ( i != ie )
1012  {
1013  i.moveNext();
1014  ++c;
1015  }
1016  return c;
1017  }
1020  {
1021  return mData.max_size();
1022  }
1025  {
1026  mData.reserve( size );
1027  }
1029  void resize( size_type num, const code_point& val = 0 )
1030  {
1031  mData.resize( num, val );
1032  }
1034  void swap( UString& from )
1035  {
1036  mData.swap( from.mData );
1037  }
1039  bool empty() const
1040  {
1041  return mData.empty();
1042  }
1044  const code_point* c_str() const
1045  {
1046  return mData.c_str();
1047  }
1049  const code_point* data() const
1050  {
1051  return c_str();
1052  }
1055  {
1056  return mData.capacity();
1057  }
1059  void clear()
1060  {
1061  mData.clear();
1062  }
1064 
1065  UString substr( size_type index, size_type num = npos ) const
1066  {
1067  // this could avoid the extra copy if we used a private specialty constructor
1068  dstring data = mData.substr( index, num );
1069  UString tmp;
1070  tmp.mData.swap( data );
1071  return tmp;
1072  }
1075  {
1076  code_point cp[2];
1077  size_t c = _utf32_to_utf16( val, cp );
1078  if ( c > 0 ) push_back( cp[0] );
1079  if ( c > 1 ) push_back( cp[1] );
1080  }
1081 #if MYGUI_IS_NATIVE_WCHAR_T
1082 
1083  void push_back( wchar_t val )
1084  {
1085  // we do this because the Unicode method still preserves UTF-16 code points
1086  mData.push_back( static_cast<unicode_char>( val ) );
1087  }
1088 #endif
1089 
1090 
1092  void push_back( code_point val )
1093  {
1094  mData.push_back( val );
1095  }
1097 
1098  void push_back( char val )
1099  {
1100  mData.push_back( static_cast<code_point>( val ) );
1101  }
1103  bool inString( unicode_char ch ) const
1104  {
1105  const_iterator i, ie = end();
1106  for ( i = begin(); i != ie; i.moveNext() )
1107  {
1108  if ( i.getCharacter() == ch )
1109  return true;
1110  }
1111  return false;
1112  }
1114 
1116 
1118 
1119 
1120  const std::string& asUTF8() const
1121  {
1122  _load_buffer_UTF8();
1123  return *m_buffer.mStrBuffer;
1124  }
1126  const char* asUTF8_c_str() const
1127  {
1128  _load_buffer_UTF8();
1129  return m_buffer.mStrBuffer->c_str();
1130  }
1132  const utf32string& asUTF32() const
1133  {
1134  _load_buffer_UTF32();
1135  return *m_buffer.mUTF32StrBuffer;
1136  }
1139  {
1140  _load_buffer_UTF32();
1141  return m_buffer.mUTF32StrBuffer->c_str();
1142  }
1144  const std::wstring& asWStr() const
1145  {
1146  _load_buffer_WStr();
1147  return *m_buffer.mWStrBuffer;
1148  }
1150  const wchar_t* asWStr_c_str() const
1151  {
1152  _load_buffer_WStr();
1153  return m_buffer.mWStrBuffer->c_str();
1154  }
1156 
1158 
1160 
1161 
1163  {
1164  return mData.at( loc );
1165  }
1167  const code_point& at( size_type loc ) const
1168  {
1169  return mData.at( loc );
1170  }
1172 
1177  {
1178  const code_point* ptr = c_str();
1179  unicode_char uc;
1180  size_t len = _utf16_char_length( ptr[loc] );
1181  code_point cp[2] = { /* blame the code beautifier */ 0, 0 };
1182  cp[0] = ptr[loc];
1183 
1184  if ( len == 2 && ( loc + 1 ) < mData.length() )
1185  {
1186  cp[1] = ptr[loc+1];
1187  }
1188  _utf16_to_utf32( cp, uc );
1189  return uc;
1190  }
1192 
1201  {
1202  code_point cp[2] = { /* blame the code beautifier */ 0, 0 };
1203  size_t lc = _utf32_to_utf16( ch, cp );
1204  unicode_char existingChar = getChar( loc );
1205  size_t existingSize = _utf16_char_length( existingChar );
1206  size_t newSize = _utf16_char_length( ch );
1207 
1208  if ( newSize > existingSize )
1209  {
1210  at( loc ) = cp[0];
1211  insert( loc + 1, 1, cp[1] );
1212  return 1;
1213  }
1214  if ( newSize < existingSize )
1215  {
1216  erase( loc, 1 );
1217  at( loc ) = cp[0];
1218  return -1;
1219  }
1220 
1221  // newSize == existingSize
1222  at( loc ) = cp[0];
1223  if ( lc == 2 ) at( loc + 1 ) = cp[1];
1224  return 0;
1225  }
1227 
1229 
1231 
1232 
1234  {
1235  iterator i;
1236  i.mIter = mData.begin();
1237  i.mString = this;
1238  return i;
1239  }
1242  {
1243  const_iterator i;
1244  i.mIter = const_cast<UString*>( this )->mData.begin();
1245  i.mString = const_cast<UString*>( this );
1246  return i;
1247  }
1250  {
1251  iterator i;
1252  i.mIter = mData.end();
1253  i.mString = this;
1254  return i;
1255  }
1258  {
1259  const_iterator i;
1260  i.mIter = const_cast<UString*>( this )->mData.end();
1261  i.mString = const_cast<UString*>( this );
1262  return i;
1263  }
1266  {
1267  reverse_iterator i;
1268  i.mIter = mData.end();
1269  i.mString = this;
1270  return i;
1271  }
1274  {
1276  i.mIter = const_cast<UString*>( this )->mData.end();
1277  i.mString = const_cast<UString*>( this );
1278  return i;
1279  }
1282  {
1283  reverse_iterator i;
1284  i.mIter = mData.begin();
1285  i.mString = this;
1286  return i;
1287  }
1290  {
1292  i.mIter = const_cast<UString*>( this )->mData.begin();
1293  i.mString = const_cast<UString*>( this );
1294  return i;
1295  }
1297 
1299 
1301 
1302 
1304  {
1305  mData.assign( start.mIter, end.mIter );
1306  return *this;
1307  }
1309  UString& assign( const UString& str )
1310  {
1311  mData.assign( str.mData );
1312  return *this;
1313  }
1315  UString& assign( const code_point* str )
1316  {
1317  mData.assign( str );
1318  return *this;
1319  }
1321  UString& assign( const code_point* str, size_type num )
1322  {
1323  mData.assign( str, num );
1324  return *this;
1325  }
1327  UString& assign( const UString& str, size_type index, size_type len )
1328  {
1329  mData.assign( str.mData, index, len );
1330  return *this;
1331  }
1333  UString& assign( size_type num, const code_point& ch )
1334  {
1335  mData.assign( num, ch );
1336  return *this;
1337  }
1339  UString& assign( const std::wstring& wstr )
1340  {
1341  mData.clear();
1342  mData.reserve( wstr.length() ); // best guess bulk allocate
1343 #ifdef WCHAR_UTF16 // if we're already working in UTF-16, this is easy
1344  code_point tmp;
1345  std::wstring::const_iterator i, ie = wstr.end();
1346  for ( i = wstr.begin(); i != ie; ++i )
1347  {
1348  tmp = static_cast<code_point>( *i );
1349  mData.push_back( tmp );
1350  }
1351 #else // otherwise we do it the safe way (which is still 100% safe to pass UTF-16 through, just slower)
1352  code_point cp[3] = { 0, 0, 0 };
1353  unicode_char tmp;
1354  std::wstring::const_iterator i, ie = wstr.end();
1355  for ( i = wstr.begin(); i != ie; i++ )
1356  {
1357  tmp = static_cast<unicode_char>( *i );
1358  size_t lc = _utf32_to_utf16( tmp, cp );
1359  if ( lc > 0 ) mData.push_back( cp[0] );
1360  if ( lc > 1 ) mData.push_back( cp[1] );
1361  }
1362 #endif
1363  return *this;
1364  }
1365 #if MYGUI_IS_NATIVE_WCHAR_T
1366 
1367  UString& assign( const wchar_t* w_str )
1368  {
1369  std::wstring tmp;
1370  tmp.assign( w_str );
1371  return assign( tmp );
1372  }
1374  UString& assign( const wchar_t* w_str, size_type num )
1375  {
1376  std::wstring tmp;
1377  tmp.assign( w_str, num );
1378  return assign( tmp );
1379  }
1380 #endif
1381 
1382  UString& assign( const std::string& str )
1383  {
1384  size_type len = _verifyUTF8( str );
1385  clear(); // empty our contents, if there are any
1386  reserve( len ); // best guess bulk capacity growth
1387 
1388  // This is a 3 step process, converting each byte in the UTF-8 stream to UTF-32,
1389  // then converting it to UTF-16, then finally appending the data buffer
1390 
1391  unicode_char uc; // temporary Unicode character buffer
1392  unsigned char utf8buf[7]; // temporary UTF-8 buffer
1393  utf8buf[6] = 0;
1394  size_t utf8len; // UTF-8 length
1395  code_point utf16buff[3]; // temporary UTF-16 buffer
1396  utf16buff[2] = 0;
1397  size_t utf16len; // UTF-16 length
1398 
1399  std::string::const_iterator i, ie = str.end();
1400  for ( i = str.begin(); i != ie; ++i )
1401  {
1402  utf8len = _utf8_char_length( static_cast<unsigned char>( *i ) ); // estimate bytes to load
1403  for ( size_t j = 0; j < utf8len; j++ )
1404  { // load the needed UTF-8 bytes
1405  utf8buf[j] = ( static_cast<unsigned char>( *( i + j ) ) ); // we don't increment 'i' here just in case the estimate is wrong (shouldn't happen, but we're being careful)
1406  }
1407  utf8buf[utf8len] = 0; // nul terminate so we throw an exception before running off the end of the buffer
1408  utf8len = _utf8_to_utf32( utf8buf, uc ); // do the UTF-8 -> UTF-32 conversion
1409  i += utf8len - 1; // we subtract 1 for the increment of the 'for' loop
1410 
1411  utf16len = _utf32_to_utf16( uc, utf16buff ); // UTF-32 -> UTF-16 conversion
1412  append( utf16buff, utf16len ); // append the characters to the string
1413  }
1414  return *this;
1415  }
1417  UString& assign( const char* c_str )
1418  {
1419  std::string tmp( c_str );
1420  return assign( tmp );
1421  }
1423  UString& assign( const char* c_str, size_type num )
1424  {
1425  std::string tmp;
1426  tmp.assign( c_str, num );
1427  return assign( tmp );
1428  }
1430 
1432 
1434 
1435 
1436  UString& append( const UString& str )
1437  {
1438  mData.append( str.mData );
1439  return *this;
1440  }
1442  UString& append( const code_point* str )
1443  {
1444  mData.append( str );
1445  return *this;
1446  }
1448  UString& append( const UString& str, size_type index, size_type len )
1449  {
1450  mData.append( str.mData, index, len );
1451  return *this;
1452  }
1454  UString& append( const code_point* str, size_type num )
1455  {
1456  mData.append( str, num );
1457  return *this;
1458  }
1461  {
1462  mData.append( num, ch );
1463  return *this;
1464  }
1467  {
1468  mData.append( start.mIter, end.mIter );
1469  return *this;
1470  }
1471 #if MYGUI_IS_NATIVE_WCHAR_T
1472 
1473  UString& append( const wchar_t* w_str, size_type num )
1474  {
1475  std::wstring tmp( w_str, num );
1476  return append( tmp );
1477  }
1479  UString& append( size_type num, wchar_t ch )
1480  {
1481  return append( num, static_cast<unicode_char>( ch ) );
1482  }
1483 #endif
1484 
1485  UString& append( const char* c_str, size_type num )
1486  {
1487  UString tmp( c_str, num );
1488  append( tmp );
1489  return *this;
1490  }
1492  UString& append( size_type num, char ch )
1493  {
1494  append( num, static_cast<code_point>( ch ) );
1495  return *this;
1496  }
1499  {
1500  code_point cp[2] = { 0, 0 };
1501  if ( _utf32_to_utf16( ch, cp ) == 2 )
1502  {
1503  for ( size_type i = 0; i < num; i++ )
1504  {
1505  append( 1, cp[0] );
1506  append( 1, cp[1] );
1507  }
1508  }
1509  else
1510  {
1511  for ( size_type i = 0; i < num; i++ )
1512  {
1513  append( 1, cp[0] );
1514  }
1515  }
1516  return *this;
1517  }
1519 
1521 
1523 
1524 
1526  {
1527  iterator ret;
1528  ret.mIter = mData.insert( i.mIter, ch );
1529  ret.mString = this;
1530  return ret;
1531  }
1533  UString& insert( size_type index, const UString& str )
1534  {
1535  mData.insert( index, str.mData );
1536  return *this;
1537  }
1539  UString& insert( size_type index, const code_point* str )
1540  {
1541  mData.insert( index, str );
1542  return *this;
1543  }
1545  UString& insert( size_type index1, const UString& str, size_type index2, size_type num )
1546  {
1547  mData.insert( index1, str.mData, index2, num );
1548  return *this;
1549  }
1552  {
1553  mData.insert( i.mIter, start.mIter, end.mIter );
1554  }
1556  UString& insert( size_type index, const code_point* str, size_type num )
1557  {
1558  mData.insert( index, str, num );
1559  return *this;
1560  }
1561 #if MYGUI_IS_NATIVE_WCHAR_T
1562 
1563  UString& insert( size_type index, const wchar_t* w_str, size_type num )
1564  {
1565  UString tmp( w_str, num );
1566  insert( index, tmp );
1567  return *this;
1568  }
1569 #endif
1570 
1571  UString& insert( size_type index, const char* c_str, size_type num )
1572  {
1573  UString tmp( c_str, num );
1574  insert( index, tmp );
1575  return *this;
1576  }
1579  {
1580  mData.insert( index, num, ch );
1581  return *this;
1582  }
1583 #if MYGUI_IS_NATIVE_WCHAR_T
1584 
1585  UString& insert( size_type index, size_type num, wchar_t ch )
1586  {
1587  insert( index, num, static_cast<unicode_char>( ch ) );
1588  return *this;
1589  }
1590 #endif
1591 
1592  UString& insert( size_type index, size_type num, char ch )
1593  {
1594  insert( index, num, static_cast<code_point>( ch ) );
1595  return *this;
1596  }
1599  {
1600  code_point cp[3] = { 0, 0, 0 };
1601  size_t lc = _utf32_to_utf16( ch, cp );
1602  if ( lc == 1 )
1603  {
1604  return insert( index, num, cp[0] );
1605  }
1606  for ( size_type c = 0; c < num; c++ )
1607  {
1608  // insert in reverse order to preserve ordering after insert
1609  insert( index, 1, cp[1] );
1610  insert( index, 1, cp[0] );
1611  }
1612  return *this;
1613  }
1615  void insert( iterator i, size_type num, const code_point& ch )
1616  {
1617  mData.insert( i.mIter, num, ch );
1618  }
1619 #if MYGUI_IS_NATIVE_WCHAR_T
1620 
1621  void insert( iterator i, size_type num, const wchar_t& ch )
1622  {
1623  insert( i, num, static_cast<unicode_char>( ch ) );
1624  }
1625 #endif
1626 
1627  void insert( iterator i, size_type num, const char& ch )
1628  {
1629  insert( i, num, static_cast<code_point>( ch ) );
1630  }
1632  void insert( iterator i, size_type num, const unicode_char& ch )
1633  {
1634  code_point cp[3] = { 0, 0, 0 };
1635  size_t lc = _utf32_to_utf16( ch, cp );
1636  if ( lc == 1 )
1637  {
1638  insert( i, num, cp[0] );
1639  }
1640  else
1641  {
1642  for ( size_type c = 0; c < num; c++ )
1643  {
1644  // insert in reverse order to preserve ordering after insert
1645  insert( i, 1, cp[1] );
1646  insert( i, 1, cp[0] );
1647  }
1648  }
1649  }
1651 
1653 
1655 
1656 
1658  {
1659  iterator ret;
1660  ret.mIter = mData.erase( loc.mIter );
1661  ret.mString = this;
1662  return ret;
1663  }
1666  {
1667  iterator ret;
1668  ret.mIter = mData.erase( start.mIter, end.mIter );
1669  ret.mString = this;
1670  return ret;
1671  }
1673  UString& erase( size_type index = 0, size_type num = npos )
1674  {
1675  if ( num == npos )
1676  mData.erase( index );
1677  else
1678  mData.erase( index, num );
1679  return *this;
1680  }
1682 
1684 
1686 
1687 
1688  UString& replace( size_type index1, size_type num1, const UString& str )
1689  {
1690  mData.replace( index1, num1, str.mData, 0, npos );
1691  return *this;
1692  }
1694  UString& replace( size_type index1, size_type num1, const UString& str, size_type num2 )
1695  {
1696  mData.replace( index1, num1, str.mData, 0, num2 );
1697  return *this;
1698  }
1700  UString& replace( size_type index1, size_type num1, const UString& str, size_type index2, size_type num2 )
1701  {
1702  mData.replace( index1, num1, str.mData, index2, num2 );
1703  return *this;
1704  }
1706  UString& replace( iterator start, iterator end, const UString& str, size_type num = npos )
1707  {
1708  _const_fwd_iterator st(start); //Work around for gcc, allow it to find correct overload
1709 
1710  size_type index1 = begin() - st;
1711  size_type num1 = end - st;
1712  return replace( index1, num1, str, 0, num );
1713  }
1716  {
1717  mData.replace( index, num1, num2, ch );
1718  return *this;
1719  }
1722  {
1723  _const_fwd_iterator st(start); //Work around for gcc, allow it to find correct overload
1724 
1725  size_type index1 = begin() - st;
1726  size_type num1 = end - st;
1727  return replace( index1, num1, num, ch );
1728  }
1730 
1732 
1734 
1735 
1736  int compare( const UString& str ) const
1737  {
1738  return mData.compare( str.mData );
1739  }
1741  int compare( const code_point* str ) const
1742  {
1743  return mData.compare( str );
1744  }
1746  int compare( size_type index, size_type length, const UString& str ) const
1747  {
1748  return mData.compare( index, length, str.mData );
1749  }
1751  int compare( size_type index, size_type length, const UString& str, size_type index2, size_type length2 ) const
1752  {
1753  return mData.compare( index, length, str.mData, index2, length2 );
1754  }
1756  int compare( size_type index, size_type length, const code_point* str, size_type length2 ) const
1757  {
1758  return mData.compare( index, length, str, length2 );
1759  }
1760 #if MYGUI_IS_NATIVE_WCHAR_T
1761 
1762  int compare( size_type index, size_type length, const wchar_t* w_str, size_type length2 ) const
1763  {
1764  UString tmp( w_str, length2 );
1765  return compare( index, length, tmp );
1766  }
1767 #endif
1768 
1769  int compare( size_type index, size_type length, const char* c_str, size_type length2 ) const
1770  {
1771  UString tmp( c_str, length2 );
1772  return compare( index, length, tmp );
1773  }
1775 
1777 
1779 
1780 
1781 
1782  size_type find( const UString& str, size_type index = 0 ) const
1783  {
1784  return mData.find( str.c_str(), index );
1785  }
1787 
1788  size_type find( const code_point* cp_str, size_type index, size_type length ) const
1789  {
1790  UString tmp( cp_str );
1791  return mData.find( tmp.c_str(), index, length );
1792  }
1794 
1795  size_type find( const char* c_str, size_type index, size_type length ) const
1796  {
1797  UString tmp( c_str );
1798  return mData.find( tmp.c_str(), index, length );
1799  }
1800 #if MYGUI_IS_NATIVE_WCHAR_T
1801 
1802 
1803  size_type find( const wchar_t* w_str, size_type index, size_type length ) const
1804  {
1805  UString tmp( w_str );
1806  return mData.find( tmp.c_str(), index, length );
1807  }
1808 #endif
1809 
1810 
1811  size_type find( char ch, size_type index = 0 ) const
1812  {
1813  return find( static_cast<code_point>( ch ), index );
1814  }
1816 
1817  size_type find( code_point ch, size_type index = 0 ) const
1818  {
1819  return mData.find( ch, index );
1820  }
1821 #if MYGUI_IS_NATIVE_WCHAR_T
1822 
1823 
1824  size_type find( wchar_t ch, size_type index = 0 ) const
1825  {
1826  return find( static_cast<unicode_char>( ch ), index );
1827  }
1828 #endif
1829 
1830 
1831  size_type find( unicode_char ch, size_type index = 0 ) const
1832  {
1833  code_point cp[3] = { 0, 0, 0 };
1834  size_t lc = _utf32_to_utf16( ch, cp );
1835  return find( UString( cp, lc ), index );
1836  }
1837 
1839  size_type rfind( const UString& str, size_type index = 0 ) const
1840  {
1841  return mData.rfind( str.c_str(), index );
1842  }
1844  size_type rfind( const code_point* cp_str, size_type index, size_type num ) const
1845  {
1846  UString tmp( cp_str );
1847  return mData.rfind( tmp.c_str(), index, num );
1848  }
1850  size_type rfind( const char* c_str, size_type index, size_type num ) const
1851  {
1852  UString tmp( c_str );
1853  return mData.rfind( tmp.c_str(), index, num );
1854  }
1855 #if MYGUI_IS_NATIVE_WCHAR_T
1856 
1857  size_type rfind( const wchar_t* w_str, size_type index, size_type num ) const
1858  {
1859  UString tmp( w_str );
1860  return mData.rfind( tmp.c_str(), index, num );
1861  }
1862 #endif
1863 
1864  size_type rfind( char ch, size_type index = 0 ) const
1865  {
1866  return rfind( static_cast<code_point>( ch ), index );
1867  }
1870  {
1871  return mData.rfind( ch, index );
1872  }
1873 #if MYGUI_IS_NATIVE_WCHAR_T
1874 
1875  size_type rfind( wchar_t ch, size_type index = 0 ) const
1876  {
1877  return rfind( static_cast<unicode_char>( ch ), index );
1878  }
1879 #endif
1880 
1881  size_type rfind( unicode_char ch, size_type index = 0 ) const
1882  {
1883  code_point cp[3] = { 0, 0, 0 };
1884  size_t lc = _utf32_to_utf16( ch, cp );
1885  return rfind( UString( cp, lc ), index );
1886  }
1888 
1890 
1892 
1893 
1894  size_type find_first_of( const UString &str, size_type index = 0, size_type num = npos ) const
1895  {
1896  size_type i = 0;
1897  const size_type len = length();
1898  while ( i < num && ( index + i ) < len )
1899  {
1900  unicode_char ch = getChar( index + i );
1901  if ( str.inString( ch ) )
1902  return index + i;
1903  i += _utf16_char_length( ch ); // increment by the Unicode character length
1904  }
1905  return npos;
1906  }
1909  {
1910  UString tmp;
1911  tmp.assign( 1, ch );
1912  return find_first_of( tmp, index );
1913  }
1915  size_type find_first_of( char ch, size_type index = 0 ) const
1916  {
1917  return find_first_of( static_cast<code_point>( ch ), index );
1918  }
1919 #if MYGUI_IS_NATIVE_WCHAR_T
1920 
1921  size_type find_first_of( wchar_t ch, size_type index = 0 ) const
1922  {
1923  return find_first_of( static_cast<unicode_char>( ch ), index );
1924  }
1925 #endif
1926 
1928  {
1929  code_point cp[3] = { 0, 0, 0 };
1930  size_t lc = _utf32_to_utf16( ch, cp );
1931  return find_first_of( UString( cp, lc ), index );
1932  }
1933 
1935  size_type find_first_not_of( const UString& str, size_type index = 0, size_type num = npos ) const
1936  {
1937  size_type i = 0;
1938  const size_type len = length();
1939  while ( i < num && ( index + i ) < len )
1940  {
1941  unicode_char ch = getChar( index + i );
1942  if ( !str.inString( ch ) )
1943  return index + i;
1944  i += _utf16_char_length( ch ); // increment by the Unicode character length
1945  }
1946  return npos;
1947  }
1950  {
1951  UString tmp;
1952  tmp.assign( 1, ch );
1953  return find_first_not_of( tmp, index );
1954  }
1956  size_type find_first_not_of( char ch, size_type index = 0 ) const
1957  {
1958  return find_first_not_of( static_cast<code_point>( ch ), index );
1959  }
1960 #if MYGUI_IS_NATIVE_WCHAR_T
1961 
1962  size_type find_first_not_of( wchar_t ch, size_type index = 0 ) const
1963  {
1964  return find_first_not_of( static_cast<unicode_char>( ch ), index );
1965  }
1966 #endif
1967 
1969  {
1970  code_point cp[3] = { 0, 0, 0 };
1971  size_t lc = _utf32_to_utf16( ch, cp );
1972  return find_first_not_of( UString( cp, lc ), index );
1973  }
1974 
1976  size_type find_last_of( const UString& str, size_type index = npos, size_type num = npos ) const
1977  {
1978  size_type i = 0;
1979  const size_type len = length();
1980  if ( index > len ) index = len - 1;
1981 
1982  while ( i < num && ( index - i ) != npos )
1983  {
1984  size_type j = index - i;
1985  // careful to step full Unicode characters
1986  if ( j != 0 && _utf16_surrogate_follow( at( j ) ) && _utf16_surrogate_lead( at( j - 1 ) ) )
1987  {
1988  j = index - ++i;
1989  }
1990  // and back to the usual dull test
1991  unicode_char ch = getChar( j );
1992  if ( str.inString( ch ) )
1993  return j;
1994  i++;
1995  }
1996  return npos;
1997  }
2000  {
2001  UString tmp;
2002  tmp.assign( 1, ch );
2003  return find_last_of( tmp, index );
2004  }
2006  size_type find_last_of( char ch, size_type index = npos ) const
2007  {
2008  return find_last_of( static_cast<code_point>( ch ), index );
2009  }
2010 #if MYGUI_IS_NATIVE_WCHAR_T
2011 
2012  size_type find_last_of( wchar_t ch, size_type index = npos ) const
2013  {
2014  return find_last_of( static_cast<unicode_char>( ch ), index );
2015  }
2016 #endif
2017 
2019  {
2020  code_point cp[3] = { 0, 0, 0 };
2021  size_t lc = _utf32_to_utf16( ch, cp );
2022  return find_last_of( UString( cp, lc ), index );
2023  }
2024 
2026  size_type find_last_not_of( const UString& str, size_type index = npos, size_type num = npos ) const
2027  {
2028  size_type i = 0;
2029  const size_type len = length();
2030  if ( index > len ) index = len - 1;
2031 
2032  while ( i < num && ( index - i ) != npos )
2033  {
2034  size_type j = index - i;
2035  // careful to step full Unicode characters
2036  if ( j != 0 && _utf16_surrogate_follow( at( j ) ) && _utf16_surrogate_lead( at( j - 1 ) ) )
2037  {
2038  j = index - ++i;
2039  }
2040  // and back to the usual dull test
2041  unicode_char ch = getChar( j );
2042  if ( !str.inString( ch ) )
2043  return j;
2044  i++;
2045  }
2046  return npos;
2047  }
2050  {
2051  UString tmp;
2052  tmp.assign( 1, ch );
2053  return find_last_not_of( tmp, index );
2054  }
2056  size_type find_last_not_of( char ch, size_type index = npos ) const
2057  {
2058  return find_last_not_of( static_cast<code_point>( ch ), index );
2059  }
2060 #if MYGUI_IS_NATIVE_WCHAR_T
2061 
2062  size_type find_last_not_of( wchar_t ch, size_type index = npos ) const
2063  {
2064  return find_last_not_of( static_cast<unicode_char>( ch ), index );
2065  }
2066 #endif
2067 
2069  {
2070  code_point cp[3] = { 0, 0, 0 };
2071  size_t lc = _utf32_to_utf16( ch, cp );
2072  return find_last_not_of( UString( cp, lc ), index );
2073  }
2075 
2077 
2079 
2080 
2081  bool operator<( const UString& right ) const
2082  {
2083  return compare( right ) < 0;
2084  }
2086  bool operator<=( const UString& right ) const
2087  {
2088  return compare( right ) <= 0;
2089  }
2091  bool operator>( const UString& right ) const
2092  {
2093  return compare( right ) > 0;
2094  }
2096  bool operator>=( const UString& right ) const
2097  {
2098  return compare( right ) >= 0;
2099  }
2101  bool operator==( const UString& right ) const
2102  {
2103  return compare( right ) == 0;
2104  }
2106  bool operator!=( const UString& right ) const
2107  {
2108  return !operator==( right );
2109  }
2112  {
2113  return assign( s );
2114  }
2117  {
2118  clear();
2119  return append( 1, ch );
2120  }
2122  UString& operator=( char ch )
2123  {
2124  clear();
2125  return append( 1, ch );
2126  }
2127 #if MYGUI_IS_NATIVE_WCHAR_T
2128 
2129  UString& operator=( wchar_t ch )
2130  {
2131  clear();
2132  return append( 1, ch );
2133  }
2134 #endif
2135 
2137  {
2138  clear();
2139  return append( 1, ch );
2140  }
2143  {
2144  return at( index );
2145  }
2147  const code_point& operator[]( size_type index ) const
2148  {
2149  return at( index );
2150  }
2152 
2154 
2156 
2157 
2158  operator std::string() const
2159  {
2160  return std::string( asUTF8() );
2161  }
2163  operator std::wstring() const
2164  {
2165  return std::wstring( asWStr() );
2166  }
2168 
2170 
2172 
2173 
2175  {
2176  if ( 0xD800 <= cp && cp <= 0xDFFF ) // tests if the cp is within the surrogate pair range
2177  return false; // it matches a surrogate pair signature
2178  return true; // everything else is a standalone code point
2179  }
2182  {
2183  if ( 0xD800 <= cp && cp <= 0xDBFF ) // tests if the cp is within the 2nd word of a surrogate pair
2184  return true; // it is a 1st word
2185  return false; // it isn't
2186  }
2189  {
2190  if ( 0xDC00 <= cp && cp <= 0xDFFF ) // tests if the cp is within the 2nd word of a surrogate pair
2191  return true; // it is a 2nd word
2192  return false; // everything else isn't
2193  }
2195  static size_t _utf16_char_length( code_point cp )
2196  {
2197  if ( 0xD800 <= cp && cp <= 0xDBFF ) // test if cp is the beginning of a surrogate pair
2198  return 2; // if it is, then we are 2 words long
2199  return 1; // otherwise we are only 1 word long
2200  }
2202  static size_t _utf16_char_length( unicode_char uc )
2203  {
2204  if ( uc > 0xFFFF ) // test if uc is greater than the single word maximum
2205  return 2; // if so, we need a surrogate pair
2206  return 1; // otherwise we can stuff it into a single word
2207  }
2209 
2213  static size_t _utf16_to_utf32( const code_point in_cp[2], unicode_char& out_uc )
2214  {
2215  const code_point& cp1 = in_cp[0];
2216  const code_point& cp2 = in_cp[1];
2217  bool wordPair = false;
2218 
2219  // does it look like a surrogate pair?
2220  if ( 0xD800 <= cp1 && cp1 <= 0xDBFF )
2221  {
2222  // looks like one, but does the other half match the algorithm as well?
2223  if ( 0xDC00 <= cp2 && cp2 <= 0xDFFF )
2224  wordPair = true; // yep!
2225  }
2226 
2227  if ( !wordPair )
2228  { // if we aren't a 100% authentic surrogate pair, then just copy the value
2229  out_uc = cp1;
2230  return 1;
2231  }
2232 
2233  unsigned short cU = cp1, cL = cp2; // copy upper and lower words of surrogate pair to writable buffers
2234  cU -= 0xD800; // remove the encoding markers
2235  cL -= 0xDC00;
2236 
2237  out_uc = ( cU & 0x03FF ) << 10; // grab the 10 upper bits and set them in their proper location
2238  out_uc |= ( cL & 0x03FF ); // combine in the lower 10 bits
2239  out_uc += 0x10000; // add back in the value offset
2240 
2241  return 2; // this whole operation takes to words, so that's what we'll return
2242  }
2244 
2249  static size_t _utf32_to_utf16( const unicode_char& in_uc, code_point out_cp[2] )
2250  {
2251  if ( in_uc <= 0xFFFF )
2252  { // we blindly preserve sentinel values because our decoder understands them
2253  out_cp[0] = in_uc;
2254  return 1;
2255  }
2256  unicode_char uc = in_uc; // copy to writable buffer
2257  unsigned short tmp; // single code point buffer
2258  uc -= 0x10000; // subtract value offset
2259 
2260  //process upper word
2261  tmp = ( uc >> 10 ) & 0x03FF; // grab the upper 10 bits
2262  tmp += 0xD800; // add encoding offset
2263  out_cp[0] = tmp; // write
2264 
2265  // process lower word
2266  tmp = uc & 0x03FF; // grab the lower 10 bits
2267  tmp += 0xDC00; // add encoding offset
2268  out_cp[1] = tmp; // write
2269 
2270  return 2; // return used word count (2 for surrogate pairs)
2271  }
2273 
2275 
2277 
2278 
2279  static bool _utf8_start_char( unsigned char cp )
2280  {
2281  return ( cp & ~_cont_mask ) != _cont;
2282  }
2284  static size_t _utf8_char_length( unsigned char cp )
2285  {
2286  if ( !( cp & 0x80 ) ) return 1;
2287  if (( cp & ~_lead1_mask ) == _lead1 ) return 2;
2288  if (( cp & ~_lead2_mask ) == _lead2 ) return 3;
2289  if (( cp & ~_lead3_mask ) == _lead3 ) return 4;
2290  if (( cp & ~_lead4_mask ) == _lead4 ) return 5;
2291  if (( cp & ~_lead5_mask ) == _lead5 ) return 6;
2292  throw invalid_data( "invalid UTF-8 sequence header value" );
2293  }
2295  static size_t _utf8_char_length( unicode_char uc )
2296  {
2297  /*
2298  7 bit: U-00000000 - U-0000007F: 0xxxxxxx
2299  11 bit: U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
2300  16 bit: U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
2301  21 bit: U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2302  26 bit: U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2303  31 bit: U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2304  */
2305  if ( !( uc & ~0x0000007F ) ) return 1;
2306  if ( !( uc & ~0x000007FF ) ) return 2;
2307  if ( !( uc & ~0x0000FFFF ) ) return 3;
2308  if ( !( uc & ~0x001FFFFF ) ) return 4;
2309  if ( !( uc & ~0x03FFFFFF ) ) return 5;
2310  if ( !( uc & ~0x7FFFFFFF ) ) return 6;
2311  throw invalid_data( "invalid UTF-32 value" );
2312  }
2313 
2315  static size_t _utf8_to_utf32( const unsigned char in_cp[6], unicode_char& out_uc )
2316  {
2317  size_t len = _utf8_char_length( in_cp[0] );
2318  if ( len == 1 )
2319  { // if we are only 1 byte long, then just grab it and exit
2320  out_uc = in_cp[0];
2321  return 1;
2322  }
2323 
2324  unicode_char c = 0; // temporary buffer
2325  size_t i = 0;
2326  switch ( len )
2327  { // load header byte
2328  case 6:
2329  c = in_cp[i] & _lead5_mask;
2330  break;
2331  case 5:
2332  c = in_cp[i] & _lead4_mask;
2333  break;
2334  case 4:
2335  c = in_cp[i] & _lead3_mask;
2336  break;
2337  case 3:
2338  c = in_cp[i] & _lead2_mask;
2339  break;
2340  case 2:
2341  c = in_cp[i] & _lead1_mask;
2342  break;
2343  }
2344 
2345  for ( ++i; i < len; i++ )
2346  { // load each continuation byte
2347  if (( in_cp[i] & ~_cont_mask ) != _cont )
2348  throw invalid_data( "bad UTF-8 continuation byte" );
2349  c <<= 6;
2350  c |= ( in_cp[i] & _cont_mask );
2351  }
2352 
2353  out_uc = c; // write the final value and return the used byte length
2354  return len;
2355  }
2357  static size_t _utf32_to_utf8( const unicode_char& in_uc, unsigned char out_cp[6] )
2358  {
2359  size_t len = _utf8_char_length( in_uc ); // predict byte length of sequence
2360  unicode_char c = in_uc; // copy to temp buffer
2361 
2362  //stuff all of the lower bits
2363  for ( size_t i = len - 1; i > 0; i-- )
2364  {
2365  out_cp[i] = (( c ) & _cont_mask ) | _cont;
2366  c >>= 6;
2367  }
2368 
2369  //now write the header byte
2370  switch ( len )
2371  {
2372  case 6:
2373  out_cp[0] = (( c ) & _lead5_mask ) | _lead5;
2374  break;
2375  case 5:
2376  out_cp[0] = (( c ) & _lead4_mask ) | _lead4;
2377  break;
2378  case 4:
2379  out_cp[0] = (( c ) & _lead3_mask ) | _lead3;
2380  break;
2381  case 3:
2382  out_cp[0] = (( c ) & _lead2_mask ) | _lead2;
2383  break;
2384  case 2:
2385  out_cp[0] = (( c ) & _lead1_mask ) | _lead1;
2386  break;
2387  case 1:
2388  default:
2389  out_cp[0] = ( c ) & 0x7F;
2390  break;
2391  }
2392 
2393  // return the byte length of the sequence
2394  return len;
2395  }
2396 
2398  static size_type _verifyUTF8( const unsigned char* c_str )
2399  {
2400  std::string tmp( reinterpret_cast<const char*>( c_str ) );
2401  return _verifyUTF8( tmp );
2402  }
2404  static size_type _verifyUTF8( const std::string& str )
2405  {
2406  std::string::const_iterator i, ie = str.end();
2407  i = str.begin();
2408  size_type length = 0;
2409 
2410  while ( i != ie )
2411  {
2412  // characters pass until we find an extended sequence
2413  if (( *i ) & 0x80 )
2414  {
2415  unsigned char c = ( *i );
2416  size_t contBytes = 0;
2417 
2418  // get continuation byte count and test for overlong sequences
2419  if (( c & ~_lead1_mask ) == _lead1 )
2420  { // 1 additional byte
2421  if ( c == _lead1 ) throw invalid_data( "overlong UTF-8 sequence" );
2422  contBytes = 1;
2423 
2424  }
2425  else if (( c & ~_lead2_mask ) == _lead2 )
2426  { // 2 additional bytes
2427  contBytes = 2;
2428  if ( c == _lead2 )
2429  { // possible overlong UTF-8 sequence
2430  c = ( *( i + 1 ) ); // look ahead to next byte in sequence
2431  if (( c & _lead2 ) == _cont ) throw invalid_data( "overlong UTF-8 sequence" );
2432  }
2433 
2434  }
2435  else if (( c & ~_lead3_mask ) == _lead3 )
2436  { // 3 additional bytes
2437  contBytes = 3;
2438  if ( c == _lead3 )
2439  { // possible overlong UTF-8 sequence
2440  c = ( *( i + 1 ) ); // look ahead to next byte in sequence
2441  if (( c & _lead3 ) == _cont ) throw invalid_data( "overlong UTF-8 sequence" );
2442  }
2443 
2444  }
2445  else if (( c & ~_lead4_mask ) == _lead4 )
2446  { // 4 additional bytes
2447  contBytes = 4;
2448  if ( c == _lead4 )
2449  { // possible overlong UTF-8 sequence
2450  c = ( *( i + 1 ) ); // look ahead to next byte in sequence
2451  if (( c & _lead4 ) == _cont ) throw invalid_data( "overlong UTF-8 sequence" );
2452  }
2453 
2454  }
2455  else if (( c & ~_lead5_mask ) == _lead5 )
2456  { // 5 additional bytes
2457  contBytes = 5;
2458  if ( c == _lead5 )
2459  { // possible overlong UTF-8 sequence
2460  c = ( *( i + 1 ) ); // look ahead to next byte in sequence
2461  if (( c & _lead5 ) == _cont ) throw invalid_data( "overlong UTF-8 sequence" );
2462  }
2463  }
2464 
2465  // check remaining continuation bytes for
2466  while ( contBytes-- )
2467  {
2468  c = ( *( ++i ) ); // get next byte in sequence
2469  if (( c & ~_cont_mask ) != _cont )
2470  throw invalid_data( "bad UTF-8 continuation byte" );
2471  }
2472  }
2473  length++;
2474  i++;
2475  }
2476  return length;
2477  }
2479 
2480  private:
2481  //template<class ITER_TYPE> friend class _iterator;
2482  dstring mData;
2483 
2485  enum BufferType
2486  {
2487  bt_none,
2488  bt_string,
2489  bt_wstring,
2490  bt_utf32string
2491  };
2492 
2494  void _init()
2495  {
2496  m_buffer.mVoidBuffer = 0;
2497  m_bufferType = bt_none;
2498  m_bufferSize = 0;
2499  }
2500 
2502  // Scratch buffer
2504  void _cleanBuffer() const
2505  {
2506  if ( m_buffer.mVoidBuffer != 0 )
2507  {
2508  switch ( m_bufferType )
2509  {
2510  case bt_string:
2511  delete m_buffer.mStrBuffer;
2512  break;
2513  case bt_wstring:
2514  delete m_buffer.mWStrBuffer;
2515  break;
2516  case bt_utf32string:
2517  delete m_buffer.mUTF32StrBuffer;
2518  break;
2519  case bt_none: // under the worse of circumstances, this is all we can do, and hope it works out
2520  default:
2521  //delete m_buffer.mVoidBuffer;
2522  // delete void* is undefined, don't do that
2523  MYGUI_ASSERT(false, "This should never happen - mVoidBuffer should never contain something if we "
2524  "don't know the type");
2525  break;
2526  }
2527  m_buffer.mVoidBuffer = 0;
2528  m_bufferSize = 0;
2529  }
2530  }
2531 
2533  void _getBufferStr() const
2534  {
2535  if ( m_bufferType != bt_string )
2536  {
2537  _cleanBuffer();
2538  m_buffer.mStrBuffer = new std::string();
2539  m_bufferType = bt_string;
2540  }
2541  m_buffer.mStrBuffer->clear();
2542  }
2544  void _getBufferWStr() const
2545  {
2546  if ( m_bufferType != bt_wstring )
2547  {
2548  _cleanBuffer();
2549  m_buffer.mWStrBuffer = new std::wstring();
2550  m_bufferType = bt_wstring;
2551  }
2552  m_buffer.mWStrBuffer->clear();
2553  }
2555  void _getBufferUTF32Str() const
2556  {
2557  if ( m_bufferType != bt_utf32string )
2558  {
2559  _cleanBuffer();
2560  m_buffer.mUTF32StrBuffer = new utf32string();
2561  m_bufferType = bt_utf32string;
2562  }
2563  m_buffer.mUTF32StrBuffer->clear();
2564  }
2565 
2566  void _load_buffer_UTF8() const
2567  {
2568  _getBufferStr();
2569  std::string& buffer = ( *m_buffer.mStrBuffer );
2570  buffer.reserve( length() );
2571 
2572  unsigned char utf8buf[6];
2573  char* charbuf = ( char* )utf8buf;
2574  unicode_char c;
2575  size_t len;
2576 
2577  const_iterator i, ie = end();
2578  for ( i = begin(); i != ie; i.moveNext() )
2579  {
2580  c = i.getCharacter();
2581  len = _utf32_to_utf8( c, utf8buf );
2582  size_t j = 0;
2583  while ( j < len )
2584  buffer.push_back( charbuf[j++] );
2585  }
2586  }
2587  void _load_buffer_WStr() const
2588  {
2589  _getBufferWStr();
2590  std::wstring& buffer = ( *m_buffer.mWStrBuffer );
2591  buffer.reserve( length() ); // may over reserve, but should be close enough
2592 #ifdef WCHAR_UTF16 // wchar_t matches UTF-16
2593  const_iterator i, ie = end();
2594  for ( i = begin(); i != ie; ++i )
2595  {
2596  buffer.push_back(( wchar_t )( *i ) );
2597  }
2598 #else // wchar_t fits UTF-32
2599  unicode_char c;
2600  const_iterator i, ie = end();
2601  for ( i = begin(); i != ie; i.moveNext() )
2602  {
2603  c = i.getCharacter();
2604  buffer.push_back(( wchar_t )c );
2605  }
2606 #endif
2607  }
2608  void _load_buffer_UTF32() const
2609  {
2610  _getBufferUTF32Str();
2611  utf32string& buffer = ( *m_buffer.mUTF32StrBuffer );
2612  buffer.reserve( length() ); // may over reserve, but should be close enough
2613 
2614  unicode_char c;
2615 
2616  const_iterator i, ie = end();
2617  for ( i = begin(); i != ie; i.moveNext() )
2618  {
2619  c = i.getCharacter();
2620  buffer.push_back( c );
2621  }
2622  }
2623 
2624  mutable BufferType m_bufferType; // identifies the data type held in m_buffer
2625  mutable size_t m_bufferSize; // size of the CString buffer
2626 
2627  // multi-purpose buffer used everywhere we need a throw-away buffer
2628  union Buffer
2629  {
2630  mutable void* mVoidBuffer;
2631  mutable std::string* mStrBuffer;
2632  mutable std::wstring* mWStrBuffer;
2633  mutable utf32string* mUTF32StrBuffer;
2634  }
2635  m_buffer;
2636  };
2637 
2639  inline UString operator+( const UString& s1, const UString& s2 )
2640  {
2641  return UString( s1 ).append( s2 );
2642  }
2644  inline UString operator+( const UString& s1, UString::code_point c )
2645  {
2646  return UString( s1 ).append( 1, c );
2647  }
2649  inline UString operator+( const UString& s1, UString::unicode_char c )
2650  {
2651  return UString( s1 ).append( 1, c );
2652  }
2654  inline UString operator+( const UString& s1, char c )
2655  {
2656  return UString( s1 ).append( 1, c );
2657  }
2658 #if MYGUI_IS_NATIVE_WCHAR_T
2659 
2660  inline UString operator+( const UString& s1, wchar_t c )
2661  {
2662  return UString( s1 ).append( 1, c );
2663  }
2664 #endif
2665 
2666  inline UString operator+( UString::code_point c, const UString& s2 )
2667  {
2668  return UString().append( 1, c ).append( s2 );
2669  }
2671  inline UString operator+( UString::unicode_char c, const UString& s2 )
2672  {
2673  return UString().append( 1, c ).append( s2 );
2674  }
2676  inline UString operator+( char c, const UString& s2 )
2677  {
2678  return UString().append( 1, c ).append( s2 );
2679  }
2680 #if MYGUI_IS_NATIVE_WCHAR_T
2681 
2682  inline UString operator+( wchar_t c, const UString& s2 )
2683  {
2684  return UString().append( 1, c ).append( s2 );
2685  }
2686 #endif
2687 
2688  // (const) forward iterator common operators
2690  {
2691  return ( left.mIter - right.mIter );
2692  }
2694  {
2695  return left.mIter == right.mIter;
2696  }
2698  {
2699  return left.mIter != right.mIter;
2700  }
2702  {
2703  return left.mIter < right.mIter;
2704  }
2706  {
2707  return left.mIter <= right.mIter;
2708  }
2710  {
2711  return left.mIter > right.mIter;
2712  }
2714  {
2715  return left.mIter >= right.mIter;
2716  }
2717 
2718  // (const) reverse iterator common operators
2719  // NB: many of these operations are evaluated in reverse because this is a reverse iterator wrapping a forward iterator
2721  {
2722  return ( right.mIter - left.mIter );
2723  }
2725  {
2726  return left.mIter == right.mIter;
2727  }
2729  {
2730  return left.mIter != right.mIter;
2731  }
2733  {
2734  return right.mIter < left.mIter;
2735  }
2737  {
2738  return right.mIter <= left.mIter;
2739  }
2741  {
2742  return right.mIter > left.mIter;
2743  }
2745  {
2746  return right.mIter >= left.mIter;
2747  }
2748 
2750  inline std::ostream& operator << ( std::ostream& os, const UString& s )
2751  {
2752  return os << s.asUTF8();
2753  }
2754 
2756  //inline std::wostream& operator << ( std::wostream& os, const UString& s )
2757  //{
2758  // return os << s.asWStr();
2759  //}
2760 
2761 
2762 
2763 }
2764 
2765 #endif // __MYGUI_U_STRING_H__