src/NCollection/NCollection_UtfIterator.lxx

   1 // Created on: 2013-01-28
   2 // Created by: Kirill GAVRILOV
   3 // Copyright (c) 2013-2014 OPEN CASCADE SAS
   4 //
   5 // This file is part of Open CASCADE Technology software library.
   6 //
   7 // This library is free software; you can redistribute it and/or modify it under
   8 // the terms of the GNU Lesser General Public License version 2.1 as published
   9 // by the Free Software Foundation, with special exception defined in the file
  10 // OCCT_LGPL_EXCEPTION.txt. Consult the file LICENSE_LGPL_21.txt included in OCCT
  11 // distribution for complete text of the license and disclaimer of any warranty.
  12 //
  13 // Alternatively, this file may be used under the terms of Open CASCADE
  14 // commercial license or contractual agreement.
  15
  16 // Portions of code are copyrighted by Unicode, Inc.
  17 //
  18 // Copyright (c) 2001-2004 Unicode, Inc.
  19 //
  20 // Disclaimer
  21 //
  22 // This source code is provided as is by Unicode, Inc. No claims are
  23 // made as to fitness for any particular purpose. No warranties of any
  24 // kind are expressed or implied. The recipient agrees to determine
  25 // applicability of information provided. If this file has been
  26 // purchased on magnetic or optical media from Unicode, Inc., the
  27 // sole remedy for any claim will be exchange of defective media
  28 // within 90 days of receipt.
  29 //
  30 // Limitations on Rights to Redistribute This Code
  31 //
  32 // Unicode, Inc. hereby grants the right to freely use the information
  33 // supplied in this file in the creation of products supporting the
  34 // Unicode Standard, and to make copies of this file in any form
  35 // for internal or external distribution as long as this notice
  36 // remains attached.
  37
  38 //! The first character in a UTF-8 sequence indicates how many bytes
  39 //! to read (among other things).
  40 template<typename Type>
  41 const unsigned char NCollection_UtfIterator<Type>::UTF8_BYTES_MINUS_ONE[256] =
  42 {
  43   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  44   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  45   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  46   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  47   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  48   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
  49   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
  50   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
  51 };
  52
  53 //! Magic values subtracted from a buffer value during UTF-8 conversion.
  54 //! This table contains as many values as there might be trailing bytes
  55 //! in a UTF-8 sequence.
  56 template<typename Type>
  57 const unsigned long NCollection_UtfIterator<Type>::offsetsFromUTF8[6] =
  58 {
  59   0x00000000UL, 0x00003080UL, 0x000E2080UL,
  60   0x03C82080UL, 0xFA082080UL, 0x82082080UL
  61 };
  62
  63 //! The first character in a UTF-8 sequence indicates how many bytes to read.
  64 template<typename Type>
  65 const unsigned char NCollection_UtfIterator<Type>::UTF8_FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
  66
  67 // =======================================================================
  68 // function : readUTF8
  69 // purpose  : Get a UTF-8 character; leave the tracking pointer at the start of the next character.
  70 //            Not protected against invalid UTF-8.
  71 // =======================================================================
  72 template<typename Type>
  73 inline void NCollection_UtfIterator<Type>::readUTF8()
  74 {
  75   // unsigned arithmetic used
  76   Standard_Utf8UChar* aPos = (Standard_Utf8UChar* )myPosNext;
  77   const unsigned char aBytesToRead = UTF8_BYTES_MINUS_ONE[*aPos];
  78   myCharUtf32 = 0;
  79   switch (aBytesToRead)
  80   {
  81     case 5: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; // remember, illegal UTF-8
  82     case 4: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; // remember, illegal UTF-8
  83     case 3: myCharUtf32 += *aPos++; myCharUtf32 <<= 6;
  84     case 2: myCharUtf32 += *aPos++; myCharUtf32 <<= 6;
  85     case 1: myCharUtf32 += *aPos++; myCharUtf32 <<= 6;
  86     case 0: myCharUtf32 += *aPos++;
  87   }
  88   myCharUtf32 -= offsetsFromUTF8[aBytesToRead];
  89   myPosNext = (Type* )aPos;
  90 }
  91
  92 // magic numbers
  93 template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF8_BYTE_MASK = 0xBF;
  94 template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF8_BYTE_MARK = 0x80;
  95 template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_START = 0xD800;
  96 template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_END   = 0xDBFF;
  97 template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_START  = 0xDC00;
  98 template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_END    = 0xDFFF;
  99 template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_SHIFT = 10;
 100 template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_BASE   = 0x0010000UL;
 101 template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_MASK   = 0x3FFUL;
 102 template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF32_MAX_BMP   = 0x0000FFFFUL;
 103 template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF32_MAX_LEGAL = 0x0010FFFFUL;
 104
 105 // =======================================================================
 106 // function : readUTF16
 107 // purpose  :
 108 // =======================================================================
 109 template<typename Type> inline
 110 void NCollection_UtfIterator<Type>::readUTF16()
 111 {
 112   Standard_Utf32Char aChar = *myPosNext++;
 113   // if we have the first half of the surrogate pair
 114   if (aChar >= UTF16_SURROGATE_HIGH_START
 115    && aChar <= UTF16_SURROGATE_HIGH_END)
 116   {
 117     const Standard_Utf32Char aChar2 = *myPosNext;
 118     // complete the surrogate pair
 119     if (aChar2 >= UTF16_SURROGATE_LOW_START
 120      && aChar2 <= UTF16_SURROGATE_LOW_END)
 121     {
 122       aChar = ((aChar - UTF16_SURROGATE_HIGH_START) << UTF16_SURROGATE_HIGH_SHIFT)
 123             + (aChar2 - UTF16_SURROGATE_LOW_START)   + UTF16_SURROGATE_LOW_BASE;
 124       ++myPosNext;
 125     }
 126   }
 127   myCharUtf32 = aChar;
 128 }
 129
 130 // =======================================================================
 131 // function : AdvanceBytesUtf8
 132 // purpose  :
 133 // =======================================================================
 134 template<typename Type> inline
 135 Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf8() const
 136 {
 137   if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START
 138    && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
 139    {
 140     // UTF-16 surrogate values are illegal in UTF-32
 141     return 0;
 142   }
 143   else if (myCharUtf32 < Standard_Utf32Char(0x80))
 144   {
 145     return 1;
 146   }
 147   else if (myCharUtf32 < Standard_Utf32Char(0x800))
 148   {
 149     return 2;
 150   }
 151   else if (myCharUtf32 < Standard_Utf32Char(0x10000))
 152   {
 153     return 3;
 154   }
 155   else if (myCharUtf32 <= UTF32_MAX_LEGAL)
 156   {
 157     return 4;
 158   }
 159   else
 160   {
 161     // illegal
 162     return 0;
 163   }
 164 }
 165
 166 // =======================================================================
 167 // function : GetUtf8
 168 // purpose  :
 169 // =======================================================================
 170 template<typename Type> inline
 171 Standard_Utf8Char* NCollection_UtfIterator<Type>::GetUtf8 (Standard_Utf8Char* theBuffer) const
 172 {
 173   // unsigned arithmetic used
 174   return (Standard_Utf8Char* )GetUtf8 ((Standard_Utf8UChar* )theBuffer);
 175 }
 176
 177 // =======================================================================
 178 // function : GetUtf8
 179 // purpose  :
 180 // =======================================================================
 181 template<typename Type> inline
 182 Standard_Utf8UChar* NCollection_UtfIterator<Type>::GetUtf8 (Standard_Utf8UChar* theBuffer) const
 183 {
 184   Standard_Utf32Char aChar = myCharUtf32;
 185   if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START
 186    && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
 187   {
 188     // UTF-16 surrogate values are illegal in UTF-32
 189     return theBuffer;
 190   }
 191   else if (myCharUtf32 < Standard_Utf32Char(0x80))
 192   {
 193     *theBuffer++ = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[1]);
 194     return theBuffer;
 195   }
 196   else if (myCharUtf32 < Standard_Utf32Char(0x800))
 197   {
 198     *++theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
 199     *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[2]);
 200     return theBuffer + 2;
 201   }
 202   else if (myCharUtf32 < Standard_Utf32Char(0x10000))
 203   {
 204     theBuffer += 3;
 205     *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
 206     *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
 207     *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[3]);
 208     return theBuffer + 3;
 209   }
 210   else if (myCharUtf32 <= UTF32_MAX_LEGAL)
 211   {
 212     theBuffer += 4;
 213     *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
 214     *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
 215     *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
 216     *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[4]);
 217     return theBuffer + 4;
 218   }
 219   else
 220   {
 221     // illegal
 222     return theBuffer;
 223   }
 224 }
 225
 226 // =======================================================================
 227 // function : AdvanceBytesUtf16
 228 // purpose  :
 229 // =======================================================================
 230 template<typename Type> inline
 231 Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf16() const
 232 {
 233   if (myCharUtf32 <= UTF32_MAX_BMP) // target is a character <= 0xFFFF
 234   {
 235     // UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values
 236     if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START
 237      && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
 238     {
 239       return 0;
 240     }
 241     else
 242     {
 243       return Standard_Integer(sizeof(Standard_Utf16Char));
 244     }
 245   }
 246   else if (myCharUtf32 > UTF32_MAX_LEGAL)
 247   {
 248     // illegal
 249     return 0;
 250   }
 251   else
 252   {
 253     // target is a character in range 0xFFFF - 0x10FFFF
 254     // surrogate pair
 255     return Standard_Integer(sizeof(Standard_Utf16Char) * 2);
 256   }
 257 }
 258
 259 // =======================================================================
 260 // function : GetUtf16
 261 // purpose  :
 262 // =======================================================================
 263 template<typename Type> inline
 264 Standard_Utf16Char* NCollection_UtfIterator<Type>::GetUtf16 (Standard_Utf16Char* theBuffer) const
 265 {
 266   if (myCharUtf32 <= UTF32_MAX_BMP) // target is a character <= 0xFFFF
 267   {
 268     // UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values
 269     if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START
 270      && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
 271     {
 272       return theBuffer;
 273     }
 274     else
 275     {
 276       *theBuffer++ = Standard_Utf16Char(myCharUtf32);
 277       return theBuffer;
 278     }
 279   }
 280   else if (myCharUtf32 > UTF32_MAX_LEGAL)
 281   {
 282     // illegal
 283     return theBuffer;
 284   }
 285   else
 286   {
 287     // surrogate pair
 288     Standard_Utf32Char aChar = myCharUtf32 - UTF16_SURROGATE_LOW_BASE;
 289     *theBuffer++ = Standard_Utf16Char((aChar >> UTF16_SURROGATE_HIGH_SHIFT) + UTF16_SURROGATE_HIGH_START);
 290     *theBuffer++ = Standard_Utf16Char((aChar &  UTF16_SURROGATE_LOW_MASK)   + UTF16_SURROGATE_LOW_START);
 291     return theBuffer;
 292   }
 293 }
 294
 295 // =======================================================================
 296 // function : GetUtf32
 297 // purpose  :
 298 // =======================================================================
 299 template<typename Type> inline
 300 Standard_Utf32Char* NCollection_UtfIterator<Type>::GetUtf32 (Standard_Utf32Char* theBuffer) const
 301 {
 302   *theBuffer++ = myCharUtf32;
 303   return theBuffer;
 304 }
 305
 306 // =======================================================================
 307 // function : AdvanceBytesUtf
 308 // purpose  :
 309 // =======================================================================
 310 template<typename Type> template<typename TypeWrite> inline
 311 Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf() const
 312 {
 313   switch (sizeof(TypeWrite))
 314   {
 315     case sizeof(Standard_Utf8Char):  return AdvanceBytesUtf8();
 316     case sizeof(Standard_Utf16Char): return AdvanceBytesUtf16();
 317     case sizeof(Standard_Utf32Char): return AdvanceBytesUtf32();
 318     default:                         return 0; // invalid case
 319   }
 320 }
 321
 322 // =======================================================================
 323 // function : GetUtf
 324 // purpose  :
 325 // =======================================================================
 326 template<typename Type> template<typename TypeWrite> inline
 327 TypeWrite* NCollection_UtfIterator<Type>::GetUtf (TypeWrite* theBuffer) const
 328 {
 329   switch (sizeof(TypeWrite))
 330   {
 331     case sizeof(Standard_Utf8Char):  return (TypeWrite* )GetUtf8 ((Standard_Utf8UChar* )theBuffer);
 332     case sizeof(Standard_Utf16Char): return (TypeWrite* )GetUtf16((Standard_Utf16Char* )theBuffer);
 333     case sizeof(Standard_Utf32Char): return (TypeWrite* )GetUtf32((Standard_Utf32Char* )theBuffer);
 334     default:                         return NULL; // invalid case
 335   }
 336 }