src/NCollection/NCollection_UtfIterator.hxx

   1 // Created on: 2013-01-28
   2 // Created by: Kirill GAVRILOV
   3 // Copyright (c) 2013-2014 OPEN CASCADE SAS
   4 //
   5 // This file is part of Open CASCADE Technology software library.
   6 //
   7 // This library is free software; you can redistribute it and/or modify it under
   8 // the terms of the GNU Lesser General Public License version 2.1 as published
   9 // by the Free Software Foundation, with special exception defined in the file
  10 // OCCT_LGPL_EXCEPTION.txt. Consult the file LICENSE_LGPL_21.txt included in OCCT
  11 // distribution for complete text of the license and disclaimer of any warranty.
  12 //
  13 // Alternatively, this file may be used under the terms of Open CASCADE
  14 // commercial license or contractual agreement.
  15
  16 #ifndef _NCollection_UtfIterator_H__
  17 #define _NCollection_UtfIterator_H__
  18
  19 #include <Standard_TypeDef.hxx>
  20
  21 //! Template class for Unicode strings support.
  22 //! It defines an iterator and provide correct way to read multi-byte text (UTF-8 and UTF-16)
  23 //! and convert it from one to another.
  24 //! The current value of iterator returned as UTF-32 Unicode code.
  25 template<typename Type>
  26 class NCollection_UtfIterator
  27 {
  28
  29 public:
  30
  31   //! Constructor.
  32   //! @param theString buffer to iterate
  33   NCollection_UtfIterator (const Type* theString)
  34   : myPosition(theString),
  35     myPosNext(theString),
  36     myCharIndex(0),
  37     myCharUtf32(0)
  38   {
  39     if (theString != NULL)
  40     {
  41       ++(*this);
  42       myCharIndex = 0;
  43     }
  44   }
  45
  46   //! Initialize iterator within specified NULL-terminated string.
  47   void Init (const Type* theString)
  48   {
  49     myPosition  = theString;
  50     myPosNext   = theString;
  51     myCharUtf32 = 0;
  52     if (theString != NULL)
  53     {
  54       ++(*this);
  55     }
  56     myCharIndex = 0;
  57   }
  58
  59   //! Pre-increment operator. Reads the next unicode character.
  60   //! Notice - no protection against overrun!
  61   NCollection_UtfIterator& operator++()
  62   {
  63     myPosition = myPosNext;
  64     ++myCharIndex;
  65     switch (sizeof(Type))
  66     {
  67       case 1: readUTF8();  break;
  68       case 2: readUTF16(); break;
  69       case 4: // UTF-32
  70       default:
  71         myCharUtf32 = *myPosNext++;
  72     }
  73     return *this;
  74   }
  75
  76   //! Post-increment operator.
  77   //! Notice - no protection against overrun!
  78   NCollection_UtfIterator operator++ (int )
  79   {
  80     NCollection_UtfIterator aCopy = *this;
  81     ++*this;
  82     return aCopy;
  83   }
  84
  85   //! Equality operator.
  86   bool operator== (const NCollection_UtfIterator& theRight) const
  87   {
  88     return myPosition == theRight.myPosition;
  89   }
  90
  91   //! Dereference operator.
  92   //! @return the UTF-32 codepoint of the character currently pointed by iterator.
  93   Standard_Utf32Char operator*() const
  94   {
  95     return myCharUtf32;
  96   }
  97
  98   //! Buffer-fetching getter.
  99   const Type* BufferHere() const { return myPosition; }
 100
 101   //! Buffer-fetching getter. Dangerous! Iterator should be reinitialized on buffer change.
 102   Type* ChangeBufferHere() { return (Type* )myPosition; }
 103
 104   //! Buffer-fetching getter.
 105   const Type* BufferNext() const { return myPosNext; }
 106
 107   //! @return the index displacement from iterator intialization
 108   Standard_Integer Index() const
 109   {
 110     return myCharIndex;
 111   }
 112
 113   //! @return the advance in bytes to store current symbol in UTF-8.
 114   //! 0 means an invalid symbol;
 115   //! 1-4 bytes are valid range.
 116   Standard_Integer AdvanceBytesUtf8() const;
 117
 118   //! @return the advance in bytes to store current symbol in UTF-16.
 119   //! 0 means an invalid symbol;
 120   //! 2 bytes is a general case;
 121   //! 4 bytes for surrogate pair.
 122   Standard_Integer AdvanceBytesUtf16() const;
 123
 124   //! @return the advance in bytes to store current symbol in UTF-32.
 125   //! Always 4 bytes (method for consistency).
 126   Standard_Integer AdvanceBytesUtf32() const
 127   {
 128     return Standard_Integer(sizeof(Standard_Utf32Char));
 129   }
 130
 131   //! Fill the UTF-8 buffer within current Unicode symbol.
 132   //! Use method AdvanceUtf8() to allocate buffer with enough size.
 133   //! @param theBuffer buffer to fill
 134   //! @return new buffer position (for next char)
 135   Standard_Utf8Char*  GetUtf8 (Standard_Utf8Char*  theBuffer) const;
 136   Standard_Utf8UChar* GetUtf8 (Standard_Utf8UChar* theBuffer) const;
 137
 138   //! Fill the UTF-16 buffer within current Unicode symbol.
 139   //! Use method AdvanceUtf16() to allocate buffer with enough size.
 140   //! @param theBuffer buffer to fill
 141   //! @return new buffer position (for next char)
 142   Standard_Utf16Char* GetUtf16 (Standard_Utf16Char* theBuffer) const;
 143
 144   //! Fill the UTF-32 buffer within current Unicode symbol.
 145   //! Use method AdvanceUtf32() to allocate buffer with enough size.
 146   //! @param theBuffer buffer to fill
 147   //! @return new buffer position (for next char)
 148   Standard_Utf32Char* GetUtf32 (Standard_Utf32Char* theBuffer) const;
 149
 150   //! @return the advance in TypeWrite chars needed to store current symbol
 151   template<typename TypeWrite>
 152   Standard_Integer AdvanceBytesUtf() const;
 153
 154   //! Fill the UTF-** buffer within current Unicode symbol.
 155   //! Use method AdvanceUtf**() to allocate buffer with enough size.
 156   //! @param theBuffer buffer to fill
 157   //! @return new buffer position (for next char)
 158   template<typename TypeWrite>
 159   TypeWrite* GetUtf (TypeWrite* theBuffer) const;
 160
 161 private:
 162
 163   //! Helper function for reading a single UTF8 character from the string.
 164   //! Updates internal state appropriately.
 165   void readUTF8();
 166
 167   //! Helper function for reading a single UTF16 character from the string.
 168   //! Updates internal state appropriately.
 169   void readUTF16();
 170
 171 private: //! @name unicode magic numbers
 172
 173   static const unsigned char UTF8_BYTES_MINUS_ONE[256];
 174   static const unsigned long offsetsFromUTF8[6];
 175   static const unsigned char UTF8_FIRST_BYTE_MARK[7];
 176   static const unsigned long UTF8_BYTE_MASK;
 177   static const unsigned long UTF8_BYTE_MARK;
 178   static const unsigned long UTF16_SURROGATE_HIGH_START;
 179   static const unsigned long UTF16_SURROGATE_HIGH_END;
 180   static const unsigned long UTF16_SURROGATE_LOW_START;
 181   static const unsigned long UTF16_SURROGATE_LOW_END;
 182   static const unsigned long UTF16_SURROGATE_HIGH_SHIFT;
 183   static const unsigned long UTF16_SURROGATE_LOW_BASE;
 184   static const unsigned long UTF16_SURROGATE_LOW_MASK;
 185   static const unsigned long UTF32_MAX_BMP;
 186   static const unsigned long UTF32_MAX_LEGAL;
 187
 188 private: //! @name private fields
 189
 190   const Type*        myPosition;  //!< buffer position of the first element in the current character
 191   const Type*        myPosNext;   //!< buffer position of the first element in the next character
 192   Standard_Integer   myCharIndex; //!< index displacement from iterator intialization
 193   Standard_Utf32Char myCharUtf32; //!< character stored at the current buffer position
 194
 195 };
 196
 197 typedef NCollection_UtfIterator<Standard_Utf8Char>  NCollection_Utf8Iter;
 198 typedef NCollection_UtfIterator<Standard_Utf16Char> NCollection_Utf16Iter;
 199 typedef NCollection_UtfIterator<Standard_Utf32Char> NCollection_Utf32Iter;
 200 typedef NCollection_UtfIterator<Standard_WideChar>  NCollection_UtfWideIter;
 201
 202 // template implementation
 203 #include "NCollection_UtfIterator.lxx"
 204
 205 #endif // _NCollection_UtfIterator_H__