src/NCollection/NCollection_UtfIterator.hxx

   1 // Created on: 2013-01-28
   2 // Created by: Kirill GAVRILOV
   3 // Copyright (c) 2013-2014 OPEN CASCADE SAS
   4 //
   5 // This file is part of Open CASCADE Technology software library.
   6 //
   7 // This library is free software; you can redistribute it and/or modify it under
   8 // the terms of the GNU Lesser General Public License version 2.1 as published
   9 // by the Free Software Foundation, with special exception defined in the file
  10 // OCCT_LGPL_EXCEPTION.txt. Consult the file LICENSE_LGPL_21.txt included in OCCT
  11 // distribution for complete text of the license and disclaimer of any warranty.
  12 //
  13 // Alternatively, this file may be used under the terms of Open CASCADE
  14 // commercial license or contractual agreement.
  15
  16 #ifndef _NCollection_UtfIterator_H__
  17 #define _NCollection_UtfIterator_H__
  18
  19 #include <Standard_TypeDef.hxx>
  20
  21 //! Template class for Unicode strings support.
  22 //! It defines an iterator and provide correct way to read multi-byte text (UTF-8 and UTF-16)
  23 //! and convert it from one to another.
  24 //! The current value of iterator returned as UTF-32 Unicode code.
  25 template<typename Type>
  26 class NCollection_UtfIterator
  27 {
  28
  29 public:
  30
  31   //! Constructor.
  32   //! @param theString buffer to iterate
  33   NCollection_UtfIterator (const Type* theString)
  34   : myPosition(theString),
  35     myPosNext(theString),
  36     myCharIndex(0),
  37     myCharUtf32(0)
  38   {
  39     if (theString != NULL)
  40     {
  41       ++(*this);
  42       myCharIndex = 0;
  43     }
  44   }
  45
  46   //! Initialize iterator within specified NULL-terminated string.
  47   void Init (const Type* theString)
  48   {
  49     myPosition  = theString;
  50     myPosNext   = theString;
  51     myCharUtf32 = 0;
  52     if (theString != NULL)
  53     {
  54       ++(*this);
  55     }
  56     myCharIndex = 0;
  57   }
  58
  59   //! Pre-increment operator. Reads the next unicode character.
  60   //! Notice - no protection against overrun!
  61   NCollection_UtfIterator& operator++()
  62   {
  63     myPosition = myPosNext;
  64     ++myCharIndex;
  65     switch (sizeof(Type))
  66     {
  67       case 1: readUTF8();  break;
  68       case 2: readUTF16(); break;
  69       case 4: // UTF-32
  70       default:
  71         myCharUtf32 = *myPosNext++;
  72     }
  73     return *this;
  74   }
  75
  76   //! Post-increment operator.
  77   //! Notice - no protection against overrun!
  78   NCollection_UtfIterator operator++ (int )
  79   {
  80     NCollection_UtfIterator aCopy = *this;
  81     ++*this;
  82     return aCopy;
  83   }
  84
  85   //! Equality operator.
  86   bool operator== (const NCollection_UtfIterator& theRight) const
  87   {
  88     return myPosition == theRight.myPosition;
  89   }
  90
  91   //! Return true if Unicode symbol is within valid range.
  92   bool IsValid() const
  93   {
  94     return myCharUtf32 <= UTF32_MAX_LEGAL;
  95   }
  96
  97   //! Dereference operator.
  98   //! @return the UTF-32 codepoint of the character currently pointed by iterator.
  99   Standard_Utf32Char operator*() const
 100   {
 101     return myCharUtf32;
 102   }
 103
 104   //! Buffer-fetching getter.
 105   const Type* BufferHere() const { return myPosition; }
 106
 107   //! Buffer-fetching getter. Dangerous! Iterator should be reinitialized on buffer change.
 108   Type* ChangeBufferHere() { return (Type* )myPosition; }
 109
 110   //! Buffer-fetching getter.
 111   const Type* BufferNext() const { return myPosNext; }
 112
 113   //! @return the index displacement from iterator intialization
 114   Standard_Integer Index() const
 115   {
 116     return myCharIndex;
 117   }
 118
 119   //! @return the advance in bytes to store current symbol in UTF-8.
 120   //! 0 means an invalid symbol;
 121   //! 1-4 bytes are valid range.
 122   Standard_Integer AdvanceBytesUtf8() const;
 123
 124   //! @return the advance in bytes to store current symbol in UTF-16.
 125   //! 0 means an invalid symbol;
 126   //! 2 bytes is a general case;
 127   //! 4 bytes for surrogate pair.
 128   Standard_Integer AdvanceBytesUtf16() const;
 129
 130   //! @return the advance in bytes to store current symbol in UTF-16.
 131   //! 0 means an invalid symbol;
 132   //! 1 16-bit code unit is a general case;
 133   //! 2 16-bit code units for surrogate pair.
 134   Standard_Integer AdvanceCodeUnitsUtf16() const;
 135
 136   //! @return the advance in bytes to store current symbol in UTF-32.
 137   //! Always 4 bytes (method for consistency).
 138   Standard_Integer AdvanceBytesUtf32() const
 139   {
 140     return Standard_Integer(sizeof(Standard_Utf32Char));
 141   }
 142
 143   //! Fill the UTF-8 buffer within current Unicode symbol.
 144   //! Use method AdvanceUtf8() to allocate buffer with enough size.
 145   //! @param theBuffer buffer to fill
 146   //! @return new buffer position (for next char)
 147   Standard_Utf8Char*  GetUtf8 (Standard_Utf8Char*  theBuffer) const;
 148   Standard_Utf8UChar* GetUtf8 (Standard_Utf8UChar* theBuffer) const;
 149
 150   //! Fill the UTF-16 buffer within current Unicode symbol.
 151   //! Use method AdvanceUtf16() to allocate buffer with enough size.
 152   //! @param theBuffer buffer to fill
 153   //! @return new buffer position (for next char)
 154   Standard_Utf16Char* GetUtf16 (Standard_Utf16Char* theBuffer) const;
 155
 156   //! Fill the UTF-32 buffer within current Unicode symbol.
 157   //! Use method AdvanceUtf32() to allocate buffer with enough size.
 158   //! @param theBuffer buffer to fill
 159   //! @return new buffer position (for next char)
 160   Standard_Utf32Char* GetUtf32 (Standard_Utf32Char* theBuffer) const;
 161
 162   //! @return the advance in TypeWrite chars needed to store current symbol
 163   template<typename TypeWrite>
 164   Standard_Integer AdvanceBytesUtf() const;
 165
 166   //! Fill the UTF-** buffer within current Unicode symbol.
 167   //! Use method AdvanceUtf**() to allocate buffer with enough size.
 168   //! @param theBuffer buffer to fill
 169   //! @return new buffer position (for next char)
 170   template<typename TypeWrite>
 171   TypeWrite* GetUtf (TypeWrite* theBuffer) const;
 172
 173 private:
 174
 175   //! Helper function for reading a single UTF8 character from the string.
 176   //! Updates internal state appropriately.
 177   void readUTF8();
 178
 179   //! Helper function for reading a single UTF16 character from the string.
 180   //! Updates internal state appropriately.
 181   void readUTF16();
 182
 183 private: //! @name unicode magic numbers
 184
 185   static const unsigned char UTF8_BYTES_MINUS_ONE[256];
 186   static const unsigned long offsetsFromUTF8[6];
 187   static const unsigned char UTF8_FIRST_BYTE_MARK[7];
 188   static const unsigned long UTF8_BYTE_MASK;
 189   static const unsigned long UTF8_BYTE_MARK;
 190   static const unsigned long UTF16_SURROGATE_HIGH_START;
 191   static const unsigned long UTF16_SURROGATE_HIGH_END;
 192   static const unsigned long UTF16_SURROGATE_LOW_START;
 193   static const unsigned long UTF16_SURROGATE_LOW_END;
 194   static const unsigned long UTF16_SURROGATE_HIGH_SHIFT;
 195   static const unsigned long UTF16_SURROGATE_LOW_BASE;
 196   static const unsigned long UTF16_SURROGATE_LOW_MASK;
 197   static const unsigned long UTF32_MAX_BMP;
 198   static const unsigned long UTF32_MAX_LEGAL;
 199
 200 private: //! @name private fields
 201
 202   const Type*        myPosition;  //!< buffer position of the first element in the current character
 203   const Type*        myPosNext;   //!< buffer position of the first element in the next character
 204   Standard_Integer   myCharIndex; //!< index displacement from iterator intialization
 205   Standard_Utf32Char myCharUtf32; //!< character stored at the current buffer position
 206
 207 };
 208
 209 typedef NCollection_UtfIterator<Standard_Utf8Char>  NCollection_Utf8Iter;
 210 typedef NCollection_UtfIterator<Standard_Utf16Char> NCollection_Utf16Iter;
 211 typedef NCollection_UtfIterator<Standard_Utf32Char> NCollection_Utf32Iter;
 212 typedef NCollection_UtfIterator<Standard_WideChar>  NCollection_UtfWideIter;
 213
 214 // template implementation
 215 #include "NCollection_UtfIterator.lxx"
 216
 217 #endif // _NCollection_UtfIterator_H__