1 // Created on: 2013-01-28
2 // Created by: Kirill GAVRILOV
3 // Copyright (c) 2013-2014 OPEN CASCADE SAS
5 // This file is part of Open CASCADE Technology software library.
7 // This library is free software; you can redistribute it and/or modify it under
8 // the terms of the GNU Lesser General Public License version 2.1 as published
9 // by the Free Software Foundation, with special exception defined in the file
10 // OCCT_LGPL_EXCEPTION.txt. Consult the file LICENSE_LGPL_21.txt included in OCCT
11 // distribution for complete text of the license and disclaimer of any warranty.
13 // Alternatively, this file may be used under the terms of Open CASCADE
14 // commercial license or contractual agreement.
16 #ifndef _NCollection_UtfIterator_H__
17 #define _NCollection_UtfIterator_H__
19 #include <Standard_TypeDef.hxx>
21 //! Template class for Unicode strings support.
22 //! It defines an iterator and provide correct way to read multi-byte text (UTF-8 and UTF-16)
23 //! and convert it from one to another.
24 //! The current value of iterator returned as UTF-32 Unicode code.
25 template<typename Type>
26 class NCollection_UtfIterator
32 //! @param theString buffer to iterate
33 NCollection_UtfIterator (const Type* theString)
34 : myPosition(theString),
39 if (theString != NULL)
46 //! Initialize iterator within specified NULL-terminated string.
47 void Init (const Type* theString)
49 myPosition = theString;
50 myPosNext = theString;
52 if (theString != NULL)
59 //! Pre-increment operator. Reads the next unicode character.
60 //! Notice - no protection against overrun!
61 NCollection_UtfIterator& operator++()
63 myPosition = myPosNext;
67 case 1: readUTF8(); break;
68 case 2: readUTF16(); break;
71 myCharUtf32 = *myPosNext++;
76 //! Post-increment operator.
77 //! Notice - no protection against overrun!
78 NCollection_UtfIterator operator++ (int )
80 NCollection_UtfIterator aCopy = *this;
85 //! Equality operator.
86 bool operator== (const NCollection_UtfIterator& theRight) const
88 return myPosition == theRight.myPosition;
91 //! Return true if Unicode symbol is within valid range.
94 return myCharUtf32 <= UTF32_MAX_LEGAL;
97 //! Dereference operator.
98 //! @return the UTF-32 codepoint of the character currently pointed by iterator.
99 Standard_Utf32Char operator*() const
104 //! Buffer-fetching getter.
105 const Type* BufferHere() const { return myPosition; }
107 //! Buffer-fetching getter. Dangerous! Iterator should be reinitialized on buffer change.
108 Type* ChangeBufferHere() { return (Type* )myPosition; }
110 //! Buffer-fetching getter.
111 const Type* BufferNext() const { return myPosNext; }
113 //! @return the index displacement from iterator intialization
114 Standard_Integer Index() const
119 //! @return the advance in bytes to store current symbol in UTF-8.
120 //! 0 means an invalid symbol;
121 //! 1-4 bytes are valid range.
122 Standard_Integer AdvanceBytesUtf8() const;
124 //! @return the advance in bytes to store current symbol in UTF-16.
125 //! 0 means an invalid symbol;
126 //! 2 bytes is a general case;
127 //! 4 bytes for surrogate pair.
128 Standard_Integer AdvanceBytesUtf16() const;
130 //! @return the advance in bytes to store current symbol in UTF-16.
131 //! 0 means an invalid symbol;
132 //! 1 16-bit code unit is a general case;
133 //! 2 16-bit code units for surrogate pair.
134 Standard_Integer AdvanceCodeUnitsUtf16() const;
136 //! @return the advance in bytes to store current symbol in UTF-32.
137 //! Always 4 bytes (method for consistency).
138 Standard_Integer AdvanceBytesUtf32() const
140 return Standard_Integer(sizeof(Standard_Utf32Char));
143 //! Fill the UTF-8 buffer within current Unicode symbol.
144 //! Use method AdvanceUtf8() to allocate buffer with enough size.
145 //! @param theBuffer buffer to fill
146 //! @return new buffer position (for next char)
147 Standard_Utf8Char* GetUtf8 (Standard_Utf8Char* theBuffer) const;
148 Standard_Utf8UChar* GetUtf8 (Standard_Utf8UChar* theBuffer) const;
150 //! Fill the UTF-16 buffer within current Unicode symbol.
151 //! Use method AdvanceUtf16() to allocate buffer with enough size.
152 //! @param theBuffer buffer to fill
153 //! @return new buffer position (for next char)
154 Standard_Utf16Char* GetUtf16 (Standard_Utf16Char* theBuffer) const;
156 //! Fill the UTF-32 buffer within current Unicode symbol.
157 //! Use method AdvanceUtf32() to allocate buffer with enough size.
158 //! @param theBuffer buffer to fill
159 //! @return new buffer position (for next char)
160 Standard_Utf32Char* GetUtf32 (Standard_Utf32Char* theBuffer) const;
162 //! @return the advance in TypeWrite chars needed to store current symbol
163 template<typename TypeWrite>
164 Standard_Integer AdvanceBytesUtf() const;
166 //! Fill the UTF-** buffer within current Unicode symbol.
167 //! Use method AdvanceUtf**() to allocate buffer with enough size.
168 //! @param theBuffer buffer to fill
169 //! @return new buffer position (for next char)
170 template<typename TypeWrite>
171 TypeWrite* GetUtf (TypeWrite* theBuffer) const;
175 //! Helper function for reading a single UTF8 character from the string.
176 //! Updates internal state appropriately.
179 //! Helper function for reading a single UTF16 character from the string.
180 //! Updates internal state appropriately.
183 private: //! @name unicode magic numbers
185 static const unsigned char UTF8_BYTES_MINUS_ONE[256];
186 static const unsigned long offsetsFromUTF8[6];
187 static const unsigned char UTF8_FIRST_BYTE_MARK[7];
188 static const unsigned long UTF8_BYTE_MASK;
189 static const unsigned long UTF8_BYTE_MARK;
190 static const unsigned long UTF16_SURROGATE_HIGH_START;
191 static const unsigned long UTF16_SURROGATE_HIGH_END;
192 static const unsigned long UTF16_SURROGATE_LOW_START;
193 static const unsigned long UTF16_SURROGATE_LOW_END;
194 static const unsigned long UTF16_SURROGATE_HIGH_SHIFT;
195 static const unsigned long UTF16_SURROGATE_LOW_BASE;
196 static const unsigned long UTF16_SURROGATE_LOW_MASK;
197 static const unsigned long UTF32_MAX_BMP;
198 static const unsigned long UTF32_MAX_LEGAL;
200 private: //! @name private fields
202 const Type* myPosition; //!< buffer position of the first element in the current character
203 const Type* myPosNext; //!< buffer position of the first element in the next character
204 Standard_Integer myCharIndex; //!< index displacement from iterator intialization
205 Standard_Utf32Char myCharUtf32; //!< character stored at the current buffer position
209 typedef NCollection_UtfIterator<Standard_Utf8Char> NCollection_Utf8Iter;
210 typedef NCollection_UtfIterator<Standard_Utf16Char> NCollection_Utf16Iter;
211 typedef NCollection_UtfIterator<Standard_Utf32Char> NCollection_Utf32Iter;
212 typedef NCollection_UtfIterator<Standard_WideChar> NCollection_UtfWideIter;
214 // template implementation
215 #include "NCollection_UtfIterator.lxx"
217 #endif // _NCollection_UtfIterator_H__