1 // Created on: 2013-01-28
2 // Created by: Kirill GAVRILOV
3 // Copyright (c) 2013-2014 OPEN CASCADE SAS
5 // This file is part of Open CASCADE Technology software library.
7 // This library is free software; you can redistribute it and/or modify it under
8 // the terms of the GNU Lesser General Public License version 2.1 as published
9 // by the Free Software Foundation, with special exception defined in the file
10 // OCCT_LGPL_EXCEPTION.txt. Consult the file LICENSE_LGPL_21.txt included in OCCT
11 // distribution for complete text of the license and disclaimer of any warranty.
13 // Alternatively, this file may be used under the terms of Open CASCADE
14 // commercial license or contractual agreement.
16 #ifndef _NCollection_UtfIterator_H__
17 #define _NCollection_UtfIterator_H__
19 #include <Standard_TypeDef.hxx>
21 //! Template class for Unicode strings support.
22 //! It defines an iterator and provide correct way to read multi-byte text (UTF-8 and UTF-16)
23 //! and convert it from one to another.
24 //! The current value of iterator returned as UTF-32 Unicode code.
25 template<typename Type>
26 class NCollection_UtfIterator
32 //! @param theString buffer to iterate
33 NCollection_UtfIterator (const Type* theString)
34 : myPosition(theString),
39 if (theString != NULL)
46 //! Initialize iterator within specified NULL-terminated string.
47 void Init (const Type* theString)
49 myPosition = theString;
50 myPosNext = theString;
52 if (theString != NULL)
59 //! Pre-increment operator. Reads the next unicode character.
60 //! Notice - no protection against overrun!
61 NCollection_UtfIterator& operator++()
63 myPosition = myPosNext;
67 case 1: readUTF8(); break;
68 case 2: readUTF16(); break;
71 myCharUtf32 = *myPosNext++;
76 //! Post-increment operator.
77 //! Notice - no protection against overrun!
78 NCollection_UtfIterator operator++ (int )
80 NCollection_UtfIterator aCopy = *this;
85 //! Equality operator.
86 bool operator== (const NCollection_UtfIterator& theRight) const
88 return myPosition == theRight.myPosition;
91 //! Dereference operator.
92 //! @return the UTF-32 codepoint of the character currently pointed by iterator.
93 Standard_Utf32Char operator*() const
98 //! Buffer-fetching getter.
99 const Type* BufferHere() const { return myPosition; }
101 //! Buffer-fetching getter. Dangerous! Iterator should be reinitialized on buffer change.
102 Type* ChangeBufferHere() { return (Type* )myPosition; }
104 //! Buffer-fetching getter.
105 const Type* BufferNext() const { return myPosNext; }
107 //! @return the index displacement from iterator intialization
108 Standard_Integer Index() const
113 //! @return the advance in bytes to store current symbol in UTF-8.
114 //! 0 means an invalid symbol;
115 //! 1-4 bytes are valid range.
116 Standard_Integer AdvanceBytesUtf8() const;
118 //! @return the advance in bytes to store current symbol in UTF-16.
119 //! 0 means an invalid symbol;
120 //! 2 bytes is a general case;
121 //! 4 bytes for surrogate pair.
122 Standard_Integer AdvanceBytesUtf16() const;
124 //! @return the advance in bytes to store current symbol in UTF-32.
125 //! Always 4 bytes (method for consistency).
126 Standard_Integer AdvanceBytesUtf32() const
128 return Standard_Integer(sizeof(Standard_Utf32Char));
131 //! Fill the UTF-8 buffer within current Unicode symbol.
132 //! Use method AdvanceUtf8() to allocate buffer with enough size.
133 //! @param theBuffer buffer to fill
134 //! @return new buffer position (for next char)
135 Standard_Utf8Char* GetUtf8 (Standard_Utf8Char* theBuffer) const;
136 Standard_Utf8UChar* GetUtf8 (Standard_Utf8UChar* theBuffer) const;
138 //! Fill the UTF-16 buffer within current Unicode symbol.
139 //! Use method AdvanceUtf16() to allocate buffer with enough size.
140 //! @param theBuffer buffer to fill
141 //! @return new buffer position (for next char)
142 Standard_Utf16Char* GetUtf16 (Standard_Utf16Char* theBuffer) const;
144 //! Fill the UTF-32 buffer within current Unicode symbol.
145 //! Use method AdvanceUtf32() to allocate buffer with enough size.
146 //! @param theBuffer buffer to fill
147 //! @return new buffer position (for next char)
148 Standard_Utf32Char* GetUtf32 (Standard_Utf32Char* theBuffer) const;
150 //! @return the advance in TypeWrite chars needed to store current symbol
151 template<typename TypeWrite>
152 Standard_Integer AdvanceBytesUtf() const;
154 //! Fill the UTF-** buffer within current Unicode symbol.
155 //! Use method AdvanceUtf**() to allocate buffer with enough size.
156 //! @param theBuffer buffer to fill
157 //! @return new buffer position (for next char)
158 template<typename TypeWrite>
159 TypeWrite* GetUtf (TypeWrite* theBuffer) const;
163 //! Helper function for reading a single UTF8 character from the string.
164 //! Updates internal state appropriately.
167 //! Helper function for reading a single UTF16 character from the string.
168 //! Updates internal state appropriately.
171 private: //! @name unicode magic numbers
173 static const unsigned char UTF8_BYTES_MINUS_ONE[256];
174 static const unsigned long offsetsFromUTF8[6];
175 static const unsigned char UTF8_FIRST_BYTE_MARK[7];
176 static const unsigned long UTF8_BYTE_MASK;
177 static const unsigned long UTF8_BYTE_MARK;
178 static const unsigned long UTF16_SURROGATE_HIGH_START;
179 static const unsigned long UTF16_SURROGATE_HIGH_END;
180 static const unsigned long UTF16_SURROGATE_LOW_START;
181 static const unsigned long UTF16_SURROGATE_LOW_END;
182 static const unsigned long UTF16_SURROGATE_HIGH_SHIFT;
183 static const unsigned long UTF16_SURROGATE_LOW_BASE;
184 static const unsigned long UTF16_SURROGATE_LOW_MASK;
185 static const unsigned long UTF32_MAX_BMP;
186 static const unsigned long UTF32_MAX_LEGAL;
188 private: //! @name private fields
190 const Type* myPosition; //!< buffer position of the first element in the current character
191 const Type* myPosNext; //!< buffer position of the first element in the next character
192 Standard_Integer myCharIndex; //!< index displacement from iterator intialization
193 Standard_Utf32Char myCharUtf32; //!< character stored at the current buffer position
197 typedef NCollection_UtfIterator<Standard_Utf8Char> NCollection_Utf8Iter;
198 typedef NCollection_UtfIterator<Standard_Utf16Char> NCollection_Utf16Iter;
199 typedef NCollection_UtfIterator<Standard_Utf32Char> NCollection_Utf32Iter;
200 typedef NCollection_UtfIterator<Standard_WideChar> NCollection_UtfWideIter;
202 // template implementation
203 #include "NCollection_UtfIterator.lxx"
205 #endif // _NCollection_UtfIterator_H__