0030131: Foundation Classes - support of Linear builder for 2D BVH trees
[occt.git] / src / NCollection / NCollection_UtfIterator.hxx
CommitLineData
a174a3c5 1// Created on: 2013-01-28
2// Created by: Kirill GAVRILOV
d5f74e42 3// Copyright (c) 2013-2014 OPEN CASCADE SAS
a174a3c5 4//
973c2be1 5// This file is part of Open CASCADE Technology software library.
a174a3c5 6//
d5f74e42 7// This library is free software; you can redistribute it and/or modify it under
8// the terms of the GNU Lesser General Public License version 2.1 as published
973c2be1 9// by the Free Software Foundation, with special exception defined in the file
10// OCCT_LGPL_EXCEPTION.txt. Consult the file LICENSE_LGPL_21.txt included in OCCT
11// distribution for complete text of the license and disclaimer of any warranty.
a174a3c5 12//
973c2be1 13// Alternatively, this file may be used under the terms of Open CASCADE
14// commercial license or contractual agreement.
a174a3c5 15
16#ifndef _NCollection_UtfIterator_H__
17#define _NCollection_UtfIterator_H__
18
cf0786da 19#include <Standard_Handle.hxx>
a174a3c5 20
21//! Template class for Unicode strings support.
cf0786da 22//!
a174a3c5 23//! It defines an iterator and provide correct way to read multi-byte text (UTF-8 and UTF-16)
24//! and convert it from one to another.
cf0786da 25//! The current value of iterator is returned as UTF-32 Unicode symbol.
26//!
27//! Here and below term "Unicode symbol" is used as
28//! synonym of "Unicode code point".
a174a3c5 29template<typename Type>
30class NCollection_UtfIterator
31{
32
33public:
34
35 //! Constructor.
36 //! @param theString buffer to iterate
37 NCollection_UtfIterator (const Type* theString)
38 : myPosition(theString),
39 myPosNext(theString),
40 myCharIndex(0),
41 myCharUtf32(0)
42 {
43 if (theString != NULL)
44 {
45 ++(*this);
46 myCharIndex = 0;
47 }
48 }
49
50 //! Initialize iterator within specified NULL-terminated string.
51 void Init (const Type* theString)
52 {
53 myPosition = theString;
54 myPosNext = theString;
55 myCharUtf32 = 0;
56 if (theString != NULL)
57 {
58 ++(*this);
59 }
60 myCharIndex = 0;
61 }
62
cf0786da 63 //! Pre-increment operator. Reads the next unicode symbol.
a174a3c5 64 //! Notice - no protection against overrun!
65 NCollection_UtfIterator& operator++()
66 {
67 myPosition = myPosNext;
68 ++myCharIndex;
cf0786da 69 readNext (static_cast<const typename CharTypeChooser<Type>::type*>(0));
a174a3c5 70 return *this;
71 }
72
73 //! Post-increment operator.
74 //! Notice - no protection against overrun!
75 NCollection_UtfIterator operator++ (int )
76 {
77 NCollection_UtfIterator aCopy = *this;
78 ++*this;
79 return aCopy;
80 }
81
82 //! Equality operator.
83 bool operator== (const NCollection_UtfIterator& theRight) const
84 {
85 return myPosition == theRight.myPosition;
86 }
87
fb0b0531 88 //! Return true if Unicode symbol is within valid range.
89 bool IsValid() const
90 {
91 return myCharUtf32 <= UTF32_MAX_LEGAL;
92 }
93
a174a3c5 94 //! Dereference operator.
cf0786da 95 //! @return the UTF-32 codepoint of the symbol currently pointed by iterator.
a174a3c5 96 Standard_Utf32Char operator*() const
97 {
98 return myCharUtf32;
99 }
100
101 //! Buffer-fetching getter.
102 const Type* BufferHere() const { return myPosition; }
103
104 //! Buffer-fetching getter. Dangerous! Iterator should be reinitialized on buffer change.
105 Type* ChangeBufferHere() { return (Type* )myPosition; }
106
107 //! Buffer-fetching getter.
108 const Type* BufferNext() const { return myPosNext; }
109
110 //! @return the index displacement from iterator intialization
cf0786da 111 //! (first symbol has index 0)
a174a3c5 112 Standard_Integer Index() const
113 {
114 return myCharIndex;
115 }
116
117 //! @return the advance in bytes to store current symbol in UTF-8.
118 //! 0 means an invalid symbol;
119 //! 1-4 bytes are valid range.
120 Standard_Integer AdvanceBytesUtf8() const;
121
122 //! @return the advance in bytes to store current symbol in UTF-16.
123 //! 0 means an invalid symbol;
124 //! 2 bytes is a general case;
125 //! 4 bytes for surrogate pair.
126 Standard_Integer AdvanceBytesUtf16() const;
127
fb0b0531 128 //! @return the advance in bytes to store current symbol in UTF-16.
129 //! 0 means an invalid symbol;
130 //! 1 16-bit code unit is a general case;
131 //! 2 16-bit code units for surrogate pair.
132 Standard_Integer AdvanceCodeUnitsUtf16() const;
133
a174a3c5 134 //! @return the advance in bytes to store current symbol in UTF-32.
135 //! Always 4 bytes (method for consistency).
136 Standard_Integer AdvanceBytesUtf32() const
137 {
138 return Standard_Integer(sizeof(Standard_Utf32Char));
139 }
140
141 //! Fill the UTF-8 buffer within current Unicode symbol.
142 //! Use method AdvanceUtf8() to allocate buffer with enough size.
143 //! @param theBuffer buffer to fill
144 //! @return new buffer position (for next char)
145 Standard_Utf8Char* GetUtf8 (Standard_Utf8Char* theBuffer) const;
146 Standard_Utf8UChar* GetUtf8 (Standard_Utf8UChar* theBuffer) const;
147
148 //! Fill the UTF-16 buffer within current Unicode symbol.
149 //! Use method AdvanceUtf16() to allocate buffer with enough size.
150 //! @param theBuffer buffer to fill
151 //! @return new buffer position (for next char)
152 Standard_Utf16Char* GetUtf16 (Standard_Utf16Char* theBuffer) const;
153
154 //! Fill the UTF-32 buffer within current Unicode symbol.
155 //! Use method AdvanceUtf32() to allocate buffer with enough size.
156 //! @param theBuffer buffer to fill
157 //! @return new buffer position (for next char)
158 Standard_Utf32Char* GetUtf32 (Standard_Utf32Char* theBuffer) const;
159
160 //! @return the advance in TypeWrite chars needed to store current symbol
161 template<typename TypeWrite>
cf0786da 162 inline Standard_Integer AdvanceBytesUtf() const
163 {
164 return advanceBytes(static_cast<const typename CharTypeChooser<TypeWrite>::type*>(0));
165 }
a174a3c5 166
167 //! Fill the UTF-** buffer within current Unicode symbol.
168 //! Use method AdvanceUtf**() to allocate buffer with enough size.
169 //! @param theBuffer buffer to fill
170 //! @return new buffer position (for next char)
171 template<typename TypeWrite>
cf0786da 172 inline TypeWrite* GetUtf (TypeWrite* theBuffer) const
173 {
174 return (TypeWrite*)(getUtf (reinterpret_cast<typename CharTypeChooser<TypeWrite>::type*>(theBuffer)));
175 }
a174a3c5 176
177private:
178
cf0786da 179 //! Helper template class dispatching its argument class
180 //! to the equivalent (by size) character (Unicode code unit) type.
181 //! The code unit type is defined as nested typedef "type".
182 //!
183 //! In practice this is relevant for wchar_t type:
184 //! typename CharTypeChooser<wchar_t>::type resolves to
185 //! Standard_Utf16Char on Windows and to Standard_Utf32Char on Linux.
186 template <typename TypeChar>
187 class CharTypeChooser :
188 public opencascade::std::conditional< sizeof(TypeChar) == 1, Standard_Utf8Char,
189 typename opencascade::std::conditional< sizeof(TypeChar) == 2, Standard_Utf16Char,
190 typename opencascade::std::conditional< sizeof(TypeChar) == 4, Standard_Utf32Char, void >::type >::type >
191 {
192 };
193
194 //! Helper function for reading a single Unicode symbol from the UTF-8 string.
a174a3c5 195 //! Updates internal state appropriately.
196 void readUTF8();
197
cf0786da 198 //! Helper function for reading a single Unicode symbol from the UTF-16 string.
a174a3c5 199 //! Updates internal state appropriately.
200 void readUTF16();
201
cf0786da 202 //! Helper overload methods to dispatch reading function depending on code unit size
203 void readNext (const Standard_Utf8Char*) { readUTF8(); }
204 void readNext (const Standard_Utf16Char*) { readUTF16(); }
205 void readNext (const Standard_Utf32Char*) { myCharUtf32 = *myPosNext++; }
206
207 //! Helper overload methods to dispatch advance function depending on code unit size
208 Standard_Integer advanceBytes (const Standard_Utf8Char*) const { return AdvanceBytesUtf8(); }
209 Standard_Integer advanceBytes (const Standard_Utf16Char*) const { return AdvanceBytesUtf16(); }
210 Standard_Integer advanceBytes (const Standard_Utf32Char*) const { return AdvanceBytesUtf32(); }
211
212 //! Helper overload methods to dispatch getter function depending on code unit size
213 Standard_Utf8Char* getUtf (Standard_Utf8Char* theBuffer) const { return GetUtf8 (theBuffer); }
214 Standard_Utf16Char* getUtf (Standard_Utf16Char* theBuffer) const { return GetUtf16(theBuffer); }
215 Standard_Utf32Char* getUtf (Standard_Utf32Char* theBuffer) const { return GetUtf32(theBuffer); }
216
a174a3c5 217private: //! @name unicode magic numbers
218
219 static const unsigned char UTF8_BYTES_MINUS_ONE[256];
220 static const unsigned long offsetsFromUTF8[6];
221 static const unsigned char UTF8_FIRST_BYTE_MARK[7];
222 static const unsigned long UTF8_BYTE_MASK;
223 static const unsigned long UTF8_BYTE_MARK;
224 static const unsigned long UTF16_SURROGATE_HIGH_START;
225 static const unsigned long UTF16_SURROGATE_HIGH_END;
226 static const unsigned long UTF16_SURROGATE_LOW_START;
227 static const unsigned long UTF16_SURROGATE_LOW_END;
228 static const unsigned long UTF16_SURROGATE_HIGH_SHIFT;
229 static const unsigned long UTF16_SURROGATE_LOW_BASE;
230 static const unsigned long UTF16_SURROGATE_LOW_MASK;
231 static const unsigned long UTF32_MAX_BMP;
232 static const unsigned long UTF32_MAX_LEGAL;
233
234private: //! @name private fields
235
cf0786da 236 const Type* myPosition; //!< buffer position of the first element in the current symbol
237 const Type* myPosNext; //!< buffer position of the first element in the next symbol
a174a3c5 238 Standard_Integer myCharIndex; //!< index displacement from iterator intialization
cf0786da 239 Standard_Utf32Char myCharUtf32; //!< Unicode symbol stored at the current buffer position
a174a3c5 240
241};
242
243typedef NCollection_UtfIterator<Standard_Utf8Char> NCollection_Utf8Iter;
244typedef NCollection_UtfIterator<Standard_Utf16Char> NCollection_Utf16Iter;
245typedef NCollection_UtfIterator<Standard_Utf32Char> NCollection_Utf32Iter;
246typedef NCollection_UtfIterator<Standard_WideChar> NCollection_UtfWideIter;
247
248// template implementation
249#include "NCollection_UtfIterator.lxx"
250
251#endif // _NCollection_UtfIterator_H__