a174a3c5 |
1 | // Created on: 2013-01-28 |
2 | // Created by: Kirill GAVRILOV |
d5f74e42 |
3 | // Copyright (c) 2013-2014 OPEN CASCADE SAS |
a174a3c5 |
4 | // |
973c2be1 |
5 | // This file is part of Open CASCADE Technology software library. |
a174a3c5 |
6 | // |
d5f74e42 |
7 | // This library is free software; you can redistribute it and/or modify it under |
8 | // the terms of the GNU Lesser General Public License version 2.1 as published |
973c2be1 |
9 | // by the Free Software Foundation, with special exception defined in the file |
10 | // OCCT_LGPL_EXCEPTION.txt. Consult the file LICENSE_LGPL_21.txt included in OCCT |
11 | // distribution for complete text of the license and disclaimer of any warranty. |
a174a3c5 |
12 | // |
973c2be1 |
13 | // Alternatively, this file may be used under the terms of Open CASCADE |
14 | // commercial license or contractual agreement. |
a174a3c5 |
15 | |
16 | #ifndef _NCollection_UtfIterator_H__ |
17 | #define _NCollection_UtfIterator_H__ |
18 | |
cf0786da |
19 | #include <Standard_Handle.hxx> |
a174a3c5 |
20 | |
21 | //! Template class for Unicode strings support. |
cf0786da |
22 | //! |
a174a3c5 |
23 | //! It defines an iterator and provide correct way to read multi-byte text (UTF-8 and UTF-16) |
24 | //! and convert it from one to another. |
cf0786da |
25 | //! The current value of iterator is returned as UTF-32 Unicode symbol. |
26 | //! |
27 | //! Here and below term "Unicode symbol" is used as |
28 | //! synonym of "Unicode code point". |
a174a3c5 |
29 | template<typename Type> |
30 | class NCollection_UtfIterator |
31 | { |
32 | |
33 | public: |
34 | |
35 | //! Constructor. |
36 | //! @param theString buffer to iterate |
37 | NCollection_UtfIterator (const Type* theString) |
38 | : myPosition(theString), |
39 | myPosNext(theString), |
40 | myCharIndex(0), |
41 | myCharUtf32(0) |
42 | { |
43 | if (theString != NULL) |
44 | { |
45 | ++(*this); |
46 | myCharIndex = 0; |
47 | } |
48 | } |
49 | |
50 | //! Initialize iterator within specified NULL-terminated string. |
51 | void Init (const Type* theString) |
52 | { |
53 | myPosition = theString; |
54 | myPosNext = theString; |
55 | myCharUtf32 = 0; |
56 | if (theString != NULL) |
57 | { |
58 | ++(*this); |
59 | } |
60 | myCharIndex = 0; |
61 | } |
62 | |
cf0786da |
63 | //! Pre-increment operator. Reads the next unicode symbol. |
a174a3c5 |
64 | //! Notice - no protection against overrun! |
65 | NCollection_UtfIterator& operator++() |
66 | { |
67 | myPosition = myPosNext; |
68 | ++myCharIndex; |
cf0786da |
69 | readNext (static_cast<const typename CharTypeChooser<Type>::type*>(0)); |
a174a3c5 |
70 | return *this; |
71 | } |
72 | |
73 | //! Post-increment operator. |
74 | //! Notice - no protection against overrun! |
75 | NCollection_UtfIterator operator++ (int ) |
76 | { |
77 | NCollection_UtfIterator aCopy = *this; |
78 | ++*this; |
79 | return aCopy; |
80 | } |
81 | |
82 | //! Equality operator. |
83 | bool operator== (const NCollection_UtfIterator& theRight) const |
84 | { |
85 | return myPosition == theRight.myPosition; |
86 | } |
87 | |
fb0b0531 |
88 | //! Return true if Unicode symbol is within valid range. |
89 | bool IsValid() const |
90 | { |
91 | return myCharUtf32 <= UTF32_MAX_LEGAL; |
92 | } |
93 | |
a174a3c5 |
94 | //! Dereference operator. |
cf0786da |
95 | //! @return the UTF-32 codepoint of the symbol currently pointed by iterator. |
a174a3c5 |
96 | Standard_Utf32Char operator*() const |
97 | { |
98 | return myCharUtf32; |
99 | } |
100 | |
101 | //! Buffer-fetching getter. |
102 | const Type* BufferHere() const { return myPosition; } |
103 | |
104 | //! Buffer-fetching getter. Dangerous! Iterator should be reinitialized on buffer change. |
105 | Type* ChangeBufferHere() { return (Type* )myPosition; } |
106 | |
107 | //! Buffer-fetching getter. |
108 | const Type* BufferNext() const { return myPosNext; } |
109 | |
110 | //! @return the index displacement from iterator intialization |
cf0786da |
111 | //! (first symbol has index 0) |
a174a3c5 |
112 | Standard_Integer Index() const |
113 | { |
114 | return myCharIndex; |
115 | } |
116 | |
117 | //! @return the advance in bytes to store current symbol in UTF-8. |
118 | //! 0 means an invalid symbol; |
119 | //! 1-4 bytes are valid range. |
120 | Standard_Integer AdvanceBytesUtf8() const; |
121 | |
122 | //! @return the advance in bytes to store current symbol in UTF-16. |
123 | //! 0 means an invalid symbol; |
124 | //! 2 bytes is a general case; |
125 | //! 4 bytes for surrogate pair. |
126 | Standard_Integer AdvanceBytesUtf16() const; |
127 | |
fb0b0531 |
128 | //! @return the advance in bytes to store current symbol in UTF-16. |
129 | //! 0 means an invalid symbol; |
130 | //! 1 16-bit code unit is a general case; |
131 | //! 2 16-bit code units for surrogate pair. |
132 | Standard_Integer AdvanceCodeUnitsUtf16() const; |
133 | |
a174a3c5 |
134 | //! @return the advance in bytes to store current symbol in UTF-32. |
135 | //! Always 4 bytes (method for consistency). |
136 | Standard_Integer AdvanceBytesUtf32() const |
137 | { |
138 | return Standard_Integer(sizeof(Standard_Utf32Char)); |
139 | } |
140 | |
141 | //! Fill the UTF-8 buffer within current Unicode symbol. |
142 | //! Use method AdvanceUtf8() to allocate buffer with enough size. |
143 | //! @param theBuffer buffer to fill |
144 | //! @return new buffer position (for next char) |
145 | Standard_Utf8Char* GetUtf8 (Standard_Utf8Char* theBuffer) const; |
146 | Standard_Utf8UChar* GetUtf8 (Standard_Utf8UChar* theBuffer) const; |
147 | |
148 | //! Fill the UTF-16 buffer within current Unicode symbol. |
149 | //! Use method AdvanceUtf16() to allocate buffer with enough size. |
150 | //! @param theBuffer buffer to fill |
151 | //! @return new buffer position (for next char) |
152 | Standard_Utf16Char* GetUtf16 (Standard_Utf16Char* theBuffer) const; |
153 | |
154 | //! Fill the UTF-32 buffer within current Unicode symbol. |
155 | //! Use method AdvanceUtf32() to allocate buffer with enough size. |
156 | //! @param theBuffer buffer to fill |
157 | //! @return new buffer position (for next char) |
158 | Standard_Utf32Char* GetUtf32 (Standard_Utf32Char* theBuffer) const; |
159 | |
160 | //! @return the advance in TypeWrite chars needed to store current symbol |
161 | template<typename TypeWrite> |
cf0786da |
162 | inline Standard_Integer AdvanceBytesUtf() const |
163 | { |
164 | return advanceBytes(static_cast<const typename CharTypeChooser<TypeWrite>::type*>(0)); |
165 | } |
a174a3c5 |
166 | |
167 | //! Fill the UTF-** buffer within current Unicode symbol. |
168 | //! Use method AdvanceUtf**() to allocate buffer with enough size. |
169 | //! @param theBuffer buffer to fill |
170 | //! @return new buffer position (for next char) |
171 | template<typename TypeWrite> |
cf0786da |
172 | inline TypeWrite* GetUtf (TypeWrite* theBuffer) const |
173 | { |
174 | return (TypeWrite*)(getUtf (reinterpret_cast<typename CharTypeChooser<TypeWrite>::type*>(theBuffer))); |
175 | } |
a174a3c5 |
176 | |
177 | private: |
178 | |
cf0786da |
179 | //! Helper template class dispatching its argument class |
180 | //! to the equivalent (by size) character (Unicode code unit) type. |
181 | //! The code unit type is defined as nested typedef "type". |
182 | //! |
183 | //! In practice this is relevant for wchar_t type: |
184 | //! typename CharTypeChooser<wchar_t>::type resolves to |
185 | //! Standard_Utf16Char on Windows and to Standard_Utf32Char on Linux. |
186 | template <typename TypeChar> |
187 | class CharTypeChooser : |
188 | public opencascade::std::conditional< sizeof(TypeChar) == 1, Standard_Utf8Char, |
189 | typename opencascade::std::conditional< sizeof(TypeChar) == 2, Standard_Utf16Char, |
190 | typename opencascade::std::conditional< sizeof(TypeChar) == 4, Standard_Utf32Char, void >::type >::type > |
191 | { |
192 | }; |
193 | |
194 | //! Helper function for reading a single Unicode symbol from the UTF-8 string. |
a174a3c5 |
195 | //! Updates internal state appropriately. |
196 | void readUTF8(); |
197 | |
cf0786da |
198 | //! Helper function for reading a single Unicode symbol from the UTF-16 string. |
a174a3c5 |
199 | //! Updates internal state appropriately. |
200 | void readUTF16(); |
201 | |
cf0786da |
202 | //! Helper overload methods to dispatch reading function depending on code unit size |
203 | void readNext (const Standard_Utf8Char*) { readUTF8(); } |
204 | void readNext (const Standard_Utf16Char*) { readUTF16(); } |
205 | void readNext (const Standard_Utf32Char*) { myCharUtf32 = *myPosNext++; } |
206 | |
207 | //! Helper overload methods to dispatch advance function depending on code unit size |
208 | Standard_Integer advanceBytes (const Standard_Utf8Char*) const { return AdvanceBytesUtf8(); } |
209 | Standard_Integer advanceBytes (const Standard_Utf16Char*) const { return AdvanceBytesUtf16(); } |
210 | Standard_Integer advanceBytes (const Standard_Utf32Char*) const { return AdvanceBytesUtf32(); } |
211 | |
212 | //! Helper overload methods to dispatch getter function depending on code unit size |
213 | Standard_Utf8Char* getUtf (Standard_Utf8Char* theBuffer) const { return GetUtf8 (theBuffer); } |
214 | Standard_Utf16Char* getUtf (Standard_Utf16Char* theBuffer) const { return GetUtf16(theBuffer); } |
215 | Standard_Utf32Char* getUtf (Standard_Utf32Char* theBuffer) const { return GetUtf32(theBuffer); } |
216 | |
a174a3c5 |
217 | private: //! @name unicode magic numbers |
218 | |
219 | static const unsigned char UTF8_BYTES_MINUS_ONE[256]; |
220 | static const unsigned long offsetsFromUTF8[6]; |
221 | static const unsigned char UTF8_FIRST_BYTE_MARK[7]; |
222 | static const unsigned long UTF8_BYTE_MASK; |
223 | static const unsigned long UTF8_BYTE_MARK; |
224 | static const unsigned long UTF16_SURROGATE_HIGH_START; |
225 | static const unsigned long UTF16_SURROGATE_HIGH_END; |
226 | static const unsigned long UTF16_SURROGATE_LOW_START; |
227 | static const unsigned long UTF16_SURROGATE_LOW_END; |
228 | static const unsigned long UTF16_SURROGATE_HIGH_SHIFT; |
229 | static const unsigned long UTF16_SURROGATE_LOW_BASE; |
230 | static const unsigned long UTF16_SURROGATE_LOW_MASK; |
231 | static const unsigned long UTF32_MAX_BMP; |
232 | static const unsigned long UTF32_MAX_LEGAL; |
233 | |
234 | private: //! @name private fields |
235 | |
cf0786da |
236 | const Type* myPosition; //!< buffer position of the first element in the current symbol |
237 | const Type* myPosNext; //!< buffer position of the first element in the next symbol |
a174a3c5 |
238 | Standard_Integer myCharIndex; //!< index displacement from iterator intialization |
cf0786da |
239 | Standard_Utf32Char myCharUtf32; //!< Unicode symbol stored at the current buffer position |
a174a3c5 |
240 | |
241 | }; |
242 | |
243 | typedef NCollection_UtfIterator<Standard_Utf8Char> NCollection_Utf8Iter; |
244 | typedef NCollection_UtfIterator<Standard_Utf16Char> NCollection_Utf16Iter; |
245 | typedef NCollection_UtfIterator<Standard_Utf32Char> NCollection_Utf32Iter; |
246 | typedef NCollection_UtfIterator<Standard_WideChar> NCollection_UtfWideIter; |
247 | |
248 | // template implementation |
249 | #include "NCollection_UtfIterator.lxx" |
250 | |
251 | #endif // _NCollection_UtfIterator_H__ |