a174a3c5 |
1 | // Created on: 2013-01-28 |
2 | // Created by: Kirill GAVRILOV |
3 | // Copyright (c) 2013 OPEN CASCADE SAS |
4 | // |
5 | // The content of this file is subject to the Open CASCADE Technology Public |
6 | // License Version 6.5 (the "License"). You may not use the content of this file |
7 | // except in compliance with the License. Please obtain a copy of the License |
8 | // at http://www.opencascade.org and read it completely before using this file. |
9 | // |
10 | // The Initial Developer of the Original Code is Open CASCADE S.A.S., having its |
11 | // main offices at: 1, place des Freres Montgolfier, 78280 Guyancourt, France. |
12 | // |
13 | // The Original Code and all software distributed under the License is |
14 | // distributed on an "AS IS" basis, without warranty of any kind, and the |
15 | // Initial Developer hereby disclaims all such warranties, including without |
16 | // limitation, any warranties of merchantability, fitness for a particular |
17 | // purpose or non-infringement. Please see the License for the specific terms |
18 | // and conditions governing the rights and limitations under the License. |
19 | |
20 | // Portions of code are copyrighted by Unicode, Inc. |
21 | // |
22 | // Copyright © 2001-2004 Unicode, Inc. |
23 | // |
24 | // Disclaimer |
25 | // |
26 | // This source code is provided as is by Unicode, Inc. No claims are |
27 | // made as to fitness for any particular purpose. No warranties of any |
28 | // kind are expressed or implied. The recipient agrees to determine |
29 | // applicability of information provided. If this file has been |
30 | // purchased on magnetic or optical media from Unicode, Inc., the |
31 | // sole remedy for any claim will be exchange of defective media |
32 | // within 90 days of receipt. |
33 | // |
34 | // Limitations on Rights to Redistribute This Code |
35 | // |
36 | // Unicode, Inc. hereby grants the right to freely use the information |
37 | // supplied in this file in the creation of products supporting the |
38 | // Unicode Standard, and to make copies of this file in any form |
39 | // for internal or external distribution as long as this notice |
40 | // remains attached. |
41 | |
42 | //! The first character in a UTF-8 sequence indicates how many bytes |
43 | //! to read (among other things). |
44 | template<typename Type> |
45 | const unsigned char NCollection_UtfIterator<Type>::UTF8_BYTES_MINUS_ONE[256] = |
46 | { |
47 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
48 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
49 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
50 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
51 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
52 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
53 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
54 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 |
55 | }; |
56 | |
57 | //! Magic values subtracted from a buffer value during UTF-8 conversion. |
58 | //! This table contains as many values as there might be trailing bytes |
59 | //! in a UTF-8 sequence. |
60 | template<typename Type> |
61 | const unsigned long NCollection_UtfIterator<Type>::offsetsFromUTF8[6] = |
62 | { |
63 | 0x00000000UL, 0x00003080UL, 0x000E2080UL, |
64 | 0x03C82080UL, 0xFA082080UL, 0x82082080UL |
65 | }; |
66 | |
67 | //! The first character in a UTF-8 sequence indicates how many bytes to read. |
68 | template<typename Type> |
69 | const unsigned char NCollection_UtfIterator<Type>::UTF8_FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; |
70 | |
71 | // ======================================================================= |
72 | // function : readUTF8 |
73 | // purpose : Get a UTF-8 character; leave the tracking pointer at the start of the next character. |
74 | // Not protected against invalid UTF-8. |
75 | // ======================================================================= |
76 | template<typename Type> |
77 | inline void NCollection_UtfIterator<Type>::readUTF8() |
78 | { |
79 | // unsigned arithmetic used |
80 | Standard_Utf8UChar* aPos = (Standard_Utf8UChar* )myPosNext; |
81 | const unsigned char aBytesToRead = UTF8_BYTES_MINUS_ONE[*aPos]; |
82 | myCharUtf32 = 0; |
83 | switch (aBytesToRead) |
84 | { |
85 | case 5: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; // remember, illegal UTF-8 |
86 | case 4: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; // remember, illegal UTF-8 |
87 | case 3: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; |
88 | case 2: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; |
89 | case 1: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; |
90 | case 0: myCharUtf32 += *aPos++; |
91 | } |
92 | myCharUtf32 -= offsetsFromUTF8[aBytesToRead]; |
93 | myPosNext = (Type* )aPos; |
94 | } |
95 | |
96 | // magic numbers |
97 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF8_BYTE_MASK = 0xBF; |
98 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF8_BYTE_MARK = 0x80; |
99 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_START = 0xD800; |
100 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_END = 0xDBFF; |
101 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_START = 0xDC00; |
102 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_END = 0xDFFF; |
103 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_SHIFT = 10; |
104 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_BASE = 0x0010000UL; |
105 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_MASK = 0x3FFUL; |
106 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF32_MAX_BMP = 0x0000FFFFUL; |
107 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF32_MAX_LEGAL = 0x0010FFFFUL; |
108 | |
109 | // ======================================================================= |
110 | // function : readUTF16 |
111 | // purpose : |
112 | // ======================================================================= |
113 | template<typename Type> inline |
114 | void NCollection_UtfIterator<Type>::readUTF16() |
115 | { |
116 | Standard_Utf32Char aChar = *myPosNext++; |
117 | // if we have the first half of the surrogate pair |
118 | if (aChar >= UTF16_SURROGATE_HIGH_START |
119 | && aChar <= UTF16_SURROGATE_HIGH_END) |
120 | { |
121 | Standard_Utf32Char aChar2 = *myPosition; |
122 | // complete the surrogate pair |
123 | if (aChar2 >= UTF16_SURROGATE_LOW_START |
124 | && aChar2 <= UTF16_SURROGATE_LOW_END) |
125 | { |
126 | aChar = ((aChar - UTF16_SURROGATE_HIGH_START) << UTF16_SURROGATE_HIGH_SHIFT) |
127 | + (aChar2 - UTF16_SURROGATE_LOW_START) + UTF16_SURROGATE_LOW_BASE; |
128 | ++myPosNext; |
129 | } |
130 | } |
131 | myCharUtf32 = aChar; |
132 | } |
133 | |
134 | // ======================================================================= |
135 | // function : AdvanceBytesUtf8 |
136 | // purpose : |
137 | // ======================================================================= |
138 | template<typename Type> inline |
139 | Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf8() const |
140 | { |
141 | if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START |
142 | && myCharUtf32 <= UTF16_SURROGATE_LOW_END) |
143 | { |
144 | // UTF-16 surrogate values are illegal in UTF-32 |
145 | return 0; |
146 | } |
147 | else if (myCharUtf32 < Standard_Utf32Char(0x80)) |
148 | { |
149 | return 1; |
150 | } |
151 | else if (myCharUtf32 < Standard_Utf32Char(0x800)) |
152 | { |
153 | return 2; |
154 | } |
155 | else if (myCharUtf32 < Standard_Utf32Char(0x10000)) |
156 | { |
157 | return 3; |
158 | } |
159 | else if (myCharUtf32 <= UTF32_MAX_LEGAL) |
160 | { |
161 | return 4; |
162 | } |
163 | else |
164 | { |
165 | // illegal |
166 | return 0; |
167 | } |
168 | } |
169 | |
170 | // ======================================================================= |
171 | // function : GetUtf8 |
172 | // purpose : |
173 | // ======================================================================= |
174 | template<typename Type> inline |
175 | Standard_Utf8Char* NCollection_UtfIterator<Type>::GetUtf8 (Standard_Utf8Char* theBuffer) const |
176 | { |
177 | // unsigned arithmetic used |
178 | return (Standard_Utf8Char* )GetUtf8 ((Standard_Utf8UChar* )theBuffer); |
179 | } |
180 | |
181 | // ======================================================================= |
182 | // function : GetUtf8 |
183 | // purpose : |
184 | // ======================================================================= |
185 | template<typename Type> inline |
186 | Standard_Utf8UChar* NCollection_UtfIterator<Type>::GetUtf8 (Standard_Utf8UChar* theBuffer) const |
187 | { |
188 | Standard_Utf32Char aChar = myCharUtf32; |
189 | if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START |
190 | && myCharUtf32 <= UTF16_SURROGATE_LOW_END) |
191 | { |
192 | // UTF-16 surrogate values are illegal in UTF-32 |
193 | return theBuffer; |
194 | } |
195 | else if (myCharUtf32 < Standard_Utf32Char(0x80)) |
196 | { |
197 | *theBuffer++ = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[1]); |
198 | return theBuffer; |
199 | } |
200 | else if (myCharUtf32 < Standard_Utf32Char(0x800)) |
201 | { |
202 | *++theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; |
203 | *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[2]); |
204 | return theBuffer + 2; |
205 | } |
206 | else if (myCharUtf32 < Standard_Utf32Char(0x10000)) |
207 | { |
208 | theBuffer += 3; |
209 | *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; |
210 | *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; |
211 | *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[3]); |
212 | return theBuffer + 3; |
213 | } |
214 | else if (myCharUtf32 <= UTF32_MAX_LEGAL) |
215 | { |
216 | theBuffer += 4; |
217 | *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; |
218 | *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; |
219 | *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; |
220 | *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[4]); |
221 | return theBuffer + 4; |
222 | } |
223 | else |
224 | { |
225 | // illegal |
226 | return theBuffer; |
227 | } |
228 | } |
229 | |
230 | // ======================================================================= |
231 | // function : AdvanceBytesUtf16 |
232 | // purpose : |
233 | // ======================================================================= |
234 | template<typename Type> inline |
235 | Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf16() const |
236 | { |
237 | if (myCharUtf32 <= UTF32_MAX_BMP) // target is a character <= 0xFFFF |
238 | { |
239 | // UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values |
240 | if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START |
241 | && myCharUtf32 <= UTF16_SURROGATE_LOW_END) |
242 | { |
243 | return 0; |
244 | } |
245 | else |
246 | { |
247 | return Standard_Integer(sizeof(Standard_Utf16Char)); |
248 | } |
249 | } |
250 | else if (myCharUtf32 > UTF32_MAX_LEGAL) |
251 | { |
252 | // illegal |
253 | return 0; |
254 | } |
255 | else |
256 | { |
257 | // target is a character in range 0xFFFF - 0x10FFFF |
258 | // surrogate pair |
259 | return Standard_Integer(sizeof(Standard_Utf16Char) * 2); |
260 | } |
261 | } |
262 | |
263 | // ======================================================================= |
264 | // function : GetUtf16 |
265 | // purpose : |
266 | // ======================================================================= |
267 | template<typename Type> inline |
268 | Standard_Utf16Char* NCollection_UtfIterator<Type>::GetUtf16 (Standard_Utf16Char* theBuffer) const |
269 | { |
270 | if (myCharUtf32 <= UTF32_MAX_BMP) // target is a character <= 0xFFFF |
271 | { |
272 | // UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values |
273 | if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START |
274 | && myCharUtf32 <= UTF16_SURROGATE_LOW_END) |
275 | { |
276 | return theBuffer; |
277 | } |
278 | else |
279 | { |
280 | *theBuffer++ = Standard_Utf16Char(myCharUtf32); |
281 | return theBuffer; |
282 | } |
283 | } |
284 | else if (myCharUtf32 > UTF32_MAX_LEGAL) |
285 | { |
286 | // illegal |
287 | return theBuffer; |
288 | } |
289 | else |
290 | { |
291 | // surrogate pair |
292 | Standard_Utf32Char aChar = myCharUtf32 - UTF16_SURROGATE_LOW_BASE; |
293 | *theBuffer++ = Standard_Utf16Char((aChar >> UTF16_SURROGATE_HIGH_SHIFT) + UTF16_SURROGATE_HIGH_START); |
294 | *theBuffer++ = Standard_Utf16Char((aChar & UTF16_SURROGATE_LOW_MASK) + UTF16_SURROGATE_LOW_START); |
295 | return theBuffer; |
296 | } |
297 | } |
298 | |
299 | // ======================================================================= |
300 | // function : GetUtf32 |
301 | // purpose : |
302 | // ======================================================================= |
303 | template<typename Type> inline |
304 | Standard_Utf32Char* NCollection_UtfIterator<Type>::GetUtf32 (Standard_Utf32Char* theBuffer) const |
305 | { |
306 | *theBuffer++ = myCharUtf32; |
307 | return theBuffer; |
308 | } |
309 | |
310 | // ======================================================================= |
311 | // function : AdvanceBytesUtf |
312 | // purpose : |
313 | // ======================================================================= |
314 | template<typename Type> template<typename TypeWrite> inline |
315 | Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf() const |
316 | { |
317 | switch (sizeof(TypeWrite)) |
318 | { |
319 | case sizeof(Standard_Utf8Char): return AdvanceBytesUtf8(); |
320 | case sizeof(Standard_Utf16Char): return AdvanceBytesUtf16(); |
321 | case sizeof(Standard_Utf32Char): return AdvanceBytesUtf32(); |
322 | default: return 0; // invalid case |
323 | } |
324 | } |
325 | |
326 | // ======================================================================= |
327 | // function : GetUtf |
328 | // purpose : |
329 | // ======================================================================= |
330 | template<typename Type> template<typename TypeWrite> inline |
331 | TypeWrite* NCollection_UtfIterator<Type>::GetUtf (TypeWrite* theBuffer) const |
332 | { |
333 | switch (sizeof(TypeWrite)) |
334 | { |
335 | case sizeof(Standard_Utf8Char): return (TypeWrite* )GetUtf8 ((Standard_Utf8UChar* )theBuffer); |
336 | case sizeof(Standard_Utf16Char): return (TypeWrite* )GetUtf16((Standard_Utf16Char* )theBuffer); |
337 | case sizeof(Standard_Utf32Char): return (TypeWrite* )GetUtf32((Standard_Utf32Char* )theBuffer); |
338 | default: return NULL; // invalid case |
339 | } |
340 | } |