a174a3c5 |
1 | // Created on: 2013-01-28 |
2 | // Created by: Kirill GAVRILOV |
d5f74e42 |
3 | // Copyright (c) 2013-2014 OPEN CASCADE SAS |
a174a3c5 |
4 | // |
973c2be1 |
5 | // This file is part of Open CASCADE Technology software library. |
6 | // |
d5f74e42 |
7 | // This library is free software; you can redistribute it and/or modify it under |
8 | // the terms of the GNU Lesser General Public License version 2.1 as published |
973c2be1 |
9 | // by the Free Software Foundation, with special exception defined in the file |
10 | // OCCT_LGPL_EXCEPTION.txt. Consult the file LICENSE_LGPL_21.txt included in OCCT |
11 | // distribution for complete text of the license and disclaimer of any warranty. |
12 | // |
13 | // Alternatively, this file may be used under the terms of Open CASCADE |
14 | // commercial license or contractual agreement. |
a174a3c5 |
15 | |
16 | // Portions of code are copyrighted by Unicode, Inc. |
17 | // |
d94fa32e |
18 | // Copyright (c) 2001-2004 Unicode, Inc. |
a174a3c5 |
19 | // |
20 | // Disclaimer |
21 | // |
22 | // This source code is provided as is by Unicode, Inc. No claims are |
23 | // made as to fitness for any particular purpose. No warranties of any |
24 | // kind are expressed or implied. The recipient agrees to determine |
25 | // applicability of information provided. If this file has been |
26 | // purchased on magnetic or optical media from Unicode, Inc., the |
27 | // sole remedy for any claim will be exchange of defective media |
28 | // within 90 days of receipt. |
29 | // |
30 | // Limitations on Rights to Redistribute This Code |
31 | // |
32 | // Unicode, Inc. hereby grants the right to freely use the information |
33 | // supplied in this file in the creation of products supporting the |
34 | // Unicode Standard, and to make copies of this file in any form |
35 | // for internal or external distribution as long as this notice |
36 | // remains attached. |
37 | |
38 | //! The first character in a UTF-8 sequence indicates how many bytes |
39 | //! to read (among other things). |
40 | template<typename Type> |
41 | const unsigned char NCollection_UtfIterator<Type>::UTF8_BYTES_MINUS_ONE[256] = |
42 | { |
43 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
44 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
45 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
46 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
47 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
48 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
49 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
50 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 |
51 | }; |
52 | |
53 | //! Magic values subtracted from a buffer value during UTF-8 conversion. |
54 | //! This table contains as many values as there might be trailing bytes |
55 | //! in a UTF-8 sequence. |
56 | template<typename Type> |
57 | const unsigned long NCollection_UtfIterator<Type>::offsetsFromUTF8[6] = |
58 | { |
59 | 0x00000000UL, 0x00003080UL, 0x000E2080UL, |
60 | 0x03C82080UL, 0xFA082080UL, 0x82082080UL |
61 | }; |
62 | |
63 | //! The first character in a UTF-8 sequence indicates how many bytes to read. |
64 | template<typename Type> |
65 | const unsigned char NCollection_UtfIterator<Type>::UTF8_FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; |
66 | |
67 | // ======================================================================= |
68 | // function : readUTF8 |
69 | // purpose : Get a UTF-8 character; leave the tracking pointer at the start of the next character. |
70 | // Not protected against invalid UTF-8. |
71 | // ======================================================================= |
72 | template<typename Type> |
73 | inline void NCollection_UtfIterator<Type>::readUTF8() |
74 | { |
75 | // unsigned arithmetic used |
76 | Standard_Utf8UChar* aPos = (Standard_Utf8UChar* )myPosNext; |
77 | const unsigned char aBytesToRead = UTF8_BYTES_MINUS_ONE[*aPos]; |
78 | myCharUtf32 = 0; |
79 | switch (aBytesToRead) |
80 | { |
81 | case 5: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; // remember, illegal UTF-8 |
b1811c1d |
82 | Standard_FALLTHROUGH |
a174a3c5 |
83 | case 4: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; // remember, illegal UTF-8 |
b1811c1d |
84 | Standard_FALLTHROUGH |
a174a3c5 |
85 | case 3: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; |
b1811c1d |
86 | Standard_FALLTHROUGH |
a174a3c5 |
87 | case 2: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; |
b1811c1d |
88 | Standard_FALLTHROUGH |
a174a3c5 |
89 | case 1: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; |
b1811c1d |
90 | Standard_FALLTHROUGH |
a174a3c5 |
91 | case 0: myCharUtf32 += *aPos++; |
92 | } |
93 | myCharUtf32 -= offsetsFromUTF8[aBytesToRead]; |
94 | myPosNext = (Type* )aPos; |
95 | } |
96 | |
97 | // magic numbers |
98 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF8_BYTE_MASK = 0xBF; |
99 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF8_BYTE_MARK = 0x80; |
100 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_START = 0xD800; |
101 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_END = 0xDBFF; |
102 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_START = 0xDC00; |
103 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_END = 0xDFFF; |
104 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_SHIFT = 10; |
105 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_BASE = 0x0010000UL; |
106 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_MASK = 0x3FFUL; |
107 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF32_MAX_BMP = 0x0000FFFFUL; |
108 | template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF32_MAX_LEGAL = 0x0010FFFFUL; |
109 | |
110 | // ======================================================================= |
111 | // function : readUTF16 |
112 | // purpose : |
113 | // ======================================================================= |
114 | template<typename Type> inline |
115 | void NCollection_UtfIterator<Type>::readUTF16() |
116 | { |
117 | Standard_Utf32Char aChar = *myPosNext++; |
118 | // if we have the first half of the surrogate pair |
119 | if (aChar >= UTF16_SURROGATE_HIGH_START |
120 | && aChar <= UTF16_SURROGATE_HIGH_END) |
121 | { |
656ec77a |
122 | const Standard_Utf32Char aChar2 = *myPosNext; |
a174a3c5 |
123 | // complete the surrogate pair |
124 | if (aChar2 >= UTF16_SURROGATE_LOW_START |
125 | && aChar2 <= UTF16_SURROGATE_LOW_END) |
126 | { |
127 | aChar = ((aChar - UTF16_SURROGATE_HIGH_START) << UTF16_SURROGATE_HIGH_SHIFT) |
128 | + (aChar2 - UTF16_SURROGATE_LOW_START) + UTF16_SURROGATE_LOW_BASE; |
129 | ++myPosNext; |
130 | } |
131 | } |
132 | myCharUtf32 = aChar; |
133 | } |
134 | |
135 | // ======================================================================= |
136 | // function : AdvanceBytesUtf8 |
137 | // purpose : |
138 | // ======================================================================= |
139 | template<typename Type> inline |
140 | Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf8() const |
141 | { |
142 | if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START |
143 | && myCharUtf32 <= UTF16_SURROGATE_LOW_END) |
144 | { |
145 | // UTF-16 surrogate values are illegal in UTF-32 |
146 | return 0; |
147 | } |
148 | else if (myCharUtf32 < Standard_Utf32Char(0x80)) |
149 | { |
150 | return 1; |
151 | } |
152 | else if (myCharUtf32 < Standard_Utf32Char(0x800)) |
153 | { |
154 | return 2; |
155 | } |
156 | else if (myCharUtf32 < Standard_Utf32Char(0x10000)) |
157 | { |
158 | return 3; |
159 | } |
160 | else if (myCharUtf32 <= UTF32_MAX_LEGAL) |
161 | { |
162 | return 4; |
163 | } |
164 | else |
165 | { |
166 | // illegal |
167 | return 0; |
168 | } |
169 | } |
170 | |
171 | // ======================================================================= |
172 | // function : GetUtf8 |
173 | // purpose : |
174 | // ======================================================================= |
175 | template<typename Type> inline |
176 | Standard_Utf8Char* NCollection_UtfIterator<Type>::GetUtf8 (Standard_Utf8Char* theBuffer) const |
177 | { |
178 | // unsigned arithmetic used |
179 | return (Standard_Utf8Char* )GetUtf8 ((Standard_Utf8UChar* )theBuffer); |
180 | } |
181 | |
182 | // ======================================================================= |
183 | // function : GetUtf8 |
184 | // purpose : |
185 | // ======================================================================= |
186 | template<typename Type> inline |
187 | Standard_Utf8UChar* NCollection_UtfIterator<Type>::GetUtf8 (Standard_Utf8UChar* theBuffer) const |
188 | { |
189 | Standard_Utf32Char aChar = myCharUtf32; |
190 | if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START |
191 | && myCharUtf32 <= UTF16_SURROGATE_LOW_END) |
192 | { |
193 | // UTF-16 surrogate values are illegal in UTF-32 |
194 | return theBuffer; |
195 | } |
196 | else if (myCharUtf32 < Standard_Utf32Char(0x80)) |
197 | { |
198 | *theBuffer++ = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[1]); |
199 | return theBuffer; |
200 | } |
201 | else if (myCharUtf32 < Standard_Utf32Char(0x800)) |
202 | { |
203 | *++theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; |
204 | *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[2]); |
205 | return theBuffer + 2; |
206 | } |
207 | else if (myCharUtf32 < Standard_Utf32Char(0x10000)) |
208 | { |
209 | theBuffer += 3; |
210 | *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; |
211 | *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; |
212 | *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[3]); |
213 | return theBuffer + 3; |
214 | } |
215 | else if (myCharUtf32 <= UTF32_MAX_LEGAL) |
216 | { |
217 | theBuffer += 4; |
218 | *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; |
219 | *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; |
220 | *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6; |
221 | *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[4]); |
222 | return theBuffer + 4; |
223 | } |
224 | else |
225 | { |
226 | // illegal |
227 | return theBuffer; |
228 | } |
229 | } |
230 | |
231 | // ======================================================================= |
232 | // function : AdvanceBytesUtf16 |
233 | // purpose : |
234 | // ======================================================================= |
235 | template<typename Type> inline |
236 | Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf16() const |
fb0b0531 |
237 | { |
238 | return AdvanceCodeUnitsUtf16() * sizeof(Standard_Utf16Char); |
239 | } |
240 | |
241 | // ======================================================================= |
242 | // function : AdvanceCodeUnitsUtf16 |
243 | // purpose : |
244 | // ======================================================================= |
245 | template<typename Type> inline |
246 | Standard_Integer NCollection_UtfIterator<Type>::AdvanceCodeUnitsUtf16() const |
a174a3c5 |
247 | { |
248 | if (myCharUtf32 <= UTF32_MAX_BMP) // target is a character <= 0xFFFF |
249 | { |
250 | // UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values |
251 | if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START |
252 | && myCharUtf32 <= UTF16_SURROGATE_LOW_END) |
253 | { |
254 | return 0; |
255 | } |
256 | else |
257 | { |
fb0b0531 |
258 | return 1; |
a174a3c5 |
259 | } |
260 | } |
261 | else if (myCharUtf32 > UTF32_MAX_LEGAL) |
262 | { |
263 | // illegal |
264 | return 0; |
265 | } |
266 | else |
267 | { |
268 | // target is a character in range 0xFFFF - 0x10FFFF |
269 | // surrogate pair |
fb0b0531 |
270 | return 2; |
a174a3c5 |
271 | } |
272 | } |
273 | |
274 | // ======================================================================= |
275 | // function : GetUtf16 |
276 | // purpose : |
277 | // ======================================================================= |
278 | template<typename Type> inline |
279 | Standard_Utf16Char* NCollection_UtfIterator<Type>::GetUtf16 (Standard_Utf16Char* theBuffer) const |
280 | { |
281 | if (myCharUtf32 <= UTF32_MAX_BMP) // target is a character <= 0xFFFF |
282 | { |
283 | // UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values |
284 | if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START |
285 | && myCharUtf32 <= UTF16_SURROGATE_LOW_END) |
286 | { |
287 | return theBuffer; |
288 | } |
289 | else |
290 | { |
291 | *theBuffer++ = Standard_Utf16Char(myCharUtf32); |
292 | return theBuffer; |
293 | } |
294 | } |
295 | else if (myCharUtf32 > UTF32_MAX_LEGAL) |
296 | { |
297 | // illegal |
298 | return theBuffer; |
299 | } |
300 | else |
301 | { |
302 | // surrogate pair |
303 | Standard_Utf32Char aChar = myCharUtf32 - UTF16_SURROGATE_LOW_BASE; |
304 | *theBuffer++ = Standard_Utf16Char((aChar >> UTF16_SURROGATE_HIGH_SHIFT) + UTF16_SURROGATE_HIGH_START); |
305 | *theBuffer++ = Standard_Utf16Char((aChar & UTF16_SURROGATE_LOW_MASK) + UTF16_SURROGATE_LOW_START); |
306 | return theBuffer; |
307 | } |
308 | } |
309 | |
310 | // ======================================================================= |
311 | // function : GetUtf32 |
312 | // purpose : |
313 | // ======================================================================= |
314 | template<typename Type> inline |
315 | Standard_Utf32Char* NCollection_UtfIterator<Type>::GetUtf32 (Standard_Utf32Char* theBuffer) const |
316 | { |
317 | *theBuffer++ = myCharUtf32; |
318 | return theBuffer; |
319 | } |