0029151: GCC 7.1 warnings "this statement may fall through" [-Wimplicit-fallthrough=]
[occt.git] / src / NCollection / NCollection_UtfIterator.lxx
... / ...
CommitLineData
1// Created on: 2013-01-28
2// Created by: Kirill GAVRILOV
3// Copyright (c) 2013-2014 OPEN CASCADE SAS
4//
5// This file is part of Open CASCADE Technology software library.
6//
7// This library is free software; you can redistribute it and/or modify it under
8// the terms of the GNU Lesser General Public License version 2.1 as published
9// by the Free Software Foundation, with special exception defined in the file
10// OCCT_LGPL_EXCEPTION.txt. Consult the file LICENSE_LGPL_21.txt included in OCCT
11// distribution for complete text of the license and disclaimer of any warranty.
12//
13// Alternatively, this file may be used under the terms of Open CASCADE
14// commercial license or contractual agreement.
15
16// Portions of code are copyrighted by Unicode, Inc.
17//
18// Copyright (c) 2001-2004 Unicode, Inc.
19//
20// Disclaimer
21//
22// This source code is provided as is by Unicode, Inc. No claims are
23// made as to fitness for any particular purpose. No warranties of any
24// kind are expressed or implied. The recipient agrees to determine
25// applicability of information provided. If this file has been
26// purchased on magnetic or optical media from Unicode, Inc., the
27// sole remedy for any claim will be exchange of defective media
28// within 90 days of receipt.
29//
30// Limitations on Rights to Redistribute This Code
31//
32// Unicode, Inc. hereby grants the right to freely use the information
33// supplied in this file in the creation of products supporting the
34// Unicode Standard, and to make copies of this file in any form
35// for internal or external distribution as long as this notice
36// remains attached.
37
38//! The first character in a UTF-8 sequence indicates how many bytes
39//! to read (among other things).
40template<typename Type>
41const unsigned char NCollection_UtfIterator<Type>::UTF8_BYTES_MINUS_ONE[256] =
42{
43 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
44 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
45 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
46 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
47 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
48 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
49 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
50 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
51};
52
53//! Magic values subtracted from a buffer value during UTF-8 conversion.
54//! This table contains as many values as there might be trailing bytes
55//! in a UTF-8 sequence.
56template<typename Type>
57const unsigned long NCollection_UtfIterator<Type>::offsetsFromUTF8[6] =
58{
59 0x00000000UL, 0x00003080UL, 0x000E2080UL,
60 0x03C82080UL, 0xFA082080UL, 0x82082080UL
61};
62
63//! The first character in a UTF-8 sequence indicates how many bytes to read.
64template<typename Type>
65const unsigned char NCollection_UtfIterator<Type>::UTF8_FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
66
67// =======================================================================
68// function : readUTF8
69// purpose : Get a UTF-8 character; leave the tracking pointer at the start of the next character.
70// Not protected against invalid UTF-8.
71// =======================================================================
72template<typename Type>
73inline void NCollection_UtfIterator<Type>::readUTF8()
74{
75 // unsigned arithmetic used
76 Standard_Utf8UChar* aPos = (Standard_Utf8UChar* )myPosNext;
77 const unsigned char aBytesToRead = UTF8_BYTES_MINUS_ONE[*aPos];
78 myCharUtf32 = 0;
79 switch (aBytesToRead)
80 {
81 case 5: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; // remember, illegal UTF-8
82 Standard_FALLTHROUGH
83 case 4: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; // remember, illegal UTF-8
84 Standard_FALLTHROUGH
85 case 3: myCharUtf32 += *aPos++; myCharUtf32 <<= 6;
86 Standard_FALLTHROUGH
87 case 2: myCharUtf32 += *aPos++; myCharUtf32 <<= 6;
88 Standard_FALLTHROUGH
89 case 1: myCharUtf32 += *aPos++; myCharUtf32 <<= 6;
90 Standard_FALLTHROUGH
91 case 0: myCharUtf32 += *aPos++;
92 }
93 myCharUtf32 -= offsetsFromUTF8[aBytesToRead];
94 myPosNext = (Type* )aPos;
95}
96
97// magic numbers
98template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF8_BYTE_MASK = 0xBF;
99template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF8_BYTE_MARK = 0x80;
100template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_START = 0xD800;
101template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_END = 0xDBFF;
102template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_START = 0xDC00;
103template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_END = 0xDFFF;
104template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_SHIFT = 10;
105template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_BASE = 0x0010000UL;
106template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_MASK = 0x3FFUL;
107template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF32_MAX_BMP = 0x0000FFFFUL;
108template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF32_MAX_LEGAL = 0x0010FFFFUL;
109
110// =======================================================================
111// function : readUTF16
112// purpose :
113// =======================================================================
114template<typename Type> inline
115void NCollection_UtfIterator<Type>::readUTF16()
116{
117 Standard_Utf32Char aChar = *myPosNext++;
118 // if we have the first half of the surrogate pair
119 if (aChar >= UTF16_SURROGATE_HIGH_START
120 && aChar <= UTF16_SURROGATE_HIGH_END)
121 {
122 const Standard_Utf32Char aChar2 = *myPosNext;
123 // complete the surrogate pair
124 if (aChar2 >= UTF16_SURROGATE_LOW_START
125 && aChar2 <= UTF16_SURROGATE_LOW_END)
126 {
127 aChar = ((aChar - UTF16_SURROGATE_HIGH_START) << UTF16_SURROGATE_HIGH_SHIFT)
128 + (aChar2 - UTF16_SURROGATE_LOW_START) + UTF16_SURROGATE_LOW_BASE;
129 ++myPosNext;
130 }
131 }
132 myCharUtf32 = aChar;
133}
134
135// =======================================================================
136// function : AdvanceBytesUtf8
137// purpose :
138// =======================================================================
139template<typename Type> inline
140Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf8() const
141{
142 if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START
143 && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
144 {
145 // UTF-16 surrogate values are illegal in UTF-32
146 return 0;
147 }
148 else if (myCharUtf32 < Standard_Utf32Char(0x80))
149 {
150 return 1;
151 }
152 else if (myCharUtf32 < Standard_Utf32Char(0x800))
153 {
154 return 2;
155 }
156 else if (myCharUtf32 < Standard_Utf32Char(0x10000))
157 {
158 return 3;
159 }
160 else if (myCharUtf32 <= UTF32_MAX_LEGAL)
161 {
162 return 4;
163 }
164 else
165 {
166 // illegal
167 return 0;
168 }
169}
170
171// =======================================================================
172// function : GetUtf8
173// purpose :
174// =======================================================================
175template<typename Type> inline
176Standard_Utf8Char* NCollection_UtfIterator<Type>::GetUtf8 (Standard_Utf8Char* theBuffer) const
177{
178 // unsigned arithmetic used
179 return (Standard_Utf8Char* )GetUtf8 ((Standard_Utf8UChar* )theBuffer);
180}
181
182// =======================================================================
183// function : GetUtf8
184// purpose :
185// =======================================================================
186template<typename Type> inline
187Standard_Utf8UChar* NCollection_UtfIterator<Type>::GetUtf8 (Standard_Utf8UChar* theBuffer) const
188{
189 Standard_Utf32Char aChar = myCharUtf32;
190 if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START
191 && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
192 {
193 // UTF-16 surrogate values are illegal in UTF-32
194 return theBuffer;
195 }
196 else if (myCharUtf32 < Standard_Utf32Char(0x80))
197 {
198 *theBuffer++ = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[1]);
199 return theBuffer;
200 }
201 else if (myCharUtf32 < Standard_Utf32Char(0x800))
202 {
203 *++theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
204 *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[2]);
205 return theBuffer + 2;
206 }
207 else if (myCharUtf32 < Standard_Utf32Char(0x10000))
208 {
209 theBuffer += 3;
210 *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
211 *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
212 *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[3]);
213 return theBuffer + 3;
214 }
215 else if (myCharUtf32 <= UTF32_MAX_LEGAL)
216 {
217 theBuffer += 4;
218 *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
219 *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
220 *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
221 *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[4]);
222 return theBuffer + 4;
223 }
224 else
225 {
226 // illegal
227 return theBuffer;
228 }
229}
230
231// =======================================================================
232// function : AdvanceBytesUtf16
233// purpose :
234// =======================================================================
235template<typename Type> inline
236Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf16() const
237{
238 return AdvanceCodeUnitsUtf16() * sizeof(Standard_Utf16Char);
239}
240
241// =======================================================================
242// function : AdvanceCodeUnitsUtf16
243// purpose :
244// =======================================================================
245template<typename Type> inline
246Standard_Integer NCollection_UtfIterator<Type>::AdvanceCodeUnitsUtf16() const
247{
248 if (myCharUtf32 <= UTF32_MAX_BMP) // target is a character <= 0xFFFF
249 {
250 // UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values
251 if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START
252 && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
253 {
254 return 0;
255 }
256 else
257 {
258 return 1;
259 }
260 }
261 else if (myCharUtf32 > UTF32_MAX_LEGAL)
262 {
263 // illegal
264 return 0;
265 }
266 else
267 {
268 // target is a character in range 0xFFFF - 0x10FFFF
269 // surrogate pair
270 return 2;
271 }
272}
273
274// =======================================================================
275// function : GetUtf16
276// purpose :
277// =======================================================================
278template<typename Type> inline
279Standard_Utf16Char* NCollection_UtfIterator<Type>::GetUtf16 (Standard_Utf16Char* theBuffer) const
280{
281 if (myCharUtf32 <= UTF32_MAX_BMP) // target is a character <= 0xFFFF
282 {
283 // UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values
284 if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START
285 && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
286 {
287 return theBuffer;
288 }
289 else
290 {
291 *theBuffer++ = Standard_Utf16Char(myCharUtf32);
292 return theBuffer;
293 }
294 }
295 else if (myCharUtf32 > UTF32_MAX_LEGAL)
296 {
297 // illegal
298 return theBuffer;
299 }
300 else
301 {
302 // surrogate pair
303 Standard_Utf32Char aChar = myCharUtf32 - UTF16_SURROGATE_LOW_BASE;
304 *theBuffer++ = Standard_Utf16Char((aChar >> UTF16_SURROGATE_HIGH_SHIFT) + UTF16_SURROGATE_HIGH_START);
305 *theBuffer++ = Standard_Utf16Char((aChar & UTF16_SURROGATE_LOW_MASK) + UTF16_SURROGATE_LOW_START);
306 return theBuffer;
307 }
308}
309
310// =======================================================================
311// function : GetUtf32
312// purpose :
313// =======================================================================
314template<typename Type> inline
315Standard_Utf32Char* NCollection_UtfIterator<Type>::GetUtf32 (Standard_Utf32Char* theBuffer) const
316{
317 *theBuffer++ = myCharUtf32;
318 return theBuffer;
319}
320
321// =======================================================================
322// function : AdvanceBytesUtf
323// purpose :
324// =======================================================================
325template<typename Type> template<typename TypeWrite> inline
326Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf() const
327{
328 switch (sizeof(TypeWrite))
329 {
330 case sizeof(Standard_Utf8Char): return AdvanceBytesUtf8();
331 case sizeof(Standard_Utf16Char): return AdvanceBytesUtf16();
332 case sizeof(Standard_Utf32Char): return AdvanceBytesUtf32();
333 default: return 0; // invalid case
334 }
335}
336
337// =======================================================================
338// function : GetUtf
339// purpose :
340// =======================================================================
341template<typename Type> template<typename TypeWrite> inline
342TypeWrite* NCollection_UtfIterator<Type>::GetUtf (TypeWrite* theBuffer) const
343{
344 switch (sizeof(TypeWrite))
345 {
346 case sizeof(Standard_Utf8Char): return (TypeWrite* )GetUtf8 ((Standard_Utf8UChar* )theBuffer);
347 case sizeof(Standard_Utf16Char): return (TypeWrite* )GetUtf16((Standard_Utf16Char* )theBuffer);
348 case sizeof(Standard_Utf32Char): return (TypeWrite* )GetUtf32((Standard_Utf32Char* )theBuffer);
349 default: return NULL; // invalid case
350 }
351}