0023457: Slow text rendering
[occt.git] / src / NCollection / NCollection_UtfIterator.lxx
CommitLineData
a174a3c5 1// Created on: 2013-01-28
2// Created by: Kirill GAVRILOV
3// Copyright (c) 2013 OPEN CASCADE SAS
4//
5// The content of this file is subject to the Open CASCADE Technology Public
6// License Version 6.5 (the "License"). You may not use the content of this file
7// except in compliance with the License. Please obtain a copy of the License
8// at http://www.opencascade.org and read it completely before using this file.
9//
10// The Initial Developer of the Original Code is Open CASCADE S.A.S., having its
11// main offices at: 1, place des Freres Montgolfier, 78280 Guyancourt, France.
12//
13// The Original Code and all software distributed under the License is
14// distributed on an "AS IS" basis, without warranty of any kind, and the
15// Initial Developer hereby disclaims all such warranties, including without
16// limitation, any warranties of merchantability, fitness for a particular
17// purpose or non-infringement. Please see the License for the specific terms
18// and conditions governing the rights and limitations under the License.
19
20// Portions of code are copyrighted by Unicode, Inc.
21//
22// Copyright © 2001-2004 Unicode, Inc.
23//
24// Disclaimer
25//
26// This source code is provided as is by Unicode, Inc. No claims are
27// made as to fitness for any particular purpose. No warranties of any
28// kind are expressed or implied. The recipient agrees to determine
29// applicability of information provided. If this file has been
30// purchased on magnetic or optical media from Unicode, Inc., the
31// sole remedy for any claim will be exchange of defective media
32// within 90 days of receipt.
33//
34// Limitations on Rights to Redistribute This Code
35//
36// Unicode, Inc. hereby grants the right to freely use the information
37// supplied in this file in the creation of products supporting the
38// Unicode Standard, and to make copies of this file in any form
39// for internal or external distribution as long as this notice
40// remains attached.
41
42//! The first character in a UTF-8 sequence indicates how many bytes
43//! to read (among other things).
44template<typename Type>
45const unsigned char NCollection_UtfIterator<Type>::UTF8_BYTES_MINUS_ONE[256] =
46{
47 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
48 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
49 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
50 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
51 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
52 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
53 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
54 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
55};
56
57//! Magic values subtracted from a buffer value during UTF-8 conversion.
58//! This table contains as many values as there might be trailing bytes
59//! in a UTF-8 sequence.
60template<typename Type>
61const unsigned long NCollection_UtfIterator<Type>::offsetsFromUTF8[6] =
62{
63 0x00000000UL, 0x00003080UL, 0x000E2080UL,
64 0x03C82080UL, 0xFA082080UL, 0x82082080UL
65};
66
67//! The first character in a UTF-8 sequence indicates how many bytes to read.
68template<typename Type>
69const unsigned char NCollection_UtfIterator<Type>::UTF8_FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
70
71// =======================================================================
72// function : readUTF8
73// purpose : Get a UTF-8 character; leave the tracking pointer at the start of the next character.
74// Not protected against invalid UTF-8.
75// =======================================================================
76template<typename Type>
77inline void NCollection_UtfIterator<Type>::readUTF8()
78{
79 // unsigned arithmetic used
80 Standard_Utf8UChar* aPos = (Standard_Utf8UChar* )myPosNext;
81 const unsigned char aBytesToRead = UTF8_BYTES_MINUS_ONE[*aPos];
82 myCharUtf32 = 0;
83 switch (aBytesToRead)
84 {
85 case 5: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; // remember, illegal UTF-8
86 case 4: myCharUtf32 += *aPos++; myCharUtf32 <<= 6; // remember, illegal UTF-8
87 case 3: myCharUtf32 += *aPos++; myCharUtf32 <<= 6;
88 case 2: myCharUtf32 += *aPos++; myCharUtf32 <<= 6;
89 case 1: myCharUtf32 += *aPos++; myCharUtf32 <<= 6;
90 case 0: myCharUtf32 += *aPos++;
91 }
92 myCharUtf32 -= offsetsFromUTF8[aBytesToRead];
93 myPosNext = (Type* )aPos;
94}
95
96// magic numbers
97template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF8_BYTE_MASK = 0xBF;
98template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF8_BYTE_MARK = 0x80;
99template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_START = 0xD800;
100template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_END = 0xDBFF;
101template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_START = 0xDC00;
102template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_END = 0xDFFF;
103template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_HIGH_SHIFT = 10;
104template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_BASE = 0x0010000UL;
105template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF16_SURROGATE_LOW_MASK = 0x3FFUL;
106template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF32_MAX_BMP = 0x0000FFFFUL;
107template<typename Type> const unsigned long NCollection_UtfIterator<Type>::UTF32_MAX_LEGAL = 0x0010FFFFUL;
108
109// =======================================================================
110// function : readUTF16
111// purpose :
112// =======================================================================
113template<typename Type> inline
114void NCollection_UtfIterator<Type>::readUTF16()
115{
116 Standard_Utf32Char aChar = *myPosNext++;
117 // if we have the first half of the surrogate pair
118 if (aChar >= UTF16_SURROGATE_HIGH_START
119 && aChar <= UTF16_SURROGATE_HIGH_END)
120 {
121 Standard_Utf32Char aChar2 = *myPosition;
122 // complete the surrogate pair
123 if (aChar2 >= UTF16_SURROGATE_LOW_START
124 && aChar2 <= UTF16_SURROGATE_LOW_END)
125 {
126 aChar = ((aChar - UTF16_SURROGATE_HIGH_START) << UTF16_SURROGATE_HIGH_SHIFT)
127 + (aChar2 - UTF16_SURROGATE_LOW_START) + UTF16_SURROGATE_LOW_BASE;
128 ++myPosNext;
129 }
130 }
131 myCharUtf32 = aChar;
132}
133
134// =======================================================================
135// function : AdvanceBytesUtf8
136// purpose :
137// =======================================================================
138template<typename Type> inline
139Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf8() const
140{
141 if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START
142 && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
143 {
144 // UTF-16 surrogate values are illegal in UTF-32
145 return 0;
146 }
147 else if (myCharUtf32 < Standard_Utf32Char(0x80))
148 {
149 return 1;
150 }
151 else if (myCharUtf32 < Standard_Utf32Char(0x800))
152 {
153 return 2;
154 }
155 else if (myCharUtf32 < Standard_Utf32Char(0x10000))
156 {
157 return 3;
158 }
159 else if (myCharUtf32 <= UTF32_MAX_LEGAL)
160 {
161 return 4;
162 }
163 else
164 {
165 // illegal
166 return 0;
167 }
168}
169
170// =======================================================================
171// function : GetUtf8
172// purpose :
173// =======================================================================
174template<typename Type> inline
175Standard_Utf8Char* NCollection_UtfIterator<Type>::GetUtf8 (Standard_Utf8Char* theBuffer) const
176{
177 // unsigned arithmetic used
178 return (Standard_Utf8Char* )GetUtf8 ((Standard_Utf8UChar* )theBuffer);
179}
180
181// =======================================================================
182// function : GetUtf8
183// purpose :
184// =======================================================================
185template<typename Type> inline
186Standard_Utf8UChar* NCollection_UtfIterator<Type>::GetUtf8 (Standard_Utf8UChar* theBuffer) const
187{
188 Standard_Utf32Char aChar = myCharUtf32;
189 if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START
190 && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
191 {
192 // UTF-16 surrogate values are illegal in UTF-32
193 return theBuffer;
194 }
195 else if (myCharUtf32 < Standard_Utf32Char(0x80))
196 {
197 *theBuffer++ = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[1]);
198 return theBuffer;
199 }
200 else if (myCharUtf32 < Standard_Utf32Char(0x800))
201 {
202 *++theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
203 *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[2]);
204 return theBuffer + 2;
205 }
206 else if (myCharUtf32 < Standard_Utf32Char(0x10000))
207 {
208 theBuffer += 3;
209 *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
210 *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
211 *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[3]);
212 return theBuffer + 3;
213 }
214 else if (myCharUtf32 <= UTF32_MAX_LEGAL)
215 {
216 theBuffer += 4;
217 *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
218 *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
219 *--theBuffer = Standard_Utf8UChar((aChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); aChar >>= 6;
220 *--theBuffer = Standard_Utf8UChar (aChar | UTF8_FIRST_BYTE_MARK[4]);
221 return theBuffer + 4;
222 }
223 else
224 {
225 // illegal
226 return theBuffer;
227 }
228}
229
230// =======================================================================
231// function : AdvanceBytesUtf16
232// purpose :
233// =======================================================================
234template<typename Type> inline
235Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf16() const
236{
237 if (myCharUtf32 <= UTF32_MAX_BMP) // target is a character <= 0xFFFF
238 {
239 // UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values
240 if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START
241 && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
242 {
243 return 0;
244 }
245 else
246 {
247 return Standard_Integer(sizeof(Standard_Utf16Char));
248 }
249 }
250 else if (myCharUtf32 > UTF32_MAX_LEGAL)
251 {
252 // illegal
253 return 0;
254 }
255 else
256 {
257 // target is a character in range 0xFFFF - 0x10FFFF
258 // surrogate pair
259 return Standard_Integer(sizeof(Standard_Utf16Char) * 2);
260 }
261}
262
263// =======================================================================
264// function : GetUtf16
265// purpose :
266// =======================================================================
267template<typename Type> inline
268Standard_Utf16Char* NCollection_UtfIterator<Type>::GetUtf16 (Standard_Utf16Char* theBuffer) const
269{
270 if (myCharUtf32 <= UTF32_MAX_BMP) // target is a character <= 0xFFFF
271 {
272 // UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values
273 if (myCharUtf32 >= UTF16_SURROGATE_HIGH_START
274 && myCharUtf32 <= UTF16_SURROGATE_LOW_END)
275 {
276 return theBuffer;
277 }
278 else
279 {
280 *theBuffer++ = Standard_Utf16Char(myCharUtf32);
281 return theBuffer;
282 }
283 }
284 else if (myCharUtf32 > UTF32_MAX_LEGAL)
285 {
286 // illegal
287 return theBuffer;
288 }
289 else
290 {
291 // surrogate pair
292 Standard_Utf32Char aChar = myCharUtf32 - UTF16_SURROGATE_LOW_BASE;
293 *theBuffer++ = Standard_Utf16Char((aChar >> UTF16_SURROGATE_HIGH_SHIFT) + UTF16_SURROGATE_HIGH_START);
294 *theBuffer++ = Standard_Utf16Char((aChar & UTF16_SURROGATE_LOW_MASK) + UTF16_SURROGATE_LOW_START);
295 return theBuffer;
296 }
297}
298
299// =======================================================================
300// function : GetUtf32
301// purpose :
302// =======================================================================
303template<typename Type> inline
304Standard_Utf32Char* NCollection_UtfIterator<Type>::GetUtf32 (Standard_Utf32Char* theBuffer) const
305{
306 *theBuffer++ = myCharUtf32;
307 return theBuffer;
308}
309
310// =======================================================================
311// function : AdvanceBytesUtf
312// purpose :
313// =======================================================================
314template<typename Type> template<typename TypeWrite> inline
315Standard_Integer NCollection_UtfIterator<Type>::AdvanceBytesUtf() const
316{
317 switch (sizeof(TypeWrite))
318 {
319 case sizeof(Standard_Utf8Char): return AdvanceBytesUtf8();
320 case sizeof(Standard_Utf16Char): return AdvanceBytesUtf16();
321 case sizeof(Standard_Utf32Char): return AdvanceBytesUtf32();
322 default: return 0; // invalid case
323 }
324}
325
326// =======================================================================
327// function : GetUtf
328// purpose :
329// =======================================================================
330template<typename Type> template<typename TypeWrite> inline
331TypeWrite* NCollection_UtfIterator<Type>::GetUtf (TypeWrite* theBuffer) const
332{
333 switch (sizeof(TypeWrite))
334 {
335 case sizeof(Standard_Utf8Char): return (TypeWrite* )GetUtf8 ((Standard_Utf8UChar* )theBuffer);
336 case sizeof(Standard_Utf16Char): return (TypeWrite* )GetUtf16((Standard_Utf16Char* )theBuffer);
337 case sizeof(Standard_Utf32Char): return (TypeWrite* )GetUtf32((Standard_Utf32Char* )theBuffer);
338 default: return NULL; // invalid case
339 }
340}