0023457: Slow text rendering
[occt.git] / src / NCollection / NCollection_UtfIterator.hxx
CommitLineData
a174a3c5 1// Created on: 2013-01-28
2// Created by: Kirill GAVRILOV
3// Copyright (c) 2013 OPEN CASCADE SAS
4//
5// The content of this file is subject to the Open CASCADE Technology Public
6// License Version 6.5 (the "License"). You may not use the content of this file
7// except in compliance with the License. Please obtain a copy of the License
8// at http://www.opencascade.org and read it completely before using this file.
9//
10// The Initial Developer of the Original Code is Open CASCADE S.A.S., having its
11// main offices at: 1, place des Freres Montgolfier, 78280 Guyancourt, France.
12//
13// The Original Code and all software distributed under the License is
14// distributed on an "AS IS" basis, without warranty of any kind, and the
15// Initial Developer hereby disclaims all such warranties, including without
16// limitation, any warranties of merchantability, fitness for a particular
17// purpose or non-infringement. Please see the License for the specific terms
18// and conditions governing the rights and limitations under the License.
19
20#ifndef _NCollection_UtfIterator_H__
21#define _NCollection_UtfIterator_H__
22
23#include <Standard_TypeDef.hxx>
24
25//! Template class for Unicode strings support.
26//! It defines an iterator and provide correct way to read multi-byte text (UTF-8 and UTF-16)
27//! and convert it from one to another.
28//! The current value of iterator returned as UTF-32 Unicode code.
29template<typename Type>
30class NCollection_UtfIterator
31{
32
33public:
34
35 //! Constructor.
36 //! @param theString buffer to iterate
37 NCollection_UtfIterator (const Type* theString)
38 : myPosition(theString),
39 myPosNext(theString),
40 myCharIndex(0),
41 myCharUtf32(0)
42 {
43 if (theString != NULL)
44 {
45 ++(*this);
46 myCharIndex = 0;
47 }
48 }
49
50 //! Initialize iterator within specified NULL-terminated string.
51 void Init (const Type* theString)
52 {
53 myPosition = theString;
54 myPosNext = theString;
55 myCharUtf32 = 0;
56 if (theString != NULL)
57 {
58 ++(*this);
59 }
60 myCharIndex = 0;
61 }
62
63 //! Pre-increment operator. Reads the next unicode character.
64 //! Notice - no protection against overrun!
65 NCollection_UtfIterator& operator++()
66 {
67 myPosition = myPosNext;
68 ++myCharIndex;
69 switch (sizeof(Type))
70 {
71 case 1: readUTF8(); break;
72 case 2: readUTF16(); break;
73 case 4: // UTF-32
74 default:
75 myCharUtf32 = *myPosNext++;
76 }
77 return *this;
78 }
79
80 //! Post-increment operator.
81 //! Notice - no protection against overrun!
82 NCollection_UtfIterator operator++ (int )
83 {
84 NCollection_UtfIterator aCopy = *this;
85 ++*this;
86 return aCopy;
87 }
88
89 //! Equality operator.
90 bool operator== (const NCollection_UtfIterator& theRight) const
91 {
92 return myPosition == theRight.myPosition;
93 }
94
95 //! Dereference operator.
96 //! @return the UTF-32 codepoint of the character currently pointed by iterator.
97 Standard_Utf32Char operator*() const
98 {
99 return myCharUtf32;
100 }
101
102 //! Buffer-fetching getter.
103 const Type* BufferHere() const { return myPosition; }
104
105 //! Buffer-fetching getter. Dangerous! Iterator should be reinitialized on buffer change.
106 Type* ChangeBufferHere() { return (Type* )myPosition; }
107
108 //! Buffer-fetching getter.
109 const Type* BufferNext() const { return myPosNext; }
110
111 //! @return the index displacement from iterator intialization
112 Standard_Integer Index() const
113 {
114 return myCharIndex;
115 }
116
117 //! @return the advance in bytes to store current symbol in UTF-8.
118 //! 0 means an invalid symbol;
119 //! 1-4 bytes are valid range.
120 Standard_Integer AdvanceBytesUtf8() const;
121
122 //! @return the advance in bytes to store current symbol in UTF-16.
123 //! 0 means an invalid symbol;
124 //! 2 bytes is a general case;
125 //! 4 bytes for surrogate pair.
126 Standard_Integer AdvanceBytesUtf16() const;
127
128 //! @return the advance in bytes to store current symbol in UTF-32.
129 //! Always 4 bytes (method for consistency).
130 Standard_Integer AdvanceBytesUtf32() const
131 {
132 return Standard_Integer(sizeof(Standard_Utf32Char));
133 }
134
135 //! Fill the UTF-8 buffer within current Unicode symbol.
136 //! Use method AdvanceUtf8() to allocate buffer with enough size.
137 //! @param theBuffer buffer to fill
138 //! @return new buffer position (for next char)
139 Standard_Utf8Char* GetUtf8 (Standard_Utf8Char* theBuffer) const;
140 Standard_Utf8UChar* GetUtf8 (Standard_Utf8UChar* theBuffer) const;
141
142 //! Fill the UTF-16 buffer within current Unicode symbol.
143 //! Use method AdvanceUtf16() to allocate buffer with enough size.
144 //! @param theBuffer buffer to fill
145 //! @return new buffer position (for next char)
146 Standard_Utf16Char* GetUtf16 (Standard_Utf16Char* theBuffer) const;
147
148 //! Fill the UTF-32 buffer within current Unicode symbol.
149 //! Use method AdvanceUtf32() to allocate buffer with enough size.
150 //! @param theBuffer buffer to fill
151 //! @return new buffer position (for next char)
152 Standard_Utf32Char* GetUtf32 (Standard_Utf32Char* theBuffer) const;
153
154 //! @return the advance in TypeWrite chars needed to store current symbol
155 template<typename TypeWrite>
156 Standard_Integer AdvanceBytesUtf() const;
157
158 //! Fill the UTF-** buffer within current Unicode symbol.
159 //! Use method AdvanceUtf**() to allocate buffer with enough size.
160 //! @param theBuffer buffer to fill
161 //! @return new buffer position (for next char)
162 template<typename TypeWrite>
163 TypeWrite* GetUtf (TypeWrite* theBuffer) const;
164
165private:
166
167 //! Helper function for reading a single UTF8 character from the string.
168 //! Updates internal state appropriately.
169 void readUTF8();
170
171 //! Helper function for reading a single UTF16 character from the string.
172 //! Updates internal state appropriately.
173 void readUTF16();
174
175private: //! @name unicode magic numbers
176
177 static const unsigned char UTF8_BYTES_MINUS_ONE[256];
178 static const unsigned long offsetsFromUTF8[6];
179 static const unsigned char UTF8_FIRST_BYTE_MARK[7];
180 static const unsigned long UTF8_BYTE_MASK;
181 static const unsigned long UTF8_BYTE_MARK;
182 static const unsigned long UTF16_SURROGATE_HIGH_START;
183 static const unsigned long UTF16_SURROGATE_HIGH_END;
184 static const unsigned long UTF16_SURROGATE_LOW_START;
185 static const unsigned long UTF16_SURROGATE_LOW_END;
186 static const unsigned long UTF16_SURROGATE_HIGH_SHIFT;
187 static const unsigned long UTF16_SURROGATE_LOW_BASE;
188 static const unsigned long UTF16_SURROGATE_LOW_MASK;
189 static const unsigned long UTF32_MAX_BMP;
190 static const unsigned long UTF32_MAX_LEGAL;
191
192private: //! @name private fields
193
194 const Type* myPosition; //!< buffer position of the first element in the current character
195 const Type* myPosNext; //!< buffer position of the first element in the next character
196 Standard_Integer myCharIndex; //!< index displacement from iterator intialization
197 Standard_Utf32Char myCharUtf32; //!< character stored at the current buffer position
198
199};
200
201typedef NCollection_UtfIterator<Standard_Utf8Char> NCollection_Utf8Iter;
202typedef NCollection_UtfIterator<Standard_Utf16Char> NCollection_Utf16Iter;
203typedef NCollection_UtfIterator<Standard_Utf32Char> NCollection_Utf32Iter;
204typedef NCollection_UtfIterator<Standard_WideChar> NCollection_UtfWideIter;
205
206// template implementation
207#include "NCollection_UtfIterator.lxx"
208
209#endif // _NCollection_UtfIterator_H__