a174a3c5 |
1 | // Created on: 2013-01-28 |
2 | // Created by: Kirill GAVRILOV |
3 | // Copyright (c) 2013 OPEN CASCADE SAS |
4 | // |
5 | // The content of this file is subject to the Open CASCADE Technology Public |
6 | // License Version 6.5 (the "License"). You may not use the content of this file |
7 | // except in compliance with the License. Please obtain a copy of the License |
8 | // at http://www.opencascade.org and read it completely before using this file. |
9 | // |
10 | // The Initial Developer of the Original Code is Open CASCADE S.A.S., having its |
11 | // main offices at: 1, place des Freres Montgolfier, 78280 Guyancourt, France. |
12 | // |
13 | // The Original Code and all software distributed under the License is |
14 | // distributed on an "AS IS" basis, without warranty of any kind, and the |
15 | // Initial Developer hereby disclaims all such warranties, including without |
16 | // limitation, any warranties of merchantability, fitness for a particular |
17 | // purpose or non-infringement. Please see the License for the specific terms |
18 | // and conditions governing the rights and limitations under the License. |
19 | |
20 | #ifndef _NCollection_UtfIterator_H__ |
21 | #define _NCollection_UtfIterator_H__ |
22 | |
23 | #include <Standard_TypeDef.hxx> |
24 | |
25 | //! Template class for Unicode strings support. |
26 | //! It defines an iterator and provide correct way to read multi-byte text (UTF-8 and UTF-16) |
27 | //! and convert it from one to another. |
28 | //! The current value of iterator returned as UTF-32 Unicode code. |
29 | template<typename Type> |
30 | class NCollection_UtfIterator |
31 | { |
32 | |
33 | public: |
34 | |
35 | //! Constructor. |
36 | //! @param theString buffer to iterate |
37 | NCollection_UtfIterator (const Type* theString) |
38 | : myPosition(theString), |
39 | myPosNext(theString), |
40 | myCharIndex(0), |
41 | myCharUtf32(0) |
42 | { |
43 | if (theString != NULL) |
44 | { |
45 | ++(*this); |
46 | myCharIndex = 0; |
47 | } |
48 | } |
49 | |
50 | //! Initialize iterator within specified NULL-terminated string. |
51 | void Init (const Type* theString) |
52 | { |
53 | myPosition = theString; |
54 | myPosNext = theString; |
55 | myCharUtf32 = 0; |
56 | if (theString != NULL) |
57 | { |
58 | ++(*this); |
59 | } |
60 | myCharIndex = 0; |
61 | } |
62 | |
63 | //! Pre-increment operator. Reads the next unicode character. |
64 | //! Notice - no protection against overrun! |
65 | NCollection_UtfIterator& operator++() |
66 | { |
67 | myPosition = myPosNext; |
68 | ++myCharIndex; |
69 | switch (sizeof(Type)) |
70 | { |
71 | case 1: readUTF8(); break; |
72 | case 2: readUTF16(); break; |
73 | case 4: // UTF-32 |
74 | default: |
75 | myCharUtf32 = *myPosNext++; |
76 | } |
77 | return *this; |
78 | } |
79 | |
80 | //! Post-increment operator. |
81 | //! Notice - no protection against overrun! |
82 | NCollection_UtfIterator operator++ (int ) |
83 | { |
84 | NCollection_UtfIterator aCopy = *this; |
85 | ++*this; |
86 | return aCopy; |
87 | } |
88 | |
89 | //! Equality operator. |
90 | bool operator== (const NCollection_UtfIterator& theRight) const |
91 | { |
92 | return myPosition == theRight.myPosition; |
93 | } |
94 | |
95 | //! Dereference operator. |
96 | //! @return the UTF-32 codepoint of the character currently pointed by iterator. |
97 | Standard_Utf32Char operator*() const |
98 | { |
99 | return myCharUtf32; |
100 | } |
101 | |
102 | //! Buffer-fetching getter. |
103 | const Type* BufferHere() const { return myPosition; } |
104 | |
105 | //! Buffer-fetching getter. Dangerous! Iterator should be reinitialized on buffer change. |
106 | Type* ChangeBufferHere() { return (Type* )myPosition; } |
107 | |
108 | //! Buffer-fetching getter. |
109 | const Type* BufferNext() const { return myPosNext; } |
110 | |
111 | //! @return the index displacement from iterator intialization |
112 | Standard_Integer Index() const |
113 | { |
114 | return myCharIndex; |
115 | } |
116 | |
117 | //! @return the advance in bytes to store current symbol in UTF-8. |
118 | //! 0 means an invalid symbol; |
119 | //! 1-4 bytes are valid range. |
120 | Standard_Integer AdvanceBytesUtf8() const; |
121 | |
122 | //! @return the advance in bytes to store current symbol in UTF-16. |
123 | //! 0 means an invalid symbol; |
124 | //! 2 bytes is a general case; |
125 | //! 4 bytes for surrogate pair. |
126 | Standard_Integer AdvanceBytesUtf16() const; |
127 | |
128 | //! @return the advance in bytes to store current symbol in UTF-32. |
129 | //! Always 4 bytes (method for consistency). |
130 | Standard_Integer AdvanceBytesUtf32() const |
131 | { |
132 | return Standard_Integer(sizeof(Standard_Utf32Char)); |
133 | } |
134 | |
135 | //! Fill the UTF-8 buffer within current Unicode symbol. |
136 | //! Use method AdvanceUtf8() to allocate buffer with enough size. |
137 | //! @param theBuffer buffer to fill |
138 | //! @return new buffer position (for next char) |
139 | Standard_Utf8Char* GetUtf8 (Standard_Utf8Char* theBuffer) const; |
140 | Standard_Utf8UChar* GetUtf8 (Standard_Utf8UChar* theBuffer) const; |
141 | |
142 | //! Fill the UTF-16 buffer within current Unicode symbol. |
143 | //! Use method AdvanceUtf16() to allocate buffer with enough size. |
144 | //! @param theBuffer buffer to fill |
145 | //! @return new buffer position (for next char) |
146 | Standard_Utf16Char* GetUtf16 (Standard_Utf16Char* theBuffer) const; |
147 | |
148 | //! Fill the UTF-32 buffer within current Unicode symbol. |
149 | //! Use method AdvanceUtf32() to allocate buffer with enough size. |
150 | //! @param theBuffer buffer to fill |
151 | //! @return new buffer position (for next char) |
152 | Standard_Utf32Char* GetUtf32 (Standard_Utf32Char* theBuffer) const; |
153 | |
154 | //! @return the advance in TypeWrite chars needed to store current symbol |
155 | template<typename TypeWrite> |
156 | Standard_Integer AdvanceBytesUtf() const; |
157 | |
158 | //! Fill the UTF-** buffer within current Unicode symbol. |
159 | //! Use method AdvanceUtf**() to allocate buffer with enough size. |
160 | //! @param theBuffer buffer to fill |
161 | //! @return new buffer position (for next char) |
162 | template<typename TypeWrite> |
163 | TypeWrite* GetUtf (TypeWrite* theBuffer) const; |
164 | |
165 | private: |
166 | |
167 | //! Helper function for reading a single UTF8 character from the string. |
168 | //! Updates internal state appropriately. |
169 | void readUTF8(); |
170 | |
171 | //! Helper function for reading a single UTF16 character from the string. |
172 | //! Updates internal state appropriately. |
173 | void readUTF16(); |
174 | |
175 | private: //! @name unicode magic numbers |
176 | |
177 | static const unsigned char UTF8_BYTES_MINUS_ONE[256]; |
178 | static const unsigned long offsetsFromUTF8[6]; |
179 | static const unsigned char UTF8_FIRST_BYTE_MARK[7]; |
180 | static const unsigned long UTF8_BYTE_MASK; |
181 | static const unsigned long UTF8_BYTE_MARK; |
182 | static const unsigned long UTF16_SURROGATE_HIGH_START; |
183 | static const unsigned long UTF16_SURROGATE_HIGH_END; |
184 | static const unsigned long UTF16_SURROGATE_LOW_START; |
185 | static const unsigned long UTF16_SURROGATE_LOW_END; |
186 | static const unsigned long UTF16_SURROGATE_HIGH_SHIFT; |
187 | static const unsigned long UTF16_SURROGATE_LOW_BASE; |
188 | static const unsigned long UTF16_SURROGATE_LOW_MASK; |
189 | static const unsigned long UTF32_MAX_BMP; |
190 | static const unsigned long UTF32_MAX_LEGAL; |
191 | |
192 | private: //! @name private fields |
193 | |
194 | const Type* myPosition; //!< buffer position of the first element in the current character |
195 | const Type* myPosNext; //!< buffer position of the first element in the next character |
196 | Standard_Integer myCharIndex; //!< index displacement from iterator intialization |
197 | Standard_Utf32Char myCharUtf32; //!< character stored at the current buffer position |
198 | |
199 | }; |
200 | |
201 | typedef NCollection_UtfIterator<Standard_Utf8Char> NCollection_Utf8Iter; |
202 | typedef NCollection_UtfIterator<Standard_Utf16Char> NCollection_Utf16Iter; |
203 | typedef NCollection_UtfIterator<Standard_Utf32Char> NCollection_Utf32Iter; |
204 | typedef NCollection_UtfIterator<Standard_WideChar> NCollection_UtfWideIter; |
205 | |
206 | // template implementation |
207 | #include "NCollection_UtfIterator.lxx" |
208 | |
209 | #endif // _NCollection_UtfIterator_H__ |