0029151: GCC 7.1 warnings "this statement may fall through" [-Wimplicit-fallthrough=]
[occt.git] / src / NCollection / NCollection_UtfIterator.hxx
CommitLineData
a174a3c5 1// Created on: 2013-01-28
2// Created by: Kirill GAVRILOV
d5f74e42 3// Copyright (c) 2013-2014 OPEN CASCADE SAS
a174a3c5 4//
973c2be1 5// This file is part of Open CASCADE Technology software library.
a174a3c5 6//
d5f74e42 7// This library is free software; you can redistribute it and/or modify it under
8// the terms of the GNU Lesser General Public License version 2.1 as published
973c2be1 9// by the Free Software Foundation, with special exception defined in the file
10// OCCT_LGPL_EXCEPTION.txt. Consult the file LICENSE_LGPL_21.txt included in OCCT
11// distribution for complete text of the license and disclaimer of any warranty.
a174a3c5 12//
973c2be1 13// Alternatively, this file may be used under the terms of Open CASCADE
14// commercial license or contractual agreement.
a174a3c5 15
16#ifndef _NCollection_UtfIterator_H__
17#define _NCollection_UtfIterator_H__
18
19#include <Standard_TypeDef.hxx>
20
21//! Template class for Unicode strings support.
22//! It defines an iterator and provide correct way to read multi-byte text (UTF-8 and UTF-16)
23//! and convert it from one to another.
24//! The current value of iterator returned as UTF-32 Unicode code.
25template<typename Type>
26class NCollection_UtfIterator
27{
28
29public:
30
31 //! Constructor.
32 //! @param theString buffer to iterate
33 NCollection_UtfIterator (const Type* theString)
34 : myPosition(theString),
35 myPosNext(theString),
36 myCharIndex(0),
37 myCharUtf32(0)
38 {
39 if (theString != NULL)
40 {
41 ++(*this);
42 myCharIndex = 0;
43 }
44 }
45
46 //! Initialize iterator within specified NULL-terminated string.
47 void Init (const Type* theString)
48 {
49 myPosition = theString;
50 myPosNext = theString;
51 myCharUtf32 = 0;
52 if (theString != NULL)
53 {
54 ++(*this);
55 }
56 myCharIndex = 0;
57 }
58
59 //! Pre-increment operator. Reads the next unicode character.
60 //! Notice - no protection against overrun!
61 NCollection_UtfIterator& operator++()
62 {
63 myPosition = myPosNext;
64 ++myCharIndex;
65 switch (sizeof(Type))
66 {
67 case 1: readUTF8(); break;
68 case 2: readUTF16(); break;
69 case 4: // UTF-32
70 default:
71 myCharUtf32 = *myPosNext++;
72 }
73 return *this;
74 }
75
76 //! Post-increment operator.
77 //! Notice - no protection against overrun!
78 NCollection_UtfIterator operator++ (int )
79 {
80 NCollection_UtfIterator aCopy = *this;
81 ++*this;
82 return aCopy;
83 }
84
85 //! Equality operator.
86 bool operator== (const NCollection_UtfIterator& theRight) const
87 {
88 return myPosition == theRight.myPosition;
89 }
90
fb0b0531 91 //! Return true if Unicode symbol is within valid range.
92 bool IsValid() const
93 {
94 return myCharUtf32 <= UTF32_MAX_LEGAL;
95 }
96
a174a3c5 97 //! Dereference operator.
98 //! @return the UTF-32 codepoint of the character currently pointed by iterator.
99 Standard_Utf32Char operator*() const
100 {
101 return myCharUtf32;
102 }
103
104 //! Buffer-fetching getter.
105 const Type* BufferHere() const { return myPosition; }
106
107 //! Buffer-fetching getter. Dangerous! Iterator should be reinitialized on buffer change.
108 Type* ChangeBufferHere() { return (Type* )myPosition; }
109
110 //! Buffer-fetching getter.
111 const Type* BufferNext() const { return myPosNext; }
112
113 //! @return the index displacement from iterator intialization
114 Standard_Integer Index() const
115 {
116 return myCharIndex;
117 }
118
119 //! @return the advance in bytes to store current symbol in UTF-8.
120 //! 0 means an invalid symbol;
121 //! 1-4 bytes are valid range.
122 Standard_Integer AdvanceBytesUtf8() const;
123
124 //! @return the advance in bytes to store current symbol in UTF-16.
125 //! 0 means an invalid symbol;
126 //! 2 bytes is a general case;
127 //! 4 bytes for surrogate pair.
128 Standard_Integer AdvanceBytesUtf16() const;
129
fb0b0531 130 //! @return the advance in bytes to store current symbol in UTF-16.
131 //! 0 means an invalid symbol;
132 //! 1 16-bit code unit is a general case;
133 //! 2 16-bit code units for surrogate pair.
134 Standard_Integer AdvanceCodeUnitsUtf16() const;
135
a174a3c5 136 //! @return the advance in bytes to store current symbol in UTF-32.
137 //! Always 4 bytes (method for consistency).
138 Standard_Integer AdvanceBytesUtf32() const
139 {
140 return Standard_Integer(sizeof(Standard_Utf32Char));
141 }
142
143 //! Fill the UTF-8 buffer within current Unicode symbol.
144 //! Use method AdvanceUtf8() to allocate buffer with enough size.
145 //! @param theBuffer buffer to fill
146 //! @return new buffer position (for next char)
147 Standard_Utf8Char* GetUtf8 (Standard_Utf8Char* theBuffer) const;
148 Standard_Utf8UChar* GetUtf8 (Standard_Utf8UChar* theBuffer) const;
149
150 //! Fill the UTF-16 buffer within current Unicode symbol.
151 //! Use method AdvanceUtf16() to allocate buffer with enough size.
152 //! @param theBuffer buffer to fill
153 //! @return new buffer position (for next char)
154 Standard_Utf16Char* GetUtf16 (Standard_Utf16Char* theBuffer) const;
155
156 //! Fill the UTF-32 buffer within current Unicode symbol.
157 //! Use method AdvanceUtf32() to allocate buffer with enough size.
158 //! @param theBuffer buffer to fill
159 //! @return new buffer position (for next char)
160 Standard_Utf32Char* GetUtf32 (Standard_Utf32Char* theBuffer) const;
161
162 //! @return the advance in TypeWrite chars needed to store current symbol
163 template<typename TypeWrite>
164 Standard_Integer AdvanceBytesUtf() const;
165
166 //! Fill the UTF-** buffer within current Unicode symbol.
167 //! Use method AdvanceUtf**() to allocate buffer with enough size.
168 //! @param theBuffer buffer to fill
169 //! @return new buffer position (for next char)
170 template<typename TypeWrite>
171 TypeWrite* GetUtf (TypeWrite* theBuffer) const;
172
173private:
174
175 //! Helper function for reading a single UTF8 character from the string.
176 //! Updates internal state appropriately.
177 void readUTF8();
178
179 //! Helper function for reading a single UTF16 character from the string.
180 //! Updates internal state appropriately.
181 void readUTF16();
182
183private: //! @name unicode magic numbers
184
185 static const unsigned char UTF8_BYTES_MINUS_ONE[256];
186 static const unsigned long offsetsFromUTF8[6];
187 static const unsigned char UTF8_FIRST_BYTE_MARK[7];
188 static const unsigned long UTF8_BYTE_MASK;
189 static const unsigned long UTF8_BYTE_MARK;
190 static const unsigned long UTF16_SURROGATE_HIGH_START;
191 static const unsigned long UTF16_SURROGATE_HIGH_END;
192 static const unsigned long UTF16_SURROGATE_LOW_START;
193 static const unsigned long UTF16_SURROGATE_LOW_END;
194 static const unsigned long UTF16_SURROGATE_HIGH_SHIFT;
195 static const unsigned long UTF16_SURROGATE_LOW_BASE;
196 static const unsigned long UTF16_SURROGATE_LOW_MASK;
197 static const unsigned long UTF32_MAX_BMP;
198 static const unsigned long UTF32_MAX_LEGAL;
199
200private: //! @name private fields
201
202 const Type* myPosition; //!< buffer position of the first element in the current character
203 const Type* myPosNext; //!< buffer position of the first element in the next character
204 Standard_Integer myCharIndex; //!< index displacement from iterator intialization
205 Standard_Utf32Char myCharUtf32; //!< character stored at the current buffer position
206
207};
208
209typedef NCollection_UtfIterator<Standard_Utf8Char> NCollection_Utf8Iter;
210typedef NCollection_UtfIterator<Standard_Utf16Char> NCollection_Utf16Iter;
211typedef NCollection_UtfIterator<Standard_Utf32Char> NCollection_Utf32Iter;
212typedef NCollection_UtfIterator<Standard_WideChar> NCollection_UtfWideIter;
213
214// template implementation
215#include "NCollection_UtfIterator.lxx"
216
217#endif // _NCollection_UtfIterator_H__