[occt.git] / src / NCollection / NCollection_UtfIterator.hxx

// Created on: 2013-01-28
// Created by: Kirill GAVRILOV
// Copyright (c) 2013-2014 OPEN CASCADE SAS
//
// This file is part of Open CASCADE Technology software library.
//
// This library is free software; you can redistribute it and/or modify it under
// the terms of the GNU Lesser General Public License version 2.1 as published
// by the Free Software Foundation, with special exception defined in the file
// OCCT_LGPL_EXCEPTION.txt. Consult the file LICENSE_LGPL_21.txt included in OCCT
// distribution for complete text of the license and disclaimer of any warranty.
//
// Alternatively, this file may be used under the terms of Open CASCADE
// commercial license or contractual agreement.

#ifndef _NCollection_UtfIterator_H__
#define _NCollection_UtfIterator_H__

#include <Standard_Handle.hxx>

//! Template class for Unicode strings support.
//!
//! It defines an iterator and provide correct way to read multi-byte text (UTF-8 and UTF-16)
//! and convert it from one to another.
//! The current value of iterator is returned as UTF-32 Unicode symbol.
//!
//! Here and below term "Unicode symbol" is used as 
//! synonym of "Unicode code point".
template<typename Type>
class NCollection_UtfIterator
{

public:

  //! Constructor.
  //! @param theString buffer to iterate
  NCollection_UtfIterator (const Type* theString)
  : myPosition(theString),
    myPosNext(theString),
    myCharIndex(0),
    myCharUtf32(0)
  {
    if (theString != NULL)
    {
      ++(*this);
      myCharIndex = 0;
    }
  }

  //! Initialize iterator within specified NULL-terminated string.
  void Init (const Type* theString)
  {
    myPosition  = theString;
    myPosNext   = theString;
    myCharUtf32 = 0;
    if (theString != NULL)
    {
      ++(*this);
    }
    myCharIndex = 0;
  }

  //! Pre-increment operator. Reads the next unicode symbol.
  //! Notice - no protection against overrun!
  NCollection_UtfIterator& operator++()
  {
    myPosition = myPosNext;
    ++myCharIndex;
    readNext (static_cast<const typename CharTypeChooser<Type>::type*>(0));
    return *this;
  }

  //! Post-increment operator.
  //! Notice - no protection against overrun!
  NCollection_UtfIterator operator++ (int )
  {
    NCollection_UtfIterator aCopy = *this;
    ++*this;
    return aCopy;
  }

  //! Equality operator.
  bool operator== (const NCollection_UtfIterator& theRight) const
  {
    return myPosition == theRight.myPosition;
  }

  //! Return true if Unicode symbol is within valid range.
  bool IsValid() const
  {
    return myCharUtf32 <= UTF32_MAX_LEGAL;
  }

  //! Dereference operator.
  //! @return the UTF-32 codepoint of the symbol currently pointed by iterator.
  Standard_Utf32Char operator*() const
  {
    return myCharUtf32;
  }

  //! Buffer-fetching getter.
  const Type* BufferHere() const { return myPosition; }

  //! Buffer-fetching getter. Dangerous! Iterator should be reinitialized on buffer change.
  Type* ChangeBufferHere() { return (Type* )myPosition; }

  //! Buffer-fetching getter.
  const Type* BufferNext() const { return myPosNext; }

  //! @return the index displacement from iterator intialization
  //!         (first symbol has index 0)
  Standard_Integer Index() const
  {
    return myCharIndex;
  }

  //! @return the advance in bytes to store current symbol in UTF-8.
  //! 0 means an invalid symbol;
  //! 1-4 bytes are valid range.
  Standard_Integer AdvanceBytesUtf8() const;

  //! @return the advance in bytes to store current symbol in UTF-16.
  //! 0 means an invalid symbol;
  //! 2 bytes is a general case;
  //! 4 bytes for surrogate pair.
  Standard_Integer AdvanceBytesUtf16() const;

  //! @return the advance in bytes to store current symbol in UTF-16.
  //! 0 means an invalid symbol;
  //! 1 16-bit code unit is a general case;
  //! 2 16-bit code units for surrogate pair.
  Standard_Integer AdvanceCodeUnitsUtf16() const;

  //! @return the advance in bytes to store current symbol in UTF-32.
  //! Always 4 bytes (method for consistency).
  Standard_Integer AdvanceBytesUtf32() const
  {
    return Standard_Integer(sizeof(Standard_Utf32Char));
  }

  //! Fill the UTF-8 buffer within current Unicode symbol.
  //! Use method AdvanceUtf8() to allocate buffer with enough size.
  //! @param theBuffer buffer to fill
  //! @return new buffer position (for next char)
  Standard_Utf8Char*  GetUtf8 (Standard_Utf8Char*  theBuffer) const;
  Standard_Utf8UChar* GetUtf8 (Standard_Utf8UChar* theBuffer) const;

  //! Fill the UTF-16 buffer within current Unicode symbol.
  //! Use method AdvanceUtf16() to allocate buffer with enough size.
  //! @param theBuffer buffer to fill
  //! @return new buffer position (for next char)
  Standard_Utf16Char* GetUtf16 (Standard_Utf16Char* theBuffer) const;

  //! Fill the UTF-32 buffer within current Unicode symbol.
  //! Use method AdvanceUtf32() to allocate buffer with enough size.
  //! @param theBuffer buffer to fill
  //! @return new buffer position (for next char)
  Standard_Utf32Char* GetUtf32 (Standard_Utf32Char* theBuffer) const;

  //! @return the advance in TypeWrite chars needed to store current symbol
  template<typename TypeWrite>
  inline Standard_Integer AdvanceBytesUtf() const
  { 
    return advanceBytes(static_cast<const typename CharTypeChooser<TypeWrite>::type*>(0));
  }

  //! Fill the UTF-** buffer within current Unicode symbol.
  //! Use method AdvanceUtf**() to allocate buffer with enough size.
  //! @param theBuffer buffer to fill
  //! @return new buffer position (for next char)
  template<typename TypeWrite>
  inline TypeWrite* GetUtf (TypeWrite* theBuffer) const
  { 
    return (TypeWrite*)(getUtf (reinterpret_cast<typename CharTypeChooser<TypeWrite>::type*>(theBuffer)));
  }

private:

  //! Helper template class dispatching its argument class
  //! to the equivalent (by size) character (Unicode code unit) type.
  //! The code unit type is defined as nested typedef "type".
  //! 
  //! In practice this is relevant for wchar_t type:
  //! typename CharTypeChooser<wchar_t>::type resolves to
  //! Standard_Utf16Char on Windows and to Standard_Utf32Char on Linux.
  template <typename TypeChar>
  class CharTypeChooser : 
    public   opencascade::std::conditional< sizeof(TypeChar) == 1, Standard_Utf8Char,
    typename opencascade::std::conditional< sizeof(TypeChar) == 2, Standard_Utf16Char,
    typename opencascade::std::conditional< sizeof(TypeChar) == 4, Standard_Utf32Char, void >::type >::type >
  {
  };

  //! Helper function for reading a single Unicode symbol from the UTF-8 string.
  //! Updates internal state appropriately.
  void readUTF8();

  //! Helper function for reading a single Unicode symbol from the UTF-16 string.
  //! Updates internal state appropriately.
  void readUTF16();

  //! Helper overload methods to dispatch reading function depending on code unit size
  void readNext (const Standard_Utf8Char*)  { readUTF8(); }
  void readNext (const Standard_Utf16Char*) { readUTF16(); }
  void readNext (const Standard_Utf32Char*) { myCharUtf32 = *myPosNext++; }

  //! Helper overload methods to dispatch advance function depending on code unit size
  Standard_Integer advanceBytes (const Standard_Utf8Char*)  const { return AdvanceBytesUtf8(); }
  Standard_Integer advanceBytes (const Standard_Utf16Char*) const { return AdvanceBytesUtf16(); }
  Standard_Integer advanceBytes (const Standard_Utf32Char*) const { return AdvanceBytesUtf32(); }

  //! Helper overload methods to dispatch getter function depending on code unit size
  Standard_Utf8Char*  getUtf (Standard_Utf8Char*  theBuffer) const { return GetUtf8 (theBuffer); }
  Standard_Utf16Char* getUtf (Standard_Utf16Char* theBuffer) const { return GetUtf16(theBuffer); }
  Standard_Utf32Char* getUtf (Standard_Utf32Char* theBuffer) const { return GetUtf32(theBuffer); }

private: //! @name unicode magic numbers

  static const unsigned char UTF8_BYTES_MINUS_ONE[256];
  static const unsigned long offsetsFromUTF8[6];
  static const unsigned char UTF8_FIRST_BYTE_MARK[7];
  static const unsigned long UTF8_BYTE_MASK;
  static const unsigned long UTF8_BYTE_MARK;
  static const unsigned long UTF16_SURROGATE_HIGH_START;
  static const unsigned long UTF16_SURROGATE_HIGH_END;
  static const unsigned long UTF16_SURROGATE_LOW_START;
  static const unsigned long UTF16_SURROGATE_LOW_END;
  static const unsigned long UTF16_SURROGATE_HIGH_SHIFT;
  static const unsigned long UTF16_SURROGATE_LOW_BASE;
  static const unsigned long UTF16_SURROGATE_LOW_MASK;
  static const unsigned long UTF32_MAX_BMP;
  static const unsigned long UTF32_MAX_LEGAL;

private: //! @name private fields

  const Type*        myPosition;  //!< buffer position of the first element in the current symbol
  const Type*        myPosNext;   //!< buffer position of the first element in the next symbol
  Standard_Integer   myCharIndex; //!< index displacement from iterator intialization
  Standard_Utf32Char myCharUtf32; //!< Unicode symbol stored at the current buffer position

};

typedef NCollection_UtfIterator<Standard_Utf8Char>  NCollection_Utf8Iter;
typedef NCollection_UtfIterator<Standard_Utf16Char> NCollection_Utf16Iter;
typedef NCollection_UtfIterator<Standard_Utf32Char> NCollection_Utf32Iter;
typedef NCollection_UtfIterator<Standard_WideChar>  NCollection_UtfWideIter;

// template implementation
#include "NCollection_UtfIterator.lxx"

#endif // _NCollection_UtfIterator_H__
Commit	Line	Data
a174a3c5	1	// Created on: 2013-01-28
a174a3c5	2	// Created by: Kirill GAVRILOV
d5f74e42	3	// Copyright (c) 2013-2014 OPEN CASCADE SAS
a174a3c5	4	//
973c2be1	5	// This file is part of Open CASCADE Technology software library.
a174a3c5	6	//
d5f74e42	7	// This library is free software; you can redistribute it and/or modify it under
d5f74e42	8	// the terms of the GNU Lesser General Public License version 2.1 as published
973c2be1	9	// by the Free Software Foundation, with special exception defined in the file
	10	// OCCT_LGPL_EXCEPTION.txt. Consult the file LICENSE_LGPL_21.txt included in OCCT
	11	// distribution for complete text of the license and disclaimer of any warranty.
a174a3c5	12	//
973c2be1	13	// Alternatively, this file may be used under the terms of Open CASCADE
973c2be1	14	// commercial license or contractual agreement.
a174a3c5	15
	16	#ifndef _NCollection_UtfIterator_H__
	17	#define _NCollection_UtfIterator_H__
	18
cf0786da	19	#include <Standard_Handle.hxx>
a174a3c5	20
a174a3c5	21	//! Template class for Unicode strings support.
cf0786da	22	//!
a174a3c5	23	//! It defines an iterator and provide correct way to read multi-byte text (UTF-8 and UTF-16)
a174a3c5	24	//! and convert it from one to another.
cf0786da	25	//! The current value of iterator is returned as UTF-32 Unicode symbol.
	26	//!
	27	//! Here and below term "Unicode symbol" is used as
	28	//! synonym of "Unicode code point".
a174a3c5	29	template<typename Type>
	30	class NCollection_UtfIterator
	31	{
	32
	33	public:
	34
	35	//! Constructor.
	36	//! @param theString buffer to iterate
	37	NCollection_UtfIterator (const Type* theString)
	38	: myPosition(theString),
	39	myPosNext(theString),
	40	myCharIndex(0),
	41	myCharUtf32(0)
	42	{
	43	if (theString != NULL)
	44	{
	45	++(*this);
	46	myCharIndex = 0;
	47	}
	48	}
	49
	50	//! Initialize iterator within specified NULL-terminated string.
	51	void Init (const Type* theString)
	52	{
	53	myPosition = theString;
	54	myPosNext = theString;
	55	myCharUtf32 = 0;
	56	if (theString != NULL)
	57	{
	58	++(*this);
	59	}
	60	myCharIndex = 0;
	61	}
	62
cf0786da	63	//! Pre-increment operator. Reads the next unicode symbol.
a174a3c5	64	//! Notice - no protection against overrun!
	65	NCollection_UtfIterator& operator++()
	66	{
	67	myPosition = myPosNext;
	68	++myCharIndex;
cf0786da	69	readNext (static_cast<const typename CharTypeChooser<Type>::type*>(0));
a174a3c5	70	return *this;
	71	}
	72
	73	//! Post-increment operator.
	74	//! Notice - no protection against overrun!
	75	NCollection_UtfIterator operator++ (int )
	76	{
	77	NCollection_UtfIterator aCopy = *this;
	78	++*this;
	79	return aCopy;
	80	}
	81
	82	//! Equality operator.
	83	bool operator== (const NCollection_UtfIterator& theRight) const
	84	{
	85	return myPosition == theRight.myPosition;
	86	}
	87
fb0b0531	88	//! Return true if Unicode symbol is within valid range.
	89	bool IsValid() const
	90	{
	91	return myCharUtf32 <= UTF32_MAX_LEGAL;
	92	}
	93
a174a3c5	94	//! Dereference operator.
cf0786da	95	//! @return the UTF-32 codepoint of the symbol currently pointed by iterator.
a174a3c5	96	Standard_Utf32Char operator*() const
	97	{
	98	return myCharUtf32;
	99	}
	100
	101	//! Buffer-fetching getter.
	102	const Type* BufferHere() const { return myPosition; }
	103
	104	//! Buffer-fetching getter. Dangerous! Iterator should be reinitialized on buffer change.
	105	Type* ChangeBufferHere() { return (Type* )myPosition; }
	106
	107	//! Buffer-fetching getter.
	108	const Type* BufferNext() const { return myPosNext; }
	109
	110	//! @return the index displacement from iterator intialization
cf0786da	111	//! (first symbol has index 0)
a174a3c5	112	Standard_Integer Index() const
	113	{
	114	return myCharIndex;
	115	}
	116
	117	//! @return the advance in bytes to store current symbol in UTF-8.
	118	//! 0 means an invalid symbol;
	119	//! 1-4 bytes are valid range.
	120	Standard_Integer AdvanceBytesUtf8() const;
	121
	122	//! @return the advance in bytes to store current symbol in UTF-16.
	123	//! 0 means an invalid symbol;
	124	//! 2 bytes is a general case;
	125	//! 4 bytes for surrogate pair.
	126	Standard_Integer AdvanceBytesUtf16() const;
	127
fb0b0531	128	//! @return the advance in bytes to store current symbol in UTF-16.
	129	//! 0 means an invalid symbol;
	130	//! 1 16-bit code unit is a general case;
	131	//! 2 16-bit code units for surrogate pair.
	132	Standard_Integer AdvanceCodeUnitsUtf16() const;
	133
a174a3c5	134	//! @return the advance in bytes to store current symbol in UTF-32.
	135	//! Always 4 bytes (method for consistency).
	136	Standard_Integer AdvanceBytesUtf32() const
	137	{
	138	return Standard_Integer(sizeof(Standard_Utf32Char));
	139	}
	140
	141	//! Fill the UTF-8 buffer within current Unicode symbol.
	142	//! Use method AdvanceUtf8() to allocate buffer with enough size.
	143	//! @param theBuffer buffer to fill
	144	//! @return new buffer position (for next char)
	145	Standard_Utf8Char* GetUtf8 (Standard_Utf8Char* theBuffer) const;
	146	Standard_Utf8UChar* GetUtf8 (Standard_Utf8UChar* theBuffer) const;
	147
	148	//! Fill the UTF-16 buffer within current Unicode symbol.
	149	//! Use method AdvanceUtf16() to allocate buffer with enough size.
	150	//! @param theBuffer buffer to fill
	151	//! @return new buffer position (for next char)
	152	Standard_Utf16Char* GetUtf16 (Standard_Utf16Char* theBuffer) const;
	153
	154	//! Fill the UTF-32 buffer within current Unicode symbol.
	155	//! Use method AdvanceUtf32() to allocate buffer with enough size.
	156	//! @param theBuffer buffer to fill
	157	//! @return new buffer position (for next char)
	158	Standard_Utf32Char* GetUtf32 (Standard_Utf32Char* theBuffer) const;
	159
	160	//! @return the advance in TypeWrite chars needed to store current symbol
	161	template<typename TypeWrite>
cf0786da	162	inline Standard_Integer AdvanceBytesUtf() const
	163	{
	164	return advanceBytes(static_cast<const typename CharTypeChooser<TypeWrite>::type*>(0));
	165	}
a174a3c5	166
	167	//! Fill the UTF-** buffer within current Unicode symbol.
	168	//! Use method AdvanceUtf**() to allocate buffer with enough size.
	169	//! @param theBuffer buffer to fill
	170	//! @return new buffer position (for next char)
	171	template<typename TypeWrite>
cf0786da	172	inline TypeWrite* GetUtf (TypeWrite* theBuffer) const
	173	{
	174	return (TypeWrite)(getUtf (reinterpret_cast<typename CharTypeChooser<TypeWrite>::type>(theBuffer)));
	175	}
a174a3c5	176
	177	private:
	178
cf0786da	179	//! Helper template class dispatching its argument class
	180	//! to the equivalent (by size) character (Unicode code unit) type.
	181	//! The code unit type is defined as nested typedef "type".
	182	//!
	183	//! In practice this is relevant for wchar_t type:
	184	//! typename CharTypeChooser<wchar_t>::type resolves to
	185	//! Standard_Utf16Char on Windows and to Standard_Utf32Char on Linux.
	186	template <typename TypeChar>
	187	class CharTypeChooser :
	188	public opencascade::std::conditional< sizeof(TypeChar) == 1, Standard_Utf8Char,
	189	typename opencascade::std::conditional< sizeof(TypeChar) == 2, Standard_Utf16Char,
	190	typename opencascade::std::conditional< sizeof(TypeChar) == 4, Standard_Utf32Char, void >::type >::type >
	191	{
	192	};
	193
	194	//! Helper function for reading a single Unicode symbol from the UTF-8 string.
a174a3c5	195	//! Updates internal state appropriately.
	196	void readUTF8();
	197
cf0786da	198	//! Helper function for reading a single Unicode symbol from the UTF-16 string.
a174a3c5	199	//! Updates internal state appropriately.
	200	void readUTF16();
	201
cf0786da	202	//! Helper overload methods to dispatch reading function depending on code unit size
	203	void readNext (const Standard_Utf8Char*) { readUTF8(); }
	204	void readNext (const Standard_Utf16Char*) { readUTF16(); }
	205	void readNext (const Standard_Utf32Char) { myCharUtf32 = myPosNext++; }
	206
	207	//! Helper overload methods to dispatch advance function depending on code unit size
	208	Standard_Integer advanceBytes (const Standard_Utf8Char*) const { return AdvanceBytesUtf8(); }
	209	Standard_Integer advanceBytes (const Standard_Utf16Char*) const { return AdvanceBytesUtf16(); }
	210	Standard_Integer advanceBytes (const Standard_Utf32Char*) const { return AdvanceBytesUtf32(); }
	211
	212	//! Helper overload methods to dispatch getter function depending on code unit size
	213	Standard_Utf8Char* getUtf (Standard_Utf8Char* theBuffer) const { return GetUtf8 (theBuffer); }
	214	Standard_Utf16Char* getUtf (Standard_Utf16Char* theBuffer) const { return GetUtf16(theBuffer); }
	215	Standard_Utf32Char* getUtf (Standard_Utf32Char* theBuffer) const { return GetUtf32(theBuffer); }
	216
a174a3c5	217	private: //! @name unicode magic numbers
	218
	219	static const unsigned char UTF8_BYTES_MINUS_ONE[256];
	220	static const unsigned long offsetsFromUTF8[6];
	221	static const unsigned char UTF8_FIRST_BYTE_MARK[7];
	222	static const unsigned long UTF8_BYTE_MASK;
	223	static const unsigned long UTF8_BYTE_MARK;
	224	static const unsigned long UTF16_SURROGATE_HIGH_START;
	225	static const unsigned long UTF16_SURROGATE_HIGH_END;
	226	static const unsigned long UTF16_SURROGATE_LOW_START;
	227	static const unsigned long UTF16_SURROGATE_LOW_END;
	228	static const unsigned long UTF16_SURROGATE_HIGH_SHIFT;
	229	static const unsigned long UTF16_SURROGATE_LOW_BASE;
	230	static const unsigned long UTF16_SURROGATE_LOW_MASK;
	231	static const unsigned long UTF32_MAX_BMP;
	232	static const unsigned long UTF32_MAX_LEGAL;
	233
	234	private: //! @name private fields
	235
cf0786da	236	const Type* myPosition; //!< buffer position of the first element in the current symbol
cf0786da	237	const Type* myPosNext; //!< buffer position of the first element in the next symbol
a174a3c5	238	Standard_Integer myCharIndex; //!< index displacement from iterator intialization
cf0786da	239	Standard_Utf32Char myCharUtf32; //!< Unicode symbol stored at the current buffer position
a174a3c5	240
	241	};
	242
	243	typedef NCollection_UtfIterator<Standard_Utf8Char> NCollection_Utf8Iter;
	244	typedef NCollection_UtfIterator<Standard_Utf16Char> NCollection_Utf16Iter;
	245	typedef NCollection_UtfIterator<Standard_Utf32Char> NCollection_Utf32Iter;
	246	typedef NCollection_UtfIterator<Standard_WideChar> NCollection_UtfWideIter;
	247
	248	// template implementation
	249	#include "NCollection_UtfIterator.lxx"
	250
	251	#endif // _NCollection_UtfIterator_H__