Added support of BOM (byte order mask) characters at the start of an XML stream or file and provided information about found BOM in LDOMParser.
return myError;
}
+//=======================================================================
+//function : GetBOM
+//purpose : Returns the byte order mask defined at the start of a stream
+//=======================================================================
+
+LDOM_OSStream::BOMType LDOMParser::GetBOM() const
+{
+ if (myReader)
+ return myReader->GetBOM();
+ return LDOM_OSStream::BOM_UNDEFINED;
+}
+
//=======================================================================
//function : parse
//purpose :
GetError (TCollection_AsciiString& aData) const;
// Return text describing a parsing error, or Empty if no error occurred
+ // Returns the byte order mask defined at the start of a stream
+ Standard_EXPORT LDOM_OSStream::BOMType GetBOM() const;
+
protected:
// ---------- PROTECTED METHODS ----------
private:
LDOM_SBuffer myBuffer;
+
+public:
+ // byte order mark defined at the start of a stream
+ enum BOMType {
+ BOM_UNDEFINED,
+ BOM_UTF8,
+ BOM_UTF16BE,
+ BOM_UTF16LE,
+ BOM_UTF32BE,
+ BOM_UTF32LE,
+ BOM_UTF7,
+ BOM_UTF1,
+ BOM_UTFEBCDIC,
+ BOM_SCSU,
+ BOM_BOCU1,
+ BOM_GB18030
+ };
};
#endif
myLastChild(NULL),
myPtr (&myBuffer[0]),
myEndPtr (&myBuffer[0]),
- myTagPerStep (theTagPerStep)
+ myTagPerStep (theTagPerStep),
+ myBOM (LDOM_OSStream::BOM_UNDEFINED)
{
}
LDOMBasicString anAttrName, anAttrValue;
char anAttDelimiter = '\0';
Standard_Boolean aHasRead = Standard_False;
+ Standard_Boolean isFileStart = !myEOF && theIStream.tellg() == std::iostream::pos_type(0);
for(;;) {
// Check if the current file buffer is exhausted
myBuffer[aBytesRest + aNBytes] = '\0';
}
}
+ if (isFileStart)
+ {
+ isFileStart = Standard_False;
+ // check for BOM block
+ Standard_Utf8UChar aFirstChar = Standard_Utf8UChar(myPtr[0]);
+ switch(aFirstChar) {
+ case 0xEF:
+ if (Standard_Utf8UChar(myPtr[1]) == 0xBB && Standard_Utf8UChar(myPtr[2]) == 0xBF)
+ {
+ myBOM = LDOM_OSStream::BOM_UTF8;
+ myPtr += 3;
+ }
+ break;
+ case 0xFE:
+ if (Standard_Utf8UChar(myPtr[1]) == 0xFF)
+ {
+ myBOM = LDOM_OSStream::BOM_UTF16BE;
+ myPtr += 2;
+ }
+ break;
+ case 0xFF:
+ if (Standard_Utf8UChar(myPtr[1]) == 0xFE)
+ {
+ if (myPtr[2] == 0 && myPtr[3] == 0)
+ {
+ myBOM = LDOM_OSStream::BOM_UTF32LE;
+ myPtr += 4;
+ }
+ else
+ {
+ myBOM = LDOM_OSStream::BOM_UTF16LE;
+ myPtr += 2;
+ }
+ }
+ break;
+ case 0x00:
+ if (myPtr[1] == 0 && Standard_Utf8UChar(myPtr[2]) == 0xFE && Standard_Utf8UChar(myPtr[3]) == 0xFF)
+ {
+ myBOM = LDOM_OSStream::BOM_UTF32BE;
+ myPtr += 4;
+ }
+ break;
+ case 0x2B:
+ if (myPtr[1] == 47 && myPtr[2] == 118 &&
+ (myPtr[3] == 43 || myPtr[3] == 47 || myPtr[3] == 56 || myPtr[3] == 57))
+ {
+ myBOM = LDOM_OSStream::BOM_UTF7;
+ if (myPtr[3] == 56 && myPtr[3] == 45)
+ myPtr += 5;
+ else
+ myPtr += 4;
+ }
+ break;
+ case 0xF7:
+ if (myPtr[1] == 100 && myPtr[2] == 76)
+ {
+ myBOM = LDOM_OSStream::BOM_UTF1;
+ myPtr += 3;
+ }
+ break;
+ case 0xDD:
+ if (myPtr[1] == 115 && myPtr[2] == 102 && myPtr[3] == 115)
+ {
+ myBOM = LDOM_OSStream::BOM_UTFEBCDIC;
+ myPtr += 4;
+ }
+ break;
+ case 0x0E:
+ if (Standard_Utf8UChar(myPtr[1]) == 0xFE && Standard_Utf8UChar(myPtr[2]) == 0xFF)
+ {
+ myBOM = LDOM_OSStream::BOM_SCSU;
+ myPtr += 3;
+ }
+ break;
+ case 0xFB:
+ if (Standard_Utf8UChar(myPtr[1]) == 0xEE && myPtr[2] == 40)
+ {
+ myBOM = LDOM_OSStream::BOM_BOCU1;
+ myPtr += 3;
+ }
+ break;
+ case 0x84:
+ if (myPtr[1] == 49 && Standard_Utf8UChar(myPtr[2]) == 0x95 && myPtr[3] == 51)
+ {
+ myBOM = LDOM_OSStream::BOM_GB18030;
+ myPtr += 4;
+ }
+ break;
+ }
+ if (myBOM != LDOM_OSStream::BOM_UNDEFINED)
+ continue;
+ }
// Check the character data
switch (aState) {
#define XML_BUFFER_SIZE 20480
#include <LDOM_BasicElement.hxx>
+#include <LDOM_OSStream.hxx>
class TCollection_AsciiString;
-class LDOM_OSStream;
// Class LDOM_XmlReader
//
const char * theEnd);
// try convert string theStart to LDOM_AsciiInteger, return False on success
+ // Returns the byte order mask defined at the start of a stream
+ LDOM_OSStream::BOMType GetBOM() const { return myBOM; }
+
private:
// ---------- PRIVATE (PROHIBITED) METHODS ----------
LDOM_XmlReader (const LDOM_XmlReader& theOther);
const char * myEndPtr;
char myBuffer [XML_BUFFER_SIZE+4];
Standard_Boolean myTagPerStep;
+ LDOM_OSStream::BOMType myBOM;
};
#endif
di << " AttributeValue = " << itemValue.ToCString() << "\n";
}
-// LDOM_Element element;
-// for ( element = (const LDOM_Element&) root.getFirstChild();
-// !element.isNull();
-// element = (const LDOM_Element&) element.getNextSibling() ) {
LDOM_Element element;
LDOM_Node node;
for ( node = root.getFirstChild(), element = (const LDOM_Element&) node;
di << " AttributeValue = " << itemValue2.ToCString() << "\n";
}
}
+ if (aParser.GetBOM() != LDOM_OSStream::BOM_UNDEFINED)
+ {
+ di << "BOM is ";
+ switch (aParser.GetBOM()) {
+ case LDOM_OSStream::BOM_UTF8: di << "UTF-8"; break;
+ case LDOM_OSStream::BOM_UTF16BE: di << "UTF-16 (BE)"; break;
+ case LDOM_OSStream::BOM_UTF16LE: di << "UTF-16 (LE)"; break;
+ case LDOM_OSStream::BOM_UTF32BE: di << "UTF-32 (BE)"; break;
+ case LDOM_OSStream::BOM_UTF32LE: di << "UTF-32 (LE)"; break;
+ case LDOM_OSStream::BOM_UTF7: di << "UTF-7"; break;
+ case LDOM_OSStream::BOM_UTF1: di << "UTF-1"; break;
+ case LDOM_OSStream::BOM_UTFEBCDIC: di << "UTF-EBCDIC"; break;
+ case LDOM_OSStream::BOM_SCSU: di << "SCSU"; break;
+ case LDOM_OSStream::BOM_BOCU1: di << "BOCU-1"; break;
+ case LDOM_OSStream::BOM_GB18030: di << "GB-18030"; break;
+ default: di << "unexpected";
+ }
+ di << "\n";
+ }
return 0;
}
--- /dev/null
+puts "================"
+puts "0031340: LDOM fails to read XML file starting with BOM"
+puts "================"
+puts ""
+
+pload QAcommands
+
+set list [OCC983 [locate_data_file bug31340.xml]]
+
+if { [regexp "Document parsed" $list] == 1 } {
+ if {[lsearch -exact ${list} "UTF-8"] != -1 } {
+ puts "OK"
+ } else {
+ puts "Error : BOM was not found in $list"
+ }
+} else {
+ puts "Error : document not parsed"
+}