0031340: LDOM fails to read XML file starting with BOM

[occt.git] / src / LDOM / LDOM_XmlReader.cxx
diff --git a/src/LDOM/LDOM_XmlReader.cxx b/src/LDOM/LDOM_XmlReader.cxx

index ff2d500..5b12bce 100644 (file)
--- a/src/LDOM/LDOM_XmlReader.cxx
+++ b/src/LDOM/LDOM_XmlReader.cxx
@@ -13,7 +13,7 @@
  // Alternatively, this file may be used under the terms of Open CASCADE
  // commercial license or contractual agreement.
  
-//AGV 060302: Input from istream
+//AGV 060302: Input from std::istream
  //            AGV 130302: bug corr: was error if strlen(root_elem_name) < 7
  
  #include <LDOM_XmlReader.hxx>
@@ -34,7 +34,6 @@
  //#include <ctype.h>
  
  const int XML_MIN_BUFFER = 10;
-const int FILE_NONVALUE  = -1;
  
  typedef enum {
    STATE_WAITING = 0,
@@ -65,14 +64,17 @@ static Standard_Boolean isName          (const char             * aString,
  
  LDOM_XmlReader::LDOM_XmlReader (
                                  const Handle(LDOM_MemManager)&  theDocument,
-                                TCollection_AsciiString&        theErrorString)
+                                TCollection_AsciiString&        theErrorString,
+                                const Standard_Boolean theTagPerStep)
  : myEOF      (Standard_False),
    myError    (theErrorString),
    myDocument (theDocument),
    myElement  (NULL),
    myLastChild(NULL), 
    myPtr      (&myBuffer[0]),
-  myEndPtr   (&myBuffer[0])
+  myEndPtr   (&myBuffer[0]),
+  myTagPerStep (theTagPerStep),
+  myBOM      (LDOM_OSStream::BOM_UNDEFINED)
  {
  }
  
@@ -90,39 +92,161 @@ LDOM_XmlReader::RecordType LDOM_XmlReader::ReadRecord (Standard_IStream& theIStr
    const char * aStartData = NULL, * aNameEnd = NULL, * aPtr;
    LDOMBasicString anAttrName, anAttrValue;
    char anAttDelimiter = '\0';
+  Standard_Boolean aHasRead = Standard_False;
+  Standard_Boolean isFileStart = !myEOF && theIStream.tellg() == std::iostream::pos_type(0);
  
    for(;;) {
      //  Check if the current file buffer is exhausted
      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      //  There should always be some bytes available in the buffer for analysis
      Standard_Integer aBytesRest = (Standard_Integer)(myEndPtr - myPtr);
-    if (aBytesRest < XML_MIN_BUFFER) {
-      if (myEOF == Standard_True) {
+    if (aBytesRest < XML_MIN_BUFFER)
+    {
+      if (myEOF == Standard_True)
+      {
          if (aBytesRest <= 0)
            break;                        // END of processing
-      } else {
-      // If we are reading some data, save the beginning and preserve the state
+      }
+      else if (myTagPerStep && aHasRead)
+      {
+        // in myTagPerStep mode, we should parse the buffer to the end before
+        // getting more characters from the stream.
+      }
+      else
+      {
+        // If we are reading some data, save the beginning and preserve the state
          if (aStartData /* && aState != STATE_WAITING */) {
            if (myPtr > aStartData)
              theData.rdbuf()->sputn(aStartData, myPtr - aStartData);
            aStartData = &myBuffer[0];
          }
-      // Copy the rest of file data to the beginning of buffer
+        // Copy the rest of file data to the beginning of buffer
          if (aBytesRest > 0)
-          memcpy (&myBuffer[0], myPtr, aBytesRest);
+        {
+          // do not use memcpy here because aBytesRest may be greater than myPtr-myBuffer, so, overlap
+          memmove (&myBuffer[0], myPtr, aBytesRest);
+        }
  
-      // Read the full buffer and reset start and end buffer pointers
+        // Read the full buffer and reset start and end buffer pointers
          myPtr    = &myBuffer[0];
          Standard_Size aNBytes;
-          theIStream.read (&myBuffer[aBytesRest],
-                          XML_BUFFER_SIZE - aBytesRest);
-          aNBytes = (Standard_Size)theIStream.gcount();
+
+        if (myTagPerStep)
+        {
+          theIStream.getline (&myBuffer[aBytesRest], XML_BUFFER_SIZE - aBytesRest, '>');
+          aHasRead = Standard_True;
+        }
+        else
+        {
+          theIStream.read (&myBuffer[aBytesRest], XML_BUFFER_SIZE - aBytesRest);
+        }
+        aNBytes = (Standard_Size)theIStream.gcount();
+        
          if (aNBytes == 0)
+        {
            myEOF = Standard_True;                  // END-OF-FILE
+        }
+        else if (myTagPerStep)
+        {
+          // replace \0 (being inserted by getline method) with > 
+          myBuffer[aBytesRest + aNBytes - 1] = '>';
+        }
          myEndPtr = &myBuffer[aBytesRest + aNBytes];
          myBuffer[aBytesRest + aNBytes] = '\0';
        }
      }
+    if (isFileStart)
+    {
+      isFileStart = Standard_False;
+      // check for BOM block
+      Standard_Utf8UChar aFirstChar = Standard_Utf8UChar(myPtr[0]);
+      switch(aFirstChar) {
+      case 0xEF:
+        if (Standard_Utf8UChar(myPtr[1]) == 0xBB && Standard_Utf8UChar(myPtr[2]) == 0xBF)
+        {
+          myBOM = LDOM_OSStream::BOM_UTF8;
+          myPtr += 3;
+        }
+        break;
+      case 0xFE:
+        if (Standard_Utf8UChar(myPtr[1]) == 0xFF)
+        {
+          myBOM = LDOM_OSStream::BOM_UTF16BE;
+          myPtr += 2;
+        }
+        break;
+      case 0xFF:
+        if (Standard_Utf8UChar(myPtr[1]) == 0xFE)
+        {
+          if (myPtr[2] == 0 && myPtr[3] == 0)
+          {
+            myBOM = LDOM_OSStream::BOM_UTF32LE;
+            myPtr += 4;
+          }
+          else
+          {
+            myBOM = LDOM_OSStream::BOM_UTF16LE;
+            myPtr += 2;
+          }
+        }
+        break;
+      case 0x00:
+        if (myPtr[1] == 0 && Standard_Utf8UChar(myPtr[2]) == 0xFE && Standard_Utf8UChar(myPtr[3]) == 0xFF)
+        {
+          myBOM = LDOM_OSStream::BOM_UTF32BE;
+          myPtr += 4;
+        }
+        break;
+      case 0x2B:
+        if (myPtr[1] == 47 && myPtr[2] == 118 &&
+            (myPtr[3] == 43 || myPtr[3] == 47 || myPtr[3] == 56 || myPtr[3] == 57))
+        {
+          myBOM = LDOM_OSStream::BOM_UTF7;
+          if (myPtr[3] == 56 && myPtr[3] == 45)
+            myPtr += 5;
+          else
+            myPtr += 4;
+        }
+        break;
+      case 0xF7:
+        if (myPtr[1] == 100 && myPtr[2] == 76)
+        {
+          myBOM = LDOM_OSStream::BOM_UTF1;
+          myPtr += 3;
+        }
+        break;
+      case 0xDD:
+        if (myPtr[1] == 115 && myPtr[2] == 102 && myPtr[3] == 115)
+        {
+          myBOM = LDOM_OSStream::BOM_UTFEBCDIC;
+          myPtr += 4;
+        }
+        break;
+      case 0x0E:
+        if (Standard_Utf8UChar(myPtr[1]) == 0xFE && Standard_Utf8UChar(myPtr[2]) == 0xFF)
+        {
+          myBOM = LDOM_OSStream::BOM_SCSU;
+          myPtr += 3;
+        }
+        break;
+      case 0xFB:
+        if (Standard_Utf8UChar(myPtr[1]) == 0xEE && myPtr[2] == 40)
+        {
+          myBOM = LDOM_OSStream::BOM_BOCU1;
+          myPtr += 3;
+        }
+        break;
+      case 0x84:
+        if (myPtr[1] == 49 && Standard_Utf8UChar(myPtr[2]) == 0x95 && myPtr[3] == 51)
+        {
+          myBOM = LDOM_OSStream::BOM_GB18030;
+          myPtr += 4;
+        }
+        break;
+      }
+      if (myBOM != LDOM_OSStream::BOM_UNDEFINED)
+        continue;
+    }
  
      //  Check the character data
      switch (aState) {
@@ -183,11 +307,11 @@ LDOM_XmlReader::RecordType LDOM_XmlReader::ReadRecord (Standard_IStream& theIStr
            }       // otherwise ERROR
          }     // end of switch
          myError = "Unknown XML object: ";
-        myError += TCollection_AsciiString ((const Standard_CString)myPtr,
-                                            XML_MIN_BUFFER);
+        myError += TCollection_AsciiString (myPtr, XML_MIN_BUFFER);
          return XML_UNKNOWN;
        case '\0':
          if (myEOF == Standard_True) continue;
+        Standard_FALLTHROUGH
        default:
          //      Limitation: we do not treat '&' as special character
          aPtr = (const char *) memchr (myPtr, '<', myEndPtr - myPtr);
@@ -200,6 +324,7 @@ LDOM_XmlReader::RecordType LDOM_XmlReader::ReadRecord (Standard_IStream& theIStr
          aState = STATE_TEXT;
          aStartData = myPtr;
          myPtr = myEndPtr;
+        aHasRead = Standard_False;
        }   // end of checking in STATE_WAITING
        continue;
  
@@ -219,6 +344,7 @@ LDOM_XmlReader::RecordType LDOM_XmlReader::ReadRecord (Standard_IStream& theIStr
          return XML_HEADER;
        }
        myPtr = myEndPtr - 1;
+      aHasRead = Standard_False;
        continue;
  
        // Checking the characters in STATE_DOCTYPE, seek for "]>" sequence
@@ -239,6 +365,7 @@ LDOM_XmlReader::RecordType LDOM_XmlReader::ReadRecord (Standard_IStream& theIStr
          }
        }
        myPtr = myEndPtr - 1;
+      aHasRead = Standard_False;
        continue;
  
      state_doctype_markup:
@@ -257,6 +384,7 @@ LDOM_XmlReader::RecordType LDOM_XmlReader::ReadRecord (Standard_IStream& theIStr
          return XML_DOCTYPE;
        }
        myPtr = myEndPtr - 1;
+      aHasRead = Standard_False;
        continue;
  
          // Checking the characters in STATE_COMMENT, seek for "-->" sequence
@@ -278,6 +406,7 @@ LDOM_XmlReader::RecordType LDOM_XmlReader::ReadRecord (Standard_IStream& theIStr
          }
        }
        myPtr = myEndPtr - 2;
+      aHasRead = Standard_False;
        continue;
  
          // Checking the characters in STATE_TEXT, seek for "<"
@@ -291,6 +420,7 @@ LDOM_XmlReader::RecordType LDOM_XmlReader::ReadRecord (Standard_IStream& theIStr
          return XML_TEXT;
        }
        myPtr = myEndPtr;
+      aHasRead = Standard_False;
        continue;
  
          // Checking the characters in STATE_CDATA, seek for "]]"
@@ -309,6 +439,7 @@ LDOM_XmlReader::RecordType LDOM_XmlReader::ReadRecord (Standard_IStream& theIStr
          return XML_CDATA;
        }
        myPtr = myEndPtr - 1;
+      aHasRead = Standard_False;
        continue;
  
          // Checking the characters in STATE_ELEMENT, seek the end of TagName
@@ -397,6 +528,7 @@ attr_name:
        switch (myPtr[0]) {
        case '=' :
          aState = STATE_ATTRIBUTE_VALUE;
+        Standard_FALLTHROUGH
        case ' ' :
        case '\t':
        case '\n':
@@ -464,8 +596,11 @@ attr_name:
            myPtr = aPtr + 1;
            aStartData = NULL;
            aState = STATE_ATTRIBUTE_NAME;
-        } else
+        }
+        else {
            myPtr = myEndPtr;
+          aHasRead = Standard_False;
+        }
          continue;
        }
          // Checking the characters in STATE_ELEMENT_END, seek for ">"
@@ -479,6 +614,7 @@ attr_name:
          return XML_END_ELEMENT;
        }
        myPtr = myEndPtr;
+      aHasRead = Standard_False;
        continue;
      }
    }
@@ -521,6 +657,7 @@ static Standard_Boolean isName (const char  * aString,
            aNameEnd = aPtr;
            return Standard_False;
          }
+        Standard_FALLTHROUGH
        case '.' :
        case '-' :
        case '_' :
@@ -537,6 +674,15 @@ static Standard_Boolean isName (const char  * aString,
    return aResult;
  }
  
+//=======================================================================
+//function : CreateElement
+//purpose  : 
+//=======================================================================
+void LDOM_XmlReader::CreateElement( const char *theName, const Standard_Integer theLen )
+{
+  myElement = &LDOM_BasicElement::Create (theName, theLen, myDocument);
+}
+
  //=======================================================================
  //function : getInteger
  //purpose  : Try to initialize theValue as Integer; return False on success