0024428: Implementation of LGPL license
[occt.git] / src / LDOM / LDOM_XmlReader.cxx
CommitLineData
b311480e 1// Created on: 2001-07-20
2// Created by: Alexander GRIGORIEV
973c2be1 3// Copyright (c) 2001-2014 OPEN CASCADE SAS
b311480e 4//
973c2be1 5// This file is part of Open CASCADE Technology software library.
b311480e 6//
973c2be1 7// This library is free software; you can redistribute it and / or modify it
8// under the terms of the GNU Lesser General Public version 2.1 as published
9// by the Free Software Foundation, with special exception defined in the file
10// OCCT_LGPL_EXCEPTION.txt. Consult the file LICENSE_LGPL_21.txt included in OCCT
11// distribution for complete text of the license and disclaimer of any warranty.
b311480e 12//
973c2be1 13// Alternatively, this file may be used under the terms of Open CASCADE
14// commercial license or contractual agreement.
b311480e 15
16//AGV 060302: Input from istream
7fd59977 17// AGV 130302: bug corr: was error if strlen(root_elem_name) < 7
18
19#include <LDOM_XmlReader.hxx>
20#include <Standard_Stream.hxx>
21#include <LDOM_MemManager.hxx>
22#include <LDOM_BasicAttribute.hxx>
23#include <LDOM_CharReference.hxx>
24#include <LDOM_OSStream.hxx>
25
26#include <string.h>
27#include <errno.h>
28#ifdef WNT
29#include <io.h>
30#else
31#include <unistd.h>
32#endif
33
34//#include <ctype.h>
35
36const int XML_MIN_BUFFER = 10;
7fd59977 37const int FILE_NONVALUE = -1;
38
39typedef enum {
40 STATE_WAITING = 0,
41 STATE_HEADER,
42 STATE_DOCTYPE,
43 STATE_DOCTYPE_MARKUP,
44 STATE_ELEMENT,
45 STATE_ELEMENT_END,
46 STATE_ATTRIBUTE_NAME,
47 STATE_ATTRIBUTE_EQUAL,
48 STATE_ATTRIBUTE_VALUE,
49 STATE_COMMENT,
50 STATE_CDATA,
51 STATE_TEXT
52} ParserState;
53
54#define TEXT_COMPARE(aPtr,aPattern) \
55 (memcmp ((aPtr), (aPattern), sizeof(aPattern) - 1) == 0)
56
57static Standard_Boolean isName (const char * aString,
58 const char * aStringEnd,
59 const char *& aNameEnd);
60
61//=======================================================================
62//function : LDOM_XmlReader()
63//purpose : Constructor (file descriptor)
64//=======================================================================
65
66LDOM_XmlReader::LDOM_XmlReader (const int aFileDes,
67 const Handle(LDOM_MemManager)& aDocument,
68 TCollection_AsciiString& anErrorString)
69 : myEOF (Standard_False),
70 myFileDes (aFileDes),
71#ifdef WNT
72 myIStream (cin), // one quirk of MSVC6.0: can't initialise by 0
73#else
74 myIStream (* (istream *) UndefinedHandleAddress),
75#endif
76 myError (anErrorString),
77 myDocument (aDocument),
c24d4017 78 myElement (NULL),
7fd59977 79 myPtr (&myBuffer[0]),
80 myEndPtr (&myBuffer[0])
81{}
82
83//=======================================================================
84//function : LDOM_XmlReader()
85//purpose : Constructor (istream)
86//=======================================================================
87
88LDOM_XmlReader::LDOM_XmlReader (istream& anInput,
89 const Handle(LDOM_MemManager)& aDocument,
90 TCollection_AsciiString& anErrorString)
91 : myEOF (Standard_False),
92 myFileDes (FILE_NONVALUE),
93 myIStream (anInput),
94 myError (anErrorString),
95 myDocument (aDocument),
c24d4017 96 myElement (NULL),
7fd59977 97 myPtr (&myBuffer[0]),
98 myEndPtr (&myBuffer[0])
99{}
100
101//=======================================================================
102//function : ReadRecord
103//purpose : Read a record from XML file
104//=======================================================================
105
106LDOM_XmlReader::RecordType LDOM_XmlReader::ReadRecord
107 (LDOM_OSStream& theData)
108{
109 theData.Clear();
110 myError.Clear();
111 ParserState aState = STATE_WAITING;
1d47d8d0 112 const char * aStartData = NULL, * aNameEnd = NULL, * aPtr;
7fd59977 113 LDOMBasicString anAttrName, anAttrValue;
114 char anAttDelimiter = '\0';
115
302f96fb 116 for(;;) {
7fd59977 117 // Check if the current file buffer is exhausted
118 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
119 // There should always be some bytes available in the buffer for analysis
7dc9e047 120 Standard_Integer aBytesRest = (Standard_Integer)(myEndPtr - myPtr);
7fd59977 121 if (aBytesRest < XML_MIN_BUFFER) {
122 if (myEOF == Standard_True) {
123 if (aBytesRest <= 0)
124 break; // END of processing
125 } else {
126 // If we are reading some data, save the beginning and preserve the state
127 if (aStartData /* && aState != STATE_WAITING */) {
128 if (myPtr > aStartData)
129 theData.rdbuf()->sputn(aStartData, myPtr - aStartData);
130 aStartData = &myBuffer[0];
131 }
132 // Copy the rest of file data to the beginning of buffer
133 if (aBytesRest > 0)
134 memcpy (&myBuffer[0], myPtr, aBytesRest);
135
136 // Read the full buffer and reset start and end buffer pointers
137 myPtr = &myBuffer[0];
60be1f9b 138 Standard_Size aNBytes;
7fd59977 139 if (myFileDes != FILE_NONVALUE)
140 aNBytes = read (myFileDes, &myBuffer[aBytesRest],
141 XML_BUFFER_SIZE - aBytesRest);
142 else {
143 myIStream.read (&myBuffer[aBytesRest],
144 XML_BUFFER_SIZE - aBytesRest);
105aae76 145 aNBytes = (Standard_Size)myIStream.gcount();
7fd59977 146 }
147 if (aNBytes == 0)
148 myEOF = Standard_True; // END-OF-FILE
149 myEndPtr = &myBuffer[aBytesRest + aNBytes];
150 myBuffer[aBytesRest + aNBytes] = '\0';
151 }
152 }
153
154 // Check the character data
155 switch (aState) {
156
157 // Checking the characters in STATE_WAITING (blank, TEXT or markup)
158 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
159 case STATE_WAITING:
160 switch (myPtr[0]) {
161 case ' ':
162 case '\t':
163 case '\n':
164 case '\r':
165 ++ myPtr;
166 continue;
167 case '<':
168 // XML markup found, then make detect the record type
169 switch (myPtr[1]) {
170 case '?':
171 aState = STATE_HEADER;
172 myPtr += 2;
173 aStartData = myPtr;
174 continue;
175 case '/':
176 aState = STATE_ELEMENT_END;
177 myPtr += 2;
178 aStartData = myPtr;
179 continue;
180 case '!':
181 if (myPtr[2] == '-' && myPtr[3] == '-') {
182 aState = STATE_COMMENT;
183 myPtr += 4;
184 } else if (TEXT_COMPARE (&myPtr[2], "DOCTYPE")) {
185 char ch = myPtr[9];
186 if (ch != ' ' && ch != '\t' && ch != '\n' && ch != '\r')
187 break;
188 aState = STATE_DOCTYPE;
189 myPtr += 10;
190 } else if (TEXT_COMPARE (&myPtr[2], "[CDATA[")) {
191 aState = STATE_CDATA;
192 myPtr += 9;
193 } else break; // ERROR
194 aStartData = myPtr;
195 continue;
196 default:
197 if (::isName (&myPtr[1], myEndPtr, aNameEnd)) {
198 aStartData = myPtr + 1;
199 myPtr = aNameEnd;
200 if (myPtr < myEndPtr) {
201 myElement = & LDOM_BasicElement::Create (aStartData,
7dc9e047 202 (Standard_Integer)(myPtr - aStartData),
7fd59977 203 myDocument);
204 myLastChild = NULL;
205 aState = STATE_ATTRIBUTE_NAME;
206 aStartData = NULL;
207 }else
208 aState = STATE_ELEMENT;
209 continue;
210 } // otherwise ERROR
211 } // end of switch
212 myError = "Unknown XML object: ";
213 myError += TCollection_AsciiString ((const Standard_CString)myPtr,
214 XML_MIN_BUFFER);
215 return XML_UNKNOWN;
216 case '\0':
217 if (myEOF == Standard_True) continue;
218 default:
219 // Limitation: we do not treat '&' as special character
220 aPtr = (const char *) memchr (myPtr, '<', myEndPtr - myPtr);
221 if (aPtr) {
222 // The end of text field reached
223 theData.rdbuf()->sputn(myPtr, aPtr - myPtr);
224 myPtr = aPtr;
225 return XML_TEXT;
226 }
227 aState = STATE_TEXT;
228 aStartData = myPtr;
229 myPtr = myEndPtr;
230 } // end of checking in STATE_WAITING
231 continue;
232
233 // Checking the characters in STATE_HEADER, seek for "?>" sequence
234 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
235 case STATE_HEADER:
236 aPtr = (const char *) memchr (aStartData, '?', (myEndPtr-1) - aStartData);
237 if (aPtr) {
238 // The end of XML declaration found
239 if (aPtr[1] != '>') { // ERROR
240 myError = "Character \'>\' is expected in the end of XML declaration";
241 return XML_UNKNOWN;
242 }
243 // The XML declaration is retrieved
244 theData.rdbuf()->sputn(aStartData, aPtr - aStartData);
245 myPtr = aPtr + 2;
246 return XML_HEADER;
247 }
248 myPtr = myEndPtr - 1;
249 continue;
250
251 // Checking the characters in STATE_DOCTYPE, seek for "]>" sequence
252 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
253 case STATE_DOCTYPE:
254 for (aPtr = aStartData; aPtr < myEndPtr-1; aPtr++) {
255 const int aChar = aPtr[0];
256 if (aChar == '[') {
257 aState = STATE_DOCTYPE_MARKUP;
258 aStartData = &aPtr[1];
259 goto state_doctype_markup;
260 }
261 if (aChar == '>') {
262 // The DOCTYPE declaration is retrieved
263 theData.rdbuf()->sputn(aStartData, aPtr - aStartData - 1);
264 myPtr = aPtr + 1;
265 return XML_DOCTYPE;
266 }
267 }
268 myPtr = myEndPtr - 1;
269 continue;
270
271 state_doctype_markup:
272 case STATE_DOCTYPE_MARKUP:
273 aPtr = (const char *) memchr (aStartData, ']', (myEndPtr-1) - aStartData);
274 if (aPtr) {
275 // The end of DOCTYPE declaration found
276 if (aPtr[1] != '>') { // ERROR
277 myError =
278 "Character \'>\' is expected in the end of DOCTYPE declaration";
279 return XML_UNKNOWN;
280 }
281 // The DOCTYPE declaration is retrieved
282 theData.rdbuf()->sputn(aStartData, aPtr - aStartData);
283 myPtr = aPtr + 2;
284 return XML_DOCTYPE;
285 }
286 myPtr = myEndPtr - 1;
287 continue;
288
289 // Checking the characters in STATE_COMMENT, seek for "-->" sequence
290 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
291 case STATE_COMMENT:
292 aPtr = aStartData;
302f96fb 293 for(;;) {
7fd59977 294 aPtr = (const char *) memchr (aPtr, '-', (myEndPtr - 2) - aPtr);
295 if (aPtr == NULL) break;
296 if (aPtr[1] != '-') ++ aPtr;
297 else {
298 if (aPtr[2] != '>') { // ERROR
299 myError = "Character \'>\' is expected in the end of comment";
300 return XML_UNKNOWN;
301 }
302 theData.rdbuf()->sputn(aStartData, aPtr - aStartData);
303 myPtr = aPtr + 3;
304 return XML_COMMENT;
305 }
306 }
307 myPtr = myEndPtr - 2;
308 continue;
309
310 // Checking the characters in STATE_TEXT, seek for "<"
311 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
312 case STATE_TEXT:
313 aPtr = (const char *) memchr (aStartData, '<', myEndPtr - aStartData);
314 if (aPtr) {
315 // The end of text field reached
316 theData.rdbuf()->sputn(aStartData, aPtr - aStartData);
317 myPtr = aPtr;
318 return XML_TEXT;
319 }
320 myPtr = myEndPtr;
321 continue;
322
323 // Checking the characters in STATE_CDATA, seek for "]]"
324 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
325 case STATE_CDATA:
326 aPtr = aStartData;
302f96fb 327 for(;;) {
7fd59977 328 aPtr = (const char *) memchr (aPtr, ']', (myEndPtr - 1) - aStartData);
329 if (aPtr == NULL) break;
330 if (aPtr[1] != ']') { // ERROR
331 myError = "Characters \']]\' are expected in the end of CDATA";
332 return XML_UNKNOWN;
333 }
334 theData.rdbuf()->sputn(aStartData, aPtr - aStartData);
335 myPtr = aPtr + 2;
336 return XML_CDATA;
337 }
338 myPtr = myEndPtr - 1;
339 continue;
340
341 // Checking the characters in STATE_ELEMENT, seek the end of TagName
342 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
343 case STATE_ELEMENT:
344 if (::isName (myPtr, myEndPtr, aNameEnd) == Standard_False)
345 if (theData.Length() == 0 || aNameEnd != myPtr) {
346 myError = "Invalid tag name";
347 return XML_UNKNOWN;
348 }
349 {
350 theData.rdbuf()->sputn(aStartData, aNameEnd - aStartData);
351 char* aDataString = (char *)theData.str();
352 myElement = & LDOM_BasicElement::Create (aDataString, theData.Length(),
353 myDocument);
354 theData.Clear();
355 myLastChild = NULL;
356 delete [] aDataString;
357 aState = STATE_ATTRIBUTE_NAME;
358 aStartData = NULL;
359 myPtr = aNameEnd;
360 continue;
361 }
362 // Parsing a single attribute (STATE_ATTRIBUTE)
363 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
364 case STATE_ATTRIBUTE_NAME: // attribute name
365 switch (myPtr[0]) {
366 case ' ' :
367 case '\t':
368 case '\n':
369 case '\r':
370 if (aStartData) goto attr_name;
371 ++ myPtr;
372 continue;
373 case '/' :
374 if (aStartData)
375 myError = "Inexpected end of attribute";
376 else if (myPtr[1] != '>')
377 myError = "Improper element tag termination";
378 else {
379 myPtr += 2;
380#ifdef DEB
381 theData.Clear();
382 theData << myElement->GetTagName();
383#endif
384 return XML_FULL_ELEMENT;
385 }
386 return XML_UNKNOWN;
387 case '>' :
388 if (aStartData) {
389 myError = "Inexpected end of attribute";
390 return XML_UNKNOWN;
391 }
392 ++ myPtr;
393#ifdef DEB
394 theData.Clear();
395 theData << myElement->GetTagName();
396#endif
397 return XML_START_ELEMENT;
398 default :
399 if (::isName (myPtr, myEndPtr, aNameEnd) == Standard_False)
400 if (theData.Length() == 0 || aNameEnd != myPtr) {
401 myError = "Invalid attribute name";
402 return XML_UNKNOWN;
403 }
404 if (aNameEnd >= myEndPtr)
405 aStartData = myPtr;
406 else {
407 if (theData.Length() == 0)
7dc9e047 408 anAttrName = LDOMBasicString(myPtr, (Standard_Integer)(aNameEnd - myPtr), myDocument);
7fd59977 409 else {
410 theData.rdbuf()->sputn(myPtr, aNameEnd - myPtr);
411attr_name:
412 char* aDataString = (char *)theData.str();
413 theData.Clear();
414 anAttrName = LDOMBasicString (aDataString, myDocument);
415 delete [] aDataString;
416 }
417 aStartData = NULL;
418 aState = STATE_ATTRIBUTE_EQUAL;
419 }
420 myPtr = aNameEnd;
421 continue;
422 }
423 case STATE_ATTRIBUTE_EQUAL: // attribute 'equal' sign
424 switch (myPtr[0]) {
425 case '=' :
426 aState = STATE_ATTRIBUTE_VALUE;
427 case ' ' :
428 case '\t':
429 case '\n':
430 case '\r':
431 ++ myPtr;
432 continue;
433 default:
434 myError = "Equal sign expected in attribute definition";
435 return XML_UNKNOWN;
436 }
437
438 case STATE_ATTRIBUTE_VALUE: // attribute value
439 switch (myPtr[0]) {
440 case ' ' :
441 case '\t':
442 case '\n':
443 case '\r':
444 if (aStartData == NULL) {
445 ++ myPtr;
446 continue;
447 default:
448 if (anAttDelimiter == '\0') {
449 myError = "Expected an attribute value";
450 return XML_UNKNOWN;
451 case '\"':
452 case '\'':
453 if (aStartData == NULL) {
454 aStartData = &myPtr[1];
455 anAttDelimiter = myPtr[0];
456 }
457 }
458 }
459 // Limitation: we do not take into account that '<' and '&'
460 // are not allowed in attribute values
461 aPtr = (const char *) memchr (aStartData, anAttDelimiter,
462 myEndPtr - aStartData);
463 if (aPtr) {
464 (char&) aPtr[0] = '\0';
465 anAttDelimiter = '\0';
466 char * aDataString = (char *) aStartData;
467 const char * ePtr = aPtr;
468
469 // Append the end of the string to previously taken data
470 if (theData.Length() > 0) {
471 theData.rdbuf()->sputn(aStartData, aPtr-aStartData);
472 aDataString = (char *)theData.str();
473 ePtr = strchr (aDataString, '\0');
474 }
475
476 Standard_Integer aDataLen;
477 aDataString = LDOM_CharReference::Decode (aDataString, aDataLen);
478 if (IsDigit(aDataString[0])) {
479 if (getInteger (anAttrValue, aDataString, ePtr))
480 anAttrValue = LDOMBasicString (aDataString,aDataLen,myDocument);
481 } else
482 anAttrValue = LDOMBasicString (aDataString, aDataLen, myDocument);
483
484 if (theData.Length() > 0) {
485 theData.Clear();
486 delete [] aDataString;
487 }
488 // Create an attribute
489 myLastChild = myElement -> AddAttribute (anAttrName, anAttrValue,
490 myDocument, myLastChild);
491 myPtr = aPtr + 1;
492 aStartData = NULL;
493 aState = STATE_ATTRIBUTE_NAME;
494 } else
495 myPtr = myEndPtr;
496 continue;
497 }
498 // Checking the characters in STATE_ELEMENT_END, seek for ">"
499 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
500 case STATE_ELEMENT_END:
501 aPtr = (const char *) memchr (aStartData, '>', myEndPtr - aStartData);
502 if (aPtr) {
503 // The end of the end-element markup
504 theData.rdbuf()->sputn(aStartData, aPtr - aStartData);
505 myPtr = aPtr + 1;
506 return XML_END_ELEMENT;
507 }
508 myPtr = myEndPtr;
509 continue;
510 }
511 }
512 if (aState != STATE_WAITING) {
513 myError = "Unexpected end of file";
514 return XML_UNKNOWN;
515 }
516 return XML_EOF;
517}
518
519//=======================================================================
520//function : isName
521//type : static
522//purpose : Check if aString is a valid XML Name
523//=======================================================================
524
525static Standard_Boolean isName (const char * aString,
526 const char * aStringEnd,
527 const char *& aNameEnd)
528{
529 Standard_Boolean aResult;
302f96fb 530 char aCh = aString[0];
7fd59977 531 if (IsAlphabetic(aCh) || aCh == '_' || aCh == ':') {
532 const char * aPtr = &aString[1];
533 while (aPtr < aStringEnd) {
534 aCh = * aPtr;
535 switch (aCh) {
536 case ' ' :
537 case '\n':
538 case '\r':
539 case '\t':
540 case '=' :
541 case '\0':
542 case '/' :
543 case '>' :
544 aNameEnd = aPtr;
545 return Standard_True;
546 default:
547 if (IsAlphanumeric(aCh) == 0) {
548 aNameEnd = aPtr;
549 return Standard_False;
550 }
551 case '.' :
552 case '-' :
553 case '_' :
554 case ':' :
555 ++ aPtr;
556 }
557 }
558 aNameEnd = aPtr;
559 aResult = Standard_True;
560 } else {
561 aNameEnd = aString;
562 aResult = Standard_False;
563 }
564 return aResult;
565}
566
567//=======================================================================
568//function : getInteger
569//purpose : Try to initialize theValue as Integer; return False on success
570//=======================================================================
571
572Standard_Boolean LDOM_XmlReader::getInteger (LDOMBasicString& theValue,
573 const char * theStart,
574 const char * theEnd)
575{
576 char * ptr;
577 errno = 0;
578 if (theEnd - theStart == 1 || theStart[0] != '0')
579 {
580 long aResult = strtol (theStart, &ptr, 10);
581 if (ptr == theEnd && errno == 0)
582 {
583 theValue = Standard_Integer(aResult);
584 return Standard_False;
585 }
586 }
587 return Standard_True;
588}