7fd59977 |
1 | // File: LDOM_XmlReader.cxx |
2 | // Created: 20.07.01 15:38:15 |
3 | // Author: Alexander GRIGORIEV |
4 | // Copyright: OpenCascade 2001 |
5 | // History: AGV 060302: Input from istream |
6 | // AGV 130302: bug corr: was error if strlen(root_elem_name) < 7 |
7 | |
8 | #include <LDOM_XmlReader.hxx> |
9 | #include <Standard_Stream.hxx> |
10 | #include <LDOM_MemManager.hxx> |
11 | #include <LDOM_BasicAttribute.hxx> |
12 | #include <LDOM_CharReference.hxx> |
13 | #include <LDOM_OSStream.hxx> |
14 | |
15 | #include <string.h> |
16 | #include <errno.h> |
17 | #ifdef WNT |
18 | #include <io.h> |
19 | #else |
20 | #include <unistd.h> |
21 | #endif |
22 | |
23 | //#include <ctype.h> |
24 | |
25 | const int XML_MIN_BUFFER = 10; |
26 | const int MAX_ATTRIBUTES = 512; |
27 | const int FILE_NONVALUE = -1; |
28 | |
29 | typedef enum { |
30 | STATE_WAITING = 0, |
31 | STATE_HEADER, |
32 | STATE_DOCTYPE, |
33 | STATE_DOCTYPE_MARKUP, |
34 | STATE_ELEMENT, |
35 | STATE_ELEMENT_END, |
36 | STATE_ATTRIBUTE_NAME, |
37 | STATE_ATTRIBUTE_EQUAL, |
38 | STATE_ATTRIBUTE_VALUE, |
39 | STATE_COMMENT, |
40 | STATE_CDATA, |
41 | STATE_TEXT |
42 | } ParserState; |
43 | |
44 | #define TEXT_COMPARE(aPtr,aPattern) \ |
45 | (memcmp ((aPtr), (aPattern), sizeof(aPattern) - 1) == 0) |
46 | |
47 | static Standard_Boolean isName (const char * aString, |
48 | const char * aStringEnd, |
49 | const char *& aNameEnd); |
50 | |
51 | //======================================================================= |
52 | //function : LDOM_XmlReader() |
53 | //purpose : Constructor (file descriptor) |
54 | //======================================================================= |
55 | |
56 | LDOM_XmlReader::LDOM_XmlReader (const int aFileDes, |
57 | const Handle(LDOM_MemManager)& aDocument, |
58 | TCollection_AsciiString& anErrorString) |
59 | : myEOF (Standard_False), |
60 | myFileDes (aFileDes), |
61 | #ifdef WNT |
62 | myIStream (cin), // one quirk of MSVC6.0: can't initialise by 0 |
63 | #else |
64 | myIStream (* (istream *) UndefinedHandleAddress), |
65 | #endif |
66 | myError (anErrorString), |
67 | myDocument (aDocument), |
68 | myPtr (&myBuffer[0]), |
69 | myEndPtr (&myBuffer[0]) |
70 | {} |
71 | |
72 | //======================================================================= |
73 | //function : LDOM_XmlReader() |
74 | //purpose : Constructor (istream) |
75 | //======================================================================= |
76 | |
77 | LDOM_XmlReader::LDOM_XmlReader (istream& anInput, |
78 | const Handle(LDOM_MemManager)& aDocument, |
79 | TCollection_AsciiString& anErrorString) |
80 | : myEOF (Standard_False), |
81 | myFileDes (FILE_NONVALUE), |
82 | myIStream (anInput), |
83 | myError (anErrorString), |
84 | myDocument (aDocument), |
85 | myPtr (&myBuffer[0]), |
86 | myEndPtr (&myBuffer[0]) |
87 | {} |
88 | |
89 | //======================================================================= |
90 | //function : ReadRecord |
91 | //purpose : Read a record from XML file |
92 | //======================================================================= |
93 | |
94 | LDOM_XmlReader::RecordType LDOM_XmlReader::ReadRecord |
95 | (LDOM_OSStream& theData) |
96 | { |
97 | theData.Clear(); |
98 | myError.Clear(); |
99 | ParserState aState = STATE_WAITING; |
100 | const char * aStartData = NULL, * aNameEnd, * aPtr; |
101 | LDOMBasicString anAttrName, anAttrValue; |
102 | char anAttDelimiter = '\0'; |
103 | |
104 | while (1) { |
105 | // Check if the current file buffer is exhausted |
106 | // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
107 | // There should always be some bytes available in the buffer for analysis |
108 | Standard_Integer aBytesRest = myEndPtr - myPtr; |
109 | if (aBytesRest < XML_MIN_BUFFER) { |
110 | if (myEOF == Standard_True) { |
111 | if (aBytesRest <= 0) |
112 | break; // END of processing |
113 | } else { |
114 | // If we are reading some data, save the beginning and preserve the state |
115 | if (aStartData /* && aState != STATE_WAITING */) { |
116 | if (myPtr > aStartData) |
117 | theData.rdbuf()->sputn(aStartData, myPtr - aStartData); |
118 | aStartData = &myBuffer[0]; |
119 | } |
120 | // Copy the rest of file data to the beginning of buffer |
121 | if (aBytesRest > 0) |
122 | memcpy (&myBuffer[0], myPtr, aBytesRest); |
123 | |
124 | // Read the full buffer and reset start and end buffer pointers |
125 | myPtr = &myBuffer[0]; |
126 | Standard_Integer aNBytes; |
127 | if (myFileDes != FILE_NONVALUE) |
128 | aNBytes = read (myFileDes, &myBuffer[aBytesRest], |
129 | XML_BUFFER_SIZE - aBytesRest); |
130 | else { |
131 | myIStream.read (&myBuffer[aBytesRest], |
132 | XML_BUFFER_SIZE - aBytesRest); |
133 | aNBytes = myIStream.gcount(); |
134 | } |
135 | if (aNBytes == 0) |
136 | myEOF = Standard_True; // END-OF-FILE |
137 | myEndPtr = &myBuffer[aBytesRest + aNBytes]; |
138 | myBuffer[aBytesRest + aNBytes] = '\0'; |
139 | } |
140 | } |
141 | |
142 | // Check the character data |
143 | switch (aState) { |
144 | |
145 | // Checking the characters in STATE_WAITING (blank, TEXT or markup) |
146 | // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
147 | case STATE_WAITING: |
148 | switch (myPtr[0]) { |
149 | case ' ': |
150 | case '\t': |
151 | case '\n': |
152 | case '\r': |
153 | ++ myPtr; |
154 | continue; |
155 | case '<': |
156 | // XML markup found, then make detect the record type |
157 | switch (myPtr[1]) { |
158 | case '?': |
159 | aState = STATE_HEADER; |
160 | myPtr += 2; |
161 | aStartData = myPtr; |
162 | continue; |
163 | case '/': |
164 | aState = STATE_ELEMENT_END; |
165 | myPtr += 2; |
166 | aStartData = myPtr; |
167 | continue; |
168 | case '!': |
169 | if (myPtr[2] == '-' && myPtr[3] == '-') { |
170 | aState = STATE_COMMENT; |
171 | myPtr += 4; |
172 | } else if (TEXT_COMPARE (&myPtr[2], "DOCTYPE")) { |
173 | char ch = myPtr[9]; |
174 | if (ch != ' ' && ch != '\t' && ch != '\n' && ch != '\r') |
175 | break; |
176 | aState = STATE_DOCTYPE; |
177 | myPtr += 10; |
178 | } else if (TEXT_COMPARE (&myPtr[2], "[CDATA[")) { |
179 | aState = STATE_CDATA; |
180 | myPtr += 9; |
181 | } else break; // ERROR |
182 | aStartData = myPtr; |
183 | continue; |
184 | default: |
185 | if (::isName (&myPtr[1], myEndPtr, aNameEnd)) { |
186 | aStartData = myPtr + 1; |
187 | myPtr = aNameEnd; |
188 | if (myPtr < myEndPtr) { |
189 | myElement = & LDOM_BasicElement::Create (aStartData, |
190 | myPtr - aStartData, |
191 | myDocument); |
192 | myLastChild = NULL; |
193 | aState = STATE_ATTRIBUTE_NAME; |
194 | aStartData = NULL; |
195 | }else |
196 | aState = STATE_ELEMENT; |
197 | continue; |
198 | } // otherwise ERROR |
199 | } // end of switch |
200 | myError = "Unknown XML object: "; |
201 | myError += TCollection_AsciiString ((const Standard_CString)myPtr, |
202 | XML_MIN_BUFFER); |
203 | return XML_UNKNOWN; |
204 | case '\0': |
205 | if (myEOF == Standard_True) continue; |
206 | default: |
207 | // Limitation: we do not treat '&' as special character |
208 | aPtr = (const char *) memchr (myPtr, '<', myEndPtr - myPtr); |
209 | if (aPtr) { |
210 | // The end of text field reached |
211 | theData.rdbuf()->sputn(myPtr, aPtr - myPtr); |
212 | myPtr = aPtr; |
213 | return XML_TEXT; |
214 | } |
215 | aState = STATE_TEXT; |
216 | aStartData = myPtr; |
217 | myPtr = myEndPtr; |
218 | } // end of checking in STATE_WAITING |
219 | continue; |
220 | |
221 | // Checking the characters in STATE_HEADER, seek for "?>" sequence |
222 | // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
223 | case STATE_HEADER: |
224 | aPtr = (const char *) memchr (aStartData, '?', (myEndPtr-1) - aStartData); |
225 | if (aPtr) { |
226 | // The end of XML declaration found |
227 | if (aPtr[1] != '>') { // ERROR |
228 | myError = "Character \'>\' is expected in the end of XML declaration"; |
229 | return XML_UNKNOWN; |
230 | } |
231 | // The XML declaration is retrieved |
232 | theData.rdbuf()->sputn(aStartData, aPtr - aStartData); |
233 | myPtr = aPtr + 2; |
234 | return XML_HEADER; |
235 | } |
236 | myPtr = myEndPtr - 1; |
237 | continue; |
238 | |
239 | // Checking the characters in STATE_DOCTYPE, seek for "]>" sequence |
240 | // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
241 | case STATE_DOCTYPE: |
242 | for (aPtr = aStartData; aPtr < myEndPtr-1; aPtr++) { |
243 | const int aChar = aPtr[0]; |
244 | if (aChar == '[') { |
245 | aState = STATE_DOCTYPE_MARKUP; |
246 | aStartData = &aPtr[1]; |
247 | goto state_doctype_markup; |
248 | } |
249 | if (aChar == '>') { |
250 | // The DOCTYPE declaration is retrieved |
251 | theData.rdbuf()->sputn(aStartData, aPtr - aStartData - 1); |
252 | myPtr = aPtr + 1; |
253 | return XML_DOCTYPE; |
254 | } |
255 | } |
256 | myPtr = myEndPtr - 1; |
257 | continue; |
258 | |
259 | state_doctype_markup: |
260 | case STATE_DOCTYPE_MARKUP: |
261 | aPtr = (const char *) memchr (aStartData, ']', (myEndPtr-1) - aStartData); |
262 | if (aPtr) { |
263 | // The end of DOCTYPE declaration found |
264 | if (aPtr[1] != '>') { // ERROR |
265 | myError = |
266 | "Character \'>\' is expected in the end of DOCTYPE declaration"; |
267 | return XML_UNKNOWN; |
268 | } |
269 | // The DOCTYPE declaration is retrieved |
270 | theData.rdbuf()->sputn(aStartData, aPtr - aStartData); |
271 | myPtr = aPtr + 2; |
272 | return XML_DOCTYPE; |
273 | } |
274 | myPtr = myEndPtr - 1; |
275 | continue; |
276 | |
277 | // Checking the characters in STATE_COMMENT, seek for "-->" sequence |
278 | // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
279 | case STATE_COMMENT: |
280 | aPtr = aStartData; |
281 | while (1) { |
282 | aPtr = (const char *) memchr (aPtr, '-', (myEndPtr - 2) - aPtr); |
283 | if (aPtr == NULL) break; |
284 | if (aPtr[1] != '-') ++ aPtr; |
285 | else { |
286 | if (aPtr[2] != '>') { // ERROR |
287 | myError = "Character \'>\' is expected in the end of comment"; |
288 | return XML_UNKNOWN; |
289 | } |
290 | theData.rdbuf()->sputn(aStartData, aPtr - aStartData); |
291 | myPtr = aPtr + 3; |
292 | return XML_COMMENT; |
293 | } |
294 | } |
295 | myPtr = myEndPtr - 2; |
296 | continue; |
297 | |
298 | // Checking the characters in STATE_TEXT, seek for "<" |
299 | // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
300 | case STATE_TEXT: |
301 | aPtr = (const char *) memchr (aStartData, '<', myEndPtr - aStartData); |
302 | if (aPtr) { |
303 | // The end of text field reached |
304 | theData.rdbuf()->sputn(aStartData, aPtr - aStartData); |
305 | myPtr = aPtr; |
306 | return XML_TEXT; |
307 | } |
308 | myPtr = myEndPtr; |
309 | continue; |
310 | |
311 | // Checking the characters in STATE_CDATA, seek for "]]" |
312 | // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
313 | case STATE_CDATA: |
314 | aPtr = aStartData; |
315 | while (1) { |
316 | aPtr = (const char *) memchr (aPtr, ']', (myEndPtr - 1) - aStartData); |
317 | if (aPtr == NULL) break; |
318 | if (aPtr[1] != ']') { // ERROR |
319 | myError = "Characters \']]\' are expected in the end of CDATA"; |
320 | return XML_UNKNOWN; |
321 | } |
322 | theData.rdbuf()->sputn(aStartData, aPtr - aStartData); |
323 | myPtr = aPtr + 2; |
324 | return XML_CDATA; |
325 | } |
326 | myPtr = myEndPtr - 1; |
327 | continue; |
328 | |
329 | // Checking the characters in STATE_ELEMENT, seek the end of TagName |
330 | // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
331 | case STATE_ELEMENT: |
332 | if (::isName (myPtr, myEndPtr, aNameEnd) == Standard_False) |
333 | if (theData.Length() == 0 || aNameEnd != myPtr) { |
334 | myError = "Invalid tag name"; |
335 | return XML_UNKNOWN; |
336 | } |
337 | { |
338 | theData.rdbuf()->sputn(aStartData, aNameEnd - aStartData); |
339 | char* aDataString = (char *)theData.str(); |
340 | myElement = & LDOM_BasicElement::Create (aDataString, theData.Length(), |
341 | myDocument); |
342 | theData.Clear(); |
343 | myLastChild = NULL; |
344 | delete [] aDataString; |
345 | aState = STATE_ATTRIBUTE_NAME; |
346 | aStartData = NULL; |
347 | myPtr = aNameEnd; |
348 | continue; |
349 | } |
350 | // Parsing a single attribute (STATE_ATTRIBUTE) |
351 | // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
352 | case STATE_ATTRIBUTE_NAME: // attribute name |
353 | switch (myPtr[0]) { |
354 | case ' ' : |
355 | case '\t': |
356 | case '\n': |
357 | case '\r': |
358 | if (aStartData) goto attr_name; |
359 | ++ myPtr; |
360 | continue; |
361 | case '/' : |
362 | if (aStartData) |
363 | myError = "Inexpected end of attribute"; |
364 | else if (myPtr[1] != '>') |
365 | myError = "Improper element tag termination"; |
366 | else { |
367 | myPtr += 2; |
368 | #ifdef DEB |
369 | theData.Clear(); |
370 | theData << myElement->GetTagName(); |
371 | #endif |
372 | return XML_FULL_ELEMENT; |
373 | } |
374 | return XML_UNKNOWN; |
375 | case '>' : |
376 | if (aStartData) { |
377 | myError = "Inexpected end of attribute"; |
378 | return XML_UNKNOWN; |
379 | } |
380 | ++ myPtr; |
381 | #ifdef DEB |
382 | theData.Clear(); |
383 | theData << myElement->GetTagName(); |
384 | #endif |
385 | return XML_START_ELEMENT; |
386 | default : |
387 | if (::isName (myPtr, myEndPtr, aNameEnd) == Standard_False) |
388 | if (theData.Length() == 0 || aNameEnd != myPtr) { |
389 | myError = "Invalid attribute name"; |
390 | return XML_UNKNOWN; |
391 | } |
392 | if (aNameEnd >= myEndPtr) |
393 | aStartData = myPtr; |
394 | else { |
395 | if (theData.Length() == 0) |
396 | anAttrName = LDOMBasicString(myPtr, aNameEnd - myPtr, myDocument); |
397 | else { |
398 | theData.rdbuf()->sputn(myPtr, aNameEnd - myPtr); |
399 | attr_name: |
400 | char* aDataString = (char *)theData.str(); |
401 | theData.Clear(); |
402 | anAttrName = LDOMBasicString (aDataString, myDocument); |
403 | delete [] aDataString; |
404 | } |
405 | aStartData = NULL; |
406 | aState = STATE_ATTRIBUTE_EQUAL; |
407 | } |
408 | myPtr = aNameEnd; |
409 | continue; |
410 | } |
411 | case STATE_ATTRIBUTE_EQUAL: // attribute 'equal' sign |
412 | switch (myPtr[0]) { |
413 | case '=' : |
414 | aState = STATE_ATTRIBUTE_VALUE; |
415 | case ' ' : |
416 | case '\t': |
417 | case '\n': |
418 | case '\r': |
419 | ++ myPtr; |
420 | continue; |
421 | default: |
422 | myError = "Equal sign expected in attribute definition"; |
423 | return XML_UNKNOWN; |
424 | } |
425 | |
426 | case STATE_ATTRIBUTE_VALUE: // attribute value |
427 | switch (myPtr[0]) { |
428 | case ' ' : |
429 | case '\t': |
430 | case '\n': |
431 | case '\r': |
432 | if (aStartData == NULL) { |
433 | ++ myPtr; |
434 | continue; |
435 | default: |
436 | if (anAttDelimiter == '\0') { |
437 | myError = "Expected an attribute value"; |
438 | return XML_UNKNOWN; |
439 | case '\"': |
440 | case '\'': |
441 | if (aStartData == NULL) { |
442 | aStartData = &myPtr[1]; |
443 | anAttDelimiter = myPtr[0]; |
444 | } |
445 | } |
446 | } |
447 | // Limitation: we do not take into account that '<' and '&' |
448 | // are not allowed in attribute values |
449 | aPtr = (const char *) memchr (aStartData, anAttDelimiter, |
450 | myEndPtr - aStartData); |
451 | if (aPtr) { |
452 | (char&) aPtr[0] = '\0'; |
453 | anAttDelimiter = '\0'; |
454 | char * aDataString = (char *) aStartData; |
455 | const char * ePtr = aPtr; |
456 | |
457 | // Append the end of the string to previously taken data |
458 | if (theData.Length() > 0) { |
459 | theData.rdbuf()->sputn(aStartData, aPtr-aStartData); |
460 | aDataString = (char *)theData.str(); |
461 | ePtr = strchr (aDataString, '\0'); |
462 | } |
463 | |
464 | Standard_Integer aDataLen; |
465 | aDataString = LDOM_CharReference::Decode (aDataString, aDataLen); |
466 | if (IsDigit(aDataString[0])) { |
467 | if (getInteger (anAttrValue, aDataString, ePtr)) |
468 | anAttrValue = LDOMBasicString (aDataString,aDataLen,myDocument); |
469 | } else |
470 | anAttrValue = LDOMBasicString (aDataString, aDataLen, myDocument); |
471 | |
472 | if (theData.Length() > 0) { |
473 | theData.Clear(); |
474 | delete [] aDataString; |
475 | } |
476 | // Create an attribute |
477 | myLastChild = myElement -> AddAttribute (anAttrName, anAttrValue, |
478 | myDocument, myLastChild); |
479 | myPtr = aPtr + 1; |
480 | aStartData = NULL; |
481 | aState = STATE_ATTRIBUTE_NAME; |
482 | } else |
483 | myPtr = myEndPtr; |
484 | continue; |
485 | } |
486 | // Checking the characters in STATE_ELEMENT_END, seek for ">" |
487 | // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
488 | case STATE_ELEMENT_END: |
489 | aPtr = (const char *) memchr (aStartData, '>', myEndPtr - aStartData); |
490 | if (aPtr) { |
491 | // The end of the end-element markup |
492 | theData.rdbuf()->sputn(aStartData, aPtr - aStartData); |
493 | myPtr = aPtr + 1; |
494 | return XML_END_ELEMENT; |
495 | } |
496 | myPtr = myEndPtr; |
497 | continue; |
498 | } |
499 | } |
500 | if (aState != STATE_WAITING) { |
501 | myError = "Unexpected end of file"; |
502 | return XML_UNKNOWN; |
503 | } |
504 | return XML_EOF; |
505 | } |
506 | |
507 | //======================================================================= |
508 | //function : isName |
509 | //type : static |
510 | //purpose : Check if aString is a valid XML Name |
511 | //======================================================================= |
512 | |
513 | static Standard_Boolean isName (const char * aString, |
514 | const char * aStringEnd, |
515 | const char *& aNameEnd) |
516 | { |
517 | Standard_Boolean aResult; |
518 | int aCh = aString[0]; |
519 | if (IsAlphabetic(aCh) || aCh == '_' || aCh == ':') { |
520 | const char * aPtr = &aString[1]; |
521 | while (aPtr < aStringEnd) { |
522 | aCh = * aPtr; |
523 | switch (aCh) { |
524 | case ' ' : |
525 | case '\n': |
526 | case '\r': |
527 | case '\t': |
528 | case '=' : |
529 | case '\0': |
530 | case '/' : |
531 | case '>' : |
532 | aNameEnd = aPtr; |
533 | return Standard_True; |
534 | default: |
535 | if (IsAlphanumeric(aCh) == 0) { |
536 | aNameEnd = aPtr; |
537 | return Standard_False; |
538 | } |
539 | case '.' : |
540 | case '-' : |
541 | case '_' : |
542 | case ':' : |
543 | ++ aPtr; |
544 | } |
545 | } |
546 | aNameEnd = aPtr; |
547 | aResult = Standard_True; |
548 | } else { |
549 | aNameEnd = aString; |
550 | aResult = Standard_False; |
551 | } |
552 | return aResult; |
553 | } |
554 | |
555 | //======================================================================= |
556 | //function : getInteger |
557 | //purpose : Try to initialize theValue as Integer; return False on success |
558 | //======================================================================= |
559 | |
560 | Standard_Boolean LDOM_XmlReader::getInteger (LDOMBasicString& theValue, |
561 | const char * theStart, |
562 | const char * theEnd) |
563 | { |
564 | char * ptr; |
565 | errno = 0; |
566 | if (theEnd - theStart == 1 || theStart[0] != '0') |
567 | { |
568 | long aResult = strtol (theStart, &ptr, 10); |
569 | if (ptr == theEnd && errno == 0) |
570 | { |
571 | theValue = Standard_Integer(aResult); |
572 | return Standard_False; |
573 | } |
574 | } |
575 | return Standard_True; |
576 | } |