From 9433bd5d628399686605dba5ddef00fef835a54f Mon Sep 17 00:00:00 2001 From: "arseny.kapoulkine" Date: Mon, 8 Jan 2007 02:51:49 +0000 Subject: Updated copyright notice, reworked internal parsing (completely different strategy for performing text conversions (EOL, etc.) - lazy gaps, reworked character classes) git-svn-id: http://pugixml.googlecode.com/svn/trunk@27 99668b35-9821-0410-8761-19e4c4f06640 --- src/pugixml.cpp | 651 +++++++++++++++++++++++++++++++++++++------------------- src/pugixml.hpp | 4 +- 2 files changed, 431 insertions(+), 224 deletions(-) (limited to 'src') diff --git a/src/pugixml.cpp b/src/pugixml.cpp index de3a548..da53c66 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -2,7 +2,9 @@ // // Pug Improved XML Parser - Version 0.2 // -------------------------------------------------------- -// Copyright (C) 2006, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) +// Copyright (C) 2006-2007, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) +// Thanks to Palvelev Artyom (cppguru@mail.ru) for hints about optimizing +// conversion functions. // This work is based on the pugxml parser, which is: // Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) // Released into the Public Domain. Use at your own risk. @@ -110,6 +112,43 @@ namespace const unsigned char BYTE_MASK_READ = 0x3F; const unsigned char FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; } + + enum chartype + { + ct_parse_pcdata = 1, // \0, &, \r, < + ct_parse_attr = 2, // \0, &, \r, ', " + ct_parse_attr_ws = 4, // \0, &, \r, ', ", \n, space, tab + ct_space = 8, // \r, \n, space, tab + ct_parse_cdata = 16, // \0, ], >, \r + ct_parse_comment = 32 // \0, -, >, \r + + }; + + static unsigned char chartype_table[256] = + { + 55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16-31 + 12, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 32, 0, 0, // 32-47 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 48, 0, // 48-63 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 64-79 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, // 80-95 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 96-111 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 112-127 + + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + + bool is_chartype(char c, chartype ct) + { + return !!(chartype_table[static_cast(c)] & ct); + } } namespace pugi @@ -237,185 +276,341 @@ namespace pugi { xml_allocator& alloc; bool chartype_symbol_table[256]; - + bool chartype_symbol(char c) const { return chartype_symbol_table[(unsigned char)c]; } - static bool chartype_space(char c) { return c < '!' && c > 0; } - static bool chartype_enter(char c) { return c == '<'; } - static bool chartype_leave(char c) { return c == '>'; } - static bool chartype_close(char c) { return c == '/'; } - static bool chartype_equals(char c) { return c == '='; } - static bool chartype_special(char c) { return c == '!'; } - static bool chartype_pi(char c) { return c == '?'; } - static bool chartype_dash(char c) { return c == '-'; } - static bool chartype_quote(char c) { return c == '"' || c == '\''; } - static bool chartype_lbracket(char c) { return c == '['; } - static bool chartype_rbracket(char c) { return c == ']'; } - - template static void strconv_t(char** s) + struct gap { - if (!s || !*s) return; - - if (!opt_escape && !opt_wnorm && !opt_wconv && !opt_eol) return; - - // Trim whitespaces - if (opt_wnorm) while (chartype_space(**s)) ++(*s); + char* end; + size_t size; - char* str = *s; + gap(): end(0), size(0) + { + } - // Skip usual symbols - if (opt_escape || opt_wnorm || opt_wconv || opt_eol) + // Push new gap, move s count bytes further (skipping the gap). + // Collapse previous gap. + void push(char*& s, size_t count) { - while (*str) + if (end) // there was a gap already; collapse it { - if (opt_escape && *str == '&') break; - if ((opt_wnorm || opt_wconv || opt_eol) && chartype_space(*str)) break; - - ++str; + // Move [old_gap_end, new_gap_start) to [old_gap_start, ...) + memmove(end - size, end, s - end); } + + s += count; // end of current gap + + // "merge" two gaps + end = s; + size += count; } + + // Collapse all gaps, return past-the-end pointer + char* flush(char* s) + { + if (end) + { + // Move [old_gap_end, current_pos) to [old_gap_start, ...) + memmove(end - size, end, s - end); - char* lastpos = str; - - if (!*str) return; + return s - size; + } + else return s; + } + }; + + static char* strconv_escape(char* s, gap& g) + { + char* stre = s + 1; - while (*str) + switch (*stre) { - if (*str == '&' && opt_escape) // & + case '#': // &#... { - char* stre = str + 1; + unsigned int ucsc = 0; + + ++stre; - switch (*stre) + if (*stre == 'x') // &#x... (hex code) { - case '#': // &#... + ++stre; + + while (*stre) { - unsigned int ucsc = 0; - - ++stre; + if (*stre >= '0' && *stre <= '9') + ucsc = 16 * ucsc + (*stre++ - '0'); + else if (*stre >= 'A' && *stre <= 'F') + ucsc = 16 * ucsc + (*stre++ - 'A' + 10); + else if (*stre >= 'a' && *stre <= 'f') + ucsc = 16 * ucsc + (*stre++ - 'a' + 10); + else break; + } - if (*stre == 'x') // &#x... (hex code) - { - ++stre; - - while (*stre) - { - if (*stre >= '0' && *stre <= '9') - ucsc = 16 * ucsc + (*stre++ - '0'); - else if (*stre >= 'A' && *stre <= 'F') - ucsc = 16 * ucsc + (*stre++ - 'A' + 10); - else if (*stre >= 'a' && *stre <= 'f') - ucsc = 16 * ucsc + (*stre++ - 'a' + 10); - else break; - } + if (*stre == ';') ++stre; + } + else // &#... (dec code) + { + while (*stre >= '0' && *stre <= '9') + ucsc = 10 * ucsc + (*stre++ - '0'); - if (*stre == ';') ++stre; - } - else // &#... (dec code) - { - while (*stre >= '0' && *stre <= '9') - ucsc = 10 * ucsc + (*stre++ - '0'); + if (*stre == ';') ++stre; + } - if (*stre == ';') ++stre; - } + s = strutf16_utf8(s, ucsc); + + g.push(s, stre - s); + return stre; + } + case 'a': // &a + { + ++stre; - str = stre; - lastpos = strutf16_utf8(lastpos, ucsc); - continue; - } - case 'a': // &a + if (*stre == 'm') // &am + { + if (*++stre == 'p' && *++stre == ';') // & { + *s++ = '&'; ++stre; - - if (*stre == 'm') // &am - { - if (*++stre == 'p' && *++stre == ';') // & - { - *lastpos++ = '&'; - str = ++stre; - continue; - } - } - else if (*stre == 'p') // &ap - { - if (*++stre == 'o' && *++stre == 's' && *++stre == ';') // ' - { - *lastpos++ = '\''; - str = ++stre; - continue; - } - } - break; - } - case 'g': // &g - { - if (*++stre == 't' && *++stre == ';') // > - { - *lastpos++ = '>'; - str = ++stre; - continue; - } - break; - } - case 'l': // &l - { - if (*++stre == 't' && *++stre == ';') // < - { - *lastpos++ = '<'; - str = ++stre; - continue; - } - break; + + g.push(s, stre - s); + return stre; } - case 'q': // &q + } + else if (*stre == 'p') // &ap + { + if (*++stre == 'o' && *++stre == 's' && *++stre == ';') // ' { - if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') // " - { - *lastpos++ = '"'; - str = ++stre; - continue; - } - break; + *s++ = '\''; + ++stre; + + g.push(s, stre - s); + return stre; } } + break; } - else if (chartype_space(*str) && opt_wnorm) + case 'g': // &g { - *lastpos++ = ' '; - - while (chartype_space(*str)) ++str; - - continue; + if (*++stre == 't' && *++stre == ';') // > + { + *s++ = '>'; + ++stre; + + g.push(s, stre - s); + return stre; + } + break; } - else if (chartype_space(*str) && opt_wconv) + case 'l': // &l { - if (*str == 0x0d && *(str + 1) == 0x0a) ++str; - - ++str; - *lastpos++ = ' '; + if (*++stre == 't' && *++stre == ';') // < + { + *s++ = '<'; + ++stre; + + g.push(s, stre - s); + return stre; + } + break; + } + case 'q': // &q + { + if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') // " + { + *s++ = '"'; + ++stre; + + g.push(s, stre - s); + return stre; + } + break; + } + } + + return stre; + } - continue; + static char* strconv_comment(char* s) + { + if (!*s) return 0; + + gap g; + + while (true) + { + while (!is_chartype(*s, ct_parse_comment)) ++s; + + if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair + { + *s++ = '\n'; // replace first one with 0x0a + + if (*s == '\n') g.push(s, 1); } - else if (*str == 0x0d && !opt_wnorm && opt_eol) + else if (*s == '-' && *(s+1) == '-' && *(s+2) == '>') // comment ends here { - if (*(str + 1) == 0x0a) ++str; - ++str; - *lastpos++ = 0x0a; + *g.flush(s) = 0; + + return s + 3; + } + else if (*s == 0) + { + return 0; + } + else ++s; + } + } - continue; + static char* strconv_cdata(char* s) + { + if (!*s) return 0; + + gap g; + + while (true) + { + while (!is_chartype(*s, ct_parse_cdata)) ++s; + + if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair + { + *s++ = '\n'; // replace first one with 0x0a + + if (*s == '\n') g.push(s, 1); } + else if (*s == ']' && *(s+1) == ']' && *(s+2) == '>') // CDATA ends here + { + *g.flush(s) = 0; + + return s + 1; + } + else if (*s == 0) + { + return 0; + } + else ++s; + } + } + + template static char* strconv_pcdata(char* s) + { + if (!*s) return 0; + + gap g; + + while (true) + { + while (!is_chartype(*s, ct_parse_pcdata)) ++s; - *lastpos++ = *str++; + if (opt_eol && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair + { + *s++ = '\n'; // replace first one with 0x0a + + if (*s == '\n') g.push(s, 1); + } + else if (opt_escape && *s == '&') + { + s = strconv_escape(s, g); + } + else if (*s == '<') // PCDATA ends here + { + *g.flush(s) = 0; + + return s + 1; + } + else if (*s == 0) + { + return 0; + } + else ++s; } + } + + static char* strconv_pcdata(char* s, unsigned int opt_escape, unsigned int opt_eol) + { + if (opt_escape) + return opt_eol ? strconv_pcdata(s) : strconv_pcdata(s); + else + return opt_eol ? strconv_pcdata(s) : strconv_pcdata(s); + } + template static char* strconv_attr(char* s, char end_quote) + { + if (!*s) return 0; + + gap g; + + // Trim whitespaces if (opt_wnorm) { - do *lastpos-- = 0; - while (chartype_space(*lastpos)); + char* str = s; + + while (is_chartype(*str, ct_space)) ++str; + + if (str != s) + g.push(s, str - s); + } + + while (true) + { + while (!is_chartype(*s, (opt_wnorm || opt_wconv) ? ct_parse_attr_ws : ct_parse_attr)) ++s; + + if (opt_escape && *s == '&') + { + s = strconv_escape(s, g); + } + else if (opt_wnorm && is_chartype(*s, ct_space)) + { + *s++ = ' '; + + if (is_chartype(*s, ct_space)) + { + char* str = s + 1; + while (is_chartype(*str, ct_space)) ++str; + + g.push(s, str - s); + } + } + else if (opt_wconv && is_chartype(*s, ct_space)) + { + if (opt_eol) + { + if (*s == '\r') + { + *s++ = ' '; + + if (*s == '\n') g.push(s, 1); + } + else *s++ = ' '; + } + else *s++ = ' '; + } + else if (opt_eol && *s == '\r') + { + *s++ = '\n'; + + if (*s == '\n') g.push(s, 1); + } + else if (*s == end_quote) + { + char* str = g.flush(s); + + if (opt_wnorm) + { + do *str-- = 0; + while (is_chartype(*str, ct_space)); + } + else *str = 0; + + return s + 1; + } + else if (!*s) + { + return 0; + } + else ++s; } - else *lastpos = 0; } - static void strconv_setup(void (*&func)(char**), unsigned int opt_escape, unsigned int opt_wnorm, unsigned int opt_wconv, unsigned int opt_eol) + static void strconv_attr_setup(char* (*&func)(char*, char), unsigned int opt_escape, unsigned int opt_wnorm, unsigned int opt_wconv, unsigned int opt_eol) { if (opt_eol) { @@ -423,26 +618,26 @@ namespace pugi { if (opt_escape) { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; + if (opt_wnorm) func = &strconv_attr; + else func = &strconv_attr; } else { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; + if (opt_wnorm) func = &strconv_attr; + else func = &strconv_attr; } } else { if (opt_escape) { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; + if (opt_wnorm) func = &strconv_attr; + else func = &strconv_attr; } else { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; + if (opt_wnorm) func = &strconv_attr; + else func = &strconv_attr; } } } @@ -452,26 +647,26 @@ namespace pugi { if (opt_escape) { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; + if (opt_wnorm) func = &strconv_attr; + else func = &strconv_attr; } else { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; + if (opt_wnorm) func = &strconv_attr; + else func = &strconv_attr; } } else { if (opt_escape) { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; + if (opt_wnorm) func = &strconv_attr; + else func = &strconv_attr; } else { - if (opt_wnorm) func = &strconv_t; - else func = &strconv_t; + if (opt_wnorm) func = &strconv_attr; + else func = &strconv_attr; } } } @@ -517,7 +712,7 @@ namespace pugi } // Parser utilities. - #define SKIPWS() { while(chartype_space(*s)) ++s; if(*s==0) return s; } + #define SKIPWS() { while(is_chartype(*s, ct_space)) ++s; if(*s==0) return s; } #define OPTSET(OPT) ( optmsk & OPT ) #define PUSHNODE(TYPE) { cursor = append_node(cursor,TYPE); } #define POPNODE() { cursor = cursor->parent; } @@ -543,11 +738,9 @@ namespace pugi { if(!s || !xmldoc) return s; - void (*strconv_pcdata)(char**); - void (*strconv_attribute)(char**); + char* (*strconv_attribute)(char*, char); - strconv_setup(strconv_attribute, OPTSET(parse_escapes), OPTSET(parse_wnorm_attribute), OPTSET(parse_wconv_attribute), OPTSET(parse_eol)); - strconv_setup(strconv_pcdata, OPTSET(parse_escapes), false, false, OPTSET(parse_eol)); + strconv_attr_setup(strconv_attribute, OPTSET(parse_escapes), OPTSET(parse_wnorm_attribute), OPTSET(parse_wconv_attribute), OPTSET(parse_eol)); char ch = 0; // Current char, in cases where we must null-terminate before we test. xml_node_struct* cursor = xmldoc; // Tree node cursor. @@ -555,12 +748,12 @@ namespace pugi while(*s!=0) { LOC_SEARCH: // Obliviously search for next element. - SCANFOR(chartype_enter(*s)); // Find the next '<'. - if(chartype_enter(*s)) + SCANFOR(*s == '<'); // Find the next '<'. + if(*s == '<') { ++s; LOC_CLASSIFY: // What kind of element? - if(chartype_pi(*s)) // 'name = mark; - if (chartype_space(ch)) + if (is_chartype(ch, ct_space)) { SKIPWS(); @@ -581,7 +774,7 @@ namespace pugi } else mark = 0; - SCANFOR(chartype_pi(*s) && chartype_leave(*(s+1))); // Look for '?>'. + SCANFOR(*s == '?' && *(s+1) == '>'); // Look for '?>'. ENDSEG(); cursor->value = mark; @@ -592,18 +785,18 @@ namespace pugi } else // Bad PI or parse_pi not set. { - SCANFOR(chartype_pi(*s) && chartype_leave(*(s+1))); // Look for '?>'. + SCANFOR(*s == '?' && *(s+1) == '>'); // Look for '?>'. ++s; goto LOC_LEAVE; } } - else if(chartype_special(*s)) // 'value = s; // Save the offset. } - // Scan for terminating '-->'. - SCANFOR(chartype_dash(*s) && chartype_dash(*(s+1)) && chartype_leave(*(s+2))); + if (OPTSET(parse_eol) && OPTSET(parse_comments)) + { + s = strconv_comment(s); + + if (!s) return s; + } + else + { + // Scan for terminating '-->'. + SCANFOR(*s == '-' && *(s+1) == '-' && *(s+2) == '>'); + + if (OPTSET(parse_comments)) + *s = 0; // Zero-terminate this segment at the first terminating '-'. + + s += 2; // Step over the '\0-'. + } if (OPTSET(parse_comments)) { - *s = 0; // Zero-terminate this segment at the first terminating '-'. POPNODE(); // Pop since this is a standalone. } - s += 2; // Step over the '\0-'. goto LOC_LEAVE; // Look for any following PCDATA. } } - else if(chartype_lbracket(*s)) + else if(*s == '[') { // 'value = s; // Save the offset. - // Scan for terminating ']]>'. - SCANFOR(chartype_rbracket(*s) && chartype_rbracket(*(s+1)) && chartype_leave(*(s+2))); - ENDSEG(); // Zero-terminate this segment. if (OPTSET(parse_eol)) { - strconv_t(&cursor->value); + s = strconv_cdata(s); + + if (!s) return s; + } + else + { + // Scan for terminating ']]>'. + SCANFOR(*s == ']' && *(s+1) == ']' && *(s+2) == '>'); + ENDSEG(); // Zero-terminate this segment. } POPNODE(); // Pop since this is a standalone. @@ -650,7 +860,7 @@ namespace pugi else // Flagged for discard, but we still have to scan for the terminator. { // Scan for terminating ']]>'. - SCANFOR(chartype_rbracket(*s) && chartype_rbracket(*(s+1)) && chartype_leave(*(s+2))); + SCANFOR(*s == ']' && *(s+1) == ']' && *(s+2) == '>'); ++s; } ++s; // Step over the last ']'. @@ -663,27 +873,27 @@ namespace pugi ++s; SKIPWS(); // Eat any whitespace. LOC_DOCTYPE: - SCANWHILE(chartype_quote(*s) || chartype_lbracket(*s) || chartype_leave(*s)); - if(chartype_quote(*s)) // '...SYSTEM "..." + SCANWHILE(*s == '\'' || *s == '"' || *s == '[' || *s == '>'); + if(*s == '\'' || *s == '"') // '...SYSTEM "..." { ch = *s++; SCANFOR(*s == ch); ++s; goto LOC_DOCTYPE; } - if(chartype_lbracket(*s)) // '...[...' + if(*s == '[') // '...[...' { ++s; unsigned int bd = 1; // Bracket depth counter. while(*s!=0) // Loop till we're out of all brackets. { - if(chartype_rbracket(*s)) --bd; - else if(chartype_lbracket(*s)) ++bd; + if(*s == ']') --bd; + else if(*s == '[') ++bd; if(bd == 0) break; ++s; } // Note: 's' now points to end of DTD, i.e.: ']'. - SCANFOR(chartype_leave(*s)); + SCANFOR(*s == '>'); continue; } // Fall-through @@ -697,19 +907,19 @@ namespace pugi cursor->name = s; SCANWHILE(chartype_symbol(*s)); // Scan for a terminator. ENDSEG(); // Save char in 'ch', terminate & step over. - if (*s!=0 && chartype_close(ch)) // '' + SCANFOR(*s == '>'); // Scan for '>' POPNODE(); // Pop. goto LOC_LEAVE; } - else if(*s!=0 && !chartype_space(ch)) + else if(*s!=0 && !is_chartype(ch, ct_space)) { - if (!chartype_leave(ch)) SCANWHILE(!chartype_leave(*s)); + if (ch != '>') SCANWHILE(*s != '>'); if (!*s) return s; goto LOC_PCDATA; // No attributes, so scan for PCDATA. } - else if(*s!=0 && chartype_space(ch)) + else if(*s!=0 && is_chartype(ch, ct_space)) { SKIPWS(); // Eat any whitespace. LOC_ATTRIBUTE: @@ -719,34 +929,34 @@ namespace pugi a->name = s; // Save the offset. SCANWHILE(chartype_symbol(*s)); // Scan for a terminator. ENDSEG(); // Save char in 'ch', terminate & step over. - if(*s!=0 && chartype_space(ch)) SKIPWS(); // Eat any whitespace. - if(*s!=0 && (chartype_equals(ch) || chartype_equals(*s))) // '<... #=...' + if(*s!=0 && is_chartype(ch, ct_space)) SKIPWS(); // Eat any whitespace. + if(*s!=0 && (ch == '=' || *s == '=')) // '<... #=...' { - if(chartype_equals(*s)) ++s; + if(*s == '=') ++s; SKIPWS(); // Eat any whitespace. - if(chartype_quote(*s)) // '<... #="...' + if(*s == '\'' || *s == '"') // '<... #="...' { ch = *s; // Save quote char to avoid breaking on "''" -or- '""'. ++s; // Step over the quote. a->value = s; // Save the offset. - SCANFOR(*s == ch); // Scan for the terminating quote - ENDSEG(); // Save char in 'ch', terminate & step over. - strconv_attribute(&a->value); + s = strconv_attribute(s, ch); + + if (!s) return s; - if(chartype_leave(*s)) + if(*s == '>') { ++s; goto LOC_PCDATA; } - else if(chartype_close(*s)) + else if(*s == '/') { ++s; POPNODE(); SKIPWS(); // Eat any whitespace. goto LOC_LEAVE; } - else if(chartype_space(*s)) // This may indicate a following attribute. + else if(is_chartype(*s, ct_space)) // This may indicate a following attribute. { SKIPWS(); // Eat any whitespace. goto LOC_ATTRIBUTE; // Go scan for additional attributes. @@ -756,19 +966,19 @@ namespace pugi goto LOC_ATTRIBUTE; } - SCANWHILE(!chartype_leave(*s) && !chartype_close(*s)); + SCANWHILE(*s != '>' && *s != '/'); } LOC_LEAVE: - if(chartype_leave(*s)) // '...>' + if(*s == '>') // '...>' { ++s; // Step over the '>'. LOC_PCDATA: // '>...<' mark = s; // Save this offset while searching for a terminator. SKIPWS(); // Eat whitespace if no genuine PCDATA here. // We hit a '<...', with only whitespace, so don't bother storing anything. - if((mark == s || !OPTSET(parse_ws_pcdata)) && chartype_enter(*s)) + if((mark == s || !OPTSET(parse_ws_pcdata)) && *s == '<') { - if(chartype_close(*(s+1))) // 'value = s; // Save the offset. - } - - while (*s && !chartype_enter(*s)) ++s; // '...<' - - if (preserve) - { - if (*s) ENDSEG(); // Save char in 'ch', terminate & step over. - strconv_pcdata(&cursor->value); + s = strconv_pcdata(s, OPTSET(parse_escapes), OPTSET(parse_eol)); + + if (!s) return s; POPNODE(); // Pop since this is a standalone. } - - if (!*s) return s; - - if(chartype_enter(ch)) // Did we hit a '<...'? + else { - if(chartype_close(*s)) goto LOC_CLOSE; - else if(chartype_special(*s)) goto LOC_CLASSIFY; // We hit a '' + if(*s == '>') // '.../>' { POPNODE(); // Pop. goto LOC_LEAVE; @@ -819,7 +1024,7 @@ namespace pugi } } // Fall-through B. - else if(chartype_close(*s)) // '.../' + else if(*s == '/') // '.../' { LOC_CLOSE: ++s; @@ -868,7 +1073,7 @@ namespace pugi } else { - SCANFOR(chartype_leave(*s)); // '...>' + SCANFOR(*s == '>'); // '...>' POPNODE(); // Pop. } diff --git a/src/pugixml.hpp b/src/pugixml.hpp index a802e3d..b7ded37 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -2,7 +2,9 @@ // // Pug Improved XML Parser - Version 0.2 // -------------------------------------------------------- -// Copyright (C) 2006, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) +// Copyright (C) 2006-2007, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) +// Thanks to Palvelev Artyom (cppguru@mail.ru) for hints about optimizing +// conversion functions. // This work is based on the pugxml parser, which is: // Copyright (C) 2003, by Kristen Wegner (kristen@tima.net) // Released into the Public Domain. Use at your own risk. -- cgit v1.2.3