diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/pugixml.cpp | 651 | ||||
| -rw-r--r-- | src/pugixml.hpp | 4 | 
2 files changed, 431 insertions, 224 deletions
diff --git a/src/pugixml.cpp b/src/pugixml.cpp index de3a548..da53c66 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -2,7 +2,9 @@  //
  // Pug Improved XML Parser - Version 0.2
  // --------------------------------------------------------
 -// Copyright (C) 2006, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
 +// Copyright (C) 2006-2007, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
 +// Thanks to Palvelev Artyom (cppguru@mail.ru) for hints about optimizing
 +// conversion functions.
  // This work is based on the pugxml parser, which is:
  // Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
  // Released into the Public Domain. Use at your own risk.
 @@ -110,6 +112,43 @@ namespace  		const unsigned char BYTE_MASK_READ = 0x3F;
  		const unsigned char FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
  	}
 +
 +	enum chartype
 +	{
 +		ct_parse_pcdata = 1,	// \0, &, \r, <
 +		ct_parse_attr = 2,		// \0, &, \r, ', "
 +		ct_parse_attr_ws = 4,	// \0, &, \r, ', ", \n, space, tab
 +		ct_space = 8,			// \r, \n, space, tab
 +		ct_parse_cdata = 16,	// \0, ], >, \r
 +		ct_parse_comment = 32	// \0, -, >, \r
 +		
 +	};
 +	
 +	static unsigned char chartype_table[256] =
 +	{
 +		55, 0, 0, 0, 0, 0, 0, 0,		0, 12, 12, 0, 0, 63, 0, 0,	// 0-15
 +		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,		// 16-31
 +		12, 0, 6, 0, 0, 0, 7, 6,		0, 0, 0, 0, 0, 32, 0, 0,	// 32-47
 +		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 1, 0, 48, 0,	// 48-63
 +		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,		// 64-79
 +		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 16, 0, 0,	// 80-95
 +		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,		// 96-111
 +		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,		// 112-127
 +
 +		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,
 +		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,
 +		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,
 +		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,
 +		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,
 +		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,
 +		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0,
 +		0, 0, 0, 0, 0, 0, 0, 0,			0, 0, 0, 0, 0, 0, 0, 0
 +	};
 +	
 +	bool is_chartype(char c, chartype ct)
 +	{
 +		return !!(chartype_table[static_cast<unsigned char>(c)] & ct);
 +	}
  }
  namespace pugi
 @@ -237,185 +276,341 @@ namespace pugi  	{
  		xml_allocator& alloc;
  		bool chartype_symbol_table[256];
 -
 +		
  		bool chartype_symbol(char c) const { return chartype_symbol_table[(unsigned char)c]; }
 -		static bool chartype_space(char c) { return c < '!' && c > 0; }
 -		static bool chartype_enter(char c) { return c == '<'; }
 -		static bool chartype_leave(char c) { return c == '>'; }
 -		static bool chartype_close(char c)	{ return c == '/'; }
 -		static bool chartype_equals(char c) { return c == '='; }
 -		static bool chartype_special(char c) { return c == '!'; }
 -		static bool chartype_pi(char c) { return c == '?'; }
 -		static bool chartype_dash(char c) { return c == '-'; }
 -		static bool chartype_quote(char c) { return c == '"' || c == '\''; }
 -		static bool chartype_lbracket(char c) { return c == '['; }
 -		static bool chartype_rbracket(char c) { return c == ']'; }
 -
 -		template <bool opt_escape, bool opt_wnorm, bool opt_wconv, bool opt_eol> static void strconv_t(char** s)
 +		struct gap
  		{
 -			if (!s || !*s) return;
 -
 -			if (!opt_escape && !opt_wnorm && !opt_wconv && !opt_eol) return;
 -
 -			// Trim whitespaces
 -			if (opt_wnorm) while (chartype_space(**s)) ++(*s);
 +			char* end;
 +			size_t size;
 -			char* str = *s;
 +			gap(): end(0), size(0)
 +			{
 +			}
 -			// Skip usual symbols
 -			if (opt_escape || opt_wnorm || opt_wconv || opt_eol)
 +			// Push new gap, move s count bytes further (skipping the gap).
 +			// Collapse previous gap.
 +			void push(char*& s, size_t count)
  			{
 -				while (*str)
 +				if (end) // there was a gap already; collapse it
  				{
 -					if (opt_escape && *str == '&') break;
 -					if ((opt_wnorm || opt_wconv || opt_eol) && chartype_space(*str)) break;
 -
 -					++str;
 +					// Move [old_gap_end, new_gap_start) to [old_gap_start, ...)
 +					memmove(end - size, end, s - end);
  				}
 +				
 +				s += count; // end of current gap
 +				
 +				// "merge" two gaps
 +				end = s;
 +				size += count;
  			}
 +			
 +			// Collapse all gaps, return past-the-end pointer
 +			char* flush(char* s)
 +			{
 +				if (end)
 +				{
 +					// Move [old_gap_end, current_pos) to [old_gap_start, ...)
 +					memmove(end - size, end, s - end);
 -			char* lastpos = str;
 -
 -			if (!*str) return;
 +					return s - size;
 +				}
 +				else return s;
 +			}
 +		};
 +		
 +		static char* strconv_escape(char* s, gap& g)
 +		{
 +			char* stre = s + 1;
 -			while (*str)
 +			switch (*stre)
  			{
 -				if (*str == '&' && opt_escape)	// &
 +				case '#':	// &#...
  				{
 -					char* stre = str + 1;
 +					unsigned int ucsc = 0;
 +
 +					++stre;
 -					switch (*stre)
 +					if (*stre == 'x') // &#x... (hex code)
  					{
 -						case '#':	// &#...
 +						++stre;
 +						
 +						while (*stre)
  						{
 -							unsigned int ucsc = 0;
 -
 -							++stre;
 +							if (*stre >= '0' && *stre <= '9')
 +								ucsc = 16 * ucsc + (*stre++ - '0');
 +							else if (*stre >= 'A' && *stre <= 'F')
 +								ucsc = 16 * ucsc + (*stre++ - 'A' + 10);
 +							else if (*stre >= 'a' && *stre <= 'f')
 +								ucsc = 16 * ucsc + (*stre++ - 'a' + 10);
 +							else break;
 +						}
 -							if (*stre == 'x') // &#x... (hex code)
 -							{
 -								++stre;
 -								
 -								while (*stre)
 -								{
 -									if (*stre >= '0' && *stre <= '9')
 -										ucsc = 16 * ucsc + (*stre++ - '0');
 -									else if (*stre >= 'A' && *stre <= 'F')
 -										ucsc = 16 * ucsc + (*stre++ - 'A' + 10);
 -									else if (*stre >= 'a' && *stre <= 'f')
 -										ucsc = 16 * ucsc + (*stre++ - 'a' + 10);
 -									else break;
 -								}
 +						if (*stre == ';') ++stre;
 +					}
 +					else	// &#... (dec code)
 +					{
 +						while (*stre >= '0' && *stre <= '9')
 +							ucsc = 10 * ucsc + (*stre++ - '0');
 -								if (*stre == ';') ++stre;
 -							}
 -							else	// &#... (dec code)
 -							{
 -								while (*stre >= '0' && *stre <= '9')
 -									ucsc = 10 * ucsc + (*stre++ - '0');
 +						if (*stre == ';') ++stre;
 +					}
 -								if (*stre == ';') ++stre;
 -							}
 +					s = strutf16_utf8(s, ucsc);
 +					
 +					g.push(s, stre - s);
 +					return stre;
 +				}
 +				case 'a':	// &a
 +				{
 +					++stre;
 -							str = stre;
 -							lastpos = strutf16_utf8(lastpos, ucsc);
 -							continue;
 -						}
 -						case 'a':	// &a
 +					if (*stre == 'm') // &am
 +					{
 +						if (*++stre == 'p' && *++stre == ';') // &
  						{
 +							*s++ = '&';
  							++stre;
 -
 -							if (*stre == 'm') // &am
 -							{
 -								if (*++stre == 'p' && *++stre == ';') // &
 -								{
 -									*lastpos++ = '&';
 -									str = ++stre;
 -									continue;
 -								}
 -							}
 -							else if (*stre == 'p') // &ap
 -							{
 -								if (*++stre == 'o' && *++stre == 's' && *++stre == ';') // '
 -								{
 -									*lastpos++ = '\'';
 -									str = ++stre;
 -									continue;
 -								}
 -							}
 -							break;
 -						}
 -						case 'g': // &g
 -						{
 -							if (*++stre == 't' && *++stre == ';') // >
 -							{
 -								*lastpos++ = '>';
 -								str = ++stre;
 -								continue;
 -							}
 -							break;
 -						}
 -						case 'l': // &l
 -						{
 -							if (*++stre == 't' && *++stre == ';') // <
 -							{
 -								*lastpos++ = '<';
 -								str = ++stre;
 -								continue;
 -							}
 -							break;
 +							
 +							g.push(s, stre - s);
 +							return stre;
  						}
 -						case 'q': // &q
 +					}
 +					else if (*stre == 'p') // &ap
 +					{
 +						if (*++stre == 'o' && *++stre == 's' && *++stre == ';') // '
  						{
 -							if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') // "
 -							{
 -								*lastpos++ = '"';
 -								str = ++stre;
 -								continue;
 -							}
 -							break;
 +							*s++ = '\'';
 +							++stre;
 +
 +							g.push(s, stre - s);
 +							return stre;
  						}
  					}
 +					break;
  				}
 -				else if (chartype_space(*str) && opt_wnorm)
 +				case 'g': // &g
  				{
 -					*lastpos++ = ' ';
 -		
 -					while (chartype_space(*str)) ++str;
 -
 -					continue;
 +					if (*++stre == 't' && *++stre == ';') // >
 +					{
 +						*s++ = '>';
 +						++stre;
 +						
 +						g.push(s, stre - s);
 +						return stre;
 +					}
 +					break;
  				}
 -				else if (chartype_space(*str) && opt_wconv)
 +				case 'l': // &l
  				{
 -					if (*str == 0x0d && *(str + 1) == 0x0a) ++str;
 -
 -					++str;
 -					*lastpos++ = ' ';
 +					if (*++stre == 't' && *++stre == ';') // <
 +					{
 +						*s++ = '<';
 +						++stre;
 +						
 +						g.push(s, stre - s);
 +						return stre;
 +					}
 +					break;
 +				}
 +				case 'q': // &q
 +				{
 +					if (*++stre == 'u' && *++stre == 'o' && *++stre == 't' && *++stre == ';') // "
 +					{
 +						*s++ = '"';
 +						++stre;
 +						
 +						g.push(s, stre - s);
 +						return stre;
 +					}
 +					break;
 +				}
 +			}
 +			
 +			return stre;
 +		}
 -					continue;
 +		static char* strconv_comment(char* s)
 +		{
 +			if (!*s) return 0;
 +			
 +			gap g;
 +			
 +			while (true)
 +			{
 +				while (!is_chartype(*s, ct_parse_comment)) ++s;
 +				
 +				if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
 +				{
 +					*s++ = '\n'; // replace first one with 0x0a
 +					
 +					if (*s == '\n') g.push(s, 1);
  				}
 -				else if (*str == 0x0d && !opt_wnorm && opt_eol)
 +				else if (*s == '-' && *(s+1) == '-' && *(s+2) == '>') // comment ends here
  				{
 -					if (*(str + 1) == 0x0a) ++str;
 -					++str;
 -					*lastpos++ = 0x0a;
 +					*g.flush(s) = 0;
 +					
 +					return s + 3;
 +				}
 +				else if (*s == 0)
 +				{
 +					return 0;
 +				}
 +				else ++s;
 +			}
 +		}
 -					continue;
 +		static char* strconv_cdata(char* s)
 +		{
 +			if (!*s) return 0;
 +			
 +			gap g;
 +			
 +			while (true)
 +			{
 +				while (!is_chartype(*s, ct_parse_cdata)) ++s;
 +				
 +				if (*s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
 +				{
 +					*s++ = '\n'; // replace first one with 0x0a
 +					
 +					if (*s == '\n') g.push(s, 1);
  				}
 +				else if (*s == ']' && *(s+1) == ']' && *(s+2) == '>') // CDATA ends here
 +				{
 +					*g.flush(s) = 0;
 +					
 +					return s + 1;
 +				}
 +				else if (*s == 0)
 +				{
 +					return 0;
 +				}
 +				else ++s;
 +			}
 +		}
 +		
 +		template <bool opt_escape, bool opt_eol> static char* strconv_pcdata(char* s)
 +		{
 +			if (!*s) return 0;
 +
 +			gap g;
 +			
 +			while (true)
 +			{
 +				while (!is_chartype(*s, ct_parse_pcdata)) ++s;
 -				*lastpos++ = *str++;
 +				if (opt_eol && *s == '\r') // Either a single 0x0d or 0x0d 0x0a pair
 +				{
 +					*s++ = '\n'; // replace first one with 0x0a
 +					
 +					if (*s == '\n') g.push(s, 1);
 +				}
 +				else if (opt_escape && *s == '&')
 +				{
 +					s = strconv_escape(s, g);
 +				}
 +				else if (*s == '<') // PCDATA ends here
 +				{
 +					*g.flush(s) = 0;
 +					
 +					return s + 1;
 +				}
 +				else if (*s == 0)
 +				{
 +					return 0;
 +				}
 +				else ++s;
  			}
 +		}
 +
 +		static char* strconv_pcdata(char* s, unsigned int opt_escape, unsigned int opt_eol)
 +		{
 +			if (opt_escape)
 +				return opt_eol ? strconv_pcdata<true, true>(s) : strconv_pcdata<true, false>(s);
 +			else
 +				return opt_eol ? strconv_pcdata<false, true>(s) : strconv_pcdata<false, false>(s);
 +		}
 +		template <bool opt_escape, bool opt_wnorm, bool opt_wconv, bool opt_eol> static char* strconv_attr(char* s, char end_quote)
 +		{
 +			if (!*s) return 0;
 +			
 +			gap g;
 +
 +			// Trim whitespaces
  			if (opt_wnorm)
  			{
 -				do *lastpos-- = 0;
 -				while (chartype_space(*lastpos));
 +				char* str = s;
 +				
 +				while (is_chartype(*str, ct_space)) ++str;
 +				
 +				if (str != s)
 +					g.push(s, str - s);
 +			}
 +			
 +			while (true)
 +			{
 +				while (!is_chartype(*s, (opt_wnorm || opt_wconv) ? ct_parse_attr_ws : ct_parse_attr)) ++s;
 +				
 +				if (opt_escape && *s == '&')
 +				{
 +					s = strconv_escape(s, g);
 +				}
 +				else if (opt_wnorm && is_chartype(*s, ct_space))
 +				{
 +					*s++ = ' ';
 +		
 +					if (is_chartype(*s, ct_space))
 +					{
 +						char* str = s + 1;
 +						while (is_chartype(*str, ct_space)) ++str;
 +						
 +						g.push(s, str - s);
 +					}
 +				}
 +				else if (opt_wconv && is_chartype(*s, ct_space))
 +				{
 +					if (opt_eol)
 +					{
 +						if (*s == '\r')
 +						{
 +							*s++ = ' ';
 +					
 +							if (*s == '\n') g.push(s, 1);
 +						}
 +						else *s++ = ' ';
 +					}
 +					else *s++ = ' ';
 +				}
 +				else if (opt_eol && *s == '\r')
 +				{
 +					*s++ = '\n';
 +					
 +					if (*s == '\n') g.push(s, 1);
 +				}
 +				else if (*s == end_quote)
 +				{
 +					char* str = g.flush(s);
 +					
 +					if (opt_wnorm)
 +					{
 +						do *str-- = 0;
 +						while (is_chartype(*str, ct_space));
 +					}
 +					else *str = 0;
 +					
 +					return s + 1;
 +				}
 +				else if (!*s)
 +				{
 +					return 0;
 +				}
 +				else ++s;
  			}
 -			else *lastpos = 0;
  		}
 -		static void strconv_setup(void (*&func)(char**), unsigned int opt_escape, unsigned int opt_wnorm, unsigned int opt_wconv, unsigned int opt_eol)
 +		static void strconv_attr_setup(char* (*&func)(char*, char), unsigned int opt_escape, unsigned int opt_wnorm, unsigned int opt_wconv, unsigned int opt_eol)
  		{
  			if (opt_eol)
  			{
 @@ -423,26 +618,26 @@ namespace pugi  				{
  					if (opt_escape)
  					{
 -						if (opt_wnorm) func = &strconv_t<true, true, true, true>;
 -						else func = &strconv_t<true, false, true, true>;
 +						if (opt_wnorm) func = &strconv_attr<true, true, true, true>;
 +						else func = &strconv_attr<true, false, true, true>;
  					}
  					else
  					{
 -						if (opt_wnorm) func = &strconv_t<false, true, true, true>;
 -						else func = &strconv_t<false, false, true, true>;
 +						if (opt_wnorm) func = &strconv_attr<false, true, true, true>;
 +						else func = &strconv_attr<false, false, true, true>;
  					}
  				}
  				else
  				{
  					if (opt_escape)
  					{
 -						if (opt_wnorm) func = &strconv_t<true, true, false, true>;
 -						else func = &strconv_t<true, false, false, true>;
 +						if (opt_wnorm) func = &strconv_attr<true, true, false, true>;
 +						else func = &strconv_attr<true, false, false, true>;
  					}
  					else
  					{
 -						if (opt_wnorm) func = &strconv_t<false, true, false, true>;
 -						else func = &strconv_t<false, false, false, true>;
 +						if (opt_wnorm) func = &strconv_attr<false, true, false, true>;
 +						else func = &strconv_attr<false, false, false, true>;
  					}
  				}
  			}
 @@ -452,26 +647,26 @@ namespace pugi  				{
  					if (opt_escape)
  					{
 -						if (opt_wnorm) func = &strconv_t<true, true, true, false>;
 -						else func = &strconv_t<true, false, true, false>;
 +						if (opt_wnorm) func = &strconv_attr<true, true, true, false>;
 +						else func = &strconv_attr<true, false, true, false>;
  					}
  					else
  					{
 -						if (opt_wnorm) func = &strconv_t<false, true, true, false>;
 -						else func = &strconv_t<false, false, true, false>;
 +						if (opt_wnorm) func = &strconv_attr<false, true, true, false>;
 +						else func = &strconv_attr<false, false, true, false>;
  					}
  				}
  				else
  				{
  					if (opt_escape)
  					{
 -						if (opt_wnorm) func = &strconv_t<true, true, false, false>;
 -						else func = &strconv_t<true, false, false, false>;
 +						if (opt_wnorm) func = &strconv_attr<true, true, false, false>;
 +						else func = &strconv_attr<true, false, false, false>;
  					}
  					else
  					{
 -						if (opt_wnorm) func = &strconv_t<false, true, false, false>;
 -						else func = &strconv_t<false, false, false, false>;
 +						if (opt_wnorm) func = &strconv_attr<false, true, false, false>;
 +						else func = &strconv_attr<false, false, false, false>;
  					}
  				}
  			}
 @@ -517,7 +712,7 @@ namespace pugi  		}
  		// Parser utilities.
 -		#define SKIPWS()			{ while(chartype_space(*s)) ++s; if(*s==0) return s; }
 +		#define SKIPWS()			{ while(is_chartype(*s, ct_space)) ++s; if(*s==0) return s; }
  		#define OPTSET(OPT)			( optmsk & OPT )
  		#define PUSHNODE(TYPE)		{ cursor = append_node(cursor,TYPE); }
  		#define POPNODE()			{ cursor = cursor->parent; }
 @@ -543,11 +738,9 @@ namespace pugi  		{
  			if(!s || !xmldoc) return s;
 -			void (*strconv_pcdata)(char**);
 -			void (*strconv_attribute)(char**);
 +			char* (*strconv_attribute)(char*, char);
 -			strconv_setup(strconv_attribute, OPTSET(parse_escapes), OPTSET(parse_wnorm_attribute), OPTSET(parse_wconv_attribute), OPTSET(parse_eol));
 -			strconv_setup(strconv_pcdata, OPTSET(parse_escapes), false, false, OPTSET(parse_eol));
 +			strconv_attr_setup(strconv_attribute, OPTSET(parse_escapes), OPTSET(parse_wnorm_attribute), OPTSET(parse_wconv_attribute), OPTSET(parse_eol));
  			char ch = 0; // Current char, in cases where we must null-terminate before we test.
  			xml_node_struct* cursor = xmldoc; // Tree node cursor.
 @@ -555,12 +748,12 @@ namespace pugi  			while(*s!=0)
  			{
  			LOC_SEARCH: // Obliviously search for next element.
 -				SCANFOR(chartype_enter(*s)); // Find the next '<'.
 -				if(chartype_enter(*s))
 +				SCANFOR(*s == '<'); // Find the next '<'.
 +				if(*s == '<')
  				{
  					++s;
  				LOC_CLASSIFY: // What kind of element?
 -					if(chartype_pi(*s)) // '<?...'
 +					if(*s == '?') // '<?...'
  					{
  						++s;
  						if(chartype_symbol(*s) && OPTSET(parse_pi))
 @@ -573,7 +766,7 @@ namespace pugi  							cursor->name = mark;
 -							if (chartype_space(ch))
 +							if (is_chartype(ch, ct_space))
  							{
  								SKIPWS();
 @@ -581,7 +774,7 @@ namespace pugi  							}
  							else mark = 0;
 -							SCANFOR(chartype_pi(*s) && chartype_leave(*(s+1))); // Look for '?>'.
 +							SCANFOR(*s == '?' && *(s+1) == '>'); // Look for '?>'.
  							ENDSEG();
  							cursor->value = mark;
 @@ -592,18 +785,18 @@ namespace pugi  						}
  						else // Bad PI or parse_pi not set.
  						{
 -							SCANFOR(chartype_pi(*s) && chartype_leave(*(s+1))); // Look for '?>'.
 +							SCANFOR(*s == '?' && *(s+1) == '>'); // Look for '?>'.
  							++s;
  							goto LOC_LEAVE;
  						}
  					}
 -					else if(chartype_special(*s)) // '<!...'
 +					else if(*s == '!') // '<!...'
  					{
  						++s;
 -						if(chartype_dash(*s)) // '<!-...'
 +						if(*s == '-') // '<!-...'
  						{
  							++s;
 -							if(chartype_dash(*s)) // '<!--...'
 +							if(*s == '-') // '<!--...'
  							{
  								++s;
 @@ -613,36 +806,53 @@ namespace pugi  									cursor->value = s; // Save the offset.
  								}
 -								// Scan for terminating '-->'.
 -								SCANFOR(chartype_dash(*s) && chartype_dash(*(s+1)) && chartype_leave(*(s+2)));
 +								if (OPTSET(parse_eol) && OPTSET(parse_comments))
 +								{
 +									s = strconv_comment(s);
 +									
 +									if (!s) return s;
 +								}
 +								else
 +								{
 +									// Scan for terminating '-->'.
 +									SCANFOR(*s == '-' && *(s+1) == '-' && *(s+2) == '>');
 +								
 +									if (OPTSET(parse_comments))
 +										*s = 0; // Zero-terminate this segment at the first terminating '-'.
 +									
 +									s += 2; // Step over the '\0-'.
 +								}
  								if (OPTSET(parse_comments))
  								{
 -									*s = 0; // Zero-terminate this segment at the first terminating '-'.
  									POPNODE(); // Pop since this is a standalone.
  								}
 -								s += 2; // Step over the '\0-'.
  								goto LOC_LEAVE; // Look for any following PCDATA.
  							}
  						}
 -						else if(chartype_lbracket(*s))
 +						else if(*s == '[')
  						{
  							// '<![CDATA[...'
 -							if(*++s=='C' && *++s=='D' && *++s=='A' && *++s=='T' && *++s=='A' && chartype_lbracket(*++s))
 +							if(*++s=='C' && *++s=='D' && *++s=='A' && *++s=='T' && *++s=='A' && *++s == '[')
  							{
  								++s;
  								if(OPTSET(parse_cdata))
  								{
  									PUSHNODE(node_cdata); // Append a new node on the tree.
  									cursor->value = s; // Save the offset.
 -									// Scan for terminating ']]>'.
 -									SCANFOR(chartype_rbracket(*s) && chartype_rbracket(*(s+1)) && chartype_leave(*(s+2)));
 -									ENDSEG(); // Zero-terminate this segment.
  									if (OPTSET(parse_eol))
  									{
 -										strconv_t<false, false, false, true>(&cursor->value);
 +										s = strconv_cdata(s);
 +										
 +										if (!s) return s;
 +									}
 +									else
 +									{
 +										// Scan for terminating ']]>'.
 +										SCANFOR(*s == ']' && *(s+1) == ']' && *(s+2) == '>');
 +										ENDSEG(); // Zero-terminate this segment.
  									}
  									POPNODE(); // Pop since this is a standalone.
 @@ -650,7 +860,7 @@ namespace pugi  								else // Flagged for discard, but we still have to scan for the terminator.
  								{
  									// Scan for terminating ']]>'.
 -									SCANFOR(chartype_rbracket(*s) && chartype_rbracket(*(s+1)) && chartype_leave(*(s+2)));
 +									SCANFOR(*s == ']' && *(s+1) == ']' && *(s+2) == '>');
  									++s;
  								}
  								++s; // Step over the last ']'.
 @@ -663,27 +873,27 @@ namespace pugi  							++s;
  							SKIPWS(); // Eat any whitespace.
  						LOC_DOCTYPE:
 -							SCANWHILE(chartype_quote(*s) || chartype_lbracket(*s) || chartype_leave(*s));
 -							if(chartype_quote(*s)) // '...SYSTEM "..."
 +							SCANWHILE(*s == '\'' || *s == '"' || *s == '[' || *s == '>');
 +							if(*s == '\'' || *s == '"') // '...SYSTEM "..."
  							{
  								ch = *s++;
  								SCANFOR(*s == ch);
  								++s;
  								goto LOC_DOCTYPE;
  							}
 -							if(chartype_lbracket(*s)) // '...[...'
 +							if(*s == '[') // '...[...'
  							{
  								++s;
  								unsigned int bd = 1; // Bracket depth counter.
  								while(*s!=0) // Loop till we're out of all brackets.
  								{
 -									if(chartype_rbracket(*s)) --bd;
 -									else if(chartype_lbracket(*s)) ++bd;
 +									if(*s == ']') --bd;
 +									else if(*s == '[') ++bd;
  									if(bd == 0) break;
  									++s;
  								}
  								// Note: 's' now points to end of DTD, i.e.: ']'.
 -								SCANFOR(chartype_leave(*s));
 +								SCANFOR(*s == '>');
  								continue;
  							}
  							// Fall-through
 @@ -697,19 +907,19 @@ namespace pugi  						cursor->name = s;
  						SCANWHILE(chartype_symbol(*s)); // Scan for a terminator.
  						ENDSEG(); // Save char in 'ch', terminate & step over.
 -						if (*s!=0 && chartype_close(ch)) // '</...'
 +						if (*s!=0 && ch == '/') // '</...'
  						{
 -							SCANFOR(chartype_leave(*s)); // Scan for '>'
 +							SCANFOR(*s == '>'); // Scan for '>'
  							POPNODE(); // Pop.
  							goto LOC_LEAVE;
  						}
 -						else if(*s!=0 && !chartype_space(ch))
 +						else if(*s!=0 && !is_chartype(ch, ct_space))
  						{
 -							if (!chartype_leave(ch)) SCANWHILE(!chartype_leave(*s));
 +							if (ch != '>') SCANWHILE(*s != '>');
  							if (!*s) return s;
  							goto LOC_PCDATA; // No attributes, so scan for PCDATA.
  						}
 -						else if(*s!=0 && chartype_space(ch))
 +						else if(*s!=0 && is_chartype(ch, ct_space))
  						{
  							SKIPWS(); // Eat any whitespace.
  						LOC_ATTRIBUTE:
 @@ -719,34 +929,34 @@ namespace pugi  								a->name = s; // Save the offset.
  								SCANWHILE(chartype_symbol(*s)); // Scan for a terminator.
  								ENDSEG(); // Save char in 'ch', terminate & step over.
 -								if(*s!=0 && chartype_space(ch)) SKIPWS(); // Eat any whitespace.
 -								if(*s!=0 && (chartype_equals(ch) || chartype_equals(*s))) // '<... #=...'
 +								if(*s!=0 && is_chartype(ch, ct_space)) SKIPWS(); // Eat any whitespace.
 +								if(*s!=0 && (ch == '=' || *s == '=')) // '<... #=...'
  								{
 -									if(chartype_equals(*s)) ++s;
 +									if(*s == '=') ++s;
  									SKIPWS(); // Eat any whitespace.
 -									if(chartype_quote(*s)) // '<... #="...'
 +									if(*s == '\'' || *s == '"') // '<... #="...'
  									{
  										ch = *s; // Save quote char to avoid breaking on "''" -or- '""'.
  										++s; // Step over the quote.
  										a->value = s; // Save the offset.
 -										SCANFOR(*s == ch); // Scan for the terminating quote
 -										ENDSEG(); // Save char in 'ch', terminate & step over.
 -										strconv_attribute(&a->value);
 +										s = strconv_attribute(s, ch);
 +										
 +										if (!s) return s;
 -										if(chartype_leave(*s))
 +										if(*s == '>')
  										{
  											++s;
  											goto LOC_PCDATA;
  										}
 -										else if(chartype_close(*s))
 +										else if(*s == '/')
  										{
  											++s;
  											POPNODE();
  											SKIPWS(); // Eat any whitespace.
  											goto LOC_LEAVE;
  										}
 -										else if(chartype_space(*s)) // This may indicate a following attribute.
 +										else if(is_chartype(*s, ct_space)) // This may indicate a following attribute.
  										{
  											SKIPWS(); // Eat any whitespace.
  											goto LOC_ATTRIBUTE; // Go scan for additional attributes.
 @@ -756,19 +966,19 @@ namespace pugi  								goto LOC_ATTRIBUTE;
  							}
 -							SCANWHILE(!chartype_leave(*s) && !chartype_close(*s));
 +							SCANWHILE(*s != '>' && *s != '/');
  						}
  					LOC_LEAVE:
 -						if(chartype_leave(*s)) // '...>'
 +						if(*s == '>') // '...>'
  						{
  							++s; // Step over the '>'.
  						LOC_PCDATA: // '>...<'
  							mark = s; // Save this offset while searching for a terminator.
  							SKIPWS(); // Eat whitespace if no genuine PCDATA here.
  							// We hit a '<...', with only whitespace, so don't bother storing anything.
 - 							if((mark == s || !OPTSET(parse_ws_pcdata)) && chartype_enter(*s))
 + 							if((mark == s || !OPTSET(parse_ws_pcdata)) && *s == '<')
  							{
 -								if(chartype_close(*(s+1))) // '</...'
 +								if(*(s+1) == '/') // '</...'
  								{
  								    ++s;
  									goto LOC_CLOSE;
 @@ -784,34 +994,29 @@ namespace pugi  							{
  								PUSHNODE(node_pcdata); // Append a new node on the tree.
  								cursor->value = s; // Save the offset.
 -							}
 -													
 -							while (*s && !chartype_enter(*s)) ++s; // '...<'
 -							
 -							if (preserve)
 -							{
 -								if (*s) ENDSEG(); // Save char in 'ch', terminate & step over.
 -								strconv_pcdata(&cursor->value);
 +								s = strconv_pcdata(s, OPTSET(parse_escapes), OPTSET(parse_eol));
 +								
 +								if (!s) return s;
  								POPNODE(); // Pop since this is a standalone.
  							}
 -
 -							if (!*s) return s;
 -
 -							if(chartype_enter(ch)) // Did we hit a '<...'?
 +							else
  							{
 -								if(chartype_close(*s)) goto LOC_CLOSE;
 -								else if(chartype_special(*s)) goto LOC_CLASSIFY; // We hit a '<!...'. We must test this here if we want comments intermixed w/PCDATA.
 -								else if(*s) goto LOC_CLASSIFY;
 -								else return s;
 +								SCANFOR(*s == '<'); // '...<'
  							}
 +
 +							// We're after '<...', otherwise we would not get here
 +							if(*s == '/') goto LOC_CLOSE;
 +							else if(*s == '!') goto LOC_CLASSIFY; // We hit a '<!...'. We must test this here if we want comments intermixed w/PCDATA.
 +							else if(*s) goto LOC_CLASSIFY;
 +							else return s;
  						}
  						// Fall-through A.
 -						else if(chartype_close(*s)) // '.../'
 +						else if(*s == '/') // '.../'
  						{
  							++s;
 -							if(chartype_leave(*s)) // '.../>'
 +							if(*s == '>') // '.../>'
  							{
  								POPNODE(); // Pop.
  								goto LOC_LEAVE;
 @@ -819,7 +1024,7 @@ namespace pugi  						}
  					}
  					// Fall-through B.
 -					else if(chartype_close(*s)) // '.../'
 +					else if(*s == '/') // '.../'
  					{
  					LOC_CLOSE:
  						++s;
 @@ -868,7 +1073,7 @@ namespace pugi  						}
  						else
  						{
 -							SCANFOR(chartype_leave(*s)); // '...>'
 +							SCANFOR(*s == '>'); // '...>'
  							POPNODE(); // Pop.
  						}
 diff --git a/src/pugixml.hpp b/src/pugixml.hpp index a802e3d..b7ded37 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -2,7 +2,9 @@  //
  // Pug Improved XML Parser - Version 0.2
  // --------------------------------------------------------
 -// Copyright (C) 2006, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
 +// Copyright (C) 2006-2007, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
 +// Thanks to Palvelev Artyom (cppguru@mail.ru) for hints about optimizing
 +// conversion functions.
  // This work is based on the pugxml parser, which is:
  // Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
  // Released into the Public Domain. Use at your own risk.
  | 
