diff options
| author | Arseny Kapoulkine <arseny.kapoulkine@gmail.com> | 2015-02-12 08:12:12 -0800 | 
|---|---|---|
| committer | Arseny Kapoulkine <arseny.kapoulkine@gmail.com> | 2015-02-12 08:12:12 -0800 | 
| commit | e94552c9ca883f8c4f2cead24355a60ecba0efb2 (patch) | |
| tree | 0aa1f9ed3d61c110d458f4c044920bd5998460fe | |
| parent | 00b4b0192f88392e80f1c504526c7e73f4d16ec7 (diff) | |
DOCTYPE parsing is now stackless
This prevents malformed input XML with very deeply recursive DOCTYPE sections
from crashing the parser.
Fixes #29.
| -rw-r--r-- | src/pugixml.cpp | 37 | ||||
| -rw-r--r-- | tests/test_parse_doctype.cpp | 40 | 
2 files changed, 62 insertions, 15 deletions
| diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 265337a..0f696ab 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -2357,23 +2357,28 @@ PUGI__NS_BEGIN  		char_t* parse_doctype_ignore(char_t* s)  		{ +			size_t depth = 0; +  			assert(s[0] == '<' && s[1] == '!' && s[2] == '['); -			s++; +			s += 3;  			while (*s)  			{  				if (s[0] == '<' && s[1] == '!' && s[2] == '[')  				{  					// nested ignore section -					s = parse_doctype_ignore(s); -					if (!s) return s; +					s += 3; +					depth++;  				}  				else if (s[0] == ']' && s[1] == ']' && s[2] == '>')  				{  					// ignore section end  					s += 3; -					return s; +					if (depth == 0) +						return s; + +					depth--;  				}  				else s++;  			} @@ -2381,10 +2386,12 @@ PUGI__NS_BEGIN  			PUGI__THROW_ERROR(status_bad_doctype, s);  		} -		char_t* parse_doctype_group(char_t* s, char_t endch, bool toplevel) +		char_t* parse_doctype_group(char_t* s, char_t endch)  		{ +			size_t depth = 0; +  			assert((s[0] == '<' || s[0] == 0) && s[1] == '!'); -			s++; +			s += 2;  			while (*s)  			{ @@ -2399,12 +2406,8 @@ PUGI__NS_BEGIN  					else  					{  						// some control group -						s = parse_doctype_group(s, endch, false); -						if (!s) return s; - -						// skip > -						assert(*s == '>'); -						s++; +						s += 2; +						depth++;  					}  				}  				else if (s[0] == '<' || s[0] == '"' || s[0] == '\'') @@ -2415,12 +2418,16 @@ PUGI__NS_BEGIN  				}  				else if (*s == '>')  				{ -					return s; +					if (depth == 0) +						return s; + +					depth--; +					s++;  				}  				else s++;  			} -			if (!toplevel || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s); +			if (depth != 0 || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s);  			return s;  		} @@ -2512,7 +2519,7 @@ PUGI__NS_BEGIN  				char_t* mark = s + 9; -				s = parse_doctype_group(s, endch, true); +				s = parse_doctype_group(s, endch);  				if (!s) return s;  				assert((*s == 0 && endch == '>') || *s == '>'); diff --git a/tests/test_parse_doctype.cpp b/tests/test_parse_doctype.cpp index 14268f6..646ebbf 100644 --- a/tests/test_parse_doctype.cpp +++ b/tests/test_parse_doctype.cpp @@ -322,3 +322,43 @@ TEST(parse_doctype_error_ignore)  	CHECK(doc.load_string(STR("<!DOCTYPE root [ <![IGNORE[ <![INCLUDE[")).status == status_bad_doctype);  	CHECK(doc.load_string(STR("<!DOCTYPE root [ <![IGNORE[ <![INCLUDE["), parse_doctype).status == status_bad_doctype);  } + +TEST(parse_doctype_stackless_group) +{ +	std::basic_string<char_t> str; + +	int count = 100000; + +	str += "<!DOCTYPE "; + +	for (int i = 0; i < count; ++i) +		str += STR("<!G "); + +	for (int j = 0; j < count; ++j) +		str += STR(">"); + +	str += ">"; + +	xml_document doc; +	CHECK(doc.load_string(str.c_str(), parse_fragment)); +} + +TEST(parse_doctype_stackless_ignore) +{ +	std::basic_string<char_t> str; + +	int count = 100000; + +	str += "<!DOCTYPE "; + +	for (int i = 0; i < count; ++i) +		str += STR("<![IGNORE[ "); + +	for (int j = 0; j < count; ++j) +		str += STR("]]>"); + +	str += ">"; + +	xml_document doc; +	CHECK(doc.load_string(str.c_str(), parse_fragment)); +} | 
