DOCTYPE parsing is now stackless

This prevents malformed input XML with very deeply recursive DOCTYPE sections from crashing the parser. Fixes #29.
author: Arseny Kapoulkine <arseny.kapoulkine@gmail.com> 2015-02-12 08:12:12 -0800
committer: Arseny Kapoulkine <arseny.kapoulkine@gmail.com> 2015-02-12 08:12:12 -0800
commit: e94552c9ca883f8c4f2cead24355a60ecba0efb2 (patch)
tree: 0aa1f9ed3d61c110d458f4c044920bd5998460fe
parent: 00b4b0192f88392e80f1c504526c7e73f4d16ec7 (diff)
2 files changed, 62 insertions, 15 deletions
diff --git a/src/pugixml.cpp b/src/pugixml.cpp
index 265337a..0f696ab 100644
--- a/src/pugixml.cpp
+++ b/src/pugixml.cpp
@@ -2357,23 +2357,28 @@ PUGI__NS_BEGIN
 
 		char_t* parse_doctype_ignore(char_t* s)
 		{
+			size_t depth = 0;
+
 			assert(s[0] == '<' && s[1] == '!' && s[2] == '[');
-			s++;
+			s += 3;
 
 			while (*s)
 			{
 				if (s[0] == '<' && s[1] == '!' && s[2] == '[')
 				{
 					// nested ignore section
-					s = parse_doctype_ignore(s);
-					if (!s) return s;
+					s += 3;
+					depth++;
 				}
 				else if (s[0] == ']' && s[1] == ']' && s[2] == '>')
 				{
 					// ignore section end
 					s += 3;
 
-					return s;
+					if (depth == 0)
+						return s;
+
+					depth--;
 				}
 				else s++;
 			}
@@ -2381,10 +2386,12 @@ PUGI__NS_BEGIN
 			PUGI__THROW_ERROR(status_bad_doctype, s);
 		}
 
-		char_t* parse_doctype_group(char_t* s, char_t endch, bool toplevel)
+		char_t* parse_doctype_group(char_t* s, char_t endch)
 		{
+			size_t depth = 0;
+
 			assert((s[0] == '<' || s[0] == 0) && s[1] == '!');
-			s++;
+			s += 2;
 
 			while (*s)
 			{
@@ -2399,12 +2406,8 @@ PUGI__NS_BEGIN
 					else
 					{
 						// some control group
-						s = parse_doctype_group(s, endch, false);
-						if (!s) return s;
-
-						// skip >
-						assert(*s == '>');
-						s++;
+						s += 2;
+						depth++;
 					}
 				}
 				else if (s[0] == '<' || s[0] == '"' || s[0] == '\'')
@@ -2415,12 +2418,16 @@ PUGI__NS_BEGIN
 				}
 				else if (*s == '>')
 				{
-					return s;
+					if (depth == 0)
+						return s;
+
+					depth--;
+					s++;
 				}
 				else s++;
 			}
 
-			if (!toplevel || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s);
+			if (depth != 0 || endch != '>') PUGI__THROW_ERROR(status_bad_doctype, s);
 
 			return s;
 		}
@@ -2512,7 +2519,7 @@ PUGI__NS_BEGIN
 
 				char_t* mark = s + 9;
 
-				s = parse_doctype_group(s, endch, true);
+				s = parse_doctype_group(s, endch);
 				if (!s) return s;
 
 				assert((*s == 0 && endch == '>') || *s == '>');
diff --git a/tests/test_parse_doctype.cpp b/tests/test_parse_doctype.cpp
index 14268f6..646ebbf 100644
--- a/tests/test_parse_doctype.cpp
+++ b/tests/test_parse_doctype.cpp
@@ -322,3 +322,43 @@ TEST(parse_doctype_error_ignore)
 	CHECK(doc.load_string(STR("<!DOCTYPE root [ <![IGNORE[ <![INCLUDE[")).status == status_bad_doctype);
 	CHECK(doc.load_string(STR("<!DOCTYPE root [ <![IGNORE[ <![INCLUDE["), parse_doctype).status == status_bad_doctype);
 }
+
+TEST(parse_doctype_stackless_group)
+{
+	std::basic_string<char_t> str;
+
+	int count = 100000;
+
+	str += "<!DOCTYPE ";
+
+	for (int i = 0; i < count; ++i)
+		str += STR("<!G ");
+
+	for (int j = 0; j < count; ++j)
+		str += STR(">");
+
+	str += ">";
+
+	xml_document doc;
+	CHECK(doc.load_string(str.c_str(), parse_fragment));
+}
+
+TEST(parse_doctype_stackless_ignore)
+{
+	std::basic_string<char_t> str;
+
+	int count = 100000;
+
+	str += "<!DOCTYPE ";
+
+	for (int i = 0; i < count; ++i)
+		str += STR("<![IGNORE[ ");
+
+	for (int j = 0; j < count; ++j)
+		str += STR("]]>");
+
+	str += ">";
+
+	xml_document doc;
+	CHECK(doc.load_string(str.c_str(), parse_fragment));
+}
author	Arseny Kapoulkine <arseny.kapoulkine@gmail.com>	2015-02-12 08:12:12 -0800
committer	Arseny Kapoulkine <arseny.kapoulkine@gmail.com>	2015-02-12 08:12:12 -0800
commit	e94552c9ca883f8c4f2cead24355a60ecba0efb2 (patch)
tree	0aa1f9ed3d61c110d458f4c044920bd5998460fe
parent	00b4b0192f88392e80f1c504526c7e73f4d16ec7 (diff)