diff options
| -rw-r--r-- | src/pugixml.cpp | 152 | ||||
| -rw-r--r-- | tests/test_parse.cpp | 29 | ||||
| -rw-r--r-- | tests/test_parse_doctype.cpp | 92 | 
3 files changed, 206 insertions, 67 deletions
diff --git a/src/pugixml.cpp b/src/pugixml.cpp index aa10bbc..dcb25f7 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -1739,6 +1739,116 @@ namespace  		{
  		}
 +		// DOCTYPE consists of nested sections of the following possible types:
 +		// <!-- ... -->, <? ... ?>, "...", '...'
 +		// <![...]]>
 +		// <!...>
 +		// First group can not contain nested groups
 +		// Second group can contain nested groups of the same type
 +		// Third group can contain all other groups
 +		xml_parse_result parse_doctype_primitive(char_t*& s, char_t* buffer_start)
 +		{
 +			if (*s == '"' || *s == '\'')
 +			{
 +				// quoted string
 +				char_t ch = *s++;
 +				SCANFOR(*s == ch);
 +				if (!*s) THROW_ERROR(status_bad_doctype, s);
 +
 +				s++;
 +			}
 +			else if (s[0] == '<' && s[1] == '?')
 +			{
 +				// <? ... ?>
 +				s += 2;
 +				SCANFOR(s[0] == '?' && s[1] == '>'); // no need for ENDSWITH because ?> can't terminate proper doctype
 +				if (!*s) THROW_ERROR(status_bad_doctype, s);
 +
 +				s += 2;
 +			}
 +			else if (s[0] == '<' && s[1] == '!' && s[2] == '-' && s[3] == '-')
 +			{
 +				s += 4;
 +				SCANFOR(s[0] == '-' && s[1] == '-' && s[2] == '>'); // no need for ENDSWITH because --> can't terminate proper doctype
 +				if (!*s) THROW_ERROR(status_bad_doctype, s);
 +
 +				s += 4;
 +			}
 +			else THROW_ERROR(status_bad_doctype, s);
 +
 +			THROW_ERROR(status_ok, s);
 +		}
 +
 +		xml_parse_result parse_doctype_ignore(char_t*& s, char_t* buffer_start)
 +		{
 +			assert(s[0] == '<' && s[1] == '!' && s[2] == '[');
 +			s++;
 +
 +			while (*s)
 +			{
 +				if (s[0] == '<' && s[1] == '!' && s[2] == '[')
 +				{
 +					// nested ignore section
 +					xml_parse_result res = parse_doctype_ignore(s, buffer_start);
 +
 +					if (!res) return res;
 +				}
 +				else if (s[0] == ']' && s[1] == ']' && s[2] == '>')
 +				{
 +					// ignore section end
 +					s += 3;
 +
 +					THROW_ERROR(status_ok, s);
 +				}
 +				else s++;
 +			}
 +
 +			THROW_ERROR(status_bad_doctype, s);
 +		}
 +
 +		xml_parse_result parse_doctype(char_t*& s, char_t* buffer_start, char_t endch, bool toplevel)
 +		{
 +			assert(s[0] == '<' && s[1] == '!');
 +			s++;
 +
 +			while (*s)
 +			{
 +				if (s[0] == '<' && s[1] == '!' && s[2] != '-')
 +				{
 +					if (s[2] == '[')
 +					{
 +						// ignore
 +						xml_parse_result res = parse_doctype_ignore(s, buffer_start);
 +
 +						if (!res) return res;
 +					}
 +					else
 +					{
 +						// some control group
 +						xml_parse_result res = parse_doctype(s, buffer_start, endch, false);
 +
 +						if (!res) return res;
 +					}
 +				}
 +				else if (s[0] == '<' || s[0] == '"' || s[0] == '\'')
 +				{
 +					// unknown tag (forbidden), or some primitive group
 +					xml_parse_result res = parse_doctype_primitive(s, buffer_start);
 +
 +					if (!res) return res;
 +				}
 +				else if (*s == '>')
 +				{
 +					s++;
 +
 +					THROW_ERROR(status_ok, s);
 +				}
 +				else s++;
 +			}
 +
 +			THROW_ERROR((toplevel && endch == '>') ? status_ok : status_bad_doctype, s);
 +		}
 +
  		xml_parse_result parse_exclamation(char_t*& ref_s, xml_node_struct* cursor, unsigned int optmsk, char_t* buffer_start, char_t endch)
  		{
  			// load into registers
 @@ -1831,47 +1941,13 @@ namespace  			}
  			else if (s[0] == 'D' && s[1] == 'O' && s[2] == 'C' && s[3] == 'T' && s[4] == 'Y' && s[5] == 'P' && ENDSWITH(s[6], 'E'))
  			{
 -				if (s[6] != 'E') THROW_ERROR(status_bad_doctype, s); 
 +				if (s[6] != 'E') THROW_ERROR(status_bad_doctype, s);
 -			LOC_DOCTYPE:
 -				SCANFOR(*s == '\'' || *s == '"' || *s == '[' || *s == '>');
 -				if (*s == 0 && endch != '>') THROW_ERROR(status_bad_doctype, s);
 +				s -= 2;
 -				if (*s == '\'' || *s == '"') // '...SYSTEM "..."
 -				{
 -					ch = *s++;
 -					SCANFOR(*s == ch);
 -					if (*s == 0 && endch != '>') THROW_ERROR(status_bad_doctype, s);
 +				xml_parse_result res = parse_doctype(s, buffer_start, endch, true);
 -					s += (*s != 0);
 -					goto LOC_DOCTYPE;
 -				}
 -
 -				if(*s == '[') // '...[...'
 -				{
 -					++s;
 -					unsigned int bd = 1; // Bracket depth counter.
 -					while (*s!=0) // Loop till we're out of all brackets.
 -					{
 -						if (*s == ']') --bd;
 -						else if (*s == '[') ++bd;
 -						if (bd == 0) break;
 -						++s;
 -					}
 -
 -					if (bd != 0) THROW_ERROR(status_bad_doctype, s);
 -				}
 -
 -				SCANFOR(*s == '>');
 -
 -				if (*s == 0)
 -				{
 -					if (endch != '>') THROW_ERROR(status_bad_doctype, s);
 -				}
 -				else
 -				{
 -					++s;
 -				}
 +				if (!res) return res;
  			}
  			else if (*s == 0 && endch == '-') THROW_ERROR(status_bad_comment, s);
  			else if (*s == 0 && endch == '[') THROW_ERROR(status_bad_cdata, s);
 diff --git a/tests/test_parse.cpp b/tests/test_parse.cpp index 0719e5d..fb0dd23 100644 --- a/tests/test_parse.cpp +++ b/tests/test_parse.cpp @@ -497,35 +497,6 @@ TEST(parse_declaration_error)  	CHECK(doc.load(STR("<?xml version='1?>"), parse_minimal | parse_declaration).status == status_bad_attribute);
  }
 -TEST(parse_doctype_skip)
 -{
 -	xml_document doc;
 -	CHECK(doc.load(STR("<!DOCTYPE doc>")) && !doc.first_child());
 -	CHECK(doc.load(STR("<!DOCTYPE doc SYSTEM 'foo'>")) && !doc.first_child());
 -	CHECK(doc.load(STR("<!DOCTYPE doc SYSTEM \"foo\">")) && !doc.first_child());
 -	CHECK(doc.load(STR("<!DOCTYPE doc PUBLIC \"foo\" 'bar'>")) && !doc.first_child());
 -	CHECK(doc.load(STR("<!DOCTYPE doc PUBLIC \"foo'\">")) && !doc.first_child());
 -	CHECK(doc.load(STR("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY'>]>")) && !doc.first_child());
 -
 -	CHECK(doc.load(STR("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY'>]><node/>")));
 -	CHECK_NODE(doc, STR("<node />"));
 -}
 -
 -TEST(parse_doctype_error)
 -{
 -	xml_document doc;
 -	CHECK(doc.load(STR("<!DOCTYPE")).status == status_bad_doctype);
 -	CHECK(doc.load(STR("<!DOCTYPE doc")).status == status_bad_doctype);
 -	CHECK(doc.load(STR("<!DOCTYPE doc SYSTEM 'foo")).status == status_bad_doctype);
 -	CHECK(doc.load(STR("<!DOCTYPE doc SYSTEM \"foo")).status == status_bad_doctype);
 -	CHECK(doc.load(STR("<!DOCTYPE doc PUBLIC \"foo\" 'bar")).status == status_bad_doctype);
 -	CHECK(doc.load(STR("<!DOCTYPE doc PUBLIC \"foo'\"")).status == status_bad_doctype);
 -	CHECK(doc.load(STR("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY")).status == status_bad_doctype);
 -	CHECK(doc.load(STR("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY'>")).status == status_bad_doctype);
 -	CHECK(doc.load(STR("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY'>]")).status == status_bad_doctype);
 -	CHECK(doc.load(STR("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY'>] ")).status == status_bad_doctype);
 -}
 -
  TEST(parse_empty)
  {
  	xml_document doc;
 diff --git a/tests/test_parse_doctype.cpp b/tests/test_parse_doctype.cpp new file mode 100644 index 0000000..35015ff --- /dev/null +++ b/tests/test_parse_doctype.cpp @@ -0,0 +1,92 @@ +#include "common.hpp"
 +
 +#include <string>
 +
 +bool test_doctype_wf(const std::basic_string<char_t>& decl)
 +{
 +	xml_document doc;
 +
 +	// standalone
 +	if (!doc.load(decl.c_str()) || doc.first_child()) return false;
 +
 +	// pcdata pre/postfix
 +	if (!doc.load(("a" + decl).c_str()) || doc.first_child()) return false;
 +	if (!doc.load((decl + "b").c_str()) || doc.first_child()) return false;
 +	if (!doc.load(("a" + decl + "b").c_str()) || doc.first_child()) return false;
 +
 +	// node pre/postfix
 +	if (!doc.load(("<nodea/>" + decl).c_str()) || !test_node(doc, STR("<nodea />"), STR(""), format_raw)) return false;
 +	if (!doc.load((decl + "<nodeb/>").c_str()) || !test_node(doc, STR("<nodeb />"), STR(""), format_raw)) return false;
 +	if (!doc.load(("<nodea/>" + decl + "<nodeb/>").c_str()) || !test_node(doc, STR("<nodea /><nodeb />"), STR(""), format_raw)) return false;
 +
 +	return true;
 +}
 +
 +bool test_doctype_nwf(const std::basic_string<char_t>& decl)
 +{
 +	xml_document doc;
 +
 +	// standalone
 +	if (doc.load(decl.c_str()).status != status_bad_doctype) return false;
 +
 +	// pcdata postfix
 +	if (doc.load((decl + "b").c_str()).status != status_bad_doctype) return false;
 +
 +	// node postfix
 +	if (doc.load((decl + "<nodeb/>").c_str()).status != status_bad_doctype) return false;
 +
 +	return true;
 +}
 +
 +#define TEST_DOCTYPE_WF(contents) CHECK(test_doctype_wf(STR(contents)))
 +#define TEST_DOCTYPE_NWF(contents) CHECK(test_doctype_nwf(STR(contents)))
 +
 +TEST(parse_doctype_skip)
 +{
 +	TEST_DOCTYPE_WF("<!DOCTYPE doc>");
 +	TEST_DOCTYPE_WF("<!DOCTYPE doc SYSTEM 'foo'>");
 +	TEST_DOCTYPE_WF("<!DOCTYPE doc SYSTEM \"foo\">");
 +	TEST_DOCTYPE_WF("<!DOCTYPE doc PUBLIC \"foo\" 'bar'>");
 +	TEST_DOCTYPE_WF("<!DOCTYPE doc PUBLIC \"foo'\">");
 +	TEST_DOCTYPE_WF("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY'>]>");
 +}
 +
 +TEST(parse_doctype_error)
 +{
 +	TEST_DOCTYPE_NWF("<!DOCTYPE");
 +	TEST_DOCTYPE_NWF("<!DOCTYPE doc");
 +	TEST_DOCTYPE_NWF("<!DOCTYPE doc SYSTEM 'foo");
 +	TEST_DOCTYPE_NWF("<!DOCTYPE doc SYSTEM \"foo");
 +	TEST_DOCTYPE_NWF("<!DOCTYPE doc PUBLIC \"foo\" 'bar");
 +	TEST_DOCTYPE_NWF("<!DOCTYPE doc PUBLIC \"foo'\"");
 +	TEST_DOCTYPE_NWF("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY");
 +	TEST_DOCTYPE_NWF("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY'>");
 +	TEST_DOCTYPE_NWF("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY'>]");
 +	TEST_DOCTYPE_NWF("<!DOCTYPE doc SYSTEM 'foo' [<!ELEMENT foo 'ANY'>] ");
 +}
 +
 +// Examples from W3C recommendations
 +TEST(parse_doctype_w3c_wf)
 +{
 +	TEST_DOCTYPE_WF("<!DOCTYPE greeting SYSTEM \"hello.dtd\">");
 +	TEST_DOCTYPE_WF("<!DOCTYPE greeting [ <!ELEMENT greeting (#PCDATA)> ]>");
 +	TEST_DOCTYPE_WF("<!DOCTYPE greeting [ <!ATTLIST list type    (bullets|ordered|glossary)  \"ordered\"> <!ATTLIST form method  CDATA   #FIXED \"POST\"> ]>");
 +	TEST_DOCTYPE_WF("<!DOCTYPE greeting [ <!ENTITY % draft 'INCLUDE' > <!ENTITY % final 'IGNORE' > <![%draft;[ <!ELEMENT book (comments*, title, body, supplements?)> ]]> <![%final;[ <!ELEMENT book (title, body, supplements?)> ]]>]>");
 +	TEST_DOCTYPE_WF("<!DOCTYPE greeting [ <!ENTITY open-hatch PUBLIC \"-//Textuality//TEXT Standard open-hatch boilerplate//EN\" \"http://www.textuality.com/boilerplate/OpenHatch.xml\"> ]>");
 +	TEST_DOCTYPE_WF("<!DOCTYPE greeting [ <!ENTITY EndAttr \"27'\" > ]>");
 +}
 +
 +TEST(parse_doctype_w3c_nwf)
 +{
 +	TEST_DOCTYPE_NWF("<!DOCTYPE greeting SYSTEM \"hello.dtd>");
 +	TEST_DOCTYPE_NWF("<!DOCTYPE greeting SYSTEM");
 +	TEST_DOCTYPE_NWF("<!DOCTYPE greeting [ <!ELEMENT greeting (#PCDATA)> ]");
 +	TEST_DOCTYPE_NWF("<!DOCTYPE greeting [ <!ELEMENT greeting (#PCDATA)>");
 +	TEST_DOCTYPE_NWF("<!DOCTYPE greeting [ <!ELEMENT greeting (#PCDATA");
 +	TEST_DOCTYPE_NWF("<!DOCTYPE greeting [ ");
 +	TEST_DOCTYPE_NWF("<!DOCTYPE greeting [ <!ATTLIST list type    (bullets|ordered|glossary)  \"ordered\"> ]");
 +	TEST_DOCTYPE_NWF("<!DOCTYPE greeting [ <!ATTLIST list type    (bullets|ordered|glossary)  \"ordered\">");
 +	TEST_DOCTYPE_NWF("<!DOCTYPE greeting [ <!ATTLIST list type    (bullets|ordered|glossary)  \"orde");
 +	TEST_DOCTYPE_NWF("<!DOCTYPE greeting [ <!ATTLIST list type    (bullets|ordered|glossary) ");
 +	TEST_DOCTYPE_NWF("<!DOCTYPE greeting [ <!ENTITY open-hatch PUBLIC \"-//Textuality//TEXT Standard open-hatch boilerplate//EN\" \"http://www.textuality.com/boilerplate/OpenHatch.x");
 +}
  | 
