diff options
| -rw-r--r-- | src/pugixml.cpp | 78 | ||||
| -rw-r--r-- | src/pugixml.hpp | 8 | ||||
| -rw-r--r-- | tests/test_document.cpp | 135 | ||||
| -rw-r--r-- | tests/test_dom_modify.cpp | 17 | ||||
| -rw-r--r-- | tests/test_memory.cpp | 2 | ||||
| -rw-r--r-- | tests/test_parse.cpp | 194 | ||||
| -rw-r--r-- | tests/test_parse_doctype.cpp | 16 | ||||
| -rw-r--r-- | tests/test_write.cpp | 15 | 
8 files changed, 363 insertions, 102 deletions
| diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 926458e..1893125 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -2199,7 +2199,7 @@ PUGI__NS_BEGIN  		char_t* parse_doctype_group(char_t* s, char_t endch, bool toplevel)  		{ -			assert(s[0] == '<' && s[1] == '!'); +			assert((s[0] == '<' || s[0] == 0) && s[1] == '!');  			s++;  			while (*s) @@ -2331,6 +2331,9 @@ PUGI__NS_BEGIN  				s = parse_doctype_group(s, endch, true);  				if (!s) return s; +				assert((*s == 0 && endch == '>') || *s == '>'); +				if (*s) *s++ = 0; +  				if (PUGI__OPTSET(parse_doctype))  				{  					while (PUGI__IS_CHARTYPE(*mark, ct_space)) ++mark; @@ -2339,9 +2342,6 @@ PUGI__NS_BEGIN  					cursor->value = mark; -					assert((*s == 0 && endch == '>') || *s == '>'); -					if (*s) *s++ = 0; -  					PUGI__POPNODE();  				}  			} @@ -2629,7 +2629,7 @@ PUGI__NS_BEGIN  					PUGI__SKIPWS(); // Eat whitespace if no genuine PCDATA here. -					if (*s == '<') +					if (*s == '<' || !*s)  					{  						// We skipped some whitespace characters because otherwise we would take the tag branch instead of PCDATA one  						assert(mark != s); @@ -2640,13 +2640,13 @@ PUGI__NS_BEGIN  						}  						else if (PUGI__OPTSET(parse_ws_pcdata_single))  						{ -							if (s[1] != '/' || cursor->first_child) continue; +							if (s[0] != '<' || s[1] != '/' || cursor->first_child) continue;  						}  					}  					s = mark; -					if (cursor->parent) +					if (cursor->parent || PUGI__OPTSET(parse_fragment))  					{  						PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree.  						cursor->value = s; // Save the offset. @@ -2676,14 +2676,43 @@ PUGI__NS_BEGIN  			return s;  		} +	#ifdef PUGIXML_WCHAR_MODE +		static char_t* parse_skip_bom(char_t* s) +		{ +			return (s[0] == 0xfeff) ? s + 1 : s; +		} +	#else +		static char_t* parse_skip_bom(char_t* s) +		{ +			return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s; +		} +	#endif + +		static bool has_element_node_siblings(xml_node_struct* node) +		{ +			while (node) +			{ +				xml_node_type type = static_cast<xml_node_type>((node->header & impl::xml_memory_page_type_mask) + 1); +				if (type == node_element) return true; + +				node = node->next_sibling; +			} + +			return false; +		} +  		static xml_parse_result parse(char_t* buffer, size_t length, xml_document_struct* xmldoc, xml_node_struct* root, unsigned int optmsk)  		{  			// allocator object is a part of document object  			xml_allocator& alloc = *static_cast<xml_allocator*>(xmldoc);  			// early-out for empty documents -			if (length == 0) return make_parse_result(status_ok); +			if (length == 0) +				return make_parse_result(PUGI__OPTSET(parse_fragment) ? status_ok : status_no_document_element); +			// get last child of the root before parsing +			xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c : 0; +	  			// create parser on stack  			xml_parser parser(alloc); @@ -2691,24 +2720,35 @@ PUGI__NS_BEGIN  			char_t endch = buffer[length - 1];  			buffer[length - 1] = 0; +			// skip BOM to make sure it does not end up as part of parse output +			char_t* buffer_data = parse_skip_bom(buffer); +  			// perform actual parsing -			parser.parse_tree(buffer, root, optmsk, endch); +			parser.parse_tree(buffer_data, root, optmsk, endch); + +			// update allocator state +			alloc = parser.alloc;  			xml_parse_result result = make_parse_result(parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0);  			assert(result.offset >= 0 && static_cast<size_t>(result.offset) <= length); -			// roll back offset if it occurs on a null terminator in the source buffer -			if (result.offset > 0 && static_cast<size_t>(result.offset) == length - 1 && endch == 0) -				result.offset--; +			if (result) +			{ +				// since we removed last character, we have to handle the only possible false positive (stray <) +				if (endch == '<') +					return make_parse_result(status_unrecognized_tag, length - 1); -			// update allocator state -			alloc = parser.alloc; +				// check if there are any element nodes parsed +				xml_node_struct* first_root_child_parsed = last_root_child ? last_root_child->next_sibling : root->first_child; -			// since we removed last character, we have to handle the only possible false positive -			if (result && endch == '<') +				if (!PUGI__OPTSET(parse_fragment) && !has_element_node_siblings(first_root_child_parsed)) +					return make_parse_result(status_no_document_element, length - 1); +			} +			else  			{ -				// there's no possible well-formed document with < at the end -				return make_parse_result(status_unrecognized_tag, length - 1); +				// roll back offset if it occurs on a null terminator in the source buffer +				if (result.offset > 0 && static_cast<size_t>(result.offset) == length - 1 && endch == 0) +					result.offset--;  			}  			return result; @@ -5469,6 +5509,8 @@ namespace pugi  		case status_append_invalid_root: return "Unable to append nodes: root is not an element or document"; +		case status_no_document_element: return "No document element found"; +  		default: return "Unknown error";  		}  	} diff --git a/src/pugixml.hpp b/src/pugixml.hpp index e19a4a3..e5009fe 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -151,6 +151,10 @@ namespace pugi  	// This flag is off by default; turning it on may result in slower parsing and more memory consumption.  	const unsigned int parse_ws_pcdata_single = 0x0400; +	// This flag determines if plain character data that does not have a parent node is added to the DOM tree, and if an empty document +	// is a valid document. This flag is off by default. +	const unsigned int parse_fragment = 0x0800; +  	// The default parsing mode.  	// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,  	// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules. @@ -880,7 +884,9 @@ namespace pugi  		status_bad_end_element,		// Parsing error occurred while parsing end element tag  		status_end_element_mismatch,// There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag) -		status_append_invalid_root	// Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer) +		status_append_invalid_root,	// Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer) + +		status_no_document_element	// Parsing resulted in a document without element nodes  	};  	// Parsing result diff --git a/tests/test_document.cpp b/tests/test_document.cpp index 3ac8bf8..e6c7b00 100644 --- a/tests/test_document.cpp +++ b/tests/test_document.cpp @@ -249,7 +249,7 @@ TEST(document_load_file_empty)  {  	pugi::xml_document doc; -	CHECK(doc.load_file("tests/data/empty.xml")); +	CHECK(doc.load_file("tests/data/empty.xml").status == status_no_document_element);  	CHECK(!doc.first_child());  } @@ -907,16 +907,52 @@ TEST(document_load_buffer_empty)  		xml_encoding encoding = encodings[i];  		xml_document doc; -		CHECK(doc.load_buffer(buffer, 0, parse_default, encoding) && !doc.first_child()); -		CHECK(doc.load_buffer(0, 0, parse_default, encoding) && !doc.first_child()); +		CHECK(doc.load_buffer(buffer, 0, parse_default, encoding).status == status_no_document_element && !doc.first_child()); +		CHECK(doc.load_buffer(0, 0, parse_default, encoding).status == status_no_document_element && !doc.first_child()); -		CHECK(doc.load_buffer_inplace(buffer, 0, parse_default, encoding) && !doc.first_child()); -		CHECK(doc.load_buffer_inplace(0, 0, parse_default, encoding) && !doc.first_child()); +		CHECK(doc.load_buffer_inplace(buffer, 0, parse_default, encoding).status == status_no_document_element && !doc.first_child()); +		CHECK(doc.load_buffer_inplace(0, 0, parse_default, encoding).status == status_no_document_element && !doc.first_child());  		void* own_buffer = pugi::get_memory_allocation_function()(1); -		CHECK(doc.load_buffer_inplace_own(own_buffer, 0, parse_default, encoding) && !doc.first_child()); -		CHECK(doc.load_buffer_inplace_own(0, 0, parse_default, encoding) && !doc.first_child()); +		CHECK(doc.load_buffer_inplace_own(own_buffer, 0, parse_default, encoding).status == status_no_document_element && !doc.first_child()); +		CHECK(doc.load_buffer_inplace_own(0, 0, parse_default, encoding).status == status_no_document_element && !doc.first_child()); +	} +} + +TEST(document_load_buffer_empty_fragment) +{ +	xml_encoding encodings[] = +	{ +		encoding_auto, +		encoding_utf8, +		encoding_utf16_le, +		encoding_utf16_be, +		encoding_utf16, +		encoding_utf32_le, +		encoding_utf32_be, +		encoding_utf32, +		encoding_wchar, +        encoding_latin1 +	}; + +	char buffer[1]; + +	for (unsigned int i = 0; i < sizeof(encodings) / sizeof(encodings[0]); ++i) +	{ +		xml_encoding encoding = encodings[i]; + +		xml_document doc; +		CHECK(doc.load_buffer(buffer, 0, parse_fragment, encoding) && !doc.first_child()); +		CHECK(doc.load_buffer(0, 0, parse_fragment, encoding) && !doc.first_child()); + +		CHECK(doc.load_buffer_inplace(buffer, 0, parse_fragment, encoding) && !doc.first_child()); +		CHECK(doc.load_buffer_inplace(0, 0, parse_fragment, encoding) && !doc.first_child()); + +		void* own_buffer = pugi::get_memory_allocation_function()(1); + +		CHECK(doc.load_buffer_inplace_own(own_buffer, 0, parse_fragment, encoding) && !doc.first_child()); +		CHECK(doc.load_buffer_inplace_own(0, 0, parse_fragment, encoding) && !doc.first_child());  	}  } @@ -933,13 +969,27 @@ TEST(document_progressive_truncation)  	{  		char* truncated_data = buffer + original_size - i; -		memcpy(truncated_data, original_data, i); +		// default flags +		{ +			memcpy(truncated_data, original_data, i); -		xml_document doc; -		bool result = doc.load_buffer_inplace(truncated_data, i); +			xml_document doc; +			bool result = doc.load_buffer_inplace(truncated_data, i); + +			// only eof is parseable +			CHECK((i >= 3325) ? result : !result); +		} + +		// fragment mode +		{ +			memcpy(truncated_data, original_data, i); + +			xml_document doc; +			bool result = doc.load_buffer_inplace(truncated_data, i, parse_default | parse_fragment); -		// some truncate locations are parseable - those that come after declaration, declaration + doctype, declaration + doctype + comment and eof -		CHECK(((i - 21) < 3 || (i - 66) < 3 || (i - 95) < 3 || i >= 3325) ? result : !result); +			// some truncate locations are parseable - those that come after declaration, declaration + doctype, declaration + doctype + comment and eof +			CHECK(((i - 21) < 3 || (i - 66) < 3 || (i - 95) < 3 || i >= 3325) ? result : !result); +		}  	}  	delete[] buffer; @@ -953,12 +1003,29 @@ TEST(document_load_buffer_short)  	xml_document doc; -	CHECK(doc.load_buffer(data, 4)); -	CHECK(doc.load_buffer(data + 1, 3)); -	CHECK(doc.load_buffer(data + 2, 2)); -	CHECK(doc.load_buffer(data + 3, 1)); -	CHECK(doc.load_buffer(data + 4, 0)); -	CHECK(doc.load_buffer(0, 0)); +	CHECK(doc.load_buffer(data, 4).status == status_no_document_element); +	CHECK(doc.load_buffer(data + 1, 3).status == status_no_document_element); +	CHECK(doc.load_buffer(data + 2, 2).status == status_no_document_element); +	CHECK(doc.load_buffer(data + 3, 1).status == status_no_document_element); +	CHECK(doc.load_buffer(data + 4, 0).status == status_no_document_element); +	CHECK(doc.load_buffer(0, 0).status == status_no_document_element); + +	delete[] data; +} + +TEST(document_load_buffer_short_fragment) +{ +	char* data = new char[4]; +	memcpy(data, "abcd", 4); + +	xml_document doc; + +	CHECK(doc.load_buffer(data, 4, parse_fragment) && test_string_equal(doc.text().get(), STR("abcd"))); +	CHECK(doc.load_buffer(data + 1, 3, parse_fragment) && test_string_equal(doc.text().get(), STR("bcd"))); +	CHECK(doc.load_buffer(data + 2, 2, parse_fragment) && test_string_equal(doc.text().get(), STR("cd"))); +	CHECK(doc.load_buffer(data + 3, 1, parse_fragment) && test_string_equal(doc.text().get(), STR("d"))); +	CHECK(doc.load_buffer(data + 4, 0, parse_fragment) && !doc.first_child()); +	CHECK(doc.load_buffer(0, 0, parse_fragment) && !doc.first_child());  	delete[] data;  } @@ -970,12 +1037,12 @@ TEST(document_load_buffer_inplace_short)  	xml_document doc; -	CHECK(doc.load_buffer_inplace(data, 4)); -	CHECK(doc.load_buffer_inplace(data + 1, 3)); -	CHECK(doc.load_buffer_inplace(data + 2, 2)); -	CHECK(doc.load_buffer_inplace(data + 3, 1)); -	CHECK(doc.load_buffer_inplace(data + 4, 0)); -	CHECK(doc.load_buffer_inplace(0, 0)); +	CHECK(doc.load_buffer_inplace(data, 4).status == status_no_document_element); +	CHECK(doc.load_buffer_inplace(data + 1, 3).status == status_no_document_element); +	CHECK(doc.load_buffer_inplace(data + 2, 2).status == status_no_document_element); +	CHECK(doc.load_buffer_inplace(data + 3, 1).status == status_no_document_element); +	CHECK(doc.load_buffer_inplace(data + 4, 0).status == status_no_document_element); +	CHECK(doc.load_buffer_inplace(0, 0).status == status_no_document_element);  	delete[] data;  } @@ -1006,7 +1073,7 @@ TEST_XML_FLAGS(document_element, "<?xml version='1.0'?><node><child/></node><!--      CHECK(doc.document_element() == doc.child(STR("node")));  } -TEST_XML_FLAGS(document_element_absent, "<!---->", parse_comments) +TEST_XML_FLAGS(document_element_absent, "<!---->", parse_comments | parse_fragment)  {      CHECK(doc.document_element() == xml_node());  } @@ -1070,16 +1137,6 @@ TEST_XML(document_reset_copy_self, "<node><child/></node>")      CHECK_NODE(doc, STR(""));  } -struct document_data_t -{ -    xml_encoding encoding; - -    const unsigned char* data; -    size_t size; -}; - -#include <stdio.h> -  TEST(document_load_buffer_utf_truncated)  {  	const unsigned char utf8[] = {'<', 0xe2, 0x82, 0xac, '/', '>'}; @@ -1088,6 +1145,14 @@ TEST(document_load_buffer_utf_truncated)  	const unsigned char utf32_be[] = {0, 0, 0, '<', 0, 0, 0x20, 0xac, 0, 0, 0, '/', 0, 0, 0, '>'};  	const unsigned char utf32_le[] = {'<', 0, 0, 0, 0xac, 0x20, 0, 0, '/', 0, 0, 0, '>', 0, 0, 0}; +	struct document_data_t +	{ +	    xml_encoding encoding; + +	    const unsigned char* data; +	    size_t size; +	}; +  	const document_data_t data[] =  	{  		{ encoding_utf8, utf8, sizeof(utf8) }, diff --git a/tests/test_dom_modify.cpp b/tests/test_dom_modify.cpp index c7a3989..c0f156b 100644 --- a/tests/test_dom_modify.cpp +++ b/tests/test_dom_modify.cpp @@ -1057,3 +1057,20 @@ TEST(dom_node_append_buffer_out_of_memory_buffer)  	CHECK(doc.append_buffer(data, sizeof(data)).status == status_out_of_memory);  	CHECK(!doc.first_child());  } + +TEST_XML(dom_node_append_buffer_fragment, "<node />") +{ +	xml_node node = doc.child(STR("node")); + +	CHECK(node.append_buffer("1", 1).status == status_no_document_element); +	CHECK_NODE(doc, STR("<node>1</node>")); + +	CHECK(node.append_buffer("2", 1, parse_fragment)); +	CHECK_NODE(doc, STR("<node>12</node>")); + +	CHECK(node.append_buffer("3", 1).status == status_no_document_element); +	CHECK_NODE(doc, STR("<node>123</node>")); + +	CHECK(node.append_buffer("4", 1, parse_fragment)); +	CHECK_NODE(doc, STR("<node>1234</node>")); +} diff --git a/tests/test_memory.cpp b/tests/test_memory.cpp index a37b91e..32d395b 100644 --- a/tests/test_memory.cpp +++ b/tests/test_memory.cpp @@ -119,7 +119,7 @@ TEST(memory_large_allocations)  		CHECK(allocate_count == deallocate_count + 1); // only one live page left (it waits for new allocations)  		char buffer; -		CHECK(doc.load_buffer_inplace(&buffer, 0, parse_default, get_native_encoding())); +		CHECK(doc.load_buffer_inplace(&buffer, 0, parse_fragment, get_native_encoding()));  		CHECK(allocate_count == deallocate_count); // no live pages left  	} diff --git a/tests/test_parse.cpp b/tests/test_parse.cpp index c165a65..6d9d4cc 100644 --- a/tests/test_parse.cpp +++ b/tests/test_parse.cpp @@ -1,10 +1,12 @@  #include "common.hpp" +#include "writer_string.hpp" +  TEST(parse_pi_skip)  {  	xml_document doc; -	unsigned int flag_sets[] = {parse_minimal, parse_minimal | parse_declaration}; +	unsigned int flag_sets[] = {parse_fragment, parse_fragment | parse_declaration};  	for (unsigned int i = 0; i < sizeof(flag_sets) / sizeof(flag_sets[0]); ++i)  	{ @@ -21,7 +23,7 @@ TEST(parse_pi_skip)  TEST(parse_pi_parse)  {  	xml_document doc; -	CHECK(doc.load(STR("<?pi1?><?pi2 value?>"), parse_minimal | parse_pi)); +	CHECK(doc.load(STR("<?pi1?><?pi2 value?>"), parse_fragment | parse_pi));  	xml_node pi1 = doc.first_child();  	xml_node pi2 = doc.last_child(); @@ -38,7 +40,7 @@ TEST(parse_pi_parse)  TEST(parse_pi_parse_spaces)  {  	xml_document doc; -	CHECK(doc.load(STR("<?target  \r\n\t  value ?>"), parse_minimal | parse_pi)); +	CHECK(doc.load(STR("<?target  \r\n\t  value ?>"), parse_fragment | parse_pi));  	xml_node pi = doc.first_child(); @@ -51,7 +53,7 @@ TEST(parse_pi_error)  {  	xml_document doc; -	unsigned int flag_sets[] = {parse_minimal, parse_minimal | parse_pi}; +	unsigned int flag_sets[] = {parse_fragment, parse_fragment | parse_pi};  	for (unsigned int i = 0; i < sizeof(flag_sets) / sizeof(flag_sets[0]); ++i)  	{ @@ -81,22 +83,22 @@ TEST(parse_pi_error)  		CHECK(doc.load(STR("<?name&?"), flags).status == status_bad_pi);  	} -	CHECK(doc.load(STR("<?xx#?>"), parse_minimal | parse_pi).status == status_bad_pi); -	CHECK(doc.load(STR("<?name&?>"), parse_minimal | parse_pi).status == status_bad_pi); -	CHECK(doc.load(STR("<?name& x?>"), parse_minimal | parse_pi).status == status_bad_pi); +	CHECK(doc.load(STR("<?xx#?>"), parse_fragment | parse_pi).status == status_bad_pi); +	CHECK(doc.load(STR("<?name&?>"), parse_fragment | parse_pi).status == status_bad_pi); +	CHECK(doc.load(STR("<?name& x?>"), parse_fragment | parse_pi).status == status_bad_pi);  }  TEST(parse_comments_skip)  {  	xml_document doc; -	CHECK(doc.load(STR("<!----><!--value-->"), parse_minimal)); +	CHECK(doc.load(STR("<!----><!--value-->"), parse_fragment));  	CHECK(!doc.first_child());  }  TEST(parse_comments_parse)  {  	xml_document doc; -	CHECK(doc.load(STR("<!----><!--value-->"), parse_minimal | parse_comments)); +	CHECK(doc.load(STR("<!----><!--value-->"), parse_fragment | parse_comments));  	xml_node c1 = doc.first_child();  	xml_node c2 = doc.last_child(); @@ -113,7 +115,7 @@ TEST(parse_comments_parse)  TEST(parse_comments_parse_no_eol)  {  	xml_document doc; -	CHECK(doc.load(STR("<!--\r\rval1\rval2\r\nval3\nval4\r\r-->"), parse_minimal | parse_comments)); +	CHECK(doc.load(STR("<!--\r\rval1\rval2\r\nval3\nval4\r\r-->"), parse_fragment | parse_comments));  	xml_node c = doc.first_child();  	CHECK(c.type() == node_comment); @@ -123,7 +125,7 @@ TEST(parse_comments_parse_no_eol)  TEST(parse_comments_parse_eol)  {  	xml_document doc; -	CHECK(doc.load(STR("<!--\r\rval1\rval2\r\nval3\nval4\r\r-->"), parse_minimal | parse_comments | parse_eol)); +	CHECK(doc.load(STR("<!--\r\rval1\rval2\r\nval3\nval4\r\r-->"), parse_fragment | parse_comments | parse_eol));  	xml_node c = doc.first_child();  	CHECK(c.type() == node_comment); @@ -134,7 +136,7 @@ TEST(parse_comments_error)  {  	xml_document doc; -	unsigned int flag_sets[] = {parse_minimal, parse_minimal | parse_comments, parse_minimal | parse_comments | parse_eol}; +	unsigned int flag_sets[] = {parse_fragment, parse_fragment | parse_comments, parse_fragment | parse_comments | parse_eol};  	for (unsigned int i = 0; i < sizeof(flag_sets) / sizeof(flag_sets[0]); ++i)  	{ @@ -152,21 +154,21 @@ TEST(parse_comments_error)  TEST(parse_cdata_skip)  {  	xml_document doc; -	CHECK(doc.load(STR("<![CDATA[]]><![CDATA[value]]>"), parse_minimal)); +	CHECK(doc.load(STR("<![CDATA[]]><![CDATA[value]]>"), parse_fragment));  	CHECK(!doc.first_child());  }  TEST(parse_cdata_skip_contents)  {  	xml_document doc; -	CHECK(doc.load(STR("<node><![CDATA[]]>hello<![CDATA[value]]>, world!</node>"), parse_minimal)); +	CHECK(doc.load(STR("<node><![CDATA[]]>hello<![CDATA[value]]>, world!</node>"), parse_fragment));  	CHECK_NODE(doc, STR("<node>hello, world!</node>"));  }  TEST(parse_cdata_parse)  {  	xml_document doc; -	CHECK(doc.load(STR("<![CDATA[]]><![CDATA[value]]>"), parse_minimal | parse_cdata)); +	CHECK(doc.load(STR("<![CDATA[]]><![CDATA[value]]>"), parse_fragment | parse_cdata));  	xml_node c1 = doc.first_child();  	xml_node c2 = doc.last_child(); @@ -183,7 +185,7 @@ TEST(parse_cdata_parse)  TEST(parse_cdata_parse_no_eol)  {  	xml_document doc; -	CHECK(doc.load(STR("<![CDATA[\r\rval1\rval2\r\nval3\nval4\r\r]]>"), parse_minimal | parse_cdata)); +	CHECK(doc.load(STR("<![CDATA[\r\rval1\rval2\r\nval3\nval4\r\r]]>"), parse_fragment | parse_cdata));  	xml_node c = doc.first_child();  	CHECK(c.type() == node_cdata); @@ -193,7 +195,7 @@ TEST(parse_cdata_parse_no_eol)  TEST(parse_cdata_parse_eol)  {  	xml_document doc; -	CHECK(doc.load(STR("<![CDATA[\r\rval1\rval2\r\nval3\nval4\r\r]]>"), parse_minimal | parse_cdata | parse_eol)); +	CHECK(doc.load(STR("<![CDATA[\r\rval1\rval2\r\nval3\nval4\r\r]]>"), parse_fragment | parse_cdata | parse_eol));  	xml_node c = doc.first_child();  	CHECK(c.type() == node_cdata); @@ -204,7 +206,7 @@ TEST(parse_cdata_error)  {  	xml_document doc; -	unsigned int flag_sets[] = {parse_minimal, parse_minimal | parse_cdata, parse_minimal | parse_cdata | parse_eol}; +	unsigned int flag_sets[] = {parse_fragment, parse_fragment | parse_cdata, parse_fragment | parse_cdata | parse_eol};  	for (unsigned int i = 0; i < sizeof(flag_sets) / sizeof(flag_sets[0]); ++i)  	{ @@ -229,7 +231,7 @@ TEST(parse_cdata_error)  TEST(parse_ws_pcdata_skip)  {  	xml_document doc; -	CHECK(doc.load(STR("  "), parse_minimal)); +	CHECK(doc.load(STR("  "), parse_fragment));  	CHECK(!doc.first_child());  	CHECK(doc.load(STR("<root>  <node>  </node>  </root>"), parse_minimal)); @@ -286,8 +288,6 @@ TEST(parse_ws_pcdata_permutations)      test_data_t test_data[] =      {          // external pcdata should be discarded (whitespace or not) -        {7, STR("ext1"), STR(""), 1}, -        {7, STR("    "), STR(""), 1},          {7, STR("ext1<node/>"), STR("<node />"), 2},          {7, STR("ext1<node/>ext2"), STR("<node />"), 2},          {7, STR(" <node/>"), STR("<node />"), 2}, @@ -314,11 +314,13 @@ TEST(parse_ws_pcdata_permutations)          {4, STR("<node>\t\t<!---->\n\n</node>"), STR("<node>\n\n</node>"), 3},          // error case: terminate PCDATA in the middle          {7, STR("<node>abcdef"), STR("<node>abcdef</node>"), -3}, -        {7, STR("<node>      "), STR("<node>      </node>"), -3}, +        {5, STR("<node>      "), STR("<node />"), -2}, +        {2, STR("<node>      "), STR("<node>      </node>"), -3},          // error case: terminate PCDATA as early as possible          {7, STR("<node>"), STR("<node />"), -2},          {7, STR("<node>a"), STR("<node>a</node>"), -3}, -        {7, STR("<node> "), STR("<node> </node>"), -3}, +        {5, STR("<node> "), STR("<node />"), -2}, +        {2, STR("<node> "), STR("<node> </node>"), -3},      };      for (size_t i = 0; i < sizeof(test_data) / sizeof(test_data[0]); ++i) @@ -342,6 +344,57 @@ TEST(parse_ws_pcdata_permutations)      }  } +TEST(parse_ws_pcdata_fragment_permutations) +{ +    struct test_data_t +    { +        unsigned int mask; // 1 = default flags, 2 = parse_ws_pcdata, 4 = parse_ws_pcdata_single +        const pugi::char_t* source; +        const pugi::char_t* result; +        int nodes; // negative if parsing should fail +    }; + +    test_data_t test_data[] = +    { +        // external pcdata should be preserved +        {7, STR("ext1"), STR("ext1"), 2}, +        {5, STR("    "), STR(""), 1}, +        {2, STR("    "), STR("    "), 2}, +        {7, STR("ext1<node/>"), STR("ext1<node />"), 3}, +        {7, STR("<node/>ext2"), STR("<node />ext2"), 3}, +        {7, STR("ext1<node/>ext2"), STR("ext1<node />ext2"), 4}, +        {7, STR("ext1<node1/>ext2<node2/>ext3"), STR("ext1<node1 />ext2<node2 />ext3"), 6}, +        {5, STR(" <node/>"), STR("<node />"), 2}, +        {2, STR(" <node/>"), STR(" <node />"), 3}, +        {5, STR("<node/> "), STR("<node />"), 2}, +        {2, STR("<node/> "), STR("<node /> "), 3}, +        {5, STR(" <node/> "), STR("<node />"), 2}, +        {2, STR(" <node/> "), STR(" <node /> "), 4}, +        {5, STR(" <node1/> <node2/> "), STR("<node1 /><node2 />"), 3}, +        {2, STR(" <node1/> <node2/> "), STR(" <node1 /> <node2 /> "), 6}, +    }; + +    for (size_t i = 0; i < sizeof(test_data) / sizeof(test_data[0]); ++i) +    { +        const test_data_t& td = test_data[i]; + +        for (int flag = 0; flag < 3; ++flag) +        { +            if (td.mask & (1 << flag)) +            { +                unsigned int flags[] = {parse_default, parse_default | parse_ws_pcdata, parse_default | parse_ws_pcdata_single}; + +                xml_document doc; +                CHECK((td.nodes > 0) == doc.load(td.source, flags[flag] | parse_fragment)); +                CHECK_NODE(doc, td.result); + +                int nodes = get_tree_node_count(doc); +                CHECK((td.nodes < 0 ? -td.nodes : td.nodes) == nodes); +            } +        } +    } +} +  TEST(parse_pcdata_no_eol)  {  	xml_document doc; @@ -685,14 +738,14 @@ TEST(parse_tag_error)  TEST(parse_declaration_cases)  {  	xml_document doc; -	CHECK(doc.load(STR("<?xml?><?xmL?><?xMl?><?xML?><?Xml?><?XmL?><?XMl?><?XML?>"), parse_minimal | parse_pi)); +	CHECK(doc.load(STR("<?xml?><?xmL?><?xMl?><?xML?><?Xml?><?XmL?><?XMl?><?XML?>"), parse_fragment | parse_pi));  	CHECK(!doc.first_child());  }  TEST(parse_declaration_attr_cases)  {  	xml_document doc; -	CHECK(doc.load(STR("<?xml ?><?xmL ?><?xMl ?><?xML ?><?Xml ?><?XmL ?><?XMl ?><?XML ?>"), parse_minimal | parse_pi)); +	CHECK(doc.load(STR("<?xml ?><?xmL ?><?xMl ?><?xML ?><?Xml ?><?XmL ?><?XMl ?><?XML ?>"), parse_fragment | parse_pi));  	CHECK(!doc.first_child());  } @@ -700,7 +753,7 @@ TEST(parse_declaration_skip)  {  	xml_document doc; -	unsigned int flag_sets[] = {parse_minimal, parse_minimal | parse_pi}; +	unsigned int flag_sets[] = {parse_fragment, parse_fragment | parse_pi};  	for (unsigned int i = 0; i < sizeof(flag_sets) / sizeof(flag_sets[0]); ++i)  	{ @@ -717,7 +770,7 @@ TEST(parse_declaration_skip)  TEST(parse_declaration_parse)  {  	xml_document doc; -	CHECK(doc.load(STR("<?xml?><?xml version='1.0'?>"), parse_minimal | parse_declaration)); +	CHECK(doc.load(STR("<?xml?><?xml version='1.0'?>"), parse_fragment | parse_declaration));  	xml_node d1 = doc.first_child();  	xml_node d2 = doc.last_child(); @@ -734,7 +787,7 @@ TEST(parse_declaration_error)  {  	xml_document doc; -	unsigned int flag_sets[] = {parse_minimal, parse_minimal | parse_declaration}; +	unsigned int flag_sets[] = {parse_fragment, parse_fragment | parse_declaration};  	for (unsigned int i = 0; i < sizeof(flag_sets) / sizeof(flag_sets[0]); ++i)  	{ @@ -746,14 +799,15 @@ TEST(parse_declaration_error)  		CHECK(doc.load(STR("<?xml version='1>"), flags).status == status_bad_pi);  	} -	CHECK(doc.load(STR("<?xml version='1?>"), parse_minimal | parse_declaration).status == status_bad_attribute); -	CHECK(doc.load(STR("<foo><?xml version='1'?></foo>"), parse_minimal | parse_declaration).status == status_bad_pi); +	CHECK(doc.load(STR("<?xml version='1?>"), parse_fragment | parse_declaration).status == status_bad_attribute); +	CHECK(doc.load(STR("<foo><?xml version='1'?></foo>"), parse_fragment | parse_declaration).status == status_bad_pi);  }  TEST(parse_empty)  {  	xml_document doc; -	CHECK(doc.load(STR("")) && !doc.first_child()); +	CHECK(doc.load(STR("")).status == status_no_document_element && !doc.first_child()); +	CHECK(doc.load(STR(""), parse_fragment) && !doc.first_child());  }  TEST(parse_out_of_memory) @@ -843,3 +897,81 @@ TEST(parse_result_default)  	CHECK(result.offset == 0);  	CHECK(result.encoding == encoding_auto);  } + +TEST(parse_bom_fragment) +{ +	struct test_data_t +	{ +		xml_encoding encoding; +		const char* data; +		size_t size; +		const char_t* text; +	}; + +	const test_data_t data[] = +	{ +		{ encoding_utf8, "\xef\xbb\xbf", 3, STR("") }, +		{ encoding_utf8, "\xef\xbb\xbftest", 7, STR("test") }, +		{ encoding_utf16_be, "\xfe\xff", 2, STR("") }, +		{ encoding_utf16_be, "\xfe\xff\x00t\x00o\x00s\x00t", 10, STR("tost") }, +		{ encoding_utf16_le, "\xff\xfe", 2, STR("") }, +		{ encoding_utf16_le, "\xff\xfet\x00o\x00s\x00t\x00", 10, STR("tost") }, +		{ encoding_utf32_be, "\x00\x00\xfe\xff", 4, STR("") }, +		{ encoding_utf32_be, "\x00\x00\xfe\xff\x00\x00\x00t\x00\x00\x00o\x00\x00\x00s\x00\x00\x00t", 20, STR("tost") }, +		{ encoding_utf32_le, "\xff\xfe\x00\x00", 4, STR("") }, +		{ encoding_utf32_le, "\xff\xfe\x00\x00t\x00\x00\x00o\x00\x00\x00s\x00\x00\x00t\x00\x00\x00", 20, STR("tost") }, +	}; + +	for (size_t i = 0; i < sizeof(data) / sizeof(data[0]); ++i) +	{ +		xml_document doc; +		CHECK(doc.load_buffer(data[i].data, data[i].size, parse_fragment, data[i].encoding)); +		CHECK_STRING(doc.text().get(), data[i].text); +		CHECK(save_narrow(doc, format_no_declaration | format_raw | format_write_bom, data[i].encoding) == std::string(data[i].data, data[i].size)); +	} +} + +TEST(parse_bom_fragment_invalid_utf8) +{ +	xml_document doc; + +	CHECK(doc.load_buffer("\xef\xbb\xbb", 3, parse_fragment, encoding_utf8)); + +	const char_t* value = doc.text().get(); + +#ifdef PUGIXML_WCHAR_MODE +	CHECK(value[0] == wchar_cast(0xfefb) && value[1] == 0); +#else +	CHECK_STRING(value, "\xef\xbb\xbb"); +#endif +} + +TEST(parse_bom_fragment_invalid_utf16) +{ +	xml_document doc; + +	CHECK(doc.load_buffer("\xff\xfe", 2, parse_fragment, encoding_utf16_be)); + +	const char_t* value = doc.text().get(); + +#ifdef PUGIXML_WCHAR_MODE +	CHECK(value[0] == wchar_cast(0xfffe) && value[1] == 0); +#else +	CHECK_STRING(value, "\xef\xbf\xbe"); +#endif +} + +TEST(parse_bom_fragment_invalid_utf32) +{ +	xml_document doc; + +	CHECK(doc.load_buffer("\xff\xff\x00\x00", 4, parse_fragment, encoding_utf32_le)); + +	const char_t* value = doc.text().get(); + +#ifdef PUGIXML_WCHAR_MODE +	CHECK(value[0] == wchar_cast(0xffff) && value[1] == 0); +#else +	CHECK_STRING(value, "\xef\xbf\xbf"); +#endif +} diff --git a/tests/test_parse_doctype.cpp b/tests/test_parse_doctype.cpp index d7a3726..8976890 100644 --- a/tests/test_parse_doctype.cpp +++ b/tests/test_parse_doctype.cpp @@ -20,7 +20,7 @@ static xml_parse_result load_concat(xml_document& doc, const char_t* a, const ch  	strcat(buffer, c);  #endif -	return doc.load(buffer); +	return doc.load(buffer, parse_fragment);  }  static bool test_doctype_wf(const char_t* decl) @@ -31,9 +31,9 @@ static bool test_doctype_wf(const char_t* decl)  	if (!load_concat(doc, decl) || !doc.first_child().empty()) return false;  	// pcdata pre/postfix -	if (!load_concat(doc, STR("a"), decl) || !doc.first_child().empty()) return false; -	if (!load_concat(doc, decl, STR("b")) || !doc.first_child().empty()) return false; -	if (!load_concat(doc, STR("a"), decl, STR("b")) || !doc.first_child().empty()) return false; +	if (!load_concat(doc, STR("a"), decl) || !test_node(doc, STR("a"), STR(""), format_raw)) return false; +	if (!load_concat(doc, decl, STR("b")) || !test_node(doc, STR("b"), STR(""), format_raw)) return false; +	if (!load_concat(doc, STR("a"), decl, STR("b")) || !test_node(doc, STR("ab"), STR(""), format_raw)) return false;  	// node pre/postfix  	if (!load_concat(doc, STR("<nodea/>"), decl) || !test_node(doc, STR("<nodea />"), STR(""), format_raw)) return false; @@ -41,7 +41,7 @@ static bool test_doctype_wf(const char_t* decl)  	if (!load_concat(doc, STR("<nodea/>"), decl, STR("<nodeb/>")) || !test_node(doc, STR("<nodea /><nodeb />"), STR(""), format_raw)) return false;      // check load-store contents preservation -    CHECK(doc.load(decl, parse_doctype)); +    CHECK(doc.load(decl, parse_doctype | parse_fragment));      CHECK_NODE(doc, decl);  	return true; @@ -281,8 +281,8 @@ TEST(parse_doctype_xmlconf_oasis_1)      // not actually a doctype :)      xml_document doc; -    CHECK(doc.load(STR("<!--a <!DOCTYPE <?- ]]>-<[ CDATA [ \"- -'- -<doc>--> <!---->"), parse_full) && doc.first_child().type() == node_comment && doc.last_child().type() == node_comment && doc.first_child().next_sibling() == doc.last_child()); -	CHECK(doc.load(STR("<?xmla <!DOCTYPE <[ CDATA [</doc> &a%b&#c?>"), parse_full) && doc.first_child().type() == node_pi && doc.first_child() == doc.last_child()); +    CHECK(doc.load(STR("<!--a <!DOCTYPE <?- ]]>-<[ CDATA [ \"- -'- -<doc>--> <!---->"), parse_full | parse_fragment) && doc.first_child().type() == node_comment && doc.last_child().type() == node_comment && doc.first_child().next_sibling() == doc.last_child()); +	CHECK(doc.load(STR("<?xmla <!DOCTYPE <[ CDATA [</doc> &a%b&#c?>"), parse_full | parse_fragment) && doc.first_child().type() == node_pi && doc.first_child() == doc.last_child());  }  TEST(parse_doctype_xmlconf_xmltest_1) @@ -299,7 +299,7 @@ TEST(parse_doctype_xmlconf_xmltest_1)  	TEST_DOCTYPE_WF("<!DOCTYPE doc [ <!ELEMENT doc (#PCDATA)> <!ENTITY e \"<![CDATA[Tim & Michael]]>\"> ]>");  } -TEST_XML_FLAGS(parse_doctype_value, "<!DOCTYPE doc [ <!ELEMENT doc (#PCDATA)> <!ENTITY e \"<![CDATA[Tim & Michael]]>\"> ]>", parse_minimal | parse_doctype) +TEST_XML_FLAGS(parse_doctype_value, "<!DOCTYPE doc [ <!ELEMENT doc (#PCDATA)> <!ENTITY e \"<![CDATA[Tim & Michael]]>\"> ]>", parse_fragment | parse_doctype)  {      xml_node n = doc.first_child(); diff --git a/tests/test_write.cpp b/tests/test_write.cpp index de6f03d..465d111 100644 --- a/tests/test_write.cpp +++ b/tests/test_write.cpp @@ -25,19 +25,19 @@ TEST_XML(write_pcdata, "<node attr='1'><child><sub/>text</child></node>")  	CHECK_NODE_EX(doc, STR("<node attr=\"1\">\n\t<child>\n\t\t<sub />\n\t\ttext\n\t</child>\n</node>\n"), STR("\t"), format_indent);  } -TEST_XML(write_cdata, "<![CDATA[value]]>") +TEST_XML_FLAGS(write_cdata, "<![CDATA[value]]>", parse_cdata | parse_fragment)  {  	CHECK_NODE(doc, STR("<![CDATA[value]]>"));  	CHECK_NODE_EX(doc, STR("<![CDATA[value]]>\n"), STR(""), 0);  } -TEST_XML(write_cdata_empty, "<![CDATA[]]>") +TEST_XML_FLAGS(write_cdata_empty, "<![CDATA[]]>", parse_cdata | parse_fragment)  {  	CHECK_NODE(doc, STR("<![CDATA[]]>"));  	CHECK_NODE_EX(doc, STR("<![CDATA[]]>\n"), STR(""), 0);  } -TEST_XML(write_cdata_escape, "<![CDATA[value]]>") +TEST_XML_FLAGS(write_cdata_escape, "<![CDATA[value]]>", parse_cdata | parse_fragment)  {  	CHECK_NODE(doc, STR("<![CDATA[value]]>")); @@ -51,26 +51,25 @@ TEST_XML(write_cdata_inner, "<node><![CDATA[value]]></node>")  	CHECK_NODE_EX(doc, STR("<node><![CDATA[value]]></node>\n"), STR(""), 0);  } - -TEST_XML_FLAGS(write_comment, "<!--text-->", parse_default | parse_comments) +TEST_XML_FLAGS(write_comment, "<!--text-->", parse_comments | parse_fragment)  {  	CHECK_NODE(doc, STR("<!--text-->"));  	CHECK_NODE_EX(doc, STR("<!--text-->\n"), STR(""), 0);  } -TEST_XML_FLAGS(write_pi, "<?name value?>", parse_default | parse_pi) +TEST_XML_FLAGS(write_pi, "<?name value?>", parse_pi | parse_fragment)  {  	CHECK_NODE(doc, STR("<?name value?>"));  	CHECK_NODE_EX(doc, STR("<?name value?>\n"), STR(""), 0);  } -TEST_XML_FLAGS(write_declaration, "<?xml version='2.0'?>", parse_default | parse_declaration) +TEST_XML_FLAGS(write_declaration, "<?xml version='2.0'?>", parse_declaration | parse_fragment)  {  	CHECK_NODE(doc, STR("<?xml version=\"2.0\"?>"));  	CHECK_NODE_EX(doc, STR("<?xml version=\"2.0\"?>\n"), STR(""), 0);  } -TEST_XML_FLAGS(write_doctype, "<!DOCTYPE id [ foo ]>", parse_default | parse_doctype) +TEST_XML_FLAGS(write_doctype, "<!DOCTYPE id [ foo ]>", parse_doctype | parse_fragment)  {  	CHECK_NODE(doc, STR("<!DOCTYPE id [ foo ]>"));  	CHECK_NODE_EX(doc, STR("<!DOCTYPE id [ foo ]>\n"), STR(""), 0); | 
