Implement document fragment parsing.

Introduce a notable behavior change in default parsing mode: documents without a document element node are now considered invalid. This is technically a breaking change, however the amount of documents it affects is very small, all parsed data still persists, and lack of this check results in very confusing behavior in a number of cases. In order to be able to parse documents without an element node, a fragment parsing flag is introduced. Parsing a buffer in fragment mode treats the buffer as a fragment of a valid XML. As a consequence, top-level PCDATA is added to the tree; additionally, there are no restrictions on the number of nodes -- so documents without a document element are considered valid. Due to the way parsing works internally, load_buffer_inplace occasionally can not preserve the document contents if it's parsed in a fragment mode. While unfortunate, this problem is fundamental; since the use case is relatively obscure, hopefully documenting this shortcoming will be enough. git-svn-id: https://pugixml.googlecode.com/svn/trunk@980 99668b35-9821-0410-8761-19e4c4f06640
author: Arseny Kapoulkine <arseny.kapoulkine@gmail.com> 2014-02-11 06:45:27 +0000
committer: Arseny Kapoulkine <arseny.kapoulkine@gmail.com> 2014-02-11 06:45:27 +0000
commit: 47c15ad949eb6589ee14d208444b4e759a611143 (patch)
tree: 35822cba8d2d3c6e5384c960ff8ea503bf3cf235 /src
parent: 5fa25a878aa472530cfa981d374d6e9fe4e12c7c (diff)
2 files changed, 67 insertions, 19 deletions
diff --git a/src/pugixml.cpp b/src/pugixml.cpp
index 926458e..1893125 100644
--- a/src/pugixml.cpp
+++ b/src/pugixml.cpp
@@ -2199,7 +2199,7 @@ PUGI__NS_BEGIN
 
 		char_t* parse_doctype_group(char_t* s, char_t endch, bool toplevel)
 		{
-			assert(s[0] == '<' && s[1] == '!');
+			assert((s[0] == '<' || s[0] == 0) && s[1] == '!');
 			s++;
 
 			while (*s)
@@ -2331,6 +2331,9 @@ PUGI__NS_BEGIN
 				s = parse_doctype_group(s, endch, true);
 				if (!s) return s;
 
+				assert((*s == 0 && endch == '>') || *s == '>');
+				if (*s) *s++ = 0;
+
 				if (PUGI__OPTSET(parse_doctype))
 				{
 					while (PUGI__IS_CHARTYPE(*mark, ct_space)) ++mark;
@@ -2339,9 +2342,6 @@ PUGI__NS_BEGIN
 
 					cursor->value = mark;
 
-					assert((*s == 0 && endch == '>') || *s == '>');
-					if (*s) *s++ = 0;
-
 					PUGI__POPNODE();
 				}
 			}
@@ -2629,7 +2629,7 @@ PUGI__NS_BEGIN
 
 					PUGI__SKIPWS(); // Eat whitespace if no genuine PCDATA here.
 
-					if (*s == '<')
+					if (*s == '<' || !*s)
 					{
 						// We skipped some whitespace characters because otherwise we would take the tag branch instead of PCDATA one
 						assert(mark != s);
@@ -2640,13 +2640,13 @@ PUGI__NS_BEGIN
 						}
 						else if (PUGI__OPTSET(parse_ws_pcdata_single))
 						{
-							if (s[1] != '/' || cursor->first_child) continue;
+							if (s[0] != '<' || s[1] != '/' || cursor->first_child) continue;
 						}
 					}
 
 					s = mark;
 							
-					if (cursor->parent)
+					if (cursor->parent || PUGI__OPTSET(parse_fragment))
 					{
 						PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree.
 						cursor->value = s; // Save the offset.
@@ -2676,14 +2676,43 @@ PUGI__NS_BEGIN
 			return s;
 		}
 
+	#ifdef PUGIXML_WCHAR_MODE
+		static char_t* parse_skip_bom(char_t* s)
+		{
+			return (s[0] == 0xfeff) ? s + 1 : s;
+		}
+	#else
+		static char_t* parse_skip_bom(char_t* s)
+		{
+			return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s;
+		}
+	#endif
+
+		static bool has_element_node_siblings(xml_node_struct* node)
+		{
+			while (node)
+			{
+				xml_node_type type = static_cast<xml_node_type>((node->header & impl::xml_memory_page_type_mask) + 1);
+				if (type == node_element) return true;
+
+				node = node->next_sibling;
+			}
+
+			return false;
+		}
+
 		static xml_parse_result parse(char_t* buffer, size_t length, xml_document_struct* xmldoc, xml_node_struct* root, unsigned int optmsk)
 		{
 			// allocator object is a part of document object
 			xml_allocator& alloc = *static_cast<xml_allocator*>(xmldoc);
 
 			// early-out for empty documents
-			if (length == 0) return make_parse_result(status_ok);
+			if (length == 0)
+				return make_parse_result(PUGI__OPTSET(parse_fragment) ? status_ok : status_no_document_element);
 
+			// get last child of the root before parsing
+			xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c : 0;
+	
 			// create parser on stack
 			xml_parser parser(alloc);
 
@@ -2691,24 +2720,35 @@ PUGI__NS_BEGIN
 			char_t endch = buffer[length - 1];
 			buffer[length - 1] = 0;
 			
+			// skip BOM to make sure it does not end up as part of parse output
+			char_t* buffer_data = parse_skip_bom(buffer);
+
 			// perform actual parsing
-			parser.parse_tree(buffer, root, optmsk, endch);
+			parser.parse_tree(buffer_data, root, optmsk, endch);
+
+			// update allocator state
+			alloc = parser.alloc;
 
 			xml_parse_result result = make_parse_result(parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0);
 			assert(result.offset >= 0 && static_cast<size_t>(result.offset) <= length);
 
-			// roll back offset if it occurs on a null terminator in the source buffer
-			if (result.offset > 0 && static_cast<size_t>(result.offset) == length - 1 && endch == 0)
-				result.offset--;
+			if (result)
+			{
+				// since we removed last character, we have to handle the only possible false positive (stray <)
+				if (endch == '<')
+					return make_parse_result(status_unrecognized_tag, length - 1);
 
-			// update allocator state
-			alloc = parser.alloc;
+				// check if there are any element nodes parsed
+				xml_node_struct* first_root_child_parsed = last_root_child ? last_root_child->next_sibling : root->first_child;
 
-			// since we removed last character, we have to handle the only possible false positive
-			if (result && endch == '<')
+				if (!PUGI__OPTSET(parse_fragment) && !has_element_node_siblings(first_root_child_parsed))
+					return make_parse_result(status_no_document_element, length - 1);
+			}
+			else
 			{
-				// there's no possible well-formed document with < at the end
-				return make_parse_result(status_unrecognized_tag, length - 1);
+				// roll back offset if it occurs on a null terminator in the source buffer
+				if (result.offset > 0 && static_cast<size_t>(result.offset) == length - 1 && endch == 0)
+					result.offset--;
 			}
 
 			return result;
@@ -5469,6 +5509,8 @@ namespace pugi
 
 		case status_append_invalid_root: return "Unable to append nodes: root is not an element or document";
 
+		case status_no_document_element: return "No document element found";
+
 		default: return "Unknown error";
 		}
 	}
diff --git a/src/pugixml.hpp b/src/pugixml.hpp
index e19a4a3..e5009fe 100644
--- a/src/pugixml.hpp
+++ b/src/pugixml.hpp
@@ -151,6 +151,10 @@ namespace pugi
 	// This flag is off by default; turning it on may result in slower parsing and more memory consumption.
 	const unsigned int parse_ws_pcdata_single = 0x0400;
 
+	// This flag determines if plain character data that does not have a parent node is added to the DOM tree, and if an empty document
+	// is a valid document. This flag is off by default.
+	const unsigned int parse_fragment = 0x0800;
+
 	// The default parsing mode.
 	// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
 	// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.
@@ -880,7 +884,9 @@ namespace pugi
 		status_bad_end_element,		// Parsing error occurred while parsing end element tag
 		status_end_element_mismatch,// There was a mismatch of start-end tags (closing tag had incorrect name, some tag was not closed or there was an excessive closing tag)
 
-		status_append_invalid_root	// Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer)
+		status_append_invalid_root,	// Unable to append nodes since root type is not node_element or node_document (exclusive to xml_node::append_buffer)
+
+		status_no_document_element	// Parsing resulted in a document without element nodes
 	};
 
 	// Parsing result
author	Arseny Kapoulkine <arseny.kapoulkine@gmail.com>	2014-02-11 06:45:27 +0000
committer	Arseny Kapoulkine <arseny.kapoulkine@gmail.com>	2014-02-11 06:45:27 +0000
commit	47c15ad949eb6589ee14d208444b4e759a611143 (patch)
tree	35822cba8d2d3c6e5384c960ff8ea503bf3cf235 /src
parent	5fa25a878aa472530cfa981d374d6e9fe4e12c7c (diff)