Implement copyless copy

Now copying nodes or attributes does not copy names/values if the source strings are in a document buffer. As a result, several nodes can now share the same string in document buffer - to support this we 'taint' both source and destination with a special 'shared' bit. Tainting disables offset_debug() and fast-path document order comparison; it also prevents strcpy_insitu from reusing the document buffer memory for the copied node. The downsides include slower XPath queries in some (rare) cases and slightly higher memory consumption in some (rare) cases. XPath queries can execute slower if a lot of old nodes were copied to new nodes *and* a query only touches old nodes (so it used to benefit a lot from fast comparison path) *and* a query produces unsorted node sets that need to be sorted later (both are relatively rare). Higher memory consumption is possible if a lot of nodes were copied and all nodes (both new and old) have their contents modified 'in place' -- previously we could modify the old node in place and the new node required one allocation on copy, and now both nodes have to have their data allocated during modification. This should also be rare. On the bright side, in a lot of cases copying of string data can be avoided - this makes the copy much faster and the document now occupies less memory. For example, some uses of append_buffer are now actually slower compared to building up a document by copying a template from the same document and modifying the copy slightly. In one of the internal benchmarks copying is now 4x faster (the difference can be more dramatic with more string contents and less markup). git-svn-id: https://pugixml.googlecode.com/svn/trunk@1032 99668b35-9821-0410-8761-19e4c4f06640
author: Arseny Kapoulkine <arseny.kapoulkine@gmail.com> 2014-10-01 07:02:52 +0000
committer: Arseny Kapoulkine <arseny.kapoulkine@gmail.com> 2014-10-01 07:02:52 +0000
commit: febe4f0209f86225ebeedfb0874feb3cb96e7c89 (patch)
tree: 88e590e12b404bdfcc09f8e822889647c8d005a5
parent: 89d19df43df8e7962bcf896a528d3214a3c28bbc (diff)
1 files changed, 44 insertions, 29 deletions
diff --git a/src/pugixml.cpp b/src/pugixml.cpp
index 8e61182..3979eb9 100644
--- a/src/pugixml.cpp
+++ b/src/pugixml.cpp
@@ -1654,13 +1654,15 @@ PUGI__NS_BEGIN
 	}
 #endif
 
-	inline bool strcpy_insitu_allow(size_t length, uintptr_t allocated, char_t* target)
+	inline bool strcpy_insitu_allow(size_t length, uintptr_t header, uintptr_t header_mask, char_t* target)
 	{
-		assert(target);
+		// never reuse shared memory
+		if (header & xml_memory_page_name_or_value_shared_mask) return false;
+
 		size_t target_length = strlength(target);
 
 		// always reuse document buffer memory if possible
-		if (!allocated) return target_length >= length;
+		if ((header & header_mask) == 0) return target_length >= length;
 
 		// reuse heap memory if waste is not too great
 		const size_t reuse_threshold = 32;
@@ -1687,7 +1689,7 @@ PUGI__NS_BEGIN
 
 			return true;
 		}
-		else if (dest && strcpy_insitu_allow(source_length, header & header_mask, dest))
+		else if (dest && strcpy_insitu_allow(source_length, header, header_mask, dest))
 		{
 			// we can reuse old buffer, so just copy the new data (including zero terminator)
 			memcpy(dest, source, (source_length + 1) * sizeof(char_t));
@@ -3605,42 +3607,55 @@ PUGI__NS_BEGIN
 		return true;
 	}
 
-	PUGI__FN void node_copy_contents(xml_node dest, const xml_node source)
+	PUGI__FN void node_copy_string(char_t*& dest, uintptr_t& header, uintptr_t header_mask, char_t* source, uintptr_t& source_header, xml_allocator* alloc)
 	{
-		assert(dest.type() == source.type());
-
-		switch (source.type())
-		{
-		case node_element:
-		case node_declaration:
+		if (source)
 		{
-			dest.set_name(source.name());
+			if (alloc && (source_header & header_mask) == 0)
+			{
+				dest = source;
 
-			for (xml_attribute a = source.first_attribute(); a; a = a.next_attribute())
-				dest.append_attribute(a.name()).set_value(a.value());
-			break;
+				// since strcpy_insitu can reuse document buffer memory we need to mark both source and dest as shared
+				header |= xml_memory_page_name_or_value_shared_mask;
+				source_header |= xml_memory_page_name_or_value_shared_mask;
+			}
+			else
+				strcpy_insitu(dest, header, header_mask, source);
 		}
+	}
 
-		case node_pcdata:
-		case node_cdata:
-		case node_comment:
-		case node_doctype:
-			dest.set_value(source.value());
-			break;
+	PUGI__FN void node_copy_contents(xml_allocator* alloc, xml_node dest, const xml_node source)
+	{
+		assert(dest.type() == source.type());
 
-		case node_pi:
-			dest.set_name(source.name());
-			dest.set_value(source.value());
-			break;
+		xml_node_struct* dn = dest.internal_object();
+		xml_node_struct* sn = source.internal_object();
 
-		default:
-			assert(!"Invalid node type");
+		node_copy_string(dn->name, dn->header, xml_memory_page_name_allocated_mask, sn->name, sn->header, alloc);
+		node_copy_string(dn->value, dn->header, xml_memory_page_value_allocated_mask, sn->value, sn->header, alloc);
+
+		for (xml_attribute_struct* sa = sn->first_attribute; sa; sa = sa->next_attribute)
+		{
+			xml_attribute_struct* da = impl::append_new_attribute(dn, impl::get_allocator(dn));
+
+			node_copy_string(da->name, da->header, xml_memory_page_name_allocated_mask, sa->name, sa->header, alloc);
+			node_copy_string(da->value, da->header, xml_memory_page_value_allocated_mask, sa->value, sa->header, alloc);
 		}
 	}
 
+	PUGI__FN xml_allocator* node_get_shared_allocator(const xml_node lhs, const xml_node rhs)
+	{
+		xml_allocator& la = impl::get_allocator(lhs.internal_object());
+		xml_allocator& ra = impl::get_allocator(rhs.internal_object());
+
+		return (&la == &ra) ? &la : 0;
+	}
+
 	PUGI__FN void node_copy_tree(xml_node dest, const xml_node source)
 	{
-		node_copy_contents(dest, source);
+		xml_allocator* alloc = node_get_shared_allocator(dest, source);
+
+		node_copy_contents(alloc, dest, source);
 
 		xml_node destit = dest;
 		xml_node sourceit = source.first_child();
@@ -3651,7 +3666,7 @@ PUGI__NS_BEGIN
 			{
 				xml_node copy = destit.append_child(sourceit.type());
 
-				node_copy_contents(copy, sourceit);
+				node_copy_contents(alloc, copy, sourceit);
 
 				if (sourceit.first_child())
 				{
author	Arseny Kapoulkine <arseny.kapoulkine@gmail.com>	2014-10-01 07:02:52 +0000
committer	Arseny Kapoulkine <arseny.kapoulkine@gmail.com>	2014-10-01 07:02:52 +0000
commit	febe4f0209f86225ebeedfb0874feb3cb96e7c89 (patch)
tree	88e590e12b404bdfcc09f8e822889647c8d005a5
parent	89d19df43df8e7962bcf896a528d3214a3c28bbc (diff)