diff options
author | Arseny Kapoulkine <arseny.kapoulkine@gmail.com> | 2014-10-01 07:02:52 +0000 |
---|---|---|
committer | Arseny Kapoulkine <arseny.kapoulkine@gmail.com> | 2014-10-01 07:02:52 +0000 |
commit | febe4f0209f86225ebeedfb0874feb3cb96e7c89 (patch) | |
tree | 88e590e12b404bdfcc09f8e822889647c8d005a5 | |
parent | 89d19df43df8e7962bcf896a528d3214a3c28bbc (diff) |
Implement copyless copy
Now copying nodes or attributes does not copy names/values if the source
strings are in a document buffer. As a result, several nodes can now share
the same string in document buffer - to support this we 'taint' both
source and destination with a special 'shared' bit.
Tainting disables offset_debug() and fast-path document order comparison;
it also prevents strcpy_insitu from reusing the document buffer memory for
the copied node.
The downsides include slower XPath queries in some (rare) cases and
slightly higher memory consumption in some (rare) cases.
XPath queries can execute slower if a lot of old nodes were copied to new
nodes *and* a query only touches old nodes (so it used to benefit a lot
from fast comparison path) *and* a query produces unsorted node sets that
need to be sorted later (both are relatively rare).
Higher memory consumption is possible if a lot of nodes were copied and
all nodes (both new and old) have their contents modified 'in place' --
previously we could modify the old node in place and the new node required
one allocation on copy, and now both nodes have to have their data
allocated during modification. This should also be rare.
On the bright side, in a lot of cases copying of string data can be
avoided - this makes the copy much faster and the document now occupies
less memory. For example, some uses of append_buffer are now actually slower
compared to building up a document by copying a template from the same
document and modifying the copy slightly.
In one of the internal benchmarks copying is now 4x faster (the difference
can be more dramatic with more string contents and less markup).
git-svn-id: https://pugixml.googlecode.com/svn/trunk@1032 99668b35-9821-0410-8761-19e4c4f06640
-rw-r--r-- | src/pugixml.cpp | 73 |
1 files changed, 44 insertions, 29 deletions
diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 8e61182..3979eb9 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -1654,13 +1654,15 @@ PUGI__NS_BEGIN } #endif - inline bool strcpy_insitu_allow(size_t length, uintptr_t allocated, char_t* target) + inline bool strcpy_insitu_allow(size_t length, uintptr_t header, uintptr_t header_mask, char_t* target) { - assert(target); + // never reuse shared memory + if (header & xml_memory_page_name_or_value_shared_mask) return false; + size_t target_length = strlength(target); // always reuse document buffer memory if possible - if (!allocated) return target_length >= length; + if ((header & header_mask) == 0) return target_length >= length; // reuse heap memory if waste is not too great const size_t reuse_threshold = 32; @@ -1687,7 +1689,7 @@ PUGI__NS_BEGIN return true; } - else if (dest && strcpy_insitu_allow(source_length, header & header_mask, dest)) + else if (dest && strcpy_insitu_allow(source_length, header, header_mask, dest)) { // we can reuse old buffer, so just copy the new data (including zero terminator) memcpy(dest, source, (source_length + 1) * sizeof(char_t)); @@ -3605,42 +3607,55 @@ PUGI__NS_BEGIN return true; } - PUGI__FN void node_copy_contents(xml_node dest, const xml_node source) + PUGI__FN void node_copy_string(char_t*& dest, uintptr_t& header, uintptr_t header_mask, char_t* source, uintptr_t& source_header, xml_allocator* alloc) { - assert(dest.type() == source.type()); - - switch (source.type()) - { - case node_element: - case node_declaration: + if (source) { - dest.set_name(source.name()); + if (alloc && (source_header & header_mask) == 0) + { + dest = source; - for (xml_attribute a = source.first_attribute(); a; a = a.next_attribute()) - dest.append_attribute(a.name()).set_value(a.value()); - break; + // since strcpy_insitu can reuse document buffer memory we need to mark both source and dest as shared + header |= xml_memory_page_name_or_value_shared_mask; + source_header |= xml_memory_page_name_or_value_shared_mask; + } + else + strcpy_insitu(dest, header, header_mask, source); } + } - case node_pcdata: - case node_cdata: - case node_comment: - case node_doctype: - dest.set_value(source.value()); - break; + PUGI__FN void node_copy_contents(xml_allocator* alloc, xml_node dest, const xml_node source) + { + assert(dest.type() == source.type()); - case node_pi: - dest.set_name(source.name()); - dest.set_value(source.value()); - break; + xml_node_struct* dn = dest.internal_object(); + xml_node_struct* sn = source.internal_object(); - default: - assert(!"Invalid node type"); + node_copy_string(dn->name, dn->header, xml_memory_page_name_allocated_mask, sn->name, sn->header, alloc); + node_copy_string(dn->value, dn->header, xml_memory_page_value_allocated_mask, sn->value, sn->header, alloc); + + for (xml_attribute_struct* sa = sn->first_attribute; sa; sa = sa->next_attribute) + { + xml_attribute_struct* da = impl::append_new_attribute(dn, impl::get_allocator(dn)); + + node_copy_string(da->name, da->header, xml_memory_page_name_allocated_mask, sa->name, sa->header, alloc); + node_copy_string(da->value, da->header, xml_memory_page_value_allocated_mask, sa->value, sa->header, alloc); } } + PUGI__FN xml_allocator* node_get_shared_allocator(const xml_node lhs, const xml_node rhs) + { + xml_allocator& la = impl::get_allocator(lhs.internal_object()); + xml_allocator& ra = impl::get_allocator(rhs.internal_object()); + + return (&la == &ra) ? &la : 0; + } + PUGI__FN void node_copy_tree(xml_node dest, const xml_node source) { - node_copy_contents(dest, source); + xml_allocator* alloc = node_get_shared_allocator(dest, source); + + node_copy_contents(alloc, dest, source); xml_node destit = dest; xml_node sourceit = source.first_child(); @@ -3651,7 +3666,7 @@ PUGI__NS_BEGIN { xml_node copy = destit.append_child(sourceit.type()); - node_copy_contents(copy, sourceit); + node_copy_contents(alloc, copy, sourceit); if (sourceit.first_child()) { |