From 2874f6f21dc22efab1a2884fe463c5461955a225 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Fri, 8 Jan 2016 08:37:26 -0800 Subject: Add initial support for parse_embed_pcdata When this flag is true, PCDATA value is saved to the parent element instead of allocating a new node. This prevents some documents from round-tripping since it loses information, but can provide a significant memory reduction and parsing speedup for some documents. --- src/pugixml.cpp | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'src/pugixml.cpp') diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 35c0d8e..de87dcf 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -3360,13 +3360,22 @@ PUGI__NS_BEGIN if (cursor->parent || PUGI__OPTSET(parse_fragment)) { - PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree. - cursor->value = s; // Save the offset. + if (!PUGI__OPTSET(parse_embed_pcdata)) + { + PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree. + + cursor->value = s; // Save the offset. + + PUGI__POPNODE(); // Pop since this is a standalone. + } + else + { + if (cursor->parent && !cursor->value) + cursor->value = s; // Save the offset. + } s = strconv_pcdata(s); - PUGI__POPNODE(); // Pop since this is a standalone. - if (!*s) break; } else -- cgit v1.2.3 From 8b01f8923c047edf904c766c59ac359b807e7643 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Fri, 8 Jan 2016 08:40:56 -0800 Subject: Support xml_node::child_value/text for parse_embed_pcdata --- src/pugixml.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'src/pugixml.cpp') diff --git a/src/pugixml.cpp b/src/pugixml.cpp index de87dcf..8c5b9e1 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -5464,6 +5464,10 @@ namespace pugi if (impl::is_text_node(i) && i->value) return i->value; + // element nodes can have value if parse_embed_pcdata was used + if (PUGI__NODETYPE(_root) == node_element && _root->value) + return _root->value; + return PUGIXML_TEXT(""); } @@ -6211,6 +6215,10 @@ namespace pugi if (impl::is_text_node(node)) return node; + // element nodes can have value if parse_embed_pcdata was used + if (PUGI__NODETYPE(_root) == node_element && _root->value) + return _root; + return 0; } -- cgit v1.2.3 From 85d8b225f2276333001dc0f96179bfef012277ae Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Fri, 8 Jan 2016 08:41:38 -0800 Subject: Support XPath string value for parse_embed_pcdata --- src/pugixml.cpp | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'src/pugixml.cpp') diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 8c5b9e1..c018359 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -7653,6 +7653,10 @@ PUGI__NS_BEGIN { xpath_string result; + // element nodes can have value if parse_embed_pcdata was used + if (n.value()[0]) + result.append(xpath_string::from_const(n.value()), alloc); + xml_node cur = n.first_child(); while (cur && cur != n) -- cgit v1.2.3 From df2a0ad28b24681fdc39275b5260132b0a3e6918 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Sat, 9 Jan 2016 17:46:42 -0800 Subject: Implement output support for embedded PCDATA values This is a bit awkward since preserving correct indentation structure requires a bit of extra work, and the closing tag has to be written by _start function to correctly process the rest of the tree. --- src/pugixml.cpp | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) (limited to 'src/pugixml.cpp') diff --git a/src/pugixml.cpp b/src/pugixml.cpp index c018359..90c677e 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -4018,17 +4018,40 @@ PUGI__NS_BEGIN if (node->first_attribute) node_output_attributes(writer, node, indent, indent_length, flags, depth); - if (!node->first_child) + // element nodes can have value if parse_embed_pcdata was used + if (!node->value) { - writer.write(' ', '/', '>'); + if (!node->first_child) + { + writer.write(' ', '/', '>'); - return false; + return false; + } + else + { + writer.write('>'); + + return true; + } } else { writer.write('>'); - return true; + text_output(writer, node->value, ctx_special_pcdata, flags); + + if (!node->first_child) + { + writer.write('<', '/'); + writer.write_string(name); + writer.write('>'); + + return false; + } + else + { + return true; + } } } @@ -4136,6 +4159,10 @@ PUGI__NS_BEGIN if (node_output_start(writer, node, indent, indent_length, flags, depth)) { + // element nodes can have value if parse_embed_pcdata was used + if (node->value) + indent_flags = 0; + node = node->first_child; depth++; continue; -- cgit v1.2.3 From bcddf36559c4293d34fd275a4d392982fae94998 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Tue, 12 Jan 2016 20:01:44 -0800 Subject: Only save first PCDATA contents in the element This change fixes an important ordering issue - if element node has a PCDATA child *after* other elements, it's impossible to tell which order the children were in. Since the goal of PCDATA embedding is to save memory when it's the only child, only apply the optimization to the first child. This seems to fix all roundtripping issues so the only caveat is that the DOM structure is different. --- src/pugixml.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'src/pugixml.cpp') diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 90c677e..f447e97 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -3360,7 +3360,11 @@ PUGI__NS_BEGIN if (cursor->parent || PUGI__OPTSET(parse_fragment)) { - if (!PUGI__OPTSET(parse_embed_pcdata)) + if (PUGI__OPTSET(parse_embed_pcdata) && cursor->parent && !cursor->first_child && !cursor->value) + { + cursor->value = s; // Save the offset. + } + else { PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree. @@ -3368,11 +3372,6 @@ PUGI__NS_BEGIN PUGI__POPNODE(); // Pop since this is a standalone. } - else - { - if (cursor->parent && !cursor->value) - cursor->value = s; // Save the offset. - } s = strconv_pcdata(s); -- cgit v1.2.3 From 4f3be7616729cbf0c8768caf861331d710d457a8 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Tue, 12 Jan 2016 20:41:37 -0800 Subject: Preserve order semantics for child_value/text when using parse_embed_pcdata The performance cost is probably negligible and this means we treat embedded value as the first child consistently. --- src/pugixml.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'src/pugixml.cpp') diff --git a/src/pugixml.cpp b/src/pugixml.cpp index f447e97..158a24d 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -5486,14 +5486,14 @@ namespace pugi { if (!_root) return PUGIXML_TEXT(""); - for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) - if (impl::is_text_node(i) && i->value) - return i->value; - // element nodes can have value if parse_embed_pcdata was used if (PUGI__NODETYPE(_root) == node_element && _root->value) return _root->value; + for (xml_node_struct* i = _root->first_child; i; i = i->next_sibling) + if (impl::is_text_node(i) && i->value) + return i->value; + return PUGIXML_TEXT(""); } @@ -6237,14 +6237,14 @@ namespace pugi { if (!_root || impl::is_text_node(_root)) return _root; - for (xml_node_struct* node = _root->first_child; node; node = node->next_sibling) - if (impl::is_text_node(node)) - return node; - // element nodes can have value if parse_embed_pcdata was used if (PUGI__NODETYPE(_root) == node_element && _root->value) return _root; + for (xml_node_struct* node = _root->first_child; node; node = node->next_sibling) + if (impl::is_text_node(node)) + return node; + return 0; } -- cgit v1.2.3