From 1b8b87904b0618f853345619e7ee2656cab80113 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Mon, 20 Oct 2014 01:00:48 +0000 Subject: XPath: Introduce _first/_any set evaluation modes Sometimes when evaluating the node set we don't need the entire set and only need the first element in docorder or any element. In the absence of iterator support we can still use this information to short-circuit traversals. This does not have any effect on straightforward node collection queries, but frequently improves performance of complex queries with predicates etc. XMark benchmark gets 15x faster with some queries enjoying 100x speedup on 10 Mb dataset due to a significant complexity improvement. git-svn-id: https://pugixml.googlecode.com/svn/trunk@1067 99668b35-9821-0410-8761-19e4c4f06640 --- src/pugixml.cpp | 190 +++++++++++++++++++++++++++++---------------- tests/test_xpath.cpp | 22 ++++++ tests/test_xpath_paths.cpp | 22 ++++++ 3 files changed, 167 insertions(+), 67 deletions(-) diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 99cdfc0..e9ab09d 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -8270,6 +8270,13 @@ PUGI__NS_BEGIN nodetest_all_in_namespace }; + enum nodeset_eval_t + { + nodeset_eval_all, + nodeset_eval_any, + nodeset_eval_first + }; + template struct axis_to_type { static const axis_t axis; @@ -8334,8 +8341,8 @@ PUGI__NS_BEGIN { xpath_allocator_capture cr(stack.result); - xpath_node_set_raw ls = lhs->eval_node_set(c, stack); - xpath_node_set_raw rs = rhs->eval_node_set(c, stack); + xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all); + xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all); for (const xpath_node* li = ls.begin(); li != ls.end(); ++li) for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri) @@ -8363,7 +8370,7 @@ PUGI__NS_BEGIN xpath_allocator_capture cr(stack.result); double l = lhs->eval_number(c, stack); - xpath_node_set_raw rs = rhs->eval_node_set(c, stack); + xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all); for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri) { @@ -8380,7 +8387,7 @@ PUGI__NS_BEGIN xpath_allocator_capture cr(stack.result); xpath_string l = lhs->eval_string(c, stack); - xpath_node_set_raw rs = rhs->eval_node_set(c, stack); + xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all); for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri) { @@ -8408,8 +8415,8 @@ PUGI__NS_BEGIN { xpath_allocator_capture cr(stack.result); - xpath_node_set_raw ls = lhs->eval_node_set(c, stack); - xpath_node_set_raw rs = rhs->eval_node_set(c, stack); + xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all); + xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all); for (const xpath_node* li = ls.begin(); li != ls.end(); ++li) { @@ -8433,7 +8440,7 @@ PUGI__NS_BEGIN xpath_allocator_capture cr(stack.result); double l = lhs->eval_number(c, stack); - xpath_node_set_raw rs = rhs->eval_node_set(c, stack); + xpath_node_set_raw rs = rhs->eval_node_set(c, stack, nodeset_eval_all); for (const xpath_node* ri = rs.begin(); ri != rs.end(); ++ri) { @@ -8449,7 +8456,7 @@ PUGI__NS_BEGIN { xpath_allocator_capture cr(stack.result); - xpath_node_set_raw ls = lhs->eval_node_set(c, stack); + xpath_node_set_raw ls = lhs->eval_node_set(c, stack, nodeset_eval_all); double r = rhs->eval_number(c, stack); for (const xpath_node* li = ls.begin(); li != ls.end(); ++li) @@ -8469,7 +8476,7 @@ PUGI__NS_BEGIN } } - void apply_predicate(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack) + static void apply_predicate(xpath_node_set_raw& ns, size_t first, xpath_ast_node* expr, const xpath_stack& stack) { assert(ns.size() >= first); @@ -8524,12 +8531,18 @@ PUGI__NS_BEGIN case nodetest_type_node: case nodetest_all: if (is_xpath_attribute(name)) + { ns.push_back(xpath_node(a, parent), alloc); + return true; + } break; case nodetest_all_in_namespace: if (starts_with(name, _data.nodetest) && is_xpath_attribute(name)) + { ns.push_back(xpath_node(a, parent), alloc); + return true; + } break; default: @@ -8539,57 +8552,80 @@ PUGI__NS_BEGIN return false; } - void step_push(xpath_node_set_raw& ns, const xml_node n, xpath_allocator* alloc) + bool step_push(xpath_node_set_raw& ns, const xml_node n, xpath_allocator* alloc) { - if (!n) return; + if (!n) return false; switch (_test) { case nodetest_name: if (n.type() == node_element && strequal(n.name(), _data.nodetest)) + { ns.push_back(n, alloc); + return true; + } break; case nodetest_type_node: ns.push_back(n, alloc); - break; + return true; case nodetest_type_comment: if (n.type() == node_comment) + { ns.push_back(n, alloc); + return true; + } break; case nodetest_type_text: if (n.type() == node_pcdata || n.type() == node_cdata) + { ns.push_back(n, alloc); + return true; + } break; case nodetest_type_pi: if (n.type() == node_pi) + { ns.push_back(n, alloc); + return true; + } break; case nodetest_pi: if (n.type() == node_pi && strequal(n.name(), _data.nodetest)) + { ns.push_back(n, alloc); + return true; + } break; case nodetest_all: if (n.type() == node_element) + { ns.push_back(n, alloc); + return true; + } break; case nodetest_all_in_namespace: if (n.type() == node_element && starts_with(n.name(), _data.nodetest)) + { ns.push_back(n, alloc); + return true; + } break; default: assert(!"Unknown axis"); - } + } + + return false; } - template void step_fill(xpath_node_set_raw& ns, const xml_node n, xpath_allocator* alloc, T) + template void step_fill(xpath_node_set_raw& ns, const xml_node n, xpath_allocator* alloc, bool once, T) { const axis_t axis = T::axis; @@ -8598,8 +8634,8 @@ PUGI__NS_BEGIN case axis_attribute: { for (xml_attribute a = n.first_attribute(); a; a = a.next_attribute()) - if (step_push(ns, a, n, alloc)) - break; // step_push returns true if it found a unique attribute + if (step_push(ns, a, n, alloc) & once) + return; break; } @@ -8607,7 +8643,8 @@ PUGI__NS_BEGIN case axis_child: { for (xml_node c = n.first_child(); c; c = c.next_sibling()) - step_push(ns, c, alloc); + if (step_push(ns, c, alloc) & once) + return; break; } @@ -8616,13 +8653,15 @@ PUGI__NS_BEGIN case axis_descendant_or_self: { if (axis == axis_descendant_or_self) - step_push(ns, n, alloc); + if (step_push(ns, n, alloc) & once) + return; xml_node cur = n.first_child(); while (cur && cur != n) { - step_push(ns, cur, alloc); + if (step_push(ns, cur, alloc) & once) + return; if (cur.first_child()) cur = cur.first_child(); @@ -8643,7 +8682,8 @@ PUGI__NS_BEGIN case axis_following_sibling: { for (xml_node c = n.next_sibling(); c; c = c.next_sibling()) - step_push(ns, c, alloc); + if (step_push(ns, c, alloc) & once) + return; break; } @@ -8651,7 +8691,8 @@ PUGI__NS_BEGIN case axis_preceding_sibling: { for (xml_node c = n.previous_sibling(); c; c = c.previous_sibling()) - step_push(ns, c, alloc); + if (step_push(ns, c, alloc) & once) + return; break; } @@ -8666,7 +8707,8 @@ PUGI__NS_BEGIN for (;;) { - step_push(ns, cur, alloc); + if (step_push(ns, cur, alloc) & once) + return; if (cur.first_child()) cur = cur.first_child(); @@ -8698,7 +8740,8 @@ PUGI__NS_BEGIN else { // leaf node, can't be ancestor - step_push(ns, cur, alloc); + if (step_push(ns, cur, alloc) & once) + return; if (cur.previous_sibling()) cur = cur.previous_sibling(); @@ -8709,7 +8752,9 @@ PUGI__NS_BEGIN cur = cur.parent(); if (!cur) break; - if (!node_is_ancestor(cur, n)) step_push(ns, cur, alloc); + if (!node_is_ancestor(cur, n)) + if (step_push(ns, cur, alloc) & once) + return; } while (!cur.previous_sibling()); @@ -8727,13 +8772,15 @@ PUGI__NS_BEGIN case axis_ancestor_or_self: { if (axis == axis_ancestor_or_self) - step_push(ns, n, alloc); + if (step_push(ns, n, alloc) & once) + return; xml_node cur = n.parent(); while (cur) { - step_push(ns, cur, alloc); + if (step_push(ns, cur, alloc) & once) + return; cur = cur.parent(); } @@ -8760,7 +8807,7 @@ PUGI__NS_BEGIN } } - template void step_fill(xpath_node_set_raw& ns, const xml_attribute a, const xml_node p, xpath_allocator* alloc, T v) + template void step_fill(xpath_node_set_raw& ns, const xml_attribute a, const xml_node p, xpath_allocator* alloc, bool once, T v) { const axis_t axis = T::axis; @@ -8770,13 +8817,15 @@ PUGI__NS_BEGIN case axis_ancestor_or_self: { if (axis == axis_ancestor_or_self && _test == nodetest_type_node) // reject attributes based on principal node type test - step_push(ns, a, p, alloc); + if (step_push(ns, a, p, alloc) & once) + return; xml_node cur = p; while (cur) { - step_push(ns, cur, alloc); + if (step_push(ns, cur, alloc) & once) + return; cur = cur.parent(); } @@ -8811,7 +8860,8 @@ PUGI__NS_BEGIN if (!cur) break; } - step_push(ns, cur, alloc); + if (step_push(ns, cur, alloc) & once) + return; } break; @@ -8827,7 +8877,7 @@ PUGI__NS_BEGIN case axis_preceding: { // preceding:: axis does not include attribute nodes and attribute ancestors (they are the same as parent's ancestors), so we can reuse node preceding - step_fill(ns, p, alloc, v); + step_fill(ns, p, alloc, once, v); break; } @@ -8835,18 +8885,24 @@ PUGI__NS_BEGIN assert(!"Unimplemented axis"); } } - - template xpath_node_set_raw step_do(const xpath_context& c, const xpath_stack& stack, T v) + + template xpath_node_set_raw step_do(const xpath_context& c, const xpath_stack& stack, nodeset_eval_t eval, T v) { const axis_t axis = T::axis; - bool attributes = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_descendant_or_self || axis == axis_following || axis == axis_parent || axis == axis_preceding || axis == axis_self); + bool axis_has_attributes = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_descendant_or_self || axis == axis_following || axis == axis_parent || axis == axis_preceding || axis == axis_self); + bool axis_reverse = (axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_preceding || axis == axis_preceding_sibling); + + bool once = + (axis == axis_attribute && _test == nodetest_name) + ? true + : !_right && (axis_reverse ? eval == nodeset_eval_any : eval != nodeset_eval_all); xpath_node_set_raw ns; - ns.set_type((axis == axis_ancestor || axis == axis_ancestor_or_self || axis == axis_preceding || axis == axis_preceding_sibling) ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted); + ns.set_type(axis_reverse ? xpath_node_set::type_sorted_reverse : xpath_node_set::type_sorted); if (_left) { - xpath_node_set_raw s = _left->eval_node_set(c, stack); + xpath_node_set_raw s = _left->eval_node_set(c, stack, nodeset_eval_all); // self axis preserves the original order if (axis == axis_self) ns.set_type(s.type()); @@ -8859,9 +8915,9 @@ PUGI__NS_BEGIN if (axis != axis_self && size != 0) ns.set_type(xpath_node_set::type_unsorted); if (it->node()) - step_fill(ns, it->node(), stack.result, v); - else if (attributes) - step_fill(ns, it->attribute(), it->parent(), stack.result, v); + step_fill(ns, it->node(), stack.result, once, v); + else if (axis_has_attributes) + step_fill(ns, it->attribute(), it->parent(), stack.result, once, v); apply_predicates(ns, size, stack); } @@ -8869,9 +8925,9 @@ PUGI__NS_BEGIN else { if (c.n.node()) - step_fill(ns, c.n.node(), stack.result, v); - else if (attributes) - step_fill(ns, c.n.attribute(), c.n.parent(), stack.result, v); + step_fill(ns, c.n.node(), stack.result, once, v); + else if (axis_has_attributes) + step_fill(ns, c.n.attribute(), c.n.parent(), stack.result, once, v); apply_predicates(ns, 0, stack); } @@ -9054,7 +9110,7 @@ PUGI__NS_BEGIN { xpath_allocator_capture cr(stack.result); - return !eval_node_set(c, stack).empty(); + return !eval_node_set(c, stack, nodeset_eval_any).empty(); } default: @@ -9100,7 +9156,7 @@ PUGI__NS_BEGIN { xpath_allocator_capture cr(stack.result); - return static_cast(_left->eval_node_set(c, stack).size()); + return static_cast(_left->eval_node_set(c, stack, nodeset_eval_all).size()); } case ast_func_string_length_0: @@ -9133,7 +9189,7 @@ PUGI__NS_BEGIN double r = 0; - xpath_node_set_raw ns = _left->eval_node_set(c, stack); + xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_all); for (const xpath_node* it = ns.begin(); it != ns.end(); ++it) { @@ -9269,7 +9325,7 @@ PUGI__NS_BEGIN { xpath_allocator_capture cr(stack.result); - xpath_node_set_raw ns = _left->eval_node_set(c, stack); + xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first); xpath_node na = ns.first(); return xpath_string::from_const(local_name(na)); @@ -9286,7 +9342,7 @@ PUGI__NS_BEGIN { xpath_allocator_capture cr(stack.result); - xpath_node_set_raw ns = _left->eval_node_set(c, stack); + xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first); xpath_node na = ns.first(); return xpath_string::from_const(qualified_name(na)); @@ -9303,7 +9359,7 @@ PUGI__NS_BEGIN { xpath_allocator_capture cr(stack.result); - xpath_node_set_raw ns = _left->eval_node_set(c, stack); + xpath_node_set_raw ns = _left->eval_node_set(c, stack, nodeset_eval_first); xpath_node na = ns.first(); return xpath_string::from_const(namespace_uri(na)); @@ -9466,7 +9522,7 @@ PUGI__NS_BEGIN xpath_stack swapped_stack = {stack.temp, stack.result}; - xpath_node_set_raw ns = eval_node_set(c, swapped_stack); + xpath_node_set_raw ns = eval_node_set(c, swapped_stack, nodeset_eval_first); return ns.empty() ? xpath_string() : string_value(ns.first(), stack.result); } @@ -9478,7 +9534,7 @@ PUGI__NS_BEGIN } } - xpath_node_set_raw eval_node_set(const xpath_context& c, const xpath_stack& stack) + xpath_node_set_raw eval_node_set(const xpath_context& c, const xpath_stack& stack, nodeset_eval_t eval) { switch (_type) { @@ -9488,8 +9544,8 @@ PUGI__NS_BEGIN xpath_stack swapped_stack = {stack.temp, stack.result}; - xpath_node_set_raw ls = _left->eval_node_set(c, swapped_stack); - xpath_node_set_raw rs = _right->eval_node_set(c, stack); + xpath_node_set_raw ls = _left->eval_node_set(c, swapped_stack, eval); + xpath_node_set_raw rs = _right->eval_node_set(c, stack, eval); // we can optimize merging two sorted sets, but this is a very rare operation, so don't bother rs.set_type(xpath_node_set::type_unsorted); @@ -9503,7 +9559,7 @@ PUGI__NS_BEGIN case ast_filter: case ast_filter_posinv: { - xpath_node_set_raw set = _left->eval_node_set(c, stack); + xpath_node_set_raw set = _left->eval_node_set(c, stack, nodeset_eval_all); // either expression is a number or it contains position() call; sort by document order if (_type == ast_filter) set.sort_do(); @@ -9521,44 +9577,44 @@ PUGI__NS_BEGIN switch (_axis) { case axis_ancestor: - return step_do(c, stack, axis_to_type()); + return step_do(c, stack, eval, axis_to_type()); case axis_ancestor_or_self: - return step_do(c, stack, axis_to_type()); + return step_do(c, stack, eval, axis_to_type()); case axis_attribute: - return step_do(c, stack, axis_to_type()); + return step_do(c, stack, eval, axis_to_type()); case axis_child: - return step_do(c, stack, axis_to_type()); + return step_do(c, stack, eval, axis_to_type()); case axis_descendant: - return step_do(c, stack, axis_to_type()); + return step_do(c, stack, eval, axis_to_type()); case axis_descendant_or_self: - return step_do(c, stack, axis_to_type()); + return step_do(c, stack, eval, axis_to_type()); case axis_following: - return step_do(c, stack, axis_to_type()); + return step_do(c, stack, eval, axis_to_type()); case axis_following_sibling: - return step_do(c, stack, axis_to_type()); + return step_do(c, stack, eval, axis_to_type()); case axis_namespace: // namespaced axis is not supported return xpath_node_set_raw(); case axis_parent: - return step_do(c, stack, axis_to_type()); + return step_do(c, stack, eval, axis_to_type()); case axis_preceding: - return step_do(c, stack, axis_to_type()); + return step_do(c, stack, eval, axis_to_type()); case axis_preceding_sibling: - return step_do(c, stack, axis_to_type()); + return step_do(c, stack, eval, axis_to_type()); case axis_self: - return step_do(c, stack, axis_to_type()); + return step_do(c, stack, eval, axis_to_type()); default: assert(!"Unknown axis"); @@ -11111,7 +11167,7 @@ namespace pugi if (setjmp(sd.error_handler)) return xpath_node_set(); #endif - impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack); + impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack, impl::nodeset_eval_all); return xpath_node_set(r.begin(), r.end(), r.type()); } @@ -11128,7 +11184,7 @@ namespace pugi if (setjmp(sd.error_handler)) return xpath_node(); #endif - impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack); + impl::xpath_node_set_raw r = root->eval_node_set(c, sd.stack, impl::nodeset_eval_first); return r.first(); } diff --git a/tests/test_xpath.cpp b/tests/test_xpath.cpp index f7da258..2143376 100644 --- a/tests/test_xpath.cpp +++ b/tests/test_xpath.cpp @@ -122,6 +122,28 @@ TEST_XML(xpath_sort_attributes, "") xpath_node_set_tester(reverse_sorted, "reverse sorted order failed") % 5 % 4 % 3; } +TEST(xpath_sort_random_medium) +{ + xml_document doc; + load_document_copy(doc, STR("") + STR("testtest") + STR("testtest") + STR("testtest") + STR("")); + + xpath_node_set ns = doc.select_nodes(STR("//node() | //@*")); + + std::vector nsv(ns.begin(), ns.end()); + std::random_shuffle(nsv.begin(), nsv.end()); + + xpath_node_set copy(&nsv[0], &nsv[0] + nsv.size()); + copy.sort(); + + xpath_node_set_tester tester(copy, "sorted order failed"); + + for (unsigned int i = 2; i < 39; ++i) tester % i; +} + TEST(xpath_sort_random_large) { xml_document doc; diff --git a/tests/test_xpath_paths.cpp b/tests/test_xpath_paths.cpp index b6f53c7..d791cdf 100644 --- a/tests/test_xpath_paths.cpp +++ b/tests/test_xpath_paths.cpp @@ -594,4 +594,26 @@ TEST_XML(xpath_paths_optimize_compare_attribute, " CHECK_XPATH_NODESET(doc, STR("node[@xmlns = '3']")); } +TEST_XML(xpath_paths_optimize_step_once, "") +{ + CHECK_XPATH_BOOLEAN(doc, STR("node//para2/following::*"), true); + CHECK_XPATH_BOOLEAN(doc, STR("node//para6/following::*"), false); + + CHECK_XPATH_STRING(doc, STR("name(node//para2/following::*)"), STR("para3")); + CHECK_XPATH_STRING(doc, STR("name(node//para6/following::*)"), STR("")); + + CHECK_XPATH_BOOLEAN(doc, STR("node//para1/preceding::*"), false); + CHECK_XPATH_BOOLEAN(doc, STR("node//para6/preceding::*"), true); + + CHECK_XPATH_STRING(doc, STR("name(node//para1/preceding::*)"), STR("")); + CHECK_XPATH_STRING(doc, STR("name(node//para6/preceding::*)"), STR("para1")); + + CHECK_XPATH_BOOLEAN(doc, STR("node//para6/preceding::para4"), true); + + CHECK_XPATH_BOOLEAN(doc, STR("//@attr5/ancestor-or-self::*"), true); + CHECK_XPATH_BOOLEAN(doc, STR("//@attr5/ancestor::*"), true); + + CHECK_XPATH_BOOLEAN(doc, STR("//@attr5/following::para6"), true); + CHECK_XPATH_STRING(doc, STR("name(//@attr5/following::para6)"), STR("para6")); +} #endif -- cgit v1.2.3