From 4fe55906fac64cb3c58468f3f02a1a24a3a0213f Mon Sep 17 00:00:00 2001 From: "arseny.kapoulkine@gmail.com" Date: Sun, 18 Nov 2012 01:11:50 +0000 Subject: XPath stack optimization: Rewrite part of the recursive descent parser to precedence climbing to reduce stack usage git-svn-id: http://pugixml.googlecode.com/svn/trunk@931 99668b35-9821-0410-8761-19e4c4f06640 --- src/pugixml.cpp | 209 ++++++++++++++++++++++++-------------------------------- 1 file changed, 89 insertions(+), 120 deletions(-) diff --git a/src/pugixml.cpp b/src/pugixml.cpp index ffccdd7..4e8200b 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -7279,6 +7279,7 @@ PUGI__NS_BEGIN enum ast_type_t { + ast_unknown, ast_op_or, // left or right ast_op_and, // left and right ast_op_equal, // left = right @@ -9338,7 +9339,9 @@ PUGI__NS_BEGIN // | FilterExpr // | FilterExpr '/' RelativeLocationPath // | FilterExpr '//' RelativeLocationPath - xpath_ast_node* parse_path_expression() + // UnionExpr ::= PathExpr | UnionExpr '|' PathExpr + // UnaryExpr ::= UnionExpr | '-' UnaryExpr + xpath_ast_node* parse_path_or_unary_expression() { // Clarification. // PathExpr begins with either LocationPath or FilterExpr. @@ -9384,170 +9387,136 @@ PUGI__NS_BEGIN return n; } - else return parse_location_path(); - } - - // UnionExpr ::= PathExpr | UnionExpr '|' PathExpr - xpath_ast_node* parse_union_expression() - { - xpath_ast_node* n = parse_path_expression(); - - while (_lexer.current() == lex_union) + else if (_lexer.current() == lex_minus) { _lexer.next(); - xpath_ast_node* expr = parse_union_expression(); + // precedence 7+ - only parses union expressions + xpath_ast_node* expr = parse_expression_rec(parse_path_or_unary_expression(), 7); - if (n->rettype() != xpath_type_node_set || expr->rettype() != xpath_type_node_set) - throw_error("Union operator has to be applied to node sets"); - - n = new (alloc_node()) xpath_ast_node(ast_op_union, xpath_type_node_set, n, expr); + return new (alloc_node()) xpath_ast_node(ast_op_negate, xpath_type_number, expr); } - - return n; + else + return parse_location_path(); } - // UnaryExpr ::= UnionExpr | '-' UnaryExpr - xpath_ast_node* parse_unary_expression() + struct binary_op_t { - if (_lexer.current() == lex_minus) - { - _lexer.next(); - - xpath_ast_node* expr = parse_unary_expression(); + ast_type_t asttype; + xpath_value_type rettype; + int precedence; - return new (alloc_node()) xpath_ast_node(ast_op_negate, xpath_type_number, expr); + binary_op_t(): asttype(ast_unknown), rettype(xpath_type_none), precedence(0) + { } - else return parse_union_expression(); - } - - // MultiplicativeExpr ::= UnaryExpr - // | MultiplicativeExpr '*' UnaryExpr - // | MultiplicativeExpr 'div' UnaryExpr - // | MultiplicativeExpr 'mod' UnaryExpr - xpath_ast_node* parse_multiplicative_expression() - { - xpath_ast_node* n = parse_unary_expression(); - while (_lexer.current() == lex_multiply || (_lexer.current() == lex_string && - (_lexer.contents() == PUGIXML_TEXT("mod") || _lexer.contents() == PUGIXML_TEXT("div")))) + binary_op_t(ast_type_t asttype_, xpath_value_type rettype_, int precedence_): asttype(asttype_), rettype(rettype_), precedence(precedence_) { - ast_type_t op = _lexer.current() == lex_multiply ? ast_op_multiply : - _lexer.contents().begin[0] == 'd' ? ast_op_divide : ast_op_mod; - _lexer.next(); - - xpath_ast_node* expr = parse_unary_expression(); - - n = new (alloc_node()) xpath_ast_node(op, xpath_type_number, n, expr); } - return n; - } - - // AdditiveExpr ::= MultiplicativeExpr - // | AdditiveExpr '+' MultiplicativeExpr - // | AdditiveExpr '-' MultiplicativeExpr - xpath_ast_node* parse_additive_expression() - { - xpath_ast_node* n = parse_multiplicative_expression(); - - while (_lexer.current() == lex_plus || _lexer.current() == lex_minus) + static binary_op_t parse(xpath_lexer& lexer) { - lexeme_t l = _lexer.current(); + switch (lexer.current()) + { + case lex_string: + if (lexer.contents() == PUGIXML_TEXT("or")) + return binary_op_t(ast_op_or, xpath_type_boolean, 1); + else if (lexer.contents() == PUGIXML_TEXT("and")) + return binary_op_t(ast_op_and, xpath_type_boolean, 2); + else if (lexer.contents() == PUGIXML_TEXT("div")) + return binary_op_t(ast_op_divide, xpath_type_number, 6); + else if (lexer.contents() == PUGIXML_TEXT("mod")) + return binary_op_t(ast_op_mod, xpath_type_number, 6); + else + return binary_op_t(); - _lexer.next(); + case lex_equal: + return binary_op_t(ast_op_equal, xpath_type_boolean, 3); - xpath_ast_node* expr = parse_multiplicative_expression(); + case lex_not_equal: + return binary_op_t(ast_op_not_equal, xpath_type_boolean, 3); - n = new (alloc_node()) xpath_ast_node(l == lex_plus ? ast_op_add : ast_op_subtract, xpath_type_number, n, expr); - } + case lex_less: + return binary_op_t(ast_op_less, xpath_type_boolean, 4); - return n; - } + case lex_greater: + return binary_op_t(ast_op_greater, xpath_type_boolean, 4); - // RelationalExpr ::= AdditiveExpr - // | RelationalExpr '<' AdditiveExpr - // | RelationalExpr '>' AdditiveExpr - // | RelationalExpr '<=' AdditiveExpr - // | RelationalExpr '>=' AdditiveExpr - xpath_ast_node* parse_relational_expression() - { - xpath_ast_node* n = parse_additive_expression(); + case lex_less_or_equal: + return binary_op_t(ast_op_less_or_equal, xpath_type_boolean, 4); - while (_lexer.current() == lex_less || _lexer.current() == lex_less_or_equal || - _lexer.current() == lex_greater || _lexer.current() == lex_greater_or_equal) - { - lexeme_t l = _lexer.current(); - _lexer.next(); - - xpath_ast_node* expr = parse_additive_expression(); + case lex_greater_or_equal: + return binary_op_t(ast_op_greater_or_equal, xpath_type_boolean, 4); - n = new (alloc_node()) xpath_ast_node(l == lex_less ? ast_op_less : l == lex_greater ? ast_op_greater : - l == lex_less_or_equal ? ast_op_less_or_equal : ast_op_greater_or_equal, xpath_type_boolean, n, expr); - } + case lex_plus: + return binary_op_t(ast_op_add, xpath_type_number, 5); - return n; - } - - // EqualityExpr ::= RelationalExpr - // | EqualityExpr '=' RelationalExpr - // | EqualityExpr '!=' RelationalExpr - xpath_ast_node* parse_equality_expression() - { - xpath_ast_node* n = parse_relational_expression(); - - while (_lexer.current() == lex_equal || _lexer.current() == lex_not_equal) - { - lexeme_t l = _lexer.current(); + case lex_minus: + return binary_op_t(ast_op_subtract, xpath_type_number, 5); - _lexer.next(); + case lex_multiply: + return binary_op_t(ast_op_multiply, xpath_type_number, 6); - xpath_ast_node* expr = parse_relational_expression(); + case lex_union: + return binary_op_t(ast_op_union, xpath_type_node_set, 7); - n = new (alloc_node()) xpath_ast_node(l == lex_equal ? ast_op_equal : ast_op_not_equal, xpath_type_boolean, n, expr); + default: + return binary_op_t(); + } } + }; - return n; - } - - // AndExpr ::= EqualityExpr | AndExpr 'and' EqualityExpr - xpath_ast_node* parse_and_expression() + xpath_ast_node* parse_expression_rec(xpath_ast_node* lhs, int limit) { - xpath_ast_node* n = parse_equality_expression(); + binary_op_t op = binary_op_t::parse(_lexer); - while (_lexer.current() == lex_string && _lexer.contents() == PUGIXML_TEXT("and")) + while (op.asttype != ast_unknown && op.precedence >= limit) { _lexer.next(); - xpath_ast_node* expr = parse_equality_expression(); + xpath_ast_node* rhs = parse_path_or_unary_expression(); - n = new (alloc_node()) xpath_ast_node(ast_op_and, xpath_type_boolean, n, expr); - } + binary_op_t nextop = binary_op_t::parse(_lexer); - return n; - } + while (nextop.asttype != ast_unknown && nextop.precedence > op.precedence) + { + rhs = parse_expression_rec(rhs, nextop.precedence); - // OrExpr ::= AndExpr | OrExpr 'or' AndExpr - xpath_ast_node* parse_or_expression() - { - xpath_ast_node* n = parse_and_expression(); + nextop = binary_op_t::parse(_lexer); + } - while (_lexer.current() == lex_string && _lexer.contents() == PUGIXML_TEXT("or")) - { - _lexer.next(); + if (op.asttype == ast_op_union && (lhs->rettype() != xpath_type_node_set || rhs->rettype() != xpath_type_node_set)) + throw_error("Union operator has to be applied to node sets"); - xpath_ast_node* expr = parse_and_expression(); + lhs = new (alloc_node()) xpath_ast_node(op.asttype, op.rettype, lhs, rhs); - n = new (alloc_node()) xpath_ast_node(ast_op_or, xpath_type_boolean, n, expr); + op = binary_op_t::parse(_lexer); } - return n; + return lhs; } - + // Expr ::= OrExpr + // OrExpr ::= AndExpr | OrExpr 'or' AndExpr + // AndExpr ::= EqualityExpr | AndExpr 'and' EqualityExpr + // EqualityExpr ::= RelationalExpr + // | EqualityExpr '=' RelationalExpr + // | EqualityExpr '!=' RelationalExpr + // RelationalExpr ::= AdditiveExpr + // | RelationalExpr '<' AdditiveExpr + // | RelationalExpr '>' AdditiveExpr + // | RelationalExpr '<=' AdditiveExpr + // | RelationalExpr '>=' AdditiveExpr + // AdditiveExpr ::= MultiplicativeExpr + // | AdditiveExpr '+' MultiplicativeExpr + // | AdditiveExpr '-' MultiplicativeExpr + // MultiplicativeExpr ::= UnaryExpr + // | MultiplicativeExpr '*' UnaryExpr + // | MultiplicativeExpr 'div' UnaryExpr + // | MultiplicativeExpr 'mod' UnaryExpr xpath_ast_node* parse_expression() { - return parse_or_expression(); + return parse_expression_rec(parse_path_or_unary_expression(), 0); } xpath_parser(const char_t* query, xpath_variable_set* variables, xpath_allocator* alloc, xpath_parse_result* result): _alloc(alloc), _lexer(query), _query(query), _variables(variables), _result(result) -- cgit v1.2.3