diff options
author | arseny.kapoulkine <arseny.kapoulkine@99668b35-9821-0410-8761-19e4c4f06640> | 2011-12-09 05:24:07 +0000 |
---|---|---|
committer | arseny.kapoulkine <arseny.kapoulkine@99668b35-9821-0410-8761-19e4c4f06640> | 2011-12-09 05:24:07 +0000 |
commit | 1b87d3dcbf3ce5a6384826740dc65cf4917a1555 (patch) | |
tree | c55392226c9a12d28eef2d14a67bc79c8ae3b893 | |
parent | fbfd2ae25a5382e6c1fba3290c656ceb0db15d5b (diff) |
Introduced parse_ws_pcdata_single flag: only parses whitespace-only PCDATA if it's the only child of the parent node (middle ground between default flags and parse_ws_pcdata)
git-svn-id: http://pugixml.googlecode.com/svn/trunk@825 99668b35-9821-0410-8761-19e4c4f06640
-rw-r--r-- | src/pugixml.cpp | 20 | ||||
-rw-r--r-- | src/pugixml.hpp | 5 | ||||
-rw-r--r-- | tests/test_parse.cpp | 79 |
3 files changed, 99 insertions, 5 deletions
diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 3680fc5..a3c6abd 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -1898,7 +1898,7 @@ namespace // Parser utilities. #define SKIPWS() { while (IS_CHARTYPE(*s, ct_space)) ++s; } - #define OPTSET(OPT) ( optmsk & OPT ) + #define OPTSET(OPT) ( optmsk & (OPT) ) #define PUSHNODE(TYPE) { cursor = append_node(cursor, alloc, TYPE); if (!cursor) THROW_ERROR(status_out_of_memory, s); } #define POPNODE() { cursor = cursor->parent; } #define SCANFOR(X) { while (*s != 0 && !(X)) ++s; } @@ -2402,10 +2402,20 @@ namespace SKIPWS(); // Eat whitespace if no genuine PCDATA here. - if ((!OPTSET(parse_ws_pcdata) || mark == s) && (*s == '<' || !*s)) - { - continue; - } + if (*s == '<') + { + // We skipped some whitespace characters because otherwise we would take the tag branch instead of PCDATA one + assert(mark != s); + + if (!OPTSET(parse_ws_pcdata | parse_ws_pcdata_single)) + { + continue; + } + else if (OPTSET(parse_ws_pcdata_single)) + { + if (s[1] != '/' || cursor->first_child) continue; + } + } s = mark; diff --git a/src/pugixml.hpp b/src/pugixml.hpp index 1826b45..11bf279 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -164,6 +164,11 @@ namespace pugi // This flag determines if document type declaration (node_doctype) is added to the DOM tree. This flag is off by default. const unsigned int parse_doctype = 0x0200; + // This flag determines if plain character data (node_pcdata) that is the only child of the parent node and that consists only + // of whitespace is added to the DOM tree. + // This flag is off by default; turning it on may result in slower parsing and more memory consumption. + const unsigned int parse_ws_pcdata_single = 0x0400; + // The default parsing mode. // Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded, // End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules. diff --git a/tests/test_parse.cpp b/tests/test_parse.cpp index d8064b2..2f66db9 100644 --- a/tests/test_parse.cpp +++ b/tests/test_parse.cpp @@ -263,6 +263,85 @@ TEST(parse_ws_pcdata_parse) CHECK_STRING(c2.first_child().value(), STR(" ")); } +static int get_tree_node_count(xml_node n) +{ + int result = 1; + + for (xml_node c = n.first_child(); c; c = c.next_sibling()) + result += get_tree_node_count(c); + + return result; +} + +TEST(parse_ws_pcdata_permutations) +{ + struct test_data_t + { + unsigned int mask; // 1 = default flags, 2 = parse_ws_pcdata, 4 = parse_ws_pcdata_single + const pugi::char_t* source; + const pugi::char_t* result; + int nodes; // negative if parsing should fail + }; + + test_data_t test_data[] = + { + // external pcdata should be discarded (whitespace or not) + {7, STR("ext1"), STR(""), 1}, + {7, STR(" "), STR(""), 1}, + {7, STR("ext1<node/>"), STR("<node />"), 2}, + {7, STR("ext1<node/>ext2"), STR("<node />"), 2}, + {7, STR(" <node/>"), STR("<node />"), 2}, + {7, STR("<node/> "), STR("<node />"), 2}, + {7, STR(" <node/> "), STR("<node />"), 2}, + // inner pcdata should be preserved + {7, STR("<node>inner</node>"), STR("<node>inner</node>"), 3}, + {7, STR("<node>inner1<child/>inner2</node>"), STR("<node>inner1<child />inner2</node>"), 5}, + {7, STR("<node>inner1<child>deep</child>inner2</node>"), STR("<node>inner1<child>deep</child>inner2</node>"), 6}, + // empty pcdata nodes should never be created + {7, STR("<node>inner1<child></child>inner2</node>"), STR("<node>inner1<child />inner2</node>"), 5}, + {7, STR("<node><child></child>inner2</node>"), STR("<node><child />inner2</node>"), 4}, + {7, STR("<node>inner1<child></child></node>"), STR("<node>inner1<child /></node>"), 4}, + {7, STR("<node><child></child></node>"), STR("<node><child /></node>"), 3}, + // comments, pi or other nodes should not cause pcdata creation either + {7, STR("<node><!----><child><?pi?></child><![CDATA[x]]></node>"), STR("<node><child /><![CDATA[x]]></node>"), 4}, + // leading/trailing pcdata whitespace should be preserved (note: this will change if parse_ws_pcdata_trim is introduced) + {7, STR("<node>\t \tinner1<child> deep </child>\t\ninner2\n\t</node>"), STR("<node>\t \tinner1<child> deep </child>\t\ninner2\n\t</node>"), 6}, + // whitespace-only pcdata preservation depends on the parsing mode + {1, STR("<node>\n\t<child> </child>\n\t<child> <deep> </deep> </child>\n\t<!---->\n\t</node>"), STR("<node><child /><child><deep /></child></node>"), 5}, + {2, STR("<node>\n\t<child> </child>\n\t<child> <deep> </deep> </child>\n\t<!---->\n\t</node>"), STR("<node>\n\t<child> </child>\n\t<child> <deep> </deep> </child>\n\t\n\t</node>"), 13}, + {4, STR("<node>\n\t<child> </child>\n\t<child> <deep> </deep> </child>\n\t<!---->\n\t</node>"), STR("<node><child> </child><child><deep> </deep></child></node>"), 7}, + // current implementation of parse_ws_pcdata_single has an unfortunate bug; reproduce it here + {4, STR("<node>\t\t<!---->\n\n</node>"), STR("<node>\n\n</node>"), 3}, + // error case: terminate PCDATA in the middle + {7, STR("<node>abcdef"), STR("<node>abcde</node>"), -3}, + {7, STR("<node> "), STR("<node> </node>"), -3}, + // error case: terminate PCDATA as early as possible + {7, STR("<node>"), STR("<node />"), -2}, + {7, STR("<node>a"), STR("<node />"), -2}, + {7, STR("<node> "), STR("<node />"), -2}, + }; + + for (size_t i = 0; i < sizeof(test_data) / sizeof(test_data[0]); ++i) + { + const test_data_t& td = test_data[i]; + + for (int flag = 0; flag < 3; ++flag) + { + if (td.mask & (1 << flag)) + { + unsigned int flags[] = {parse_default, parse_default | parse_ws_pcdata, parse_default | parse_ws_pcdata_single}; + + xml_document doc; + CHECK((td.nodes > 0) == doc.load(td.source, flags[flag])); + CHECK_NODE(doc, td.result); + + int nodes = get_tree_node_count(doc); + CHECK((td.nodes < 0 ? -td.nodes : td.nodes) == nodes); + } + } + } +} + TEST(parse_pcdata_no_eol) { xml_document doc; |