diff options
| author | Arseny Kapoulkine <arseny.kapoulkine@gmail.com> | 2014-02-10 16:57:04 +0000 | 
|---|---|---|
| committer | Arseny Kapoulkine <arseny.kapoulkine@gmail.com> | 2014-02-10 16:57:04 +0000 | 
| commit | 79fb68ac4177206e063f8f29113abbe82ac49698 (patch) | |
| tree | c9f07c8d3b6bc82c944c8c16c72f50bf374e7b6d /tests | |
| parent | 9ba26b94c74a03ac937a5d5972f8f12a2916f301 (diff) | |
Use a null-terminated buffer for parsing as often as possible.
Parsing used to work on a non null-terminated buffer, inserting a fake null terminator to increase performance.
This makes it impossible to implement fragment parsing that preserves PCDATA contents (as witnessed by some
tests for boundary conditions that actually depended on this behavior).
Since almost all uses result in us allocating an internal buffer anyway, the new policy is to make sure all buffers
that are allocated by pugixml are null-terminated - the only exception now is external calls to load_buffer_inplace
that don't trigger encoding conversion.
git-svn-id: https://pugixml.googlecode.com/svn/trunk@977 99668b35-9821-0410-8761-19e4c4f06640
Diffstat (limited to 'tests')
| -rw-r--r-- | tests/test_document.cpp | 101 | ||||
| -rw-r--r-- | tests/test_parse.cpp | 10 | 
2 files changed, 106 insertions, 5 deletions
| diff --git a/tests/test_document.cpp b/tests/test_document.cpp index 7adc2a1..adc4bdb 100644 --- a/tests/test_document.cpp +++ b/tests/test_document.cpp @@ -1069,3 +1069,104 @@ TEST_XML(document_reset_copy_self, "<node><child/></node>")      CHECK(!doc.first_child());      CHECK_NODE(doc, STR(""));  } + +struct document_data_t +{ +    xml_encoding encoding; + +    const unsigned char* data; +    size_t size; +}; + +#include <stdio.h> + +TEST(document_load_buffer_utf_truncated) +{ +	const unsigned char utf8[] = {'<', 0xe2, 0x82, 0xac, '/', '>'}; +	const unsigned char utf16_be[] = {0, '<', 0x20, 0xac, 0, '/', 0, '>'}; +	const unsigned char utf16_le[] = {'<', 0, 0xac, 0x20, '/', 0, '>', 0}; +	const unsigned char utf32_be[] = {0, 0, 0, '<', 0, 0, 0x20, 0xac, 0, 0, 0, '/', 0, 0, 0, '>'}; +	const unsigned char utf32_le[] = {'<', 0, 0, 0, 0xac, 0x20, 0, 0, '/', 0, 0, 0, '>', 0, 0, 0}; + +	const document_data_t data[] = +	{ +		{ encoding_utf8, utf8, sizeof(utf8) }, +		{ encoding_utf16_be, utf16_be, sizeof(utf16_be) }, +		{ encoding_utf16_le, utf16_le, sizeof(utf16_le) }, +		{ encoding_utf32_be, utf32_be, sizeof(utf32_be) }, +		{ encoding_utf32_le, utf32_le, sizeof(utf32_le) }, +	}; + +	for (size_t i = 0; i < sizeof(data) / sizeof(data[0]); ++i) +	{ +		const document_data_t& d = data[i]; + +		for (size_t j = 0; j <= d.size; ++j) +		{ +			char* buffer = new char[j]; +			memcpy(buffer, d.data, j); + +			xml_document doc; +			xml_parse_result res = doc.load_buffer(buffer, j, parse_default, d.encoding); + +			if (j == d.size) +			{ +				CHECK(res); + +				const char_t* name = doc.first_child().name(); + +			#ifdef PUGIXML_WCHAR_MODE +				CHECK(name[0] == 0x20ac && name[1] == 0); +			#else +				CHECK_STRING(name, "\xe2\x82\xac"); +			#endif +			} +			else +			{ +				CHECK(!res || !doc.first_child()); +			} + +			delete[] buffer; +		} +	} +} + +#ifndef PUGIXML_NO_STL +TEST(document_load_stream_truncated) +{ +	const unsigned char utf32_be[] = {0, 0, 0, '<', 0, 0, 0x20, 0xac, 0, 0, 0, '/', 0, 0, 0, '>'}; + +	for (size_t i = 0; i <= sizeof(utf32_be); ++i) +	{ +		std::string prefix(reinterpret_cast<const char*>(utf32_be), i); +		std::istringstream iss(prefix); + +		xml_document doc; +		xml_parse_result res = doc.load(iss); + +		if (i == sizeof(utf32_be)) +		{ +			CHECK(res); +		} +		else +		{ +			CHECK(!res || !doc.first_child()); + +			if (i < 8) +			{ +				CHECK(!doc.first_child()); +			} +			else +			{ +				const char_t* name = doc.first_child().name(); + +			#ifdef PUGIXML_WCHAR_MODE +				CHECK(name[0] == 0x20ac && name[1] == 0); +			#else +				CHECK_STRING(name, "\xe2\x82\xac"); +			#endif +			} +		} +	} +} +#endif
\ No newline at end of file diff --git a/tests/test_parse.cpp b/tests/test_parse.cpp index 9a8bdf1..c165a65 100644 --- a/tests/test_parse.cpp +++ b/tests/test_parse.cpp @@ -313,12 +313,12 @@ TEST(parse_ws_pcdata_permutations)          // current implementation of parse_ws_pcdata_single has an unfortunate bug; reproduce it here          {4, STR("<node>\t\t<!---->\n\n</node>"), STR("<node>\n\n</node>"), 3},          // error case: terminate PCDATA in the middle -        {7, STR("<node>abcdef"), STR("<node>abcde</node>"), -3}, -        {7, STR("<node>      "), STR("<node>     </node>"), -3}, +        {7, STR("<node>abcdef"), STR("<node>abcdef</node>"), -3}, +        {7, STR("<node>      "), STR("<node>      </node>"), -3},          // error case: terminate PCDATA as early as possible          {7, STR("<node>"), STR("<node />"), -2}, -        {7, STR("<node>a"), STR("<node />"), -2}, -        {7, STR("<node> "), STR("<node />"), -2}, +        {7, STR("<node>a"), STR("<node>a</node>"), -3}, +        {7, STR("<node> "), STR("<node> </node>"), -3},      };      for (size_t i = 0; i < sizeof(test_data) / sizeof(test_data[0]); ++i) @@ -805,7 +805,7 @@ TEST(parse_error_offset)  	CHECK_OFFSET("<3d/>", parse_default, status_unrecognized_tag, 1);  	CHECK_OFFSET(" <3d/>", parse_default, status_unrecognized_tag, 2); -	CHECK_OFFSET(" <", parse_default, status_unrecognized_tag, 2); +	CHECK_OFFSET(" <", parse_default, status_unrecognized_tag, 1);  	CHECK_OFFSET("<?pi", parse_default, status_bad_pi, 3);  	CHECK_OFFSET("<?pi", parse_default | parse_pi, status_bad_pi, 3); | 
