diff options
-rw-r--r-- | src/pugixml.cpp | 235 | ||||
-rw-r--r-- | tests/test_document.cpp | 101 | ||||
-rw-r--r-- | tests/test_parse.cpp | 10 |
3 files changed, 261 insertions, 85 deletions
diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 7c965ce..926458e 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -1182,22 +1182,25 @@ PUGI__NS_BEGIN PUGI__FN bool get_mutable_buffer(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable) { + size_t length = size / sizeof(char_t); + if (is_mutable) { out_buffer = static_cast<char_t*>(const_cast<void*>(contents)); + out_length = length; } else { - void* buffer = xml_memory::allocate(size > 0 ? size : 1); + char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t))); if (!buffer) return false; - memcpy(buffer, contents, size); + memcpy(buffer, contents, length * sizeof(char_t)); + buffer[length] = 0; - out_buffer = static_cast<char_t*>(buffer); + out_buffer = buffer; + out_length = length + 1; } - out_length = size / sizeof(char_t); - return true; } @@ -1211,20 +1214,28 @@ PUGI__NS_BEGIN PUGI__FN bool convert_buffer_endian_swap(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable) { const char_t* data = static_cast<const char_t*>(contents); - + size_t length = size / sizeof(char_t); + if (is_mutable) { - out_buffer = const_cast<char_t*>(data); + char_t* buffer = const_cast<char_t*>(data); + + convert_wchar_endian_swap(buffer, data, length); + + out_buffer = buffer; + out_length = length; } else { - out_buffer = static_cast<char_t*>(xml_memory::allocate(size > 0 ? size : 1)); - if (!out_buffer) return false; - } + char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; - out_length = size / sizeof(char_t); + convert_wchar_endian_swap(buffer, data, length); + buffer[length] = 0; - convert_wchar_endian_swap(out_buffer, data, out_length); + out_buffer = buffer; + out_length = length + 1; + } return true; } @@ -1232,20 +1243,24 @@ PUGI__NS_BEGIN PUGI__FN bool convert_buffer_utf8(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size) { const uint8_t* data = static_cast<const uint8_t*>(contents); + size_t data_length = size; // first pass: get length in wchar_t units - out_length = utf_decoder<wchar_counter>::decode_utf8_block(data, size, 0); + size_t length = utf_decoder<wchar_counter>::decode_utf8_block(data, data_length, 0); // allocate buffer of suitable length - out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t))); - if (!out_buffer) return false; + char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; // second pass: convert utf8 input to wchar_t - wchar_writer::value_type out_begin = reinterpret_cast<wchar_writer::value_type>(out_buffer); - wchar_writer::value_type out_end = utf_decoder<wchar_writer>::decode_utf8_block(data, size, out_begin); + wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer); + wchar_writer::value_type oend = utf_decoder<wchar_writer>::decode_utf8_block(data, data_length, obegin); - assert(out_end == out_begin + out_length); - (void)!out_end; + assert(oend == obegin + length); + *oend = 0; + + out_buffer = buffer; + out_length = length + 1; return true; } @@ -1253,21 +1268,24 @@ PUGI__NS_BEGIN template <typename opt_swap> PUGI__FN bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap) { const uint16_t* data = static_cast<const uint16_t*>(contents); - size_t length = size / sizeof(uint16_t); + size_t data_length = size / sizeof(uint16_t); // first pass: get length in wchar_t units - out_length = utf_decoder<wchar_counter, opt_swap>::decode_utf16_block(data, length, 0); + size_t length = utf_decoder<wchar_counter, opt_swap>::decode_utf16_block(data, data_length, 0); // allocate buffer of suitable length - out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t))); - if (!out_buffer) return false; + char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; // second pass: convert utf16 input to wchar_t - wchar_writer::value_type out_begin = reinterpret_cast<wchar_writer::value_type>(out_buffer); - wchar_writer::value_type out_end = utf_decoder<wchar_writer, opt_swap>::decode_utf16_block(data, length, out_begin); + wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer); + wchar_writer::value_type oend = utf_decoder<wchar_writer, opt_swap>::decode_utf16_block(data, data_length, obegin); + + assert(oend == obegin + length); + *oend = 0; - assert(out_end == out_begin + out_length); - (void)!out_end; + out_buffer = buffer; + out_length = length + 1; return true; } @@ -1275,21 +1293,24 @@ PUGI__NS_BEGIN template <typename opt_swap> PUGI__FN bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap) { const uint32_t* data = static_cast<const uint32_t*>(contents); - size_t length = size / sizeof(uint32_t); + size_t data_length = size / sizeof(uint32_t); // first pass: get length in wchar_t units - out_length = utf_decoder<wchar_counter, opt_swap>::decode_utf32_block(data, length, 0); + size_t length = utf_decoder<wchar_counter, opt_swap>::decode_utf32_block(data, data_length, 0); // allocate buffer of suitable length - out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t))); - if (!out_buffer) return false; + char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; // second pass: convert utf32 input to wchar_t - wchar_writer::value_type out_begin = reinterpret_cast<wchar_writer::value_type>(out_buffer); - wchar_writer::value_type out_end = utf_decoder<wchar_writer, opt_swap>::decode_utf32_block(data, length, out_begin); + wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer); + wchar_writer::value_type oend = utf_decoder<wchar_writer, opt_swap>::decode_utf32_block(data, data_length, obegin); - assert(out_end == out_begin + out_length); - (void)!out_end; + assert(oend == obegin + length); + *oend = 0; + + out_buffer = buffer; + out_length = length + 1; return true; } @@ -1297,20 +1318,24 @@ PUGI__NS_BEGIN PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size) { const uint8_t* data = static_cast<const uint8_t*>(contents); + size_t data_length = size; // get length in wchar_t units - out_length = size; + size_t length = data_length; // allocate buffer of suitable length - out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t))); - if (!out_buffer) return false; + char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; // convert latin1 input to wchar_t - wchar_writer::value_type out_begin = reinterpret_cast<wchar_writer::value_type>(out_buffer); - wchar_writer::value_type out_end = utf_decoder<wchar_writer>::decode_latin1_block(data, size, out_begin); + wchar_writer::value_type obegin = reinterpret_cast<wchar_writer::value_type>(buffer); + wchar_writer::value_type oend = utf_decoder<wchar_writer>::decode_latin1_block(data, data_length, obegin); + + assert(oend == obegin + length); + *oend = 0; - assert(out_end == out_begin + out_length); - (void)!out_end; + out_buffer = buffer; + out_length = length + 1; return true; } @@ -1359,21 +1384,24 @@ PUGI__NS_BEGIN template <typename opt_swap> PUGI__FN bool convert_buffer_utf16(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap) { const uint16_t* data = static_cast<const uint16_t*>(contents); - size_t length = size / sizeof(uint16_t); + size_t data_length = size / sizeof(uint16_t); // first pass: get length in utf8 units - out_length = utf_decoder<utf8_counter, opt_swap>::decode_utf16_block(data, length, 0); + size_t length = utf_decoder<utf8_counter, opt_swap>::decode_utf16_block(data, data_length, 0); // allocate buffer of suitable length - out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t))); - if (!out_buffer) return false; + char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; // second pass: convert utf16 input to utf8 - uint8_t* out_begin = reinterpret_cast<uint8_t*>(out_buffer); - uint8_t* out_end = utf_decoder<utf8_writer, opt_swap>::decode_utf16_block(data, length, out_begin); + uint8_t* obegin = reinterpret_cast<uint8_t*>(buffer); + uint8_t* oend = utf_decoder<utf8_writer, opt_swap>::decode_utf16_block(data, data_length, obegin); + + assert(oend == obegin + length); + *oend = 0; - assert(out_end == out_begin + out_length); - (void)!out_end; + out_buffer = buffer; + out_length = length + 1; return true; } @@ -1381,21 +1409,24 @@ PUGI__NS_BEGIN template <typename opt_swap> PUGI__FN bool convert_buffer_utf32(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, opt_swap) { const uint32_t* data = static_cast<const uint32_t*>(contents); - size_t length = size / sizeof(uint32_t); + size_t data_length = size / sizeof(uint32_t); // first pass: get length in utf8 units - out_length = utf_decoder<utf8_counter, opt_swap>::decode_utf32_block(data, length, 0); + size_t length = utf_decoder<utf8_counter, opt_swap>::decode_utf32_block(data, data_length, 0); // allocate buffer of suitable length - out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t))); - if (!out_buffer) return false; + char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; // second pass: convert utf32 input to utf8 - uint8_t* out_begin = reinterpret_cast<uint8_t*>(out_buffer); - uint8_t* out_end = utf_decoder<utf8_writer, opt_swap>::decode_utf32_block(data, length, out_begin); + uint8_t* obegin = reinterpret_cast<uint8_t*>(buffer); + uint8_t* oend = utf_decoder<utf8_writer, opt_swap>::decode_utf32_block(data, data_length, obegin); - assert(out_end == out_begin + out_length); - (void)!out_end; + assert(oend == obegin + length); + *oend = 0; + + out_buffer = buffer; + out_length = length + 1; return true; } @@ -1412,32 +1443,36 @@ PUGI__NS_BEGIN PUGI__FN bool convert_buffer_latin1(char_t*& out_buffer, size_t& out_length, const void* contents, size_t size, bool is_mutable) { const uint8_t* data = static_cast<const uint8_t*>(contents); + size_t data_length = size; // get size of prefix that does not need utf8 conversion - size_t prefix_length = get_latin1_7bit_prefix_length(data, size); - assert(prefix_length <= size); + size_t prefix_length = get_latin1_7bit_prefix_length(data, data_length); + assert(prefix_length <= data_length); const uint8_t* postfix = data + prefix_length; - size_t postfix_length = size - prefix_length; + size_t postfix_length = data_length - prefix_length; // if no conversion is needed, just return the original buffer if (postfix_length == 0) return get_mutable_buffer(out_buffer, out_length, contents, size, is_mutable); // first pass: get length in utf8 units - out_length = prefix_length + utf_decoder<utf8_counter>::decode_latin1_block(postfix, postfix_length, 0); + size_t length = prefix_length + utf_decoder<utf8_counter>::decode_latin1_block(postfix, postfix_length, 0); // allocate buffer of suitable length - out_buffer = static_cast<char_t*>(xml_memory::allocate((out_length > 0 ? out_length : 1) * sizeof(char_t))); - if (!out_buffer) return false; + char_t* buffer = static_cast<char_t*>(xml_memory::allocate((length + 1) * sizeof(char_t))); + if (!buffer) return false; // second pass: convert latin1 input to utf8 - memcpy(out_buffer, data, prefix_length); + memcpy(buffer, data, prefix_length); + + uint8_t* obegin = reinterpret_cast<uint8_t*>(buffer); + uint8_t* oend = utf_decoder<utf8_writer>::decode_latin1_block(postfix, postfix_length, obegin + prefix_length); - uint8_t* out_begin = reinterpret_cast<uint8_t*>(out_buffer); - uint8_t* out_end = utf_decoder<utf8_writer>::decode_latin1_block(postfix, postfix_length, out_begin + prefix_length); + assert(oend == obegin + length); + *oend = 0; - assert(out_end == out_begin + out_length); - (void)!out_end; + out_buffer = buffer; + out_length = length + 1; return true; } @@ -2182,6 +2217,10 @@ PUGI__NS_BEGIN // some control group s = parse_doctype_group(s, endch, false); if (!s) return s; + + // skip > + assert(*s == '>'); + s++; } } else if (s[0] == '<' || s[0] == '"' || s[0] == '\'') @@ -2192,8 +2231,6 @@ PUGI__NS_BEGIN } else if (*s == '>') { - s++; - return s; } else s++; @@ -2302,8 +2339,8 @@ PUGI__NS_BEGIN cursor->value = mark; - assert((s[0] == 0 && endch == '>') || s[-1] == '>'); - s[*s == 0 ? 0 : -1] = 0; + assert((*s == 0 && endch == '>') || *s == '>'); + if (*s) *s++ = 0; PUGI__POPNODE(); } @@ -2660,6 +2697,10 @@ PUGI__NS_BEGIN xml_parse_result result = make_parse_result(parser.error_status, parser.error_offset ? parser.error_offset - buffer : 0); assert(result.offset >= 0 && static_cast<size_t>(result.offset) <= length); + // roll back offset if it occurs on a null terminator in the source buffer + if (result.offset > 0 && static_cast<size_t>(result.offset) == length - 1 && endch == 0) + result.offset--; + // update allocator state alloc = parser.alloc; @@ -2667,7 +2708,7 @@ PUGI__NS_BEGIN if (result && endch == '<') { // there's no possible well-formed document with < at the end - return make_parse_result(status_unrecognized_tag, length); + return make_parse_result(status_unrecognized_tag, length - 1); } return result; @@ -3530,6 +3571,30 @@ PUGI__NS_BEGIN return status_ok; } + PUGI__FN size_t zero_terminate_buffer(void* buffer, size_t size, xml_encoding encoding) + { + // We only need to zero-terminate if encoding conversion does not do it for us + #ifdef PUGIXML_WCHAR_MODE + xml_encoding wchar_encoding = get_wchar_encoding(); + + if (encoding == wchar_encoding || need_endian_swap_utf(encoding, wchar_encoding)) + { + size_t length = size / sizeof(char_t); + + static_cast<char_t*>(buffer)[length] = 0; + return (length + 1) * sizeof(char_t); + } + #else + if (encoding == encoding_utf8) + { + static_cast<char*>(buffer)[size] = 0; + return size + 1; + } + #endif + + return size; + } + PUGI__FN xml_parse_result load_file_impl(xml_document& doc, FILE* file, unsigned int options, xml_encoding encoding) { if (!file) return make_parse_result(status_file_not_found); @@ -3544,8 +3609,10 @@ PUGI__NS_BEGIN return make_parse_result(size_status); } + size_t max_suffix_size = sizeof(char_t); + // allocate buffer for the whole file - char* contents = static_cast<char*>(xml_memory::allocate(size > 0 ? size : 1)); + char* contents = static_cast<char*>(xml_memory::allocate(size + max_suffix_size)); if (!contents) { @@ -3562,8 +3629,10 @@ PUGI__NS_BEGIN xml_memory::deallocate(contents); return make_parse_result(status_io_error); } + + xml_encoding real_encoding = get_buffer_encoding(encoding, contents, size); - return doc.load_buffer_inplace_own(contents, size, options, encoding); + return doc.load_buffer_inplace_own(contents, zero_terminate_buffer(contents, size, real_encoding), options, real_encoding); } #ifndef PUGIXML_NO_STL @@ -3629,8 +3698,10 @@ PUGI__NS_BEGIN total += chunk->size; } + size_t max_suffix_size = sizeof(char_t); + // copy chunk list to a contiguous buffer - char* buffer = static_cast<char*>(xml_memory::allocate(total)); + char* buffer = static_cast<char*>(xml_memory::allocate(total + max_suffix_size)); if (!buffer) return status_out_of_memory; char* write = buffer; @@ -3666,8 +3737,10 @@ PUGI__NS_BEGIN if (static_cast<std::streamsize>(read_length) != length || length < 0) return status_out_of_memory; + size_t max_suffix_size = sizeof(char_t); + // read stream data into memory (guard against stream exceptions with buffer holder) - buffer_holder buffer(xml_memory::allocate((read_length > 0 ? read_length : 1) * sizeof(T)), xml_memory::deallocate); + buffer_holder buffer(xml_memory::allocate(read_length * sizeof(T) + max_suffix_size), xml_memory::deallocate); if (!buffer.data) return status_out_of_memory; stream.read(static_cast<T*>(buffer.data), static_cast<std::streamsize>(read_length)); @@ -3678,7 +3751,7 @@ PUGI__NS_BEGIN // return buffer size_t actual_length = static_cast<size_t>(stream.gcount()); assert(actual_length <= read_length); - + *out_buffer = buffer.release(); *out_size = actual_length * sizeof(T); @@ -3705,7 +3778,9 @@ PUGI__NS_BEGIN if (status != status_ok) return make_parse_result(status); - return doc.load_buffer_inplace_own(buffer, size, options, encoding); + xml_encoding real_encoding = get_buffer_encoding(encoding, buffer, size); + + return doc.load_buffer_inplace_own(buffer, zero_terminate_buffer(buffer, size, real_encoding), options, real_encoding); } #endif diff --git a/tests/test_document.cpp b/tests/test_document.cpp index 7adc2a1..adc4bdb 100644 --- a/tests/test_document.cpp +++ b/tests/test_document.cpp @@ -1069,3 +1069,104 @@ TEST_XML(document_reset_copy_self, "<node><child/></node>") CHECK(!doc.first_child()); CHECK_NODE(doc, STR("")); } + +struct document_data_t +{ + xml_encoding encoding; + + const unsigned char* data; + size_t size; +}; + +#include <stdio.h> + +TEST(document_load_buffer_utf_truncated) +{ + const unsigned char utf8[] = {'<', 0xe2, 0x82, 0xac, '/', '>'}; + const unsigned char utf16_be[] = {0, '<', 0x20, 0xac, 0, '/', 0, '>'}; + const unsigned char utf16_le[] = {'<', 0, 0xac, 0x20, '/', 0, '>', 0}; + const unsigned char utf32_be[] = {0, 0, 0, '<', 0, 0, 0x20, 0xac, 0, 0, 0, '/', 0, 0, 0, '>'}; + const unsigned char utf32_le[] = {'<', 0, 0, 0, 0xac, 0x20, 0, 0, '/', 0, 0, 0, '>', 0, 0, 0}; + + const document_data_t data[] = + { + { encoding_utf8, utf8, sizeof(utf8) }, + { encoding_utf16_be, utf16_be, sizeof(utf16_be) }, + { encoding_utf16_le, utf16_le, sizeof(utf16_le) }, + { encoding_utf32_be, utf32_be, sizeof(utf32_be) }, + { encoding_utf32_le, utf32_le, sizeof(utf32_le) }, + }; + + for (size_t i = 0; i < sizeof(data) / sizeof(data[0]); ++i) + { + const document_data_t& d = data[i]; + + for (size_t j = 0; j <= d.size; ++j) + { + char* buffer = new char[j]; + memcpy(buffer, d.data, j); + + xml_document doc; + xml_parse_result res = doc.load_buffer(buffer, j, parse_default, d.encoding); + + if (j == d.size) + { + CHECK(res); + + const char_t* name = doc.first_child().name(); + + #ifdef PUGIXML_WCHAR_MODE + CHECK(name[0] == 0x20ac && name[1] == 0); + #else + CHECK_STRING(name, "\xe2\x82\xac"); + #endif + } + else + { + CHECK(!res || !doc.first_child()); + } + + delete[] buffer; + } + } +} + +#ifndef PUGIXML_NO_STL +TEST(document_load_stream_truncated) +{ + const unsigned char utf32_be[] = {0, 0, 0, '<', 0, 0, 0x20, 0xac, 0, 0, 0, '/', 0, 0, 0, '>'}; + + for (size_t i = 0; i <= sizeof(utf32_be); ++i) + { + std::string prefix(reinterpret_cast<const char*>(utf32_be), i); + std::istringstream iss(prefix); + + xml_document doc; + xml_parse_result res = doc.load(iss); + + if (i == sizeof(utf32_be)) + { + CHECK(res); + } + else + { + CHECK(!res || !doc.first_child()); + + if (i < 8) + { + CHECK(!doc.first_child()); + } + else + { + const char_t* name = doc.first_child().name(); + + #ifdef PUGIXML_WCHAR_MODE + CHECK(name[0] == 0x20ac && name[1] == 0); + #else + CHECK_STRING(name, "\xe2\x82\xac"); + #endif + } + } + } +} +#endif
\ No newline at end of file diff --git a/tests/test_parse.cpp b/tests/test_parse.cpp index 9a8bdf1..c165a65 100644 --- a/tests/test_parse.cpp +++ b/tests/test_parse.cpp @@ -313,12 +313,12 @@ TEST(parse_ws_pcdata_permutations) // current implementation of parse_ws_pcdata_single has an unfortunate bug; reproduce it here {4, STR("<node>\t\t<!---->\n\n</node>"), STR("<node>\n\n</node>"), 3}, // error case: terminate PCDATA in the middle - {7, STR("<node>abcdef"), STR("<node>abcde</node>"), -3}, - {7, STR("<node> "), STR("<node> </node>"), -3}, + {7, STR("<node>abcdef"), STR("<node>abcdef</node>"), -3}, + {7, STR("<node> "), STR("<node> </node>"), -3}, // error case: terminate PCDATA as early as possible {7, STR("<node>"), STR("<node />"), -2}, - {7, STR("<node>a"), STR("<node />"), -2}, - {7, STR("<node> "), STR("<node />"), -2}, + {7, STR("<node>a"), STR("<node>a</node>"), -3}, + {7, STR("<node> "), STR("<node> </node>"), -3}, }; for (size_t i = 0; i < sizeof(test_data) / sizeof(test_data[0]); ++i) @@ -805,7 +805,7 @@ TEST(parse_error_offset) CHECK_OFFSET("<3d/>", parse_default, status_unrecognized_tag, 1); CHECK_OFFSET(" <3d/>", parse_default, status_unrecognized_tag, 2); - CHECK_OFFSET(" <", parse_default, status_unrecognized_tag, 2); + CHECK_OFFSET(" <", parse_default, status_unrecognized_tag, 1); CHECK_OFFSET("<?pi", parse_default, status_bad_pi, 3); CHECK_OFFSET("<?pi", parse_default | parse_pi, status_bad_pi, 3); |