From 45a1b743358f041cee6b590f4f419b3a3c4eb9b8 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Sun, 5 Oct 2014 22:21:38 -0700 Subject: Initial compact storage prototype The storage uses one hash table for fallbacks and simple difference encoding for node and string pointers. This is a work in progress implementation - while node pointers seem to work properly, string encoding is inefficient and parent pointers could use more tuning. No performance or compatibility work has been done either. --- src/pugixml.cpp | 523 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 496 insertions(+), 27 deletions(-) diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 58191ae..c0a2980 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -145,6 +145,13 @@ PUGI__NS_BEGIN PUGI__NS_END #endif +// Micro compact helper +#ifdef PUGIXML_COMPACT +# define PUGI__COMPACT(p) ((p).get()) +#else +# define PUGI__COMPACT(p) (p) +#endif + // Memory allocation PUGI__NS_BEGIN PUGI__FN void* default_allocate(size_t size) @@ -293,6 +300,11 @@ PUGI__NS_BEGIN result->busy_size = 0; result->freed_size = 0; + #ifdef PUGIXML_COMPACT + result->compact_string_base = 0; + result->compact_parent = 0; + #endif + return result; } @@ -303,6 +315,11 @@ PUGI__NS_BEGIN size_t busy_size; size_t freed_size; + + #ifdef PUGIXML_COMPACT + char_t* compact_string_base; + void* compact_parent; + #endif }; struct xml_memory_string_header @@ -315,6 +332,9 @@ PUGI__NS_BEGIN { xml_allocator(xml_memory_page* root): _root(root), _busy_size(root->busy_size) { + #ifdef PUGIXML_COMPACT + _hash = 0; + #endif } xml_memory_page* allocate_page(size_t data_size) @@ -446,6 +466,10 @@ PUGI__NS_BEGIN xml_memory_page* _root; size_t _busy_size; + + #ifdef PUGIXML_COMPACT + class compact_hash_table* _hash; + #endif }; PUGI__FN_NO_INLINE void* xml_allocator::allocate_memory_oob(size_t size, xml_memory_page*& out_page) @@ -488,6 +512,429 @@ PUGI__NS_BEGIN } PUGI__NS_END +#ifdef PUGIXML_COMPACT +size_t pugi_compact_stats[128]; + +PUGI__NS_BEGIN + class compact_hash_table + { + public: + compact_hash_table(): _items(0), _capacity(0), _count(0) + { + } + + void clear() + { + if (_items) + { + xml_memory::deallocate(_items); + _items = 0; + _capacity = 0; + _count = 0; + } + } + + void** find(const void* key) + { + assert(key); + + if (_capacity == 0) return 0; + + size_t hashmod = _capacity - 1; + size_t bucket = hash(key) & hashmod; + + for (size_t probe = 0; probe <= hashmod; ++probe) + { + item_t& probe_item = _items[bucket]; + + if (probe_item.key == key) + return &probe_item.value; + + if (probe_item.key == 0) + return 0; + + // hash collision, quadratic probing + bucket = (bucket + probe + 1) & hashmod; + } + + assert(!"Hash table is full"); + return 0; + } + + void** insert(const void* key, size_t tag) + { + assert(key); + + if (_count >= _capacity * 3 / 4) + rehash(); + + size_t hashmod = _capacity - 1; + size_t bucket = hash(key) & hashmod; + + for (size_t probe = 0; probe <= hashmod; ++probe) + { + item_t& probe_item = _items[bucket]; + + if (probe_item.key == 0) + { + if (tag) + pugi_compact_stats[tag]++; + + probe_item.key = key; + _count++; + return &probe_item.value; + } + + if (probe_item.key == key) + return &probe_item.value; + + // hash collision, quadratic probing + bucket = (bucket + probe + 1) & hashmod; + } + + assert(!"Hash table is full"); + return 0; + } + + private: + struct item_t + { + const void* key; + void* value; + }; + + item_t* _items; + size_t _capacity; + + size_t _count; + + void rehash() + { + compact_hash_table rt; + rt._capacity = (_capacity == 0) ? 256 : _capacity * 2; + rt._items = static_cast(xml_memory::allocate(sizeof(item_t) * rt._capacity)); + + assert(rt._items); + + memset(rt._items, 0, sizeof(item_t) * rt._capacity); + + for (size_t i = 0; i < _capacity; ++i) + if (_items[i].key) + *rt.insert(_items[i].key, 0) = _items[i].value; + + if (_items) + xml_memory::deallocate(_items); + + _capacity = rt._capacity; + _items = rt._items; + } + + // http://burtleburtle.net/bob/hash/integer.html + static unsigned int hash(const void* key) + { + unsigned int a = static_cast(reinterpret_cast(key)); + + a = (a ^ 61) ^ (a >> 16); + a = a + (a << 3); + a = a ^ (a >> 4); + a = a * 0x27d4eb2d; + a = a ^ (a >> 15); + + return a; + } + }; + + typedef uint32_t compact_alignment; + + class compact_header + { + public: + compact_header(xml_memory_page* page, unsigned int flags) + { + ptrdiff_t page_offset = reinterpret_cast(this) - reinterpret_cast(page); + assert(page_offset >= 0 && page_offset < (1 << 16)); + + this->page0 = static_cast(page_offset); + this->page1 = static_cast(page_offset >> 8); + this->flags = static_cast(flags); + } + + void operator&=(unsigned int modflags) + { + flags &= modflags; + } + + void operator|=(unsigned int modflags) + { + flags |= modflags; + } + + operator uintptr_t const() const + { + return reinterpret_cast(get_page()) | flags; + } + + xml_memory_page* get_page() const + { + unsigned int page_offset = page0 + (page1 << 8); + + return const_cast(reinterpret_cast(reinterpret_cast(this) - page_offset)); + } + + private: + unsigned char page0, page1; + unsigned char flags; + }; + + xml_memory_page* compact_get_page(const void* object, int header_offset) + { + const compact_header* header = reinterpret_cast(static_cast(object) - header_offset); + + return header->get_page(); + } + + template class compact_pointer + { + public: + compact_pointer(): _data(0) + { + } + + void operator=(const compact_pointer& rhs) + { + *this = rhs.get(); + } + + void operator=(T* value_) + { + if (value_) + { + compact_alignment* base = get_base(); + compact_alignment* value = reinterpret_cast(value_); + + if (direction > 0) + { + if (value >= base && value <= base + 253) + _data = static_cast((value - base) + 1); + else + { + *compact_get_page(this, header_offset)->allocator->_hash->insert(this, tag) = value; + + _data = 255; + } + } + else if (direction < 0) + { + if (value <= base && value >= base - 252) + _data = static_cast((base - value) + 1); + else + { + xml_memory_page* page = compact_get_page(this, header_offset); + + if (page->compact_parent == 0) + { + page->compact_parent = value; + _data = 254; + } + else if (page->compact_parent == value) + { + _data = 254; + } + else + { + *page->allocator->_hash->insert(this, tag) = value; + _data = 255; + } + } + } + else + { + if (value >= base - 126 && value <= base + 127) + _data = static_cast((value - base) + 127); + else + { + *compact_get_page(this, header_offset)->allocator->_hash->insert(this, tag) = value; + + _data = 255; + } + } + } + else + _data = 0; + } + + operator T* const() const + { + if (_data) + { + if (_data == 255) + return static_cast(*compact_get_page(this, header_offset)->allocator->_hash->find(this)); + else if (direction < 0 && _data == 254) + return static_cast(compact_get_page(this, header_offset)->compact_parent); + else + { + compact_alignment* base = get_base(); + + if (direction > 0) + return reinterpret_cast(base + (_data - 1)); + else if (direction < 0) + return reinterpret_cast(base - (_data - 1)); + else + return reinterpret_cast(base + (_data - 127)); + } + } + else + return 0; + } + + T* operator->() const + { + return operator T* const(); + } + + T* get() const + { + return operator T* const(); + } + + private: + unsigned char _data; + + compact_alignment* get_base() const + { + if (direction > 0) + return reinterpret_cast((reinterpret_cast(this) + sizeof(compact_alignment)) & ~(sizeof(compact_alignment) - 1)); + else + return reinterpret_cast(reinterpret_cast(this) & ~(sizeof(compact_alignment) - 1)); + } + }; + + template class compact_string + { + public: + compact_string(): _data(0) + { + } + + void operator=(const compact_string& rhs) + { + *this = rhs.get(); + } + + void operator=(char_t* value) + { + if (value) + { + xml_memory_page* page = compact_get_page(this, header_offset); + + if (page->compact_string_base == 0) + { + page->compact_string_base = value; + + _data = 1; + } + else + { + ptrdiff_t offset = value - page->compact_string_base; + + if (offset >= 0 && offset <= 65533) + _data = static_cast(offset + 1); + else + { + *page->allocator->_hash->insert(this, tag) = value; + + _data = 65535; + } + } + } + else + _data = 0; + } + + operator char_t* const() const + { + if (_data) + { + xml_memory_page* page = compact_get_page(this, header_offset); + + if (_data < 65535) + { + return page->compact_string_base + (_data - 1); + } + else + { + return static_cast(*page->allocator->_hash->find(this)); + } + } + else + return 0; + } + + char_t* get() const + { + return operator char_t* const(); + } + + private: + unsigned short _data; + }; + +PUGI__NS_END +#endif + +#ifdef PUGIXML_COMPACT +namespace pugi +{ + /// A 'name=value' XML attribute structure. + struct xml_attribute_struct + { + /// Default ctor + xml_attribute_struct(impl::xml_memory_page* page): header(page, 0) + { + PUGI__STATIC_ASSERT(sizeof(xml_attribute_struct) == 12); + + pugi_compact_stats[10]++; + } + + impl::compact_header header; + + unsigned char padding[3]; + + impl::compact_string<6, /*tag*/11> name; ///< Pointer to attribute name. + impl::compact_string<8, /*tag*/12> value; ///< Pointer to attribute value. + + impl::compact_pointer prev_attribute_c; ///< Previous attribute (cyclic list) + impl::compact_pointer next_attribute; ///< Next attribute + }; + + /// An XML document tree node. + struct xml_node_struct + { + /// Default ctor + /// \param type - node type + xml_node_struct(impl::xml_memory_page* page, xml_node_type type): header(page, type - 1) + { + PUGI__STATIC_ASSERT(sizeof(xml_node_struct) == 12); + + pugi_compact_stats[20]++; + } + + impl::compact_header header; + + impl::compact_pointer parent; ///< Pointer to parent + + impl::compact_string<4, /*tag*/21> name; ///< Pointer to element name. + impl::compact_string<6, /*tag*/22> value; ///< Pointer to any associated string data. + + impl::compact_pointer first_child; ///< First child + + impl::compact_pointer prev_sibling_c; ///< Left brother (cyclic list) + impl::compact_pointer next_sibling; ///< Right brother + + impl::compact_pointer first_attribute; ///< First attribute + }; +} +#else namespace pugi { /// A 'name=value' XML attribute structure. @@ -531,6 +978,7 @@ namespace pugi xml_attribute_struct* first_attribute; ///< First attribute }; } +#endif PUGI__NS_BEGIN struct xml_extra_buffer @@ -543,11 +991,18 @@ PUGI__NS_BEGIN { xml_document_struct(xml_memory_page* page): xml_node_struct(page, node_document), xml_allocator(page), buffer(0), extra_buffers(0) { + #ifdef PUGIXML_COMPACT + _hash = &hash; + #endif } const char_t* buffer; xml_extra_buffer* extra_buffers; + + #ifdef PUGIXML_COMPACT + compact_hash_table hash; + #endif }; inline xml_allocator& get_allocator(const xml_node_struct* node) @@ -1679,7 +2134,8 @@ PUGI__NS_BEGIN return target_length >= length && (target_length < reuse_threshold || target_length - length < target_length / 2); } - PUGI__FN bool strcpy_insitu(char_t*& dest, uintptr_t& header, uintptr_t header_mask, const char_t* source) + template + PUGI__FN bool strcpy_insitu(String& dest, Header& header, uintptr_t header_mask, const char_t* source) { assert(header); @@ -2826,7 +3282,7 @@ PUGI__NS_BEGIN return make_parse_result(PUGI__OPTSET(parse_fragment) ? status_ok : status_no_document_element); // get last child of the root before parsing - xml_node_struct* last_root_child = root->first_child ? root->first_child->prev_sibling_c : 0; + xml_node_struct* last_root_child = root->first_child ? PUGI__COMPACT(root->first_child->prev_sibling_c) : 0; // create parser on stack xml_parser parser(alloc_); @@ -2854,7 +3310,7 @@ PUGI__NS_BEGIN return make_parse_result(status_unrecognized_tag, length - 1); // check if there are any element nodes parsed - xml_node_struct* first_root_child_parsed = last_root_child ? last_root_child->next_sibling : root->first_child; + xml_node_struct* first_root_child_parsed = last_root_child ? PUGI__COMPACT(last_root_child->next_sibling) : root->first_child; if (!PUGI__OPTSET(parse_fragment) && !has_element_node_siblings(first_root_child_parsed)) return make_parse_result(status_no_document_element, length - 1); @@ -3612,7 +4068,8 @@ PUGI__NS_BEGIN return true; } - PUGI__FN void node_copy_string(char_t*& dest, uintptr_t& header, uintptr_t header_mask, char_t* source, uintptr_t& source_header, xml_allocator* alloc) + template + PUGI__FN void node_copy_string(String& dest, Header& header, uintptr_t header_mask, char_t* source, Header& source_header, xml_allocator* alloc) { if (source) { @@ -3813,7 +4270,8 @@ PUGI__NS_BEGIN #endif // set value with conversion functions - PUGI__FN bool set_value_buffer(char_t*& dest, uintptr_t& header, uintptr_t header_mask, char (&buf)[128]) + template + PUGI__FN bool set_value_buffer(String& dest, Header& header, uintptr_t header_mask, char (&buf)[128]) { #ifdef PUGIXML_WCHAR_MODE char_t wbuf[128]; @@ -3825,7 +4283,8 @@ PUGI__NS_BEGIN #endif } - PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, int value) + template + PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, int value) { char buf[128]; sprintf(buf, "%d", value); @@ -3833,7 +4292,8 @@ PUGI__NS_BEGIN return set_value_buffer(dest, header, header_mask, buf); } - PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, unsigned int value) + template + PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, unsigned int value) { char buf[128]; sprintf(buf, "%u", value); @@ -3841,7 +4301,8 @@ PUGI__NS_BEGIN return set_value_buffer(dest, header, header_mask, buf); } - PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, double value) + template + PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, double value) { char buf[128]; sprintf(buf, "%g", value); @@ -3849,13 +4310,15 @@ PUGI__NS_BEGIN return set_value_buffer(dest, header, header_mask, buf); } - PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, bool value) + template + PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, bool value) { return strcpy_insitu(dest, header, header_mask, value ? PUGIXML_TEXT("true") : PUGIXML_TEXT("false")); } #ifdef PUGIXML_HAS_LONG_LONG - PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, long long value) + template + PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, long long value) { char buf[128]; sprintf(buf, "%lld", value); @@ -3863,7 +4326,8 @@ PUGI__NS_BEGIN return set_value_buffer(dest, header, header_mask, buf); } - PUGI__FN bool set_value_convert(char_t*& dest, uintptr_t& header, uintptr_t header_mask, unsigned long long value) + template + PUGI__FN bool set_value_convert(String& dest, Header& header, uintptr_t header_mask, unsigned long long value) { char buf[128]; sprintf(buf, "%llu", value); @@ -4348,38 +4812,38 @@ namespace pugi PUGI__FN int xml_attribute::as_int(int def) const { - return impl::get_value_int(_attr ? _attr->value : 0, def); + return impl::get_value_int(_attr ? PUGI__COMPACT(_attr->value) : 0, def); } PUGI__FN unsigned int xml_attribute::as_uint(unsigned int def) const { - return impl::get_value_uint(_attr ? _attr->value : 0, def); + return impl::get_value_uint(_attr ? PUGI__COMPACT(_attr->value) : 0, def); } PUGI__FN double xml_attribute::as_double(double def) const { - return impl::get_value_double(_attr ? _attr->value : 0, def); + return impl::get_value_double(_attr ? PUGI__COMPACT(_attr->value) : 0, def); } PUGI__FN float xml_attribute::as_float(float def) const { - return impl::get_value_float(_attr ? _attr->value : 0, def); + return impl::get_value_float(_attr ? PUGI__COMPACT(_attr->value) : 0, def); } PUGI__FN bool xml_attribute::as_bool(bool def) const { - return impl::get_value_bool(_attr ? _attr->value : 0, def); + return impl::get_value_bool(_attr ? PUGI__COMPACT(_attr->value) : 0, def); } #ifdef PUGIXML_HAS_LONG_LONG PUGI__FN long long xml_attribute::as_llong(long long def) const { - return impl::get_value_llong(_attr ? _attr->value : 0, def); + return impl::get_value_llong(_attr ? PUGI__COMPACT(_attr->value) : 0, def); } PUGI__FN unsigned long long xml_attribute::as_ullong(unsigned long long def) const { - return impl::get_value_ullong(_attr ? _attr->value : 0, def); + return impl::get_value_ullong(_attr ? PUGI__COMPACT(_attr->value) : 0, def); } #endif @@ -4546,7 +5010,7 @@ namespace pugi PUGI__FN xml_node::iterator xml_node::begin() const { - return iterator(_root ? _root->first_child : 0, _root); + return iterator(_root ? PUGI__COMPACT(_root->first_child) : 0, _root); } PUGI__FN xml_node::iterator xml_node::end() const @@ -4556,7 +5020,7 @@ namespace pugi PUGI__FN xml_node::attribute_iterator xml_node::attributes_begin() const { - return attribute_iterator(_root ? _root->first_attribute : 0, _root); + return attribute_iterator(_root ? PUGI__COMPACT(_root->first_attribute) : 0, _root); } PUGI__FN xml_node::attribute_iterator xml_node::attributes_end() const @@ -5452,35 +5916,35 @@ namespace pugi { xml_node_struct* d = _data(); - return impl::get_value_int(d ? d->value : 0, def); + return impl::get_value_int(d ? PUGI__COMPACT(d->value) : 0, def); } PUGI__FN unsigned int xml_text::as_uint(unsigned int def) const { xml_node_struct* d = _data(); - return impl::get_value_uint(d ? d->value : 0, def); + return impl::get_value_uint(d ? PUGI__COMPACT(d->value) : 0, def); } PUGI__FN double xml_text::as_double(double def) const { xml_node_struct* d = _data(); - return impl::get_value_double(d ? d->value : 0, def); + return impl::get_value_double(d ? PUGI__COMPACT(d->value) : 0, def); } PUGI__FN float xml_text::as_float(float def) const { xml_node_struct* d = _data(); - return impl::get_value_float(d ? d->value : 0, def); + return impl::get_value_float(d ? PUGI__COMPACT(d->value) : 0, def); } PUGI__FN bool xml_text::as_bool(bool def) const { xml_node_struct* d = _data(); - return impl::get_value_bool(d ? d->value : 0, def); + return impl::get_value_bool(d ? PUGI__COMPACT(d->value) : 0, def); } #ifdef PUGIXML_HAS_LONG_LONG @@ -5488,14 +5952,14 @@ namespace pugi { xml_node_struct* d = _data(); - return impl::get_value_llong(d ? d->value : 0, def); + return impl::get_value_llong(d ? PUGI__COMPACT(d->value) : 0, def); } PUGI__FN unsigned long long xml_text::as_ullong(unsigned long long def) const { xml_node_struct* d = _data(); - return impl::get_value_ullong(d ? d->value : 0, def); + return impl::get_value_ullong(d ? PUGI__COMPACT(d->value) : 0, def); } #endif @@ -5924,6 +6388,11 @@ namespace pugi page = next; } + #ifdef PUGIXML_COMPACT + // destroy hash table + static_cast(_root)->hash.clear(); + #endif + _root = 0; } -- cgit v1.2.3