diff options
author | Arseny Kapoulkine <arseny.kapoulkine@gmail.com> | 2015-09-20 10:37:46 -0700 |
---|---|---|
committer | Arseny Kapoulkine <arseny.kapoulkine@gmail.com> | 2015-09-20 10:43:38 -0700 |
commit | ec0c9c5561785299d0c03ed05fe94e0031ba6487 (patch) | |
tree | d419f86bde522652621677dd3b45c0c54241cfaa /src | |
parent | bda55c818c0936fe5e41e381794018f0fc60b1a3 (diff) |
Implement custom string to integer conversion
This makes conversion significantly faster and removes more CRT dependencies;
in particular, to support long long pugixml only requires the type itself (and
the division operator...).
New implementation is up to 3x faster on short decimal numbers.
Note that unlike the old implementation, new implementation correctly handles
overflow and underflow and clamps the value to the representable range. This
means that there are some behavior changes - e.g. previously as_uint on "-1"
would return INT_MAX instead of 0.
In addition to CRT issues, for platforms with 64-bit long old implementation
incorrectly truncated from long to int or unsigned int, so even if CRT clamped
the values the result would have been incorrect.
Diffstat (limited to 'src')
-rw-r--r-- | src/pugixml.cpp | 111 |
1 files changed, 63 insertions, 48 deletions
diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 413e342..2df5394 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -20,6 +20,7 @@ #include <stdio.h> #include <string.h> #include <assert.h> +#include <limits.h> #ifdef PUGIXML_WCHAR_MODE # include <wchar.h> @@ -4429,39 +4430,81 @@ PUGI__NS_BEGIN } // get value with conversion functions - PUGI__FN int get_integer_base(const char_t* value) + template <typename U> U string_to_integer(const char_t* value, U minneg, U maxpos) { + U result = 0; const char_t* s = value; while (PUGI__IS_CHARTYPE(*s, ct_space)) s++; - if (*s == '-') - s++; + bool negative = (*s == '-'); + + s += negative; + + bool overflow = false; + + if (s[0] == '0' && (s[1] | ' ') == 'x') + { + s += 2; + + const char_t* start = s; + + for (;;) + { + if (static_cast<unsigned>(*s - '0') < 10) + result = result * 16 + (*s - '0'); + else if (static_cast<unsigned>((*s | ' ') - 'a') < 6) + result = result * 16 + ((*s | ' ') - 'a' + 10); + else + break; + + s++; + } + + size_t digits = static_cast<size_t>(s - start); + + overflow = digits > sizeof(U) * 2; + } + else + { + const char_t* start = s; + + for (;;) + { + if (static_cast<unsigned>(*s - '0') < 10) + result = result * 10 + (*s - '0'); + else + break; + + s++; + } + + size_t digits = static_cast<size_t>(s - start); + + PUGI__STATIC_ASSERT(sizeof(U) == 8 || sizeof(U) == 4 || sizeof(U) == 2); - return (s[0] == '0' && (s[1] | ' ') == 'x') ? 16 : 10; + const size_t max_digits10 = sizeof(U) == 8 ? 20 : sizeof(U) == 4 ? 10 : 5; + const char max_lead = sizeof(U) == 8 ? '1' : sizeof(U) == 4 ? '4' : '6'; + const size_t high_bit = sizeof(U) * 8 - 1; + + overflow = digits >= max_digits10 && !(digits == max_digits10 && (*start < max_lead || (*start == max_lead && result >> high_bit))); + } + + if (negative) + return (overflow || result > minneg) ? 0 - minneg : 0 - result; + else + return (overflow || result > maxpos) ? maxpos : result; } PUGI__FN int get_value_int(const char_t* value) { - int base = get_integer_base(value); - - #ifdef PUGIXML_WCHAR_MODE - return static_cast<int>(wcstol(value, 0, base)); - #else - return static_cast<int>(strtol(value, 0, base)); - #endif + return string_to_integer<unsigned int>(value, INT_MIN, INT_MAX); } PUGI__FN unsigned int get_value_uint(const char_t* value) { - int base = get_integer_base(value); - - #ifdef PUGIXML_WCHAR_MODE - return static_cast<unsigned int>(wcstoul(value, 0, base)); - #else - return static_cast<unsigned int>(strtoul(value, 0, base)); - #endif + return string_to_integer<unsigned int>(value, 0, UINT_MAX); } PUGI__FN double get_value_double(const char_t* value) @@ -4494,40 +4537,12 @@ PUGI__NS_BEGIN #ifdef PUGIXML_HAS_LONG_LONG PUGI__FN long long get_value_llong(const char_t* value) { - int base = get_integer_base(value); - - #ifdef PUGIXML_WCHAR_MODE - #ifdef PUGI__MSVC_CRT_VERSION - return _wcstoi64(value, 0, base); - #else - return wcstoll(value, 0, base); - #endif - #else - #ifdef PUGI__MSVC_CRT_VERSION - return _strtoi64(value, 0, base); - #else - return strtoll(value, 0, base); - #endif - #endif + return string_to_integer<unsigned long long>(value, LLONG_MIN, LLONG_MAX); } PUGI__FN unsigned long long get_value_ullong(const char_t* value) { - int base = get_integer_base(value); - - #ifdef PUGIXML_WCHAR_MODE - #ifdef PUGI__MSVC_CRT_VERSION - return _wcstoui64(value, 0, base); - #else - return wcstoull(value, 0, base); - #endif - #else - #ifdef PUGI__MSVC_CRT_VERSION - return _strtoui64(value, 0, base); - #else - return strtoull(value, 0, base); - #endif - #endif + return string_to_integer<unsigned long long>(value, 0, ULLONG_MAX); } #endif |