diff options
| author | arseny.kapoulkine <arseny.kapoulkine@99668b35-9821-0410-8761-19e4c4f06640> | 2010-05-25 16:50:03 +0000 | 
|---|---|---|
| committer | arseny.kapoulkine <arseny.kapoulkine@99668b35-9821-0410-8761-19e4c4f06640> | 2010-05-25 16:50:03 +0000 | 
| commit | 6e2521830f255c6b35ac302363a241274a336c1c (patch) | |
| tree | c118920d4eac5dcccf31a4d9405be3a958324682 /src/pugixml.cpp | |
| parent | 1046fe1f7bdcc21506b1c4b6a3dc4ce88c77dcae (diff) | |
Optimized utf8 decoding
git-svn-id: http://pugixml.googlecode.com/svn/trunk@447 99668b35-9821-0410-8761-19e4c4f06640
Diffstat (limited to 'src/pugixml.cpp')
| -rw-r--r-- | src/pugixml.cpp | 29 | 
1 files changed, 23 insertions, 6 deletions
| diff --git a/src/pugixml.cpp b/src/pugixml.cpp index d404652..bf5253f 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -796,9 +796,7 @@ namespace pugi  		{
  			const char8_t utf8_byte_mask = 0x3f;
 -			const char8_t* end = data + size;
 -
 -			while (data < end)
 +			while (size)
  			{
  				char8_t lead = *data;
 @@ -807,29 +805,48 @@ namespace pugi  				{
  					result = Traits::low(result, lead);
  					data += 1;
 +					size -= 1;
 +
 +					// process aligned single-byte (ascii) blocks
 +					if ((reinterpret_cast<uintptr_t>(data) & 3) == 0)
 +					{
 +						while (size >= 4 && (*reinterpret_cast<const char32_t*>(data) & 0x80808080) == 0)
 +						{
 +							result = Traits::low(result, data[0]);
 +							result = Traits::low(result, data[1]);
 +							result = Traits::low(result, data[2]);
 +							result = Traits::low(result, data[3]);
 +							data += 4;
 +							size -= 4;
 +						}
 +					}
  				}
  				// 110xxxxx -> U+0080..U+07FF
 -				else if ((unsigned)(lead - 0xC0) < 0x20 && (end - data) > 1 && (data[1] & 0xc0) == 0x80)
 +				else if ((unsigned)(lead - 0xC0) < 0x20 && size >= 2 && (data[1] & 0xc0) == 0x80)
  				{
  					result = Traits::low(result, ((lead & ~0xC0) << 6) | (data[1] & utf8_byte_mask));
  					data += 2;
 +					size -= 2;
  				}
  				// 1110xxxx -> U+0800-U+FFFF
 -				else if ((unsigned)(lead - 0xE0) < 0x10 && (end - data) > 2 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80)
 +				else if ((unsigned)(lead - 0xE0) < 0x10 && size >= 3 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80)
  				{
  					result = Traits::low(result, ((lead & ~0xE0) << 12) | ((data[1] & utf8_byte_mask) << 6) | (data[2] & utf8_byte_mask));
  					data += 3;
 +					size -= 3;
  				}
  				// 11110xxx -> U+10000..U+10FFFF
 -				else if ((unsigned)(lead - 0xF0) < 0x08 && (end - data) > 3 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80)
 +				else if ((unsigned)(lead - 0xF0) < 0x08 && size >= 4 && (data[1] & 0xc0) == 0x80 && (data[2] & 0xc0) == 0x80 && (data[3] & 0xc0) == 0x80)
  				{
  					result = Traits::high(result, ((lead & ~0xF0) << 18) | ((data[1] & utf8_byte_mask) << 12) | ((data[2] & utf8_byte_mask) << 6) | (data[3] & utf8_byte_mask));
  					data += 4;
 +					size -= 4;
  				}
  				// 10xxxxxx or 11111xxx -> invalid
  				else
  				{
  					data += 1;
 +					size -= 1;
  				}
  			}
 | 
