diff options
-rw-r--r-- | docs/index.html | 14 | ||||
-rw-r--r-- | src/pugixml.cpp | 62 | ||||
-rw-r--r-- | src/pugixml.hpp | 2 |
3 files changed, 35 insertions, 43 deletions
diff --git a/docs/index.html b/docs/index.html index 4066dc4..c843bdb 100644 --- a/docs/index.html +++ b/docs/index.html @@ -42,7 +42,7 @@ <h2>Introduction</h2>
<p><i>pugixml</i> is just another XML parser. This is a successor to
<a href="http://www.codeproject.com/soap/pugxml.asp">pugxml</a> (well, to be honest, the only part
-that is left as is is wildcard matching code, the rest was either heavily refactored or rewritten
+that is left as is is wildcard matching code; the rest was either heavily refactored or rewritten
from scratch). The main features (call it USP) are:</p>
<ul>
@@ -59,7 +59,7 @@ mode, with the exception of DTD related issues and XML namespaces)</li> like <i>expat</i> will; it will try to recover the state even if meeting an error (like finding matching
tags for closing ones); it will parse files with data in wrong encoding; and so on)</li>
<li>clean interface (a heavily refactored pugxml's one)</li>
-<li>more or less unicode-aware (actually, it assumes UTF-8 encoding of the input data, though
+<li>more or less Unicode-aware (actually, it assumes UTF-8 encoding of the input data, though
it will readily work with ANSI - no UTF-16 for now (see <a href="#Future_work">Future work</a>), with
helper conversion functions (UTF-8 <-> UTF-16/32 (whatever is the default for std::wstring & wchar_t))</li>
<li>fully standard compliant code (approved by <a href="http://www.comeaucomputing.com/tryitout/">Comeau</a>
@@ -238,16 +238,16 @@ be just skipped:</p> <ul>
<li>If <b>parse_pi</b> is on, then processing instructions (<b><? ... ?></b>) are put into DOM
-tree (with node type <b>node_pi</b>, otherwise they are discarded. Note that for now the prolog
+tree (with node type <b>node_pi</b>) otherwise they are discarded. Note that for now the prolog
(<?xml ... ?>) is parsed as a processing instruction.
<br>Default value: off
<br>In W3C mode: on</li>
<li>If <b>parse_comments</b> is on, then comments (<b><!-- ... --></b>) are put into DOM
-tree (with node type <b>node_comment</b>, otherwise they are discarded.
+tree (with node type <b>node_comment</b>) otherwise they are discarded.
<br>Default value: off
<br>In W3C mode: on</li>
<li>If <b>parse_cdata</b> is on, then the content of CDATA section (<b><![CDATA[[ ... ]]></b>)
-is put into DOM tree (with node type <b>node_cdata</b>, otherwise it is discarded.
+is put into DOM tree (with node type <b>node_cdata</b>) otherwise it is discarded.
<br>Default value: on
<br>In W3C mode: on</li>
<li>If <b>parse_ws_pcdata</b> is off, then the content of PCDATA section (it's the plain text
@@ -282,7 +282,7 @@ and for attribute values (replacing <lt; with <, &#4c; with L, etc.). <li>If <b>parse_wnorm_attribute</b> is on, then the whitespace normalisation is done for attribute
values (this includes replacing any space-like character by a space character, converting sequences of
spaces into a single space and trimming of leading/trailing spaces)
-<br>Default value: on
+<br>Default value: off
<br>In W3C mode: off</li>
<li>If <b>parse_wconv_attribute</b> is on, then the whitespace conversion is done for attribute
values (this is a subset of whitespace normalization, and includes only replacing space-like characters
@@ -324,7 +324,7 @@ These are: <p>A couple of words on flag usage. The parsing options are just a set of bits, with each bit corresponding
to one flag. You can turn the flag on by OR-ing the options value with this flag's constant:
<pre>
- parse_w3c | parse_wnorm_pcdata
+ parse_w3c | parse_wnorm_attribute
</pre>
or turn the flag off by AND-ing the options value with the NEGation of this flag's constant:
<pre>
diff --git a/src/pugixml.cpp b/src/pugixml.cpp index da53c66..cf74eed 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -3,8 +3,6 @@ // Pug Improved XML Parser - Version 0.2
// --------------------------------------------------------
// Copyright (C) 2006-2007, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
-// Thanks to Palvelev Artyom (cppguru@mail.ru) for hints about optimizing
-// conversion functions.
// This work is based on the pugxml parser, which is:
// Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
// Released into the Public Domain. Use at your own risk.
@@ -120,29 +118,30 @@ namespace ct_parse_attr_ws = 4, // \0, &, \r, ', ", \n, space, tab
ct_space = 8, // \r, \n, space, tab
ct_parse_cdata = 16, // \0, ], >, \r
- ct_parse_comment = 32 // \0, -, >, \r
+ ct_parse_comment = 32, // \0, -, >, \r
+ ct_symbol = 64 // Any symbol > 127, a-z, A-Z, 0-9, _, :, -, .
};
-
+
static unsigned char chartype_table[256] =
{
- 55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16-31
- 12, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 32, 0, 0, // 32-47
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 48, 0, // 48-63
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 64-79
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 0, 0, // 80-95
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 96-111
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 112-127
-
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ 55, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 0, 63, 0, 0, // 0-15
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16-31
+ 12, 0, 6, 0, 0, 0, 7, 6, 0, 0, 0, 0, 0, 96, 64, 0, // 32-47
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 0, 1, 0, 48, 0, // 48-63
+ 0, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, // 64-79
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 0, 0, 16, 0, 64, // 80-95
+ 0, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, // 96-111
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 0, 0, 0, 0, 0, // 112-127
+
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
};
bool is_chartype(char c, chartype ct)
@@ -275,9 +274,6 @@ namespace pugi struct xml_parser_impl
{
xml_allocator& alloc;
- bool chartype_symbol_table[256];
-
- bool chartype_symbol(char c) const { return chartype_symbol_table[(unsigned char)c]; }
struct gap
{
@@ -724,8 +720,6 @@ namespace pugi {
for (unsigned int c = 0; c < 256; ++c)
{
- chartype_symbol_table[c] = c > 127 || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
- (c >= '0' && c <= '9') || c == '_' || c == ':' || c == '-' || c == '.';
}
}
@@ -756,10 +750,10 @@ namespace pugi if(*s == '?') // '<?...'
{
++s;
- if(chartype_symbol(*s) && OPTSET(parse_pi))
+ if(is_chartype(*s, ct_symbol) && OPTSET(parse_pi))
{
mark = s;
- SCANWHILE(chartype_symbol(*s)); // Read PI target
+ SCANWHILE(is_chartype(*s, ct_symbol)); // Read PI target
ENDSEG();
PUSHNODE(node_pi); // Append a new node on the tree.
@@ -900,12 +894,12 @@ namespace pugi continue;
}
}
- else if(chartype_symbol(*s)) // '<#...'
+ else if(is_chartype(*s, ct_symbol)) // '<#...'
{
cursor = append_node(cursor); // Append a new node to the tree.
cursor->name = s;
- SCANWHILE(chartype_symbol(*s)); // Scan for a terminator.
+ SCANWHILE(is_chartype(*s, ct_symbol)); // Scan for a terminator.
ENDSEG(); // Save char in 'ch', terminate & step over.
if (*s!=0 && ch == '/') // '</...'
{
@@ -923,11 +917,11 @@ namespace pugi {
SKIPWS(); // Eat any whitespace.
LOC_ATTRIBUTE:
- if(chartype_symbol(*s)) // <... #...
+ if(is_chartype(*s, ct_symbol)) // <... #...
{
xml_attribute_struct* a = append_attribute(cursor); // Make space for this attribute.
a->name = s; // Save the offset.
- SCANWHILE(chartype_symbol(*s)); // Scan for a terminator.
+ SCANWHILE(is_chartype(*s, ct_symbol)); // Scan for a terminator.
ENDSEG(); // Save char in 'ch', terminate & step over.
if(*s!=0 && is_chartype(ch, ct_space)) SKIPWS(); // Eat any whitespace.
if(*s!=0 && (ch == '=' || *s == '=')) // '<... #=...'
@@ -1040,7 +1034,7 @@ namespace pugi if (name)
{
- while (*tagname && chartype_symbol(*tagname))
+ while (*tagname && is_chartype(*tagname, ct_symbol))
{
if (*tagname++ != *name++) goto TAG_NEXTMATCH;
}
@@ -1063,7 +1057,7 @@ namespace pugi char* name = cursor->name;
if (!name) return s;
- while (*s && chartype_symbol(*s))
+ while (*s && is_chartype(*s, ct_symbol))
{
if (*s++ != *name++) return s;
}
diff --git a/src/pugixml.hpp b/src/pugixml.hpp index b7ded37..de7939e 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -3,8 +3,6 @@ // Pug Improved XML Parser - Version 0.2
// --------------------------------------------------------
// Copyright (C) 2006-2007, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
-// Thanks to Palvelev Artyom (cppguru@mail.ru) for hints about optimizing
-// conversion functions.
// This work is based on the pugxml parser, which is:
// Copyright (C) 2003, by Kristen Wegner (kristen@tima.net)
// Released into the Public Domain. Use at your own risk.
|