From 0a747e6c1aba8218bda284477701044328fc50bb Mon Sep 17 00:00:00 2001
From: Arseny Kapoulkine <arseny.kapoulkine@gmail.com>
Date: Tue, 25 Feb 2014 03:41:54 +0000
Subject: Add parse_trim_pcdata parse option.

git-svn-id: https://pugixml.googlecode.com/svn/trunk@987 99668b35-9821-0410-8761-19e4c4f06640
---
 src/pugixml.cpp | 45 ++++++++++++++++++++++++++++++++-------------
 src/pugixml.hpp |  5 ++++-
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/src/pugixml.cpp b/src/pugixml.cpp
index 634192a..754f92f 100644
--- a/src/pugixml.cpp
+++ b/src/pugixml.cpp
@@ -1875,19 +1875,27 @@ PUGI__NS_BEGIN
 	
 	typedef char_t* (*strconv_pcdata_t)(char_t*);
 		
-	template <typename opt_eol, typename opt_escape> struct strconv_pcdata_impl
+	template <typename opt_trim, typename opt_eol, typename opt_escape> struct strconv_pcdata_impl
 	{
 		static char_t* parse(char_t* s)
 		{
 			gap g;
-			
+
+			char_t* begin = s;
+
 			while (true)
 			{
 				while (!PUGI__IS_CHARTYPE(*s, ct_parse_pcdata)) ++s;
 					
 				if (*s == '<') // PCDATA ends here
 				{
-					*g.flush(s) = 0;
+					char_t* end = g.flush(s);
+
+					if (opt_trim::value)
+						while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space))
+							--end;
+
+					*end = 0;
 					
 					return s + 1;
 				}
@@ -1903,8 +1911,14 @@ PUGI__NS_BEGIN
 				}
 				else if (*s == 0)
 				{
-					*g.flush(s) = 0;
-					
+					char_t* end = g.flush(s);
+
+					if (opt_trim::value)
+						while (end > begin && PUGI__IS_CHARTYPE(end[-1], ct_space))
+							--end;
+
+					*end = 0;
+
 					return s;
 				}
 				else ++s;
@@ -1914,14 +1928,18 @@ PUGI__NS_BEGIN
 	
 	PUGI__FN strconv_pcdata_t get_strconv_pcdata(unsigned int optmask)
 	{
-		PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20);
+		PUGI__STATIC_ASSERT(parse_escapes == 0x10 && parse_eol == 0x20 && parse_trim_pcdata == 0x0800);
 
-		switch ((optmask >> 4) & 3) // get bitmask for flags (eol escapes)
+		switch (((optmask >> 4) & 3) | ((optmask >> 9) & 4)) // get bitmask for flags (eol escapes trim)
 		{
-		case 0: return strconv_pcdata_impl<opt_false, opt_false>::parse;
-		case 1: return strconv_pcdata_impl<opt_false, opt_true>::parse;
-		case 2: return strconv_pcdata_impl<opt_true, opt_false>::parse;
-		case 3: return strconv_pcdata_impl<opt_true, opt_true>::parse;
+		case 0: return strconv_pcdata_impl<opt_false, opt_false, opt_false>::parse;
+		case 1: return strconv_pcdata_impl<opt_false, opt_false, opt_true>::parse;
+		case 2: return strconv_pcdata_impl<opt_false, opt_true, opt_false>::parse;
+		case 3: return strconv_pcdata_impl<opt_false, opt_true, opt_true>::parse;
+		case 4: return strconv_pcdata_impl<opt_true, opt_false, opt_false>::parse;
+		case 5: return strconv_pcdata_impl<opt_true, opt_false, opt_true>::parse;
+		case 6: return strconv_pcdata_impl<opt_true, opt_true, opt_false>::parse;
+		case 7: return strconv_pcdata_impl<opt_true, opt_true, opt_true>::parse;
 		default: assert(false); return 0; // should not get here
 		}
 	}
@@ -2636,7 +2654,7 @@ PUGI__NS_BEGIN
 						// We skipped some whitespace characters because otherwise we would take the tag branch instead of PCDATA one
 						assert(mark != s);
 
-						if (!PUGI__OPTSET(parse_ws_pcdata | parse_ws_pcdata_single))
+						if (!PUGI__OPTSET(parse_ws_pcdata | parse_ws_pcdata_single) || PUGI__OPTSET(parse_trim_pcdata))
 						{
 							continue;
 						}
@@ -2646,7 +2664,8 @@ PUGI__NS_BEGIN
 						}
 					}
 
-					s = mark;
+					if (!PUGI__OPTSET(parse_trim_pcdata))
+						s = mark;
 							
 					if (cursor->parent || PUGI__OPTSET(parse_fragment))
 					{
diff --git a/src/pugixml.hpp b/src/pugixml.hpp
index e5009fe..b912127 100644
--- a/src/pugixml.hpp
+++ b/src/pugixml.hpp
@@ -151,9 +151,12 @@ namespace pugi
 	// This flag is off by default; turning it on may result in slower parsing and more memory consumption.
 	const unsigned int parse_ws_pcdata_single = 0x0400;
 
+	// This flag determines if leading and trailing whitespace is to be removed from plain character data. This flag is off by default.
+	const unsigned int parse_trim_pcdata = 0x0800;
+
 	// This flag determines if plain character data that does not have a parent node is added to the DOM tree, and if an empty document
 	// is a valid document. This flag is off by default.
-	const unsigned int parse_fragment = 0x0800;
+	const unsigned int parse_fragment = 0x1000;
 
 	// The default parsing mode.
 	// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
-- 
cgit v1.2.3