Add initial support for parse_embed_pcdata

When this flag is true, PCDATA value is saved to the parent element instead of
allocating a new node.

This prevents some documents from round-tripping since it loses information,
but can provide a significant memory reduction and parsing speedup for some
documents.
This commit is contained in:
Arseny Kapoulkine
2016-01-08 08:37:26 -08:00
parent ad3b492c1a
commit 2874f6f21d
2 changed files with 18 additions and 4 deletions

View File

@@ -3360,13 +3360,22 @@ PUGI__NS_BEGIN
if (cursor->parent || PUGI__OPTSET(parse_fragment))
{
PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree.
cursor->value = s; // Save the offset.
if (!PUGI__OPTSET(parse_embed_pcdata))
{
PUGI__PUSHNODE(node_pcdata); // Append a new node on the tree.
cursor->value = s; // Save the offset.
PUGI__POPNODE(); // Pop since this is a standalone.
}
else
{
if (cursor->parent && !cursor->value)
cursor->value = s; // Save the offset.
}
s = strconv_pcdata(s);
PUGI__POPNODE(); // Pop since this is a standalone.
if (!*s) break;
}
else

View File

@@ -158,6 +158,11 @@ namespace pugi
// is a valid document. This flag is off by default.
const unsigned int parse_fragment = 0x1000;
// This flag determines if plain character data is be stored in the parent element's value. This significantly changes the structure of
// the document and does not allow some documents to round-trip; this flag is only recommended for parsing documents with a lot of
// PCDATA nodes in a very memory-constrained environment. This flag is off by default.
const unsigned int parse_embed_pcdata = 0x2000;
// The default parsing mode.
// Elements, PCDATA and CDATA sections are added to the DOM tree, character/reference entities are expanded,
// End-of-Line characters are normalized, attribute values are normalized using CDATA normalization rules.