|
Added
Link Here
|
| 1 |
From 85b1792e37b131e7a51af98a37f92472e8de5f3f Mon Sep 17 00:00:00 2001 |
| 2 |
From: Nick Wellnhofer <wellnhofer@aevum.de> |
| 3 |
Date: Tue, 18 May 2021 20:08:28 +0200 |
| 4 |
Subject: [PATCH] Work around lxml API abuse |
| 5 |
|
| 6 |
Make xmlNodeDumpOutput and htmlNodeDumpFormatOutput work with corrupted |
| 7 |
parent pointers. This used to work with the old recursive code but the |
| 8 |
non-recursive rewrite required parent pointers to be set correctly. |
| 9 |
|
| 10 |
Unfortunately, lxml relies on the old behavior and passes subtrees with |
| 11 |
a corrupted structure. Fall back to a recursive function call if an |
| 12 |
invalid parent pointer is detected. |
| 13 |
|
| 14 |
Fixes #255. |
| 15 |
--- |
| 16 |
HTMLtree.c | 46 ++++++++++++++++++++++++++++------------------ |
| 17 |
xmlsave.c | 31 +++++++++++++++++++++---------- |
| 18 |
2 files changed, 49 insertions(+), 28 deletions(-) |
| 19 |
|
| 20 |
diff --git a/HTMLtree.c b/HTMLtree.c |
| 21 |
index 24434d45..bdd639c7 100644 |
| 22 |
--- HTMLtree.c.orig |
| 23 |
+++ HTMLtree.c |
| 24 |
@@ -744,7 +744,7 @@ void |
| 25 |
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
| 26 |
xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED, |
| 27 |
int format) { |
| 28 |
- xmlNodePtr root; |
| 29 |
+ xmlNodePtr root, parent; |
| 30 |
xmlAttrPtr attr; |
| 31 |
const htmlElemDesc * info; |
| 32 |
|
| 33 |
@@ -755,6 +755,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
| 34 |
} |
| 35 |
|
| 36 |
root = cur; |
| 37 |
+ parent = cur->parent; |
| 38 |
while (1) { |
| 39 |
switch (cur->type) { |
| 40 |
case XML_HTML_DOCUMENT_NODE: |
| 41 |
@@ -762,13 +763,25 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
| 42 |
if (((xmlDocPtr) cur)->intSubset != NULL) { |
| 43 |
htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL); |
| 44 |
} |
| 45 |
- if (cur->children != NULL) { |
| 46 |
+ /* Always validate cur->parent when descending. */ |
| 47 |
+ if ((cur->parent == parent) && (cur->children != NULL)) { |
| 48 |
+ parent = cur; |
| 49 |
cur = cur->children; |
| 50 |
continue; |
| 51 |
} |
| 52 |
break; |
| 53 |
|
| 54 |
case XML_ELEMENT_NODE: |
| 55 |
+ /* |
| 56 |
+ * Some users like lxml are known to pass nodes with a corrupted |
| 57 |
+ * tree structure. Fall back to a recursive call to handle this |
| 58 |
+ * case. |
| 59 |
+ */ |
| 60 |
+ if ((cur->parent != parent) && (cur->children != NULL)) { |
| 61 |
+ htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); |
| 62 |
+ break; |
| 63 |
+ } |
| 64 |
+ |
| 65 |
/* |
| 66 |
* Get specific HTML info for that node. |
| 67 |
*/ |
| 68 |
@@ -817,6 +830,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
| 69 |
(cur->name != NULL) && |
| 70 |
(cur->name[0] != 'p')) /* p, pre, param */ |
| 71 |
xmlOutputBufferWriteString(buf, "\n"); |
| 72 |
+ parent = cur; |
| 73 |
cur = cur->children; |
| 74 |
continue; |
| 75 |
} |
| 76 |
@@ -825,9 +839,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
| 77 |
(info != NULL) && (!info->isinline)) { |
| 78 |
if ((cur->next->type != HTML_TEXT_NODE) && |
| 79 |
(cur->next->type != HTML_ENTITY_REF_NODE) && |
| 80 |
- (cur->parent != NULL) && |
| 81 |
- (cur->parent->name != NULL) && |
| 82 |
- (cur->parent->name[0] != 'p')) /* p, pre, param */ |
| 83 |
+ (parent != NULL) && |
| 84 |
+ (parent->name != NULL) && |
| 85 |
+ (parent->name[0] != 'p')) /* p, pre, param */ |
| 86 |
xmlOutputBufferWriteString(buf, "\n"); |
| 87 |
} |
| 88 |
|
| 89 |
@@ -842,9 +856,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
| 90 |
break; |
| 91 |
if (((cur->name == (const xmlChar *)xmlStringText) || |
| 92 |
(cur->name != (const xmlChar *)xmlStringTextNoenc)) && |
| 93 |
- ((cur->parent == NULL) || |
| 94 |
- ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) && |
| 95 |
- (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) { |
| 96 |
+ ((parent == NULL) || |
| 97 |
+ ((xmlStrcasecmp(parent->name, BAD_CAST "script")) && |
| 98 |
+ (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) { |
| 99 |
xmlChar *buffer; |
| 100 |
|
| 101 |
buffer = xmlEncodeEntitiesReentrant(doc, cur->content); |
| 102 |
@@ -902,13 +916,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
| 103 |
break; |
| 104 |
} |
| 105 |
|
| 106 |
- /* |
| 107 |
- * The parent should never be NULL here but we want to handle |
| 108 |
- * corrupted documents gracefully. |
| 109 |
- */ |
| 110 |
- if (cur->parent == NULL) |
| 111 |
- return; |
| 112 |
- cur = cur->parent; |
| 113 |
+ cur = parent; |
| 114 |
+ /* cur->parent was validated when descending. */ |
| 115 |
+ parent = cur->parent; |
| 116 |
|
| 117 |
if ((cur->type == XML_HTML_DOCUMENT_NODE) || |
| 118 |
(cur->type == XML_DOCUMENT_NODE)) { |
| 119 |
@@ -939,9 +949,9 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, |
| 120 |
(cur->next != NULL)) { |
| 121 |
if ((cur->next->type != HTML_TEXT_NODE) && |
| 122 |
(cur->next->type != HTML_ENTITY_REF_NODE) && |
| 123 |
- (cur->parent != NULL) && |
| 124 |
- (cur->parent->name != NULL) && |
| 125 |
- (cur->parent->name[0] != 'p')) /* p, pre, param */ |
| 126 |
+ (parent != NULL) && |
| 127 |
+ (parent->name != NULL) && |
| 128 |
+ (parent->name[0] != 'p')) /* p, pre, param */ |
| 129 |
xmlOutputBufferWriteString(buf, "\n"); |
| 130 |
} |
| 131 |
} |
| 132 |
diff --git a/xmlsave.c b/xmlsave.c |
| 133 |
index 61a40459..aedbd5e7 100644 |
| 134 |
--- xmlsave.c.orig |
| 135 |
+++ xmlsave.c |
| 136 |
@@ -847,7 +847,7 @@ htmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { |
| 137 |
static void |
| 138 |
xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { |
| 139 |
int format = ctxt->format; |
| 140 |
- xmlNodePtr tmp, root, unformattedNode = NULL; |
| 141 |
+ xmlNodePtr tmp, root, unformattedNode = NULL, parent; |
| 142 |
xmlAttrPtr attr; |
| 143 |
xmlChar *start, *end; |
| 144 |
xmlOutputBufferPtr buf; |
| 145 |
@@ -856,6 +856,7 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { |
| 146 |
buf = ctxt->buf; |
| 147 |
|
| 148 |
root = cur; |
| 149 |
+ parent = cur->parent; |
| 150 |
while (1) { |
| 151 |
switch (cur->type) { |
| 152 |
case XML_DOCUMENT_NODE: |
| 153 |
@@ -868,7 +869,9 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { |
| 154 |
break; |
| 155 |
|
| 156 |
case XML_DOCUMENT_FRAG_NODE: |
| 157 |
- if (cur->children != NULL) { |
| 158 |
+ /* Always validate cur->parent when descending. */ |
| 159 |
+ if ((cur->parent == parent) && (cur->children != NULL)) { |
| 160 |
+ parent = cur; |
| 161 |
cur = cur->children; |
| 162 |
continue; |
| 163 |
} |
| 164 |
@@ -887,7 +890,18 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { |
| 165 |
break; |
| 166 |
|
| 167 |
case XML_ELEMENT_NODE: |
| 168 |
- if ((cur != root) && (ctxt->format == 1) && (xmlIndentTreeOutput)) |
| 169 |
+ /* |
| 170 |
+ * Some users like lxml are known to pass nodes with a corrupted |
| 171 |
+ * tree structure. Fall back to a recursive call to handle this |
| 172 |
+ * case. |
| 173 |
+ */ |
| 174 |
+ if ((cur->parent != parent) && (cur->children != NULL)) { |
| 175 |
+ xmlNodeDumpOutputInternal(ctxt, cur); |
| 176 |
+ break; |
| 177 |
+ } |
| 178 |
+ |
| 179 |
+ if ((ctxt->level > 0) && (ctxt->format == 1) && |
| 180 |
+ (xmlIndentTreeOutput)) |
| 181 |
xmlOutputBufferWrite(buf, ctxt->indent_size * |
| 182 |
(ctxt->level > ctxt->indent_nr ? |
| 183 |
ctxt->indent_nr : ctxt->level), |
| 184 |
@@ -942,6 +956,7 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { |
| 185 |
xmlOutputBufferWrite(buf, 1, ">"); |
| 186 |
if (ctxt->format == 1) xmlOutputBufferWrite(buf, 1, "\n"); |
| 187 |
if (ctxt->level >= 0) ctxt->level++; |
| 188 |
+ parent = cur; |
| 189 |
cur = cur->children; |
| 190 |
continue; |
| 191 |
} |
| 192 |
@@ -1058,13 +1073,9 @@ xmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) { |
| 193 |
break; |
| 194 |
} |
| 195 |
|
| 196 |
- /* |
| 197 |
- * The parent should never be NULL here but we want to handle |
| 198 |
- * corrupted documents gracefully. |
| 199 |
- */ |
| 200 |
- if (cur->parent == NULL) |
| 201 |
- return; |
| 202 |
- cur = cur->parent; |
| 203 |
+ cur = parent; |
| 204 |
+ /* cur->parent was validated when descending. */ |
| 205 |
+ parent = cur->parent; |
| 206 |
|
| 207 |
if (cur->type == XML_ELEMENT_NODE) { |
| 208 |
if (ctxt->level > 0) ctxt->level--; |
| 209 |
-- |
| 210 |
GitLab |
| 211 |
|