Merge lp:~facelessuser/beautifulsoup/html5lib-fix into lp:beautifulsoup

Proposed by Isaac Muse
Status: Merged
Merged at revision: 483
Proposed branch: lp:~facelessuser/beautifulsoup/html5lib-fix
Merge into: lp:beautifulsoup
Diff against target: 361 lines (+267/-47)
2 files modified
bs4/__init__.py (+102/-47)
bs4/testing.py (+165/-0)
To merge this branch: bzr merge lp:~facelessuser/beautifulsoup/html5lib-fix
Reviewer Review Type Date Requested Status
Leonard Richardson Pending
Review via email: mp+361282@code.launchpad.net

Commit message

Ensure html5lib code always has proper internal linkage

Description of the change

This should hopefully prevent html5lib from building trees with detached elements. The added test, without the code changes, already passed in both lxml and html.parser, but could not pass in html5lib. The html that is being tested is from the demonstration script, which I realize was not meant to be parsed as one HTML file, but it worked well as a way to stress the linkage logic. It exposed that html5lib could still end up with detached element linkage.

With the added changes to the main code, all libraries (even html5lib) **should** build valid trees at all times. The test should also notify if it somehow gets broken in the future as well. Hopefully this saves everyone more debug work in the future (knock on wood).

To post a comment you must log in.
483. By Isaac Muse <email address hidden>

Remove dead line of code

Revision history for this message
Isaac Muse (facelessuser) wrote :

Bug (#1809910) detailing the in depth analysis of why this change was made as been linked to this merge request.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'bs4/__init__.py'
2--- bs4/__init__.py 2018-12-24 14:54:10 +0000
3+++ bs4/__init__.py 2018-12-26 22:11:55 +0000
4@@ -442,53 +442,108 @@
5 self._most_recent_element = o
6 parent.contents.append(o)
7
8- if parent.next_sibling is not None:
9- # This node is being inserted into an element that has
10- # already been parsed. Deal with any dangling references.
11- index = len(parent.contents)-1
12- while index >= 0:
13- if parent.contents[index] is o:
14- break
15- index -= 1
16- else:
17- raise ValueError(
18- "Error building tree: supposedly %r was inserted "
19- "into %r after the fact, but I don't see it!" % (
20- o, parent
21- )
22- )
23- if index == 0:
24- previous_element = parent
25- previous_sibling = None
26- else:
27- previous_element = previous_sibling = parent.contents[index-1]
28- previous = previous_element
29- while isinstance(previous, Tag):
30- if previous.contents:
31- previous.next_element = previous.contents[0]
32- previous = previous.contents[-1]
33- else:
34- break
35- previous_element = previous
36-
37- if index == len(parent.contents)-1:
38- next_element = parent.next_sibling
39- next_sibling = None
40- else:
41- next_element = next_sibling = parent.contents[index+1]
42-
43- o.previous_element = previous_element
44- if previous_element is not None:
45- previous_element.next_element = o
46- o.next_element = next_element
47- if next_element is not None:
48- next_element.previous_element = o
49- o.next_sibling = next_sibling
50- if next_sibling is not None:
51- next_sibling.previous_sibling = o
52- o.previous_sibling = previous_sibling
53- if previous_sibling is not None:
54- previous_sibling.next_sibling = o
55+ # Check if we are inserting into an already parsed node.
56+ if parent.next_element is not None:
57+
58+ # Check that links are proper across tag parent boundaries
59+ child = self._linkage_fixer(parent)
60+
61+ def _linkage_fixer(self, el, _recursive_call=False):
62+ """Make sure linkage of this fragment is sound."""
63+ descendant = None
64+
65+ # If element is document element,
66+ # it should have no previous element, previous sibling, or next sibling.
67+ if el.parent is None:
68+ if el.previous_element is not None:
69+ el.previous_element = None
70+ if el.previous_sibling is not None:
71+ el.previous_element = None
72+ if el.next_sibling is not None:
73+ el.next_sibling = None
74+
75+ idx = 0
76+ child = None
77+ last_child = None
78+ last_idx = len(el.contents) - 1
79+ for child in el.contents:
80+ descendant = None
81+
82+ # Parent should link next element to their first child
83+ # That child should have no previous sibling
84+ if idx == 0:
85+ if el.parent is not None:
86+ if el.next_element is not child:
87+ el.next_element = child
88+
89+ if child.previous_element is not el:
90+ child.previous_element = el
91+
92+ if child.previous_sibling is not None:
93+ child.previous_sibling = None
94+
95+ # If not the first child, previous index should link as sibling to last index.
96+ # Previous element should match the last index or the last bubbled up descendant (of a Tag sibling).
97+ else:
98+ if child.previous_sibling is not el.contents[idx - 1]:
99+ child.previous_sibling = el.contents[idx - 1]
100+ if el.contents[idx - 1].next_sibling is not child:
101+ el.contents[idx - 1].next_sibling = child
102+
103+ if last_child is not None:
104+ if child.previous_element is not last_child:
105+ child.previous_element = last_child
106+ if last_child.next_element is not child:
107+ last_child.next_element = child
108+
109+ # This index is a tag, dig deeper for a "last descendant" fixing linkage along the way
110+ if isinstance(child, Tag) and child.contents:
111+ descendant = self._linkage_fixer(child, True)
112+ # A bubbled up descendant should have no next siblings
113+ # as it is last in its content list.
114+ if descendant.next_sibling is not None:
115+ descendant.next_sibling = None
116+
117+ # Mark last child as either the bubbled up descendant or the current child
118+ if descendant is not None:
119+ last_child = descendant
120+ else:
121+ last_child = child
122+
123+ # If last child in list, there are no next siblings
124+ if idx == last_idx:
125+ if child.next_sibling is not None:
126+ child.next_sibling = None
127+ idx += 1
128+
129+ # The child to return is either the last descendant (if available)
130+ # or the last processed child (if any). If neither is available,
131+ # the parent element is its own last descendant.
132+ child = descendant if descendant is not None else child
133+ if child is None:
134+ child = el
135+
136+ # If not a recursive call, we are done processing this element.
137+ # As the final step, link last descendant. It should be linked
138+ # to the parent's next sibling (if found), else walk up the chain
139+ # and find a parent with a sibling.
140+ if not _recursive_call and child is not None:
141+ child.next_element = None
142+ target = el
143+ while True:
144+ if target is None:
145+ break
146+ elif target.next_sibling is not None:
147+ child.next_element = target.next_sibling
148+ target.next_sibling.previous_element = child
149+ break
150+ target = target.parent
151+
152+ # We are done, so nothing to return
153+ return None
154+ else:
155+ # Return the child to the recursive caller
156+ return child
157
158 def _popToTag(self, name, nsprefix=None, inclusivePop=True):
159 """Pops the tag stack up to and including the most recent
160
161=== modified file 'bs4/testing.py'
162--- bs4/testing.py 2018-07-28 20:58:23 +0000
163+++ bs4/testing.py 2018-12-26 22:11:55 +0000
164@@ -17,11 +17,48 @@
165 ContentMetaAttributeValue,
166 Doctype,
167 SoupStrainer,
168+ Tag
169 )
170
171 from bs4.builder import HTMLParserTreeBuilder
172 default_builder = HTMLParserTreeBuilder
173
174+BAD_DOCUMENT = u"""A bare string
175+<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
176+<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
177+<div><![CDATA[A CDATA section where it doesn't belong]]></div>
178+<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
179+<div>A <meta> tag</div>
180+<div>A <br> tag that supposedly has contents.</br></div>
181+<div>AT&T</div>
182+<div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div>
183+<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
184+<div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div>
185+<div><a href="http://example.com/</a> that attribute value never got closed</div>
186+<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
187+<! This document starts with a bogus declaration ><div>a</div>
188+<div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
189+<div>This document ends with <!an incomplete declaration
190+<div><a style={height:21px;}>That attribute value was bogus</a></div>
191+<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
192+<div><table><td nowrap>That boolean attribute had no value</td></table></div>
193+<div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
194+<div>This document ends before the entity finishes: &gt
195+<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
196+<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
197+<div><table><tr><td>Here's a table</td></tr></table></div>
198+<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
199+<div>This tag contains nothing but whitespace: <b> </b></div>
200+<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
201+<div><table><div>This table contains bare markup</div></table></div>
202+<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div>
203+<div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
204+<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
205+<div><our\u2603>Tag name contains Unicode characters</our\u2603></div>
206+<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div>
207+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
208+"""
209+
210
211 class SoupTest(unittest.TestCase):
212
213@@ -60,6 +97,121 @@
214 self.assertEqual(earlier, e.previous_element)
215 earlier = e
216
217+ def linkage_validator(self, el, _recursive_call=False):
218+ """Ensure proper linkage throughout the document."""
219+ descendant = None
220+ # Document element should have no previous element or previous sibling.
221+ # It also shouldn't have a next sibling.
222+ if el.parent is None:
223+ assert el.previous_element is None,\
224+ "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
225+ el, el.previous_element, None
226+ )
227+ assert el.previous_sibling is None,\
228+ "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
229+ el, el.previous_sibling, None
230+ )
231+ assert el.next_sibling is None,\
232+ "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
233+ el, el.next_sibling, None
234+ )
235+
236+ idx = 0
237+ child = None
238+ last_child = None
239+ last_idx = len(el.contents) - 1
240+ for child in el.contents:
241+ descendant = None
242+
243+ # Parent should link next element to their first child
244+ # That child should have no previous sibling
245+ if idx == 0:
246+ if el.parent is not None:
247+ assert el.next_element is child,\
248+ "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
249+ el, el.next_element, child
250+ )
251+ assert child.previous_element is el,\
252+ "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
253+ child, child.previous_element, el
254+ )
255+ assert child.previous_sibling is None,\
256+ "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
257+ child, child.previous_sibling, None
258+ )
259+
260+ # If not the first child, previous index should link as sibling to this index
261+ # Previous element should match the last index or the last bubbled up descendant
262+ else:
263+ assert child.previous_sibling is el.contents[idx - 1],\
264+ "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
265+ child, child.previous_sibling, el.contents[idx - 1]
266+ )
267+ assert el.contents[idx - 1].next_sibling is child,\
268+ "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
269+ el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
270+ )
271+
272+ if last_child is not None:
273+ assert child.previous_element is last_child,\
274+ "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
275+ child, child.previous_element, last_child, child.parent.contents
276+ )
277+ assert last_child.next_element is child,\
278+ "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
279+ last_child, last_child.next_element, child
280+ )
281+
282+ if isinstance(child, Tag) and child.contents:
283+ descendant = self.linkage_validator(child, True)
284+ # A bubbled up descendant should have no next siblings
285+ assert descendant.next_sibling is None,\
286+ "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
287+ descendant, descendant.next_sibling, None
288+ )
289+
290+ # Mark last child as either the bubbled up descendant or the current child
291+ if descendant is not None:
292+ last_child = descendant
293+ else:
294+ last_child = child
295+
296+ # If last child, there are non next siblings
297+ if idx == last_idx:
298+ assert child.next_sibling is None,\
299+ "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
300+ child, child.next_sibling, None
301+ )
302+ idx += 1
303+
304+ child = descendant if descendant is not None else child
305+ if child is None:
306+ child = el
307+
308+ if not _recursive_call and child is not None:
309+ target = el
310+ while True:
311+ if target is None:
312+ assert child.next_element is None, \
313+ "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
314+ child, child.next_element, None
315+ )
316+ break
317+ elif target.next_sibling is not None:
318+ assert child.next_element is target.next_sibling, \
319+ "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
320+ child, child.next_element, target.next_sibling
321+ )
322+ break
323+ target = target.parent
324+
325+ # We are done, so nothing to return
326+ return None
327+ else:
328+ # Return the child to the recursive caller
329+ return child
330+
331+
332 class HTMLTreeBuilderSmokeTest(object):
333
334 """A basic test of a treebuilder's competence.
335@@ -615,6 +767,13 @@
336 data.a['foo'] = 'bar'
337 self.assertEqual('<a foo="bar">text</a>', data.a.decode())
338
339+ def test_worst_case(self):
340+ """Test the worst case (currently) for linking issues."""
341+
342+ soup = self.soup(BAD_DOCUMENT)
343+ self.linkage_validator(soup)
344+
345+
346 class XMLTreeBuilderSmokeTest(object):
347
348 def test_pickle_and_unpickle_identity(self):
349@@ -761,6 +920,12 @@
350 # The two tags have the same namespace prefix.
351 self.assertEqual(tag.prefix, duplicate.prefix)
352
353+ def test_worst_case(self):
354+ """Test the worst case (currently) for linking issues."""
355+
356+ soup = self.soup(BAD_DOCUMENT)
357+ self.linkage_validator(soup)
358+
359
360 class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
361 """Smoke test for a tree builder that supports HTML5."""

Subscribers

People subscribed via source and target branches

to status/vote changes: