Merge lp:~facelessuser/beautifulsoup/html5lib-fix into lp:beautifulsoup

Proposed by Isaac Muse
Status: Merged
Merged at revision: 483
Proposed branch: lp:~facelessuser/beautifulsoup/html5lib-fix
Merge into: lp:beautifulsoup
Diff against target: 361 lines (+267/-47)
2 files modified
bs4/__init__.py (+102/-47)
bs4/testing.py (+165/-0)
To merge this branch: bzr merge lp:~facelessuser/beautifulsoup/html5lib-fix
Reviewer Review Type Date Requested Status
Leonard Richardson Pending
Review via email: mp+361282@code.launchpad.net

Commit message

Ensure html5lib code always has proper internal linkage

Description of the change

This should hopefully prevent html5lib from building trees with detached elements. The added test, without the code changes, already passed in both lxml and html.parser, but could not pass in html5lib. The html that is being tested is from the demonstration script, which I realize was not meant to be parsed as one HTML file, but it worked well as a way to stress the linkage logic. It exposed that html5lib could still end up with detached element linkage.

With the added changes to the main code, all libraries (even html5lib) **should** build valid trees at all times. The test should also notify if it somehow gets broken in the future as well. Hopefully this saves everyone more debug work in the future (knock on wood).

To post a comment you must log in.
483. By Isaac Muse <email address hidden>

Remove dead line of code

Revision history for this message
Isaac Muse (facelessuser) wrote :

Bug (#1809910) detailing the in depth analysis of why this change was made as been linked to this merge request.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'bs4/__init__.py'
--- bs4/__init__.py 2018-12-24 14:54:10 +0000
+++ bs4/__init__.py 2018-12-26 22:11:55 +0000
@@ -442,53 +442,108 @@
442 self._most_recent_element = o442 self._most_recent_element = o
443 parent.contents.append(o)443 parent.contents.append(o)
444444
445 if parent.next_sibling is not None:445 # Check if we are inserting into an already parsed node.
446 # This node is being inserted into an element that has446 if parent.next_element is not None:
447 # already been parsed. Deal with any dangling references.447
448 index = len(parent.contents)-1448 # Check that links are proper across tag parent boundaries
449 while index >= 0:449 child = self._linkage_fixer(parent)
450 if parent.contents[index] is o:450
451 break451 def _linkage_fixer(self, el, _recursive_call=False):
452 index -= 1452 """Make sure linkage of this fragment is sound."""
453 else:453 descendant = None
454 raise ValueError(454
455 "Error building tree: supposedly %r was inserted "455 # If element is document element,
456 "into %r after the fact, but I don't see it!" % (456 # it should have no previous element, previous sibling, or next sibling.
457 o, parent457 if el.parent is None:
458 )458 if el.previous_element is not None:
459 )459 el.previous_element = None
460 if index == 0:460 if el.previous_sibling is not None:
461 previous_element = parent461 el.previous_element = None
462 previous_sibling = None462 if el.next_sibling is not None:
463 else:463 el.next_sibling = None
464 previous_element = previous_sibling = parent.contents[index-1]464
465 previous = previous_element465 idx = 0
466 while isinstance(previous, Tag):466 child = None
467 if previous.contents:467 last_child = None
468 previous.next_element = previous.contents[0]468 last_idx = len(el.contents) - 1
469 previous = previous.contents[-1]469 for child in el.contents:
470 else:470 descendant = None
471 break471
472 previous_element = previous472 # Parent should link next element to their first child
473473 # That child should have no previous sibling
474 if index == len(parent.contents)-1:474 if idx == 0:
475 next_element = parent.next_sibling475 if el.parent is not None:
476 next_sibling = None476 if el.next_element is not child:
477 else:477 el.next_element = child
478 next_element = next_sibling = parent.contents[index+1]478
479479 if child.previous_element is not el:
480 o.previous_element = previous_element480 child.previous_element = el
481 if previous_element is not None:481
482 previous_element.next_element = o482 if child.previous_sibling is not None:
483 o.next_element = next_element483 child.previous_sibling = None
484 if next_element is not None:484
485 next_element.previous_element = o485 # If not the first child, previous index should link as sibling to last index.
486 o.next_sibling = next_sibling486 # Previous element should match the last index or the last bubbled up descendant (of a Tag sibling).
487 if next_sibling is not None:487 else:
488 next_sibling.previous_sibling = o488 if child.previous_sibling is not el.contents[idx - 1]:
489 o.previous_sibling = previous_sibling489 child.previous_sibling = el.contents[idx - 1]
490 if previous_sibling is not None:490 if el.contents[idx - 1].next_sibling is not child:
491 previous_sibling.next_sibling = o491 el.contents[idx - 1].next_sibling = child
492
493 if last_child is not None:
494 if child.previous_element is not last_child:
495 child.previous_element = last_child
496 if last_child.next_element is not child:
497 last_child.next_element = child
498
499 # This index is a tag, dig deeper for a "last descendant" fixing linkage along the way
500 if isinstance(child, Tag) and child.contents:
501 descendant = self._linkage_fixer(child, True)
502 # A bubbled up descendant should have no next siblings
503 # as it is last in its content list.
504 if descendant.next_sibling is not None:
505 descendant.next_sibling = None
506
507 # Mark last child as either the bubbled up descendant or the current child
508 if descendant is not None:
509 last_child = descendant
510 else:
511 last_child = child
512
513 # If last child in list, there are no next siblings
514 if idx == last_idx:
515 if child.next_sibling is not None:
516 child.next_sibling = None
517 idx += 1
518
519 # The child to return is either the last descendant (if available)
520 # or the last processed child (if any). If neither is available,
521 # the parent element is its own last descendant.
522 child = descendant if descendant is not None else child
523 if child is None:
524 child = el
525
526 # If not a recursive call, we are done processing this element.
527 # As the final step, link last descendant. It should be linked
528 # to the parent's next sibling (if found), else walk up the chain
529 # and find a parent with a sibling.
530 if not _recursive_call and child is not None:
531 child.next_element = None
532 target = el
533 while True:
534 if target is None:
535 break
536 elif target.next_sibling is not None:
537 child.next_element = target.next_sibling
538 target.next_sibling.previous_element = child
539 break
540 target = target.parent
541
542 # We are done, so nothing to return
543 return None
544 else:
545 # Return the child to the recursive caller
546 return child
492547
493 def _popToTag(self, name, nsprefix=None, inclusivePop=True):548 def _popToTag(self, name, nsprefix=None, inclusivePop=True):
494 """Pops the tag stack up to and including the most recent549 """Pops the tag stack up to and including the most recent
495550
=== modified file 'bs4/testing.py'
--- bs4/testing.py 2018-07-28 20:58:23 +0000
+++ bs4/testing.py 2018-12-26 22:11:55 +0000
@@ -17,11 +17,48 @@
17 ContentMetaAttributeValue,17 ContentMetaAttributeValue,
18 Doctype,18 Doctype,
19 SoupStrainer,19 SoupStrainer,
20 Tag
20)21)
2122
22from bs4.builder import HTMLParserTreeBuilder23from bs4.builder import HTMLParserTreeBuilder
23default_builder = HTMLParserTreeBuilder24default_builder = HTMLParserTreeBuilder
2425
26BAD_DOCUMENT = u"""A bare string
27<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
28<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
29<div><![CDATA[A CDATA section where it doesn't belong]]></div>
30<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
31<div>A <meta> tag</div>
32<div>A <br> tag that supposedly has contents.</br></div>
33<div>AT&T</div>
34<div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div>
35<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
36<div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div>
37<div><a href="http://example.com/</a> that attribute value never got closed</div>
38<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
39<! This document starts with a bogus declaration ><div>a</div>
40<div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
41<div>This document ends with <!an incomplete declaration
42<div><a style={height:21px;}>That attribute value was bogus</a></div>
43<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
44<div><table><td nowrap>That boolean attribute had no value</td></table></div>
45<div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
46<div>This document ends before the entity finishes: &gt
47<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
48<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
49<div><table><tr><td>Here's a table</td></tr></table></div>
50<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
51<div>This tag contains nothing but whitespace: <b> </b></div>
52<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
53<div><table><div>This table contains bare markup</div></table></div>
54<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div>
55<div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
56<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
57<div><our\u2603>Tag name contains Unicode characters</our\u2603></div>
58<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div>
59<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
60"""
61
2562
26class SoupTest(unittest.TestCase):63class SoupTest(unittest.TestCase):
2764
@@ -60,6 +97,121 @@
60 self.assertEqual(earlier, e.previous_element)97 self.assertEqual(earlier, e.previous_element)
61 earlier = e98 earlier = e
6299
100 def linkage_validator(self, el, _recursive_call=False):
101 """Ensure proper linkage throughout the document."""
102 descendant = None
103 # Document element should have no previous element or previous sibling.
104 # It also shouldn't have a next sibling.
105 if el.parent is None:
106 assert el.previous_element is None,\
107 "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
108 el, el.previous_element, None
109 )
110 assert el.previous_sibling is None,\
111 "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
112 el, el.previous_sibling, None
113 )
114 assert el.next_sibling is None,\
115 "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
116 el, el.next_sibling, None
117 )
118
119 idx = 0
120 child = None
121 last_child = None
122 last_idx = len(el.contents) - 1
123 for child in el.contents:
124 descendant = None
125
126 # Parent should link next element to their first child
127 # That child should have no previous sibling
128 if idx == 0:
129 if el.parent is not None:
130 assert el.next_element is child,\
131 "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
132 el, el.next_element, child
133 )
134 assert child.previous_element is el,\
135 "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
136 child, child.previous_element, el
137 )
138 assert child.previous_sibling is None,\
139 "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
140 child, child.previous_sibling, None
141 )
142
143 # If not the first child, previous index should link as sibling to this index
144 # Previous element should match the last index or the last bubbled up descendant
145 else:
146 assert child.previous_sibling is el.contents[idx - 1],\
147 "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
148 child, child.previous_sibling, el.contents[idx - 1]
149 )
150 assert el.contents[idx - 1].next_sibling is child,\
151 "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
152 el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
153 )
154
155 if last_child is not None:
156 assert child.previous_element is last_child,\
157 "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
158 child, child.previous_element, last_child, child.parent.contents
159 )
160 assert last_child.next_element is child,\
161 "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
162 last_child, last_child.next_element, child
163 )
164
165 if isinstance(child, Tag) and child.contents:
166 descendant = self.linkage_validator(child, True)
167 # A bubbled up descendant should have no next siblings
168 assert descendant.next_sibling is None,\
169 "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
170 descendant, descendant.next_sibling, None
171 )
172
173 # Mark last child as either the bubbled up descendant or the current child
174 if descendant is not None:
175 last_child = descendant
176 else:
177 last_child = child
178
179 # If last child, there are non next siblings
180 if idx == last_idx:
181 assert child.next_sibling is None,\
182 "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
183 child, child.next_sibling, None
184 )
185 idx += 1
186
187 child = descendant if descendant is not None else child
188 if child is None:
189 child = el
190
191 if not _recursive_call and child is not None:
192 target = el
193 while True:
194 if target is None:
195 assert child.next_element is None, \
196 "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
197 child, child.next_element, None
198 )
199 break
200 elif target.next_sibling is not None:
201 assert child.next_element is target.next_sibling, \
202 "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
203 child, child.next_element, target.next_sibling
204 )
205 break
206 target = target.parent
207
208 # We are done, so nothing to return
209 return None
210 else:
211 # Return the child to the recursive caller
212 return child
213
214
63class HTMLTreeBuilderSmokeTest(object):215class HTMLTreeBuilderSmokeTest(object):
64216
65 """A basic test of a treebuilder's competence.217 """A basic test of a treebuilder's competence.
@@ -615,6 +767,13 @@
615 data.a['foo'] = 'bar'767 data.a['foo'] = 'bar'
616 self.assertEqual('<a foo="bar">text</a>', data.a.decode())768 self.assertEqual('<a foo="bar">text</a>', data.a.decode())
617769
770 def test_worst_case(self):
771 """Test the worst case (currently) for linking issues."""
772
773 soup = self.soup(BAD_DOCUMENT)
774 self.linkage_validator(soup)
775
776
618class XMLTreeBuilderSmokeTest(object):777class XMLTreeBuilderSmokeTest(object):
619778
620 def test_pickle_and_unpickle_identity(self):779 def test_pickle_and_unpickle_identity(self):
@@ -761,6 +920,12 @@
761 # The two tags have the same namespace prefix.920 # The two tags have the same namespace prefix.
762 self.assertEqual(tag.prefix, duplicate.prefix)921 self.assertEqual(tag.prefix, duplicate.prefix)
763922
923 def test_worst_case(self):
924 """Test the worst case (currently) for linking issues."""
925
926 soup = self.soup(BAD_DOCUMENT)
927 self.linkage_validator(soup)
928
764929
765class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):930class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
766 """Smoke test for a tree builder that supports HTML5."""931 """Smoke test for a tree builder that supports HTML5."""

Subscribers

People subscribed via source and target branches

to status/vote changes: