Merge lp:~geoffers/beautifulsoup/html5lib-tests into lp:beautifulsoup

Proposed by Geoffrey Sneddon
Status: Merged
Merged at revision: 436
Proposed branch: lp:~geoffers/beautifulsoup/html5lib-tests
Merge into: lp:beautifulsoup
Diff against target: 153 lines (+74/-8)
2 files modified
bs4/builder/_html5lib.py (+69/-8)
bs4/tests/test_html5lib.py (+5/-0)
To merge this branch: bzr merge lp:~geoffers/beautifulsoup/html5lib-tests
Reviewer Review Type Date Requested Status
Leonard Richardson Approve
Review via email: mp+279911@code.launchpad.net

Description of the change

This makes TreeBuilderForHtml5lib strictly follow the html5lib API allowing it to be used directly from html5lib; this was done primarily as it allows one to run the html5lib testsuite against the BS4 tree builder (currently failing 216 tests).

The second commit then fixes those 216 tests by fixing foster parenting: appendChild wasn't setting the Element.parent, leading the walking up the tree to find the last table to fail. That then working, it became apparent that insertText with insertBefore=True fails, so that too was fixed.

To post a comment you must log in.
Revision history for this message
Thomas Güttler (hv-tbz-pariv) wrote :

Is there a reason why this nice patch was not integrated yet?

We have the Element.parent in beautifulsoup.

Revision history for this message
Leonard Richardson (leonardr) wrote :

Geoffrey,

Thanks for this excellent branch. I've merged it as revision 436. My apologies for not getting to this earlier; my work on Beautiful Soup mainly revolves around the bug tracker and I generally don't check the merge requests.

I don't know if this affects your work on html5lib, but since you filed this merge request I've fixed two bugs in Beautiful Soup that only showed up when the html5lib tree builder was in use. Adding similar tests to the html5lib test suite might be useful in finding errors in other code that uses html5lib.

The test methods I added to verify that my fixes worked are test_reparented_markup_containing_identical_whitespace_nodes and
test_reparented_markup_containing_children, both in tests/test_html5lib.py.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'bs4/builder/_html5lib.py'
2--- bs4/builder/_html5lib.py 2015-09-28 23:53:43 +0000
3+++ bs4/builder/_html5lib.py 2015-12-08 16:07:14 +0000
4@@ -4,6 +4,7 @@
5
6 from pdb import set_trace
7 import warnings
8+import re
9 from bs4.builder import (
10 PERMISSIVE,
11 HTML,
12@@ -15,7 +16,10 @@
13 whitespace_re,
14 )
15 import html5lib
16-from html5lib.constants import namespaces
17+from html5lib.constants import (
18+ namespaces,
19+ prefixes,
20+ )
21 from bs4.element import (
22 Comment,
23 Doctype,
24@@ -59,7 +63,7 @@
25
26 def create_treebuilder(self, namespaceHTMLElements):
27 self.underlying_builder = TreeBuilderForHtml5lib(
28- self.soup, namespaceHTMLElements)
29+ namespaceHTMLElements, self.soup)
30 return self.underlying_builder
31
32 def test_fragment_to_document(self, fragment):
33@@ -69,8 +73,12 @@
34
35 class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
36
37- def __init__(self, soup, namespaceHTMLElements):
38- self.soup = soup
39+ def __init__(self, namespaceHTMLElements, soup=None):
40+ if soup:
41+ self.soup = soup
42+ else:
43+ from bs4 import BeautifulSoup
44+ self.soup = BeautifulSoup("", "html.parser")
45 super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
46
47 def documentClass(self):
48@@ -93,7 +101,8 @@
49 return TextNode(Comment(data), self.soup)
50
51 def fragmentClass(self):
52- self.soup = BeautifulSoup("")
53+ from bs4 import BeautifulSoup
54+ self.soup = BeautifulSoup("", "html.parser")
55 self.soup.name = "[document_fragment]"
56 return Element(self.soup, self.soup, None)
57
58@@ -107,6 +116,56 @@
59 def getFragment(self):
60 return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
61
62+ def testSerializer(self, element):
63+ from bs4 import BeautifulSoup
64+ rv = []
65+ doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
66+
67+ def serializeElement(element, indent=0):
68+ if isinstance(element, BeautifulSoup):
69+ pass
70+ if isinstance(element, Doctype):
71+ m = doctype_re.match(element)
72+ if m:
73+ name = m.group(1)
74+ if m.lastindex > 1:
75+ publicId = m.group(2) or ""
76+ systemId = m.group(3) or m.group(4) or ""
77+ rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
78+ (' ' * indent, name, publicId, systemId))
79+ else:
80+ rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
81+ else:
82+ rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
83+ elif isinstance(element, Comment):
84+ rv.append("|%s<!-- %s -->" % (' ' * indent, element))
85+ elif isinstance(element, NavigableString):
86+ rv.append("|%s\"%s\"" % (' ' * indent, element))
87+ else:
88+ if element.namespace:
89+ name = "%s %s" % (prefixes[element.namespace],
90+ element.name)
91+ else:
92+ name = element.name
93+ rv.append("|%s<%s>" % (' ' * indent, name))
94+ if element.attrs:
95+ attributes = []
96+ for name, value in element.attrs.items():
97+ if isinstance(name, NamespacedAttribute):
98+ name = "%s %s" % (prefixes[name.namespace], name.name)
99+ if isinstance(value, list):
100+ value = " ".join(value)
101+ attributes.append((name, value))
102+
103+ for name, value in sorted(attributes):
104+ rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
105+ indent += 2
106+ for child in element.children:
107+ serializeElement(child, indent)
108+ serializeElement(element, 0)
109+
110+ return "\n".join(rv)
111+
112 class AttrList(object):
113 def __init__(self, element):
114 self.element = element
115@@ -158,8 +217,10 @@
116 child = node
117 elif node.element.__class__ == NavigableString:
118 string_child = child = node.element
119+ node.parent = self
120 else:
121 child = node.element
122+ node.parent = self
123
124 if not isinstance(child, basestring) and child.parent is not None:
125 node.element.extract()
126@@ -224,11 +285,11 @@
127 attributes = property(getAttributes, setAttributes)
128
129 def insertText(self, data, insertBefore=None):
130+ text = TextNode(self.soup.new_string(data), self.soup)
131 if insertBefore:
132- text = TextNode(self.soup.new_string(data), self.soup)
133- self.insertBefore(data, insertBefore)
134+ self.insertBefore(text, insertBefore)
135 else:
136- self.appendChild(data)
137+ self.appendChild(text)
138
139 def insertBefore(self, node, refNode):
140 index = self.element.index(refNode.element)
141
142=== modified file 'bs4/tests/test_html5lib.py'
143--- bs4/tests/test_html5lib.py 2015-09-28 23:53:43 +0000
144+++ bs4/tests/test_html5lib.py 2015-12-08 16:07:14 +0000
145@@ -96,3 +96,8 @@
146 a1, a2 = soup.find_all('a')
147 self.assertEqual(a1, a2)
148 assert a1 is not a2
149+
150+ def test_foster_parenting(self):
151+ markup = b"""<table><td></tbody>A"""
152+ soup = self.soup(markup)
153+ self.assertEqual(u"<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())

Subscribers

People subscribed via source and target branches

to status/vote changes: