1
=== modified file 'bs4/__init__.py'
2
--- bs4/__init__.py	2018-12-24 14:54:10 +0000
3
+++ bs4/__init__.py	2018-12-26 22:11:55 +0000
4
@@ -442,53 +442,108 @@
5
442
        self._most_recent_element = o
442
        self._most_recent_element = o
6
443
        parent.contents.append(o)
443
        parent.contents.append(o)
7
444
444
55
445
        if parent.next_sibling is not None:
445
        # Check if we are inserting into an already parsed node.
56
446
            # This node is being inserted into an element that has
446
        if parent.next_element is not None:
57
447
            # already been parsed. Deal with any dangling references.
447
58
448
            index = len(parent.contents)-1
448
            # Check that links are proper across tag parent boundaries
59
449
            while index >= 0:
449
            child = self._linkage_fixer(parent)
60
450
                if parent.contents[index] is o:
450
61
451
                    break
451
    def _linkage_fixer(self, el, _recursive_call=False):
62
452
                index -= 1
452
        """Make sure linkage of this fragment is sound."""
63
453
            else:
453
        descendant = None
64
454
                raise ValueError(
454
65
455
                    "Error building tree: supposedly %r was inserted "
455
        # If element is document element,
66
456
                    "into %r after the fact, but I don't see it!" % (
456
        # it should have no previous element, previous sibling, or next sibling.
67
457
                        o, parent
457
        if el.parent is None:
68
458
                    )
458
            if el.previous_element is not None:
69
459
                )
459
                el.previous_element = None
70
460
            if index == 0:
460
            if el.previous_sibling is not None:
71
461
                previous_element = parent
461
                el.previous_element = None
72
462
                previous_sibling = None
462
            if el.next_sibling is not None:
73
463
            else:
463
                el.next_sibling = None
74
464
                previous_element = previous_sibling = parent.contents[index-1]
464
75
465
                previous = previous_element
465
        idx = 0
76
466
                while isinstance(previous, Tag):
466
        child = None
77
467
                    if previous.contents:
467
        last_child = None
78
468
                        previous.next_element = previous.contents[0]
468
        last_idx = len(el.contents) - 1
79
469
                        previous = previous.contents[-1]
469
        for child in el.contents:
80
470
                    else:
470
            descendant = None
81
471
                        break
471
82
472
                previous_element = previous
472
            # Parent should link next element to their first child
83
473
473
            # That child should have no previous sibling
84
474
            if index == len(parent.contents)-1:
474
            if idx == 0:
85
475
                next_element = parent.next_sibling
475
                if el.parent is not None:
86
476
                next_sibling = None
476
                    if el.next_element is not child:
87
477
            else:
477
                        el.next_element = child
88
478
                next_element = next_sibling = parent.contents[index+1]
478
89
479
479
                    if child.previous_element is not el:
90
480
            o.previous_element = previous_element
480
                        child.previous_element = el
91
481
            if previous_element is not None:
481
92
482
                previous_element.next_element = o
482
                    if child.previous_sibling is not None:
93
483
            o.next_element = next_element
483
                        child.previous_sibling = None
94
484
            if next_element is not None:
484
95
485
                next_element.previous_element = o
485
            # If not the first child, previous index should link as sibling to last index.
96
486
            o.next_sibling = next_sibling
486
            # Previous element should match the last index or the last bubbled up descendant (of a Tag sibling).
97
487
            if next_sibling is not None:
487
            else:
98
488
                next_sibling.previous_sibling = o
488
                if child.previous_sibling is not el.contents[idx - 1]:
99
489
            o.previous_sibling = previous_sibling
489
                    child.previous_sibling = el.contents[idx - 1]
100
490
            if previous_sibling is not None:
490
                if el.contents[idx - 1].next_sibling is not child:
101
491
                previous_sibling.next_sibling = o
491
                    el.contents[idx - 1].next_sibling = child
102
492
103
493
                if last_child is not None:
104
494
                    if child.previous_element is not last_child:
105
495
                        child.previous_element = last_child
106
496
                    if last_child.next_element is not child:
107
497
                        last_child.next_element = child
108
498
109
499
            # This index is a tag, dig deeper for a "last descendant" fixing linkage along the way
110
500
            if isinstance(child, Tag) and child.contents:
111
501
                descendant = self._linkage_fixer(child, True)
112
502
                # A bubbled up descendant should have no next siblings
113
503
                # as it is last in its content list.
114
504
                if descendant.next_sibling is not None:
115
505
                    descendant.next_sibling = None
116
506
117
507
            # Mark last child as either the bubbled up descendant or the current child
118
508
            if descendant is not None:
119
509
                last_child = descendant
120
510
            else:
121
511
                last_child = child
122
512
123
513
            # If last child in list, there are no next siblings
124
514
            if idx == last_idx:
125
515
                if child.next_sibling is not None:
126
516
                    child.next_sibling = None
127
517
            idx += 1
128
518
129
519
        # The child to return is either the last descendant (if available)
130
520
        # or the last processed child (if any). If neither is available,
131
521
        # the parent element is its own last descendant.
132
522
        child = descendant if descendant is not None else child
133
523
        if child is None:
134
524
            child = el
135
525
136
526
        # If not a recursive call, we are done processing this element.
137
527
        # As the final step, link last descendant. It should be linked
138
528
        # to the parent's next sibling (if found), else walk up the chain
139
529
        # and find a parent with a sibling.
140
530
        if not _recursive_call and child is not None:
141
531
            child.next_element = None
142
532
            target = el
143
533
            while True:
144
534
                if target is None:
145
535
                    break
146
536
                elif target.next_sibling is not None:
147
537
                    child.next_element = target.next_sibling
148
538
                    target.next_sibling.previous_element = child
149
539
                    break
150
540
                target = target.parent
151
541
152
542
            # We are done, so nothing to return
153
543
            return None
154
544
        else:
155
545
            # Return the child to the recursive caller
156
546
            return child
157
492
547
158
493
    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
548
    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
159
494
        """Pops the tag stack up to and including the most recent
549
        """Pops the tag stack up to and including the most recent
160
495
550
161
=== modified file 'bs4/testing.py'
162
--- bs4/testing.py	2018-07-28 20:58:23 +0000
163
+++ bs4/testing.py	2018-12-26 22:11:55 +0000
164
@@ -17,11 +17,48 @@
165
17
    ContentMetaAttributeValue,
17
    ContentMetaAttributeValue,
166
18
    Doctype,
18
    Doctype,
167
19
    SoupStrainer,
19
    SoupStrainer,
168
20
    Tag
169
20
)
21
)
170
21
22
171
22
from bs4.builder import HTMLParserTreeBuilder
23
from bs4.builder import HTMLParserTreeBuilder
172
23
default_builder = HTMLParserTreeBuilder
24
default_builder = HTMLParserTreeBuilder
173
24
25
174
26
BAD_DOCUMENT = u"""A bare string
175
27
<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
176
28
<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
177
29
<div><![CDATA[A CDATA section where it doesn't belong]]></div>
178
30
<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
179
31
<div>A <meta> tag</div>
180
32
<div>A <br> tag that supposedly has contents.</br></div>
181
33
<div>AT&T</div>
182
34
<div><textarea>Within a textarea, markup like <b> tags and <&<&amp; should be treated as literal</textarea></div>
183
35
<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
184
36
<div>This numeric entity is missing the final semicolon: <x t="pi&#241ata"></div>
185
37
<div><a href="http://example.com/</a> that attribute value never got closed</div>
186
38
<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
187
39
<! This document starts with a bogus declaration ><div>a</div>
188
40
<div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
189
41
<div>This document ends with <!an incomplete declaration
190
42
<div><a style={height:21px;}>That attribute value was bogus</a></div>
191
43
<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
192
44
<div><table><td nowrap>That boolean attribute had no value</td></table></div>
193
45
<div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
194
46
<div>This document ends before the entity finishes: &gt
195
47
<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
196
48
<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
197
49
<div><table><tr><td>Here's a table</td></tr></table></div>
198
50
<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
199
51
<div>This tag contains nothing but whitespace: <b>    </b></div>
200
52
<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
201
53
<div><table><div>This table contains bare markup</div></table></div>
202
54
<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n   <a href="link2">This link is closed.</a>\n  </div>\n</div></div>
203
55
<div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
204
56
<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
205
57
<div><our\u2603>Tag name contains Unicode characters</our\u2603></div>
206
58
<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div>
207
59
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
208
60
"""
209
61
210
25
62
211
26
class SoupTest(unittest.TestCase):
63
class SoupTest(unittest.TestCase):
212
27
64
213
@@ -60,6 +97,121 @@
214
60
                self.assertEqual(earlier, e.previous_element)
97
                self.assertEqual(earlier, e.previous_element)
215
61
            earlier = e
98
            earlier = e
216
62
99
217
100
    def linkage_validator(self, el, _recursive_call=False):
218
101
        """Ensure proper linkage throughout the document."""
219
102
        descendant = None
220
103
        # Document element should have no previous element or previous sibling.
221
104
        # It also shouldn't have a next sibling.
222
105
        if el.parent is None:
223
106
            assert el.previous_element is None,\
224
107
                "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
225
108
                    el, el.previous_element, None
226
109
                )
227
110
            assert el.previous_sibling is None,\
228
111
                "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
229
112
                    el, el.previous_sibling, None
230
113
                )
231
114
            assert el.next_sibling is None,\
232
115
                "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
233
116
                    el, el.next_sibling, None
234
117
                )
235
118
236
119
        idx = 0
237
120
        child = None
238
121
        last_child = None
239
122
        last_idx = len(el.contents) - 1
240
123
        for child in el.contents:
241
124
            descendant = None
242
125
243
126
            # Parent should link next element to their first child
244
127
            # That child should have no previous sibling
245
128
            if idx == 0:
246
129
                if el.parent is not None:
247
130
                    assert el.next_element is child,\
248
131
                       "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
249
132
                            el, el.next_element, child
250
133
                        )
251
134
                    assert child.previous_element is el,\
252
135
                       "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
253
136
                            child, child.previous_element, el
254
137
                        )
255
138
                    assert child.previous_sibling is None,\
256
139
                       "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
257
140
                            child, child.previous_sibling, None
258
141
                        )
259
142
260
143
            # If not the first child, previous index should link as sibling to this index
261
144
            # Previous element should match the last index or the last bubbled up descendant
262
145
            else:
263
146
                assert child.previous_sibling is el.contents[idx - 1],\
264
147
                    "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
265
148
                        child, child.previous_sibling, el.contents[idx - 1]
266
149
                    )
267
150
                assert el.contents[idx - 1].next_sibling is child,\
268
151
                    "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
269
152
                        el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
270
153
                    )
271
154
272
155
                if last_child is not None:
273
156
                    assert child.previous_element is last_child,\
274
157
                        "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
275
158
                            child, child.previous_element, last_child, child.parent.contents
276
159
                        )
277
160
                    assert last_child.next_element is child,\
278
161
                        "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
279
162
                            last_child, last_child.next_element, child
280
163
                        )
281
164
282
165
            if isinstance(child, Tag) and child.contents:
283
166
                descendant = self.linkage_validator(child, True)
284
167
                # A bubbled up descendant should have no next siblings
285
168
                assert descendant.next_sibling is None,\
286
169
                    "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
287
170
                        descendant, descendant.next_sibling, None
288
171
                    )
289
172
290
173
            # Mark last child as either the bubbled up descendant or the current child
291
174
            if descendant is not None:
292
175
                last_child = descendant
293
176
            else:
294
177
                last_child = child
295
178
296
179
            # If last child, there are non next siblings
297
180
            if idx == last_idx:
298
181
                assert child.next_sibling is None,\
299
182
                    "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
300
183
                        child, child.next_sibling, None
301
184
                    )
302
185
            idx += 1
303
186
304
187
        child = descendant if descendant is not None else child
305
188
        if child is None:
306
189
            child = el
307
190
308
191
        if not _recursive_call and child is not None:
309
192
            target = el
310
193
            while True:
311
194
                if target is None:
312
195
                    assert child.next_element is None, \
313
196
                        "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
314
197
                            child, child.next_element, None
315
198
                        )
316
199
                    break
317
200
                elif target.next_sibling is not None:
318
201
                    assert child.next_element is target.next_sibling, \
319
202
                        "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
320
203
                            child, child.next_element, target.next_sibling
321
204
                        )
322
205
                    break
323
206
                target = target.parent
324
207
325
208
            # We are done, so nothing to return
326
209
            return None
327
210
        else:
328
211
            # Return the child to the recursive caller
329
212
            return child
330
213
331
214
332
63
class HTMLTreeBuilderSmokeTest(object):
215
class HTMLTreeBuilderSmokeTest(object):
333
64
216
334
65
    """A basic test of a treebuilder's competence.
217
    """A basic test of a treebuilder's competence.
335
@@ -615,6 +767,13 @@
336
615
        data.a['foo'] = 'bar'
767
        data.a['foo'] = 'bar'
337
616
        self.assertEqual('<a foo="bar">text</a>', data.a.decode())
768
        self.assertEqual('<a foo="bar">text</a>', data.a.decode())
338
617
769
339
770
    def test_worst_case(self):
340
771
        """Test the worst case (currently) for linking issues."""
341
772
342
773
        soup = self.soup(BAD_DOCUMENT)
343
774
        self.linkage_validator(soup)
344
775
345
776
346
618
class XMLTreeBuilderSmokeTest(object):
777
class XMLTreeBuilderSmokeTest(object):
347
619
778
348
620
    def test_pickle_and_unpickle_identity(self):
779
    def test_pickle_and_unpickle_identity(self):
349
@@ -761,6 +920,12 @@
350
761
        # The two tags have the same namespace prefix.
920
        # The two tags have the same namespace prefix.
351
762
        self.assertEqual(tag.prefix, duplicate.prefix)
921
        self.assertEqual(tag.prefix, duplicate.prefix)
352
763
922
353
923
    def test_worst_case(self):
354
924
        """Test the worst case (currently) for linking issues."""
355
925
356
926
        soup = self.soup(BAD_DOCUMENT)
357
927
        self.linkage_validator(soup)
358
928
359
764
929
360
765
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
930
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
361
766
    """Smoke test for a tree builder that supports HTML5."""
931
    """Smoke test for a tree builder that supports HTML5."""
Reviewer	Review Type	Date Requested	Status
Leonard Richardson		2018-12-25	Pending
Review via email: mp+361282@code.launchpad.net