Merge lp:~mitya57/ubuntu/vivid/beautifulsoup4/tests into lp:ubuntu/vivid/beautifulsoup4
- Vivid (15.04)
- tests
- Merge into vivid
Proposed by
Dmitry Shachnev
Status: | Merged |
---|---|
Merged at revision: | 17 |
Proposed branch: | lp:~mitya57/ubuntu/vivid/beautifulsoup4/tests |
Merge into: | lp:ubuntu/vivid/beautifulsoup4 |
Diff against target: |
511 lines (+465/-2) 6 files modified
.pc/applied-patches (+1/-0) .pc/fix-chardet-failure/bs4/tests/test_soup.py (+434/-0) bs4/tests/test_soup.py (+2/-2) debian/changelog (+6/-0) debian/patches/fix-chardet-failure (+21/-0) debian/patches/series (+1/-0) |
To merge this branch: | bzr merge lp:~mitya57/ubuntu/vivid/beautifulsoup4/tests |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Stefano Rivera | Pending | ||
Ubuntu branches | Pending | ||
Review via email: mp+241778@code.launchpad.net |
Commit message
Description of the change
Currently the autopkgtests fail on Jenkins, this branch will fix them.
This only happens when python3-chardet is installed. It does not happen on Debian's CI because python3-chardet is only a recommendation, not dependency.
Submitted a MP against upstream branch as well, with an explanation: https:/
To post a comment you must log in.
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | === modified file '.pc/applied-patches' |
2 | --- .pc/applied-patches 2014-10-16 22:57:41 +0000 |
3 | +++ .pc/applied-patches 2014-11-14 17:22:47 +0000 |
4 | @@ -1,1 +1,2 @@ |
5 | python3.4-warnings |
6 | +fix-chardet-failure |
7 | |
8 | === added directory '.pc/fix-chardet-failure' |
9 | === added directory '.pc/fix-chardet-failure/bs4' |
10 | === added directory '.pc/fix-chardet-failure/bs4/tests' |
11 | === added file '.pc/fix-chardet-failure/bs4/tests/test_soup.py' |
12 | --- .pc/fix-chardet-failure/bs4/tests/test_soup.py 1970-01-01 00:00:00 +0000 |
13 | +++ .pc/fix-chardet-failure/bs4/tests/test_soup.py 2014-11-14 17:22:47 +0000 |
14 | @@ -0,0 +1,434 @@ |
15 | +# -*- coding: utf-8 -*- |
16 | +"""Tests of Beautiful Soup as a whole.""" |
17 | + |
18 | +import logging |
19 | +import unittest |
20 | +import sys |
21 | +import tempfile |
22 | + |
23 | +from bs4 import ( |
24 | + BeautifulSoup, |
25 | + BeautifulStoneSoup, |
26 | +) |
27 | +from bs4.element import ( |
28 | + CharsetMetaAttributeValue, |
29 | + ContentMetaAttributeValue, |
30 | + SoupStrainer, |
31 | + NamespacedAttribute, |
32 | + ) |
33 | +import bs4.dammit |
34 | +from bs4.dammit import ( |
35 | + EntitySubstitution, |
36 | + UnicodeDammit, |
37 | +) |
38 | +from bs4.testing import ( |
39 | + SoupTest, |
40 | + skipIf, |
41 | +) |
42 | +import warnings |
43 | + |
44 | +try: |
45 | + from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML |
46 | + LXML_PRESENT = True |
47 | +except ImportError, e: |
48 | + LXML_PRESENT = False |
49 | + |
50 | +PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) |
51 | +PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) |
52 | + |
53 | +class TestConstructor(SoupTest): |
54 | + |
55 | + def test_short_unicode_input(self): |
56 | + data = u"<h1>éé</h1>" |
57 | + soup = self.soup(data) |
58 | + self.assertEqual(u"éé", soup.h1.string) |
59 | + |
60 | + def test_embedded_null(self): |
61 | + data = u"<h1>foo\0bar</h1>" |
62 | + soup = self.soup(data) |
63 | + self.assertEqual(u"foo\0bar", soup.h1.string) |
64 | + |
65 | + |
66 | +class TestDeprecatedConstructorArguments(SoupTest): |
67 | + |
68 | + def test_parseOnlyThese_renamed_to_parse_only(self): |
69 | + with warnings.catch_warnings(record=True) as w: |
70 | + soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b")) |
71 | + msg = str(w[0].message) |
72 | + self.assertTrue("parseOnlyThese" in msg) |
73 | + self.assertTrue("parse_only" in msg) |
74 | + self.assertEqual(b"<b></b>", soup.encode()) |
75 | + |
76 | + def test_fromEncoding_renamed_to_from_encoding(self): |
77 | + with warnings.catch_warnings(record=True) as w: |
78 | + utf8 = b"\xc3\xa9" |
79 | + soup = self.soup(utf8, fromEncoding="utf8") |
80 | + msg = str(w[0].message) |
81 | + self.assertTrue("fromEncoding" in msg) |
82 | + self.assertTrue("from_encoding" in msg) |
83 | + self.assertEqual("utf8", soup.original_encoding) |
84 | + |
85 | + def test_unrecognized_keyword_argument(self): |
86 | + self.assertRaises( |
87 | + TypeError, self.soup, "<a>", no_such_argument=True) |
88 | + |
89 | +class TestWarnings(SoupTest): |
90 | + |
91 | + def test_disk_file_warning(self): |
92 | + filehandle = tempfile.NamedTemporaryFile() |
93 | + filename = filehandle.name |
94 | + try: |
95 | + with warnings.catch_warnings(record=True) as w: |
96 | + soup = self.soup(filename) |
97 | + msg = str(w[0].message) |
98 | + self.assertTrue("looks like a filename" in msg) |
99 | + finally: |
100 | + filehandle.close() |
101 | + |
102 | + # The file no longer exists, so Beautiful Soup will no longer issue the warning. |
103 | + with warnings.catch_warnings(record=True) as w: |
104 | + soup = self.soup(filename) |
105 | + self.assertEqual(0, len(w)) |
106 | + |
107 | + def test_url_warning(self): |
108 | + with warnings.catch_warnings(record=True) as w: |
109 | + soup = self.soup("http://www.crummy.com/") |
110 | + msg = str(w[0].message) |
111 | + self.assertTrue("looks like a URL" in msg) |
112 | + |
113 | + with warnings.catch_warnings(record=True) as w: |
114 | + soup = self.soup("http://www.crummy.com/ is great") |
115 | + self.assertEqual(0, len(w)) |
116 | + |
117 | +class TestSelectiveParsing(SoupTest): |
118 | + |
119 | + def test_parse_with_soupstrainer(self): |
120 | + markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>" |
121 | + strainer = SoupStrainer("b") |
122 | + soup = self.soup(markup, parse_only=strainer) |
123 | + self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>") |
124 | + |
125 | + |
126 | +class TestEntitySubstitution(unittest.TestCase): |
127 | + """Standalone tests of the EntitySubstitution class.""" |
128 | + def setUp(self): |
129 | + self.sub = EntitySubstitution |
130 | + |
131 | + def test_simple_html_substitution(self): |
132 | + # Unicode characters corresponding to named HTML entites |
133 | + # are substituted, and no others. |
134 | + s = u"foo\u2200\N{SNOWMAN}\u00f5bar" |
135 | + self.assertEqual(self.sub.substitute_html(s), |
136 | + u"foo∀\N{SNOWMAN}õbar") |
137 | + |
138 | + def test_smart_quote_substitution(self): |
139 | + # MS smart quotes are a common source of frustration, so we |
140 | + # give them a special test. |
141 | + quotes = b"\x91\x92foo\x93\x94" |
142 | + dammit = UnicodeDammit(quotes) |
143 | + self.assertEqual(self.sub.substitute_html(dammit.markup), |
144 | + "‘’foo“”") |
145 | + |
146 | + def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): |
147 | + s = 'Welcome to "my bar"' |
148 | + self.assertEqual(self.sub.substitute_xml(s, False), s) |
149 | + |
150 | + def test_xml_attribute_quoting_normally_uses_double_quotes(self): |
151 | + self.assertEqual(self.sub.substitute_xml("Welcome", True), |
152 | + '"Welcome"') |
153 | + self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), |
154 | + '"Bob\'s Bar"') |
155 | + |
156 | + def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): |
157 | + s = 'Welcome to "my bar"' |
158 | + self.assertEqual(self.sub.substitute_xml(s, True), |
159 | + "'Welcome to \"my bar\"'") |
160 | + |
161 | + def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): |
162 | + s = 'Welcome to "Bob\'s Bar"' |
163 | + self.assertEqual( |
164 | + self.sub.substitute_xml(s, True), |
165 | + '"Welcome to "Bob\'s Bar""') |
166 | + |
167 | + def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): |
168 | + quoted = 'Welcome to "Bob\'s Bar"' |
169 | + self.assertEqual(self.sub.substitute_xml(quoted), quoted) |
170 | + |
171 | + def test_xml_quoting_handles_angle_brackets(self): |
172 | + self.assertEqual( |
173 | + self.sub.substitute_xml("foo<bar>"), |
174 | + "foo<bar>") |
175 | + |
176 | + def test_xml_quoting_handles_ampersands(self): |
177 | + self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") |
178 | + |
179 | + def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): |
180 | + self.assertEqual( |
181 | + self.sub.substitute_xml("ÁT&T"), |
182 | + "&Aacute;T&T") |
183 | + |
184 | + def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): |
185 | + self.assertEqual( |
186 | + self.sub.substitute_xml_containing_entities("ÁT&T"), |
187 | + "ÁT&T") |
188 | + |
189 | + def test_quotes_not_html_substituted(self): |
190 | + """There's no need to do this except inside attribute values.""" |
191 | + text = 'Bob\'s "bar"' |
192 | + self.assertEqual(self.sub.substitute_html(text), text) |
193 | + |
194 | + |
195 | +class TestEncodingConversion(SoupTest): |
196 | + # Test Beautiful Soup's ability to decode and encode from various |
197 | + # encodings. |
198 | + |
199 | + def setUp(self): |
200 | + super(TestEncodingConversion, self).setUp() |
201 | + self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' |
202 | + self.utf8_data = self.unicode_data.encode("utf-8") |
203 | + # Just so you know what it looks like. |
204 | + self.assertEqual( |
205 | + self.utf8_data, |
206 | + b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>') |
207 | + |
208 | + def test_ascii_in_unicode_out(self): |
209 | + # ASCII input is converted to Unicode. The original_encoding |
210 | + # attribute is set to 'utf-8', a superset of ASCII. |
211 | + chardet = bs4.dammit.chardet_dammit |
212 | + logging.disable(logging.WARNING) |
213 | + try: |
214 | + def noop(str): |
215 | + return None |
216 | + # Disable chardet, which will realize that the ASCII is ASCII. |
217 | + bs4.dammit.chardet_dammit = noop |
218 | + ascii = b"<foo>a</foo>" |
219 | + soup_from_ascii = self.soup(ascii) |
220 | + unicode_output = soup_from_ascii.decode() |
221 | + self.assertTrue(isinstance(unicode_output, unicode)) |
222 | + self.assertEqual(unicode_output, self.document_for(ascii.decode())) |
223 | + self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") |
224 | + finally: |
225 | + logging.disable(logging.NOTSET) |
226 | + bs4.dammit.chardet_dammit = chardet |
227 | + |
228 | + def test_unicode_in_unicode_out(self): |
229 | + # Unicode input is left alone. The original_encoding attribute |
230 | + # is not set. |
231 | + soup_from_unicode = self.soup(self.unicode_data) |
232 | + self.assertEqual(soup_from_unicode.decode(), self.unicode_data) |
233 | + self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') |
234 | + self.assertEqual(soup_from_unicode.original_encoding, None) |
235 | + |
236 | + def test_utf8_in_unicode_out(self): |
237 | + # UTF-8 input is converted to Unicode. The original_encoding |
238 | + # attribute is set. |
239 | + soup_from_utf8 = self.soup(self.utf8_data) |
240 | + self.assertEqual(soup_from_utf8.decode(), self.unicode_data) |
241 | + self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!') |
242 | + |
243 | + def test_utf8_out(self): |
244 | + # The internal data structures can be encoded as UTF-8. |
245 | + soup_from_unicode = self.soup(self.unicode_data) |
246 | + self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) |
247 | + |
248 | + @skipIf( |
249 | + PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, |
250 | + "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") |
251 | + def test_attribute_name_containing_unicode_characters(self): |
252 | + markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>' |
253 | + self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) |
254 | + |
255 | +class TestUnicodeDammit(unittest.TestCase): |
256 | + """Standalone tests of UnicodeDammit.""" |
257 | + |
258 | + def test_unicode_input(self): |
259 | + markup = u"I'm already Unicode! \N{SNOWMAN}" |
260 | + dammit = UnicodeDammit(markup) |
261 | + self.assertEqual(dammit.unicode_markup, markup) |
262 | + |
263 | + def test_smart_quotes_to_unicode(self): |
264 | + markup = b"<foo>\x91\x92\x93\x94</foo>" |
265 | + dammit = UnicodeDammit(markup) |
266 | + self.assertEqual( |
267 | + dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>") |
268 | + |
269 | + def test_smart_quotes_to_xml_entities(self): |
270 | + markup = b"<foo>\x91\x92\x93\x94</foo>" |
271 | + dammit = UnicodeDammit(markup, smart_quotes_to="xml") |
272 | + self.assertEqual( |
273 | + dammit.unicode_markup, "<foo>‘’“”</foo>") |
274 | + |
275 | + def test_smart_quotes_to_html_entities(self): |
276 | + markup = b"<foo>\x91\x92\x93\x94</foo>" |
277 | + dammit = UnicodeDammit(markup, smart_quotes_to="html") |
278 | + self.assertEqual( |
279 | + dammit.unicode_markup, "<foo>‘’“”</foo>") |
280 | + |
281 | + def test_smart_quotes_to_ascii(self): |
282 | + markup = b"<foo>\x91\x92\x93\x94</foo>" |
283 | + dammit = UnicodeDammit(markup, smart_quotes_to="ascii") |
284 | + self.assertEqual( |
285 | + dammit.unicode_markup, """<foo>''""</foo>""") |
286 | + |
287 | + def test_detect_utf8(self): |
288 | + utf8 = b"\xc3\xa9" |
289 | + dammit = UnicodeDammit(utf8) |
290 | + self.assertEqual(dammit.unicode_markup, u'\xe9') |
291 | + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') |
292 | + |
293 | + def test_convert_hebrew(self): |
294 | + hebrew = b"\xed\xe5\xec\xf9" |
295 | + dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) |
296 | + self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') |
297 | + self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') |
298 | + |
299 | + def test_dont_see_smart_quotes_where_there_are_none(self): |
300 | + utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" |
301 | + dammit = UnicodeDammit(utf_8) |
302 | + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') |
303 | + self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) |
304 | + |
305 | + def test_ignore_inappropriate_codecs(self): |
306 | + utf8_data = u"Räksmörgås".encode("utf-8") |
307 | + dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) |
308 | + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') |
309 | + |
310 | + def test_ignore_invalid_codecs(self): |
311 | + utf8_data = u"Räksmörgås".encode("utf-8") |
312 | + for bad_encoding in ['.utf8', '...', 'utF---16.!']: |
313 | + dammit = UnicodeDammit(utf8_data, [bad_encoding]) |
314 | + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') |
315 | + |
316 | + def test_detect_html5_style_meta_tag(self): |
317 | + |
318 | + for data in ( |
319 | + b'<html><meta charset="euc-jp" /></html>', |
320 | + b"<html><meta charset='euc-jp' /></html>", |
321 | + b"<html><meta charset=euc-jp /></html>", |
322 | + b"<html><meta charset=euc-jp/></html>"): |
323 | + dammit = UnicodeDammit(data, is_html=True) |
324 | + self.assertEqual( |
325 | + "euc-jp", dammit.original_encoding) |
326 | + |
327 | + def test_last_ditch_entity_replacement(self): |
328 | + # This is a UTF-8 document that contains bytestrings |
329 | + # completely incompatible with UTF-8 (ie. encoded with some other |
330 | + # encoding). |
331 | + # |
332 | + # Since there is no consistent encoding for the document, |
333 | + # Unicode, Dammit will eventually encode the document as UTF-8 |
334 | + # and encode the incompatible characters as REPLACEMENT |
335 | + # CHARACTER. |
336 | + # |
337 | + # If chardet is installed, it will detect that the document |
338 | + # can be converted into ISO-8859-1 without errors. This happens |
339 | + # to be the wrong encoding, but it is a consistent encoding, so the |
340 | + # code we're testing here won't run. |
341 | + # |
342 | + # So we temporarily disable chardet if it's present. |
343 | + doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> |
344 | +<html><b>\330\250\330\252\330\261</b> |
345 | +<i>\310\322\321\220\312\321\355\344</i></html>""" |
346 | + chardet = bs4.dammit.chardet_dammit |
347 | + logging.disable(logging.WARNING) |
348 | + try: |
349 | + def noop(str): |
350 | + return None |
351 | + bs4.dammit.chardet_dammit = noop |
352 | + dammit = UnicodeDammit(doc) |
353 | + self.assertEqual(True, dammit.contains_replacement_characters) |
354 | + self.assertTrue(u"\ufffd" in dammit.unicode_markup) |
355 | + |
356 | + soup = BeautifulSoup(doc, "html.parser") |
357 | + self.assertTrue(soup.contains_replacement_characters) |
358 | + finally: |
359 | + logging.disable(logging.NOTSET) |
360 | + bs4.dammit.chardet_dammit = chardet |
361 | + |
362 | + def test_byte_order_mark_removed(self): |
363 | + # A document written in UTF-16LE will have its byte order marker stripped. |
364 | + data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' |
365 | + dammit = UnicodeDammit(data) |
366 | + self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) |
367 | + self.assertEqual("utf-16le", dammit.original_encoding) |
368 | + |
369 | + def test_detwingle(self): |
370 | + # Here's a UTF8 document. |
371 | + utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") |
372 | + |
373 | + # Here's a Windows-1252 document. |
374 | + windows_1252 = ( |
375 | + u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" |
376 | + u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") |
377 | + |
378 | + # Through some unholy alchemy, they've been stuck together. |
379 | + doc = utf8 + windows_1252 + utf8 |
380 | + |
381 | + # The document can't be turned into UTF-8: |
382 | + self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") |
383 | + |
384 | + # Unicode, Dammit thinks the whole document is Windows-1252, |
385 | + # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" |
386 | + |
387 | + # But if we run it through fix_embedded_windows_1252, it's fixed: |
388 | + |
389 | + fixed = UnicodeDammit.detwingle(doc) |
390 | + self.assertEqual( |
391 | + u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) |
392 | + |
393 | + def test_detwingle_ignores_multibyte_characters(self): |
394 | + # Each of these characters has a UTF-8 representation ending |
395 | + # in \x93. \x93 is a smart quote if interpreted as |
396 | + # Windows-1252. But our code knows to skip over multibyte |
397 | + # UTF-8 characters, so they'll survive the process unscathed. |
398 | + for tricky_unicode_char in ( |
399 | + u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' |
400 | + u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' |
401 | + u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. |
402 | + ): |
403 | + input = tricky_unicode_char.encode("utf8") |
404 | + self.assertTrue(input.endswith(b'\x93')) |
405 | + output = UnicodeDammit.detwingle(input) |
406 | + self.assertEqual(output, input) |
407 | + |
408 | +class TestNamedspacedAttribute(SoupTest): |
409 | + |
410 | + def test_name_may_be_none(self): |
411 | + a = NamespacedAttribute("xmlns", None) |
412 | + self.assertEqual(a, "xmlns") |
413 | + |
414 | + def test_attribute_is_equivalent_to_colon_separated_string(self): |
415 | + a = NamespacedAttribute("a", "b") |
416 | + self.assertEqual("a:b", a) |
417 | + |
418 | + def test_attributes_are_equivalent_if_prefix_and_name_identical(self): |
419 | + a = NamespacedAttribute("a", "b", "c") |
420 | + b = NamespacedAttribute("a", "b", "c") |
421 | + self.assertEqual(a, b) |
422 | + |
423 | + # The actual namespace is not considered. |
424 | + c = NamespacedAttribute("a", "b", None) |
425 | + self.assertEqual(a, c) |
426 | + |
427 | + # But name and prefix are important. |
428 | + d = NamespacedAttribute("a", "z", "c") |
429 | + self.assertNotEqual(a, d) |
430 | + |
431 | + e = NamespacedAttribute("z", "b", "c") |
432 | + self.assertNotEqual(a, e) |
433 | + |
434 | + |
435 | +class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): |
436 | + |
437 | + def test_content_meta_attribute_value(self): |
438 | + value = CharsetMetaAttributeValue("euc-jp") |
439 | + self.assertEqual("euc-jp", value) |
440 | + self.assertEqual("euc-jp", value.original_value) |
441 | + self.assertEqual("utf8", value.encode("utf8")) |
442 | + |
443 | + |
444 | + def test_content_meta_attribute_value(self): |
445 | + value = ContentMetaAttributeValue("text/html; charset=euc-jp") |
446 | + self.assertEqual("text/html; charset=euc-jp", value) |
447 | + self.assertEqual("text/html; charset=euc-jp", value.original_value) |
448 | + self.assertEqual("text/html; charset=utf8", value.encode("utf8")) |
449 | |
450 | === modified file 'bs4/tests/test_soup.py' |
451 | --- bs4/tests/test_soup.py 2014-05-03 14:19:04 +0000 |
452 | +++ bs4/tests/test_soup.py 2014-11-14 17:22:47 +0000 |
453 | @@ -271,9 +271,9 @@ |
454 | dammit.unicode_markup, """<foo>''""</foo>""") |
455 | |
456 | def test_detect_utf8(self): |
457 | - utf8 = b"\xc3\xa9" |
458 | + utf8 = b"\xc3\xa9\xc3\xa9" |
459 | dammit = UnicodeDammit(utf8) |
460 | - self.assertEqual(dammit.unicode_markup, u'\xe9') |
461 | + self.assertEqual(dammit.unicode_markup, u'\xe9\xe9') |
462 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') |
463 | |
464 | def test_convert_hebrew(self): |
465 | |
466 | === modified file 'debian/changelog' |
467 | --- debian/changelog 2014-10-26 09:32:48 +0000 |
468 | +++ debian/changelog 2014-11-14 17:22:47 +0000 |
469 | @@ -1,3 +1,9 @@ |
470 | +beautifulsoup4 (4.3.2-2ubuntu2) vivid; urgency=medium |
471 | + |
472 | + * Work around tests failure when chardet is installed. |
473 | + |
474 | + -- Dmitry Shachnev <mitya57@ubuntu.com> Fri, 14 Nov 2014 13:49:28 +0300 |
475 | + |
476 | beautifulsoup4 (4.3.2-2ubuntu1) vivid; urgency=medium |
477 | |
478 | * Merge from Debian unstable. Remaining changes: |
479 | |
480 | === added file 'debian/patches/fix-chardet-failure' |
481 | --- debian/patches/fix-chardet-failure 1970-01-01 00:00:00 +0000 |
482 | +++ debian/patches/fix-chardet-failure 2014-11-14 17:22:47 +0000 |
483 | @@ -0,0 +1,21 @@ |
484 | +Description: fix tests failure when chardet is used |
485 | + This only happens when chardet is installed (and thus used for |
486 | + encoding detection). |
487 | +Author: Dmitry Shachnev <mitya57@ubuntu.com> |
488 | +Forwarded: yes, https://code.launchpad.net/~mitya57/beautifulsoup/tests/+merge/241832 |
489 | +Last-Update: 2014-11-14 |
490 | + |
491 | +--- a/bs4/tests/test_soup.py |
492 | ++++ b/bs4/tests/test_soup.py |
493 | +@@ -271,9 +271,9 @@ |
494 | + dammit.unicode_markup, """<foo>''""</foo>""") |
495 | + |
496 | + def test_detect_utf8(self): |
497 | +- utf8 = b"\xc3\xa9" |
498 | ++ utf8 = b"\xc3\xa9\xc3\xa9" |
499 | + dammit = UnicodeDammit(utf8) |
500 | +- self.assertEqual(dammit.unicode_markup, u'\xe9') |
501 | ++ self.assertEqual(dammit.unicode_markup, u'\xe9\xe9') |
502 | + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') |
503 | + |
504 | + def test_convert_hebrew(self): |
505 | |
506 | === modified file 'debian/patches/series' |
507 | --- debian/patches/series 2014-10-16 22:57:41 +0000 |
508 | +++ debian/patches/series 2014-11-14 17:22:47 +0000 |
509 | @@ -1,1 +1,2 @@ |
510 | python3.4-warnings |
511 | +fix-chardet-failure |