Merge lp:~cjwatson/wadllib/fix-mime-encoding into lp:wadllib

Proposed by Colin Watson
Status: Merged
Merged at revision: 34
Proposed branch: lp:~cjwatson/wadllib/fix-mime-encoding
Merge into: lp:wadllib
Diff against target: 442 lines (+208/-82)
8 files modified
README.txt (+1/-1)
setup.py (+0/-8)
src/wadllib/NEWS.txt (+10/-0)
src/wadllib/README.txt (+58/-11)
src/wadllib/application.py (+114/-47)
src/wadllib/tests/test_docs.py (+15/-7)
tox.ini (+10/-0)
versions.cfg (+0/-8)
To merge this branch: bzr merge lp:~cjwatson/wadllib/fix-mime-encoding
Reviewer Review Type Date Requested Status
Simon Davy (community) Approve
LAZR Developers Pending
Review via email: mp+348955@code.launchpad.net

Commit message

Reimplement MIME multipart/form-data encoding without binary corruption bugs.

Description of the change

I tried to persuade the standard library to do what I wanted, but getting it to work right across all Python versions was too hard.

To post a comment you must log in.
Revision history for this message
Pierre Equoy (pieq) wrote :

Hi,

is there an easy way to test this change to upload a binary attachment to the Staging instance of Launchpad?

37. By Colin Watson

Test a few more variations of binary data.

Revision history for this message
Colin Watson (cjwatson) wrote :

You just need to check out this branch and install it in a virtualenv using pip.

Anyway, I've tested this myself against a development instance of Launchpad, explicitly testing the example @pieq gave in the linked bug as well as things like files ending with an \r byte, and everything now seems to round-trip successfully.

Revision history for this message
Simon Davy (bloodearnest) wrote :

LGTM, I agree that it's small enough in inline, so to speak.

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'README.txt'
2--- README.txt 2012-01-18 20:55:11 +0000
3+++ README.txt 2018-07-13 10:07:36 +0000
4@@ -1,6 +1,6 @@
5 Navigate HTTP resources using WADL files as guides.
6
7-wadllib should work with Python >= 2.4.
8+wadllib should work with Python >= 2.6.
9
10 ..
11 Copyright (C) 2008-2009 Canonical Ltd.
12
13=== modified file 'setup.py'
14--- setup.py 2014-06-24 17:54:15 +0000
15+++ setup.py 2018-07-13 10:07:36 +0000
16@@ -44,14 +44,6 @@
17 'setuptools',
18 'lazr.uri',
19 ]
20-if sys.version < '2.5':
21- # This module is in the standard library as of Python 2.5. In Python 3,
22- # it doesn't work when fetched separately.
23- install_requires.append('elementtree')
24-if sys.version < '2.6':
25- # This module is in the standard library as of Python 2.6. In Python 3,
26- # it doesn't work when fetched separately.
27- install_requires.extend('simplejson')
28
29 setup(
30 name='wadllib',
31
32=== modified file 'src/wadllib/NEWS.txt'
33--- src/wadllib/NEWS.txt 2013-02-25 18:53:09 +0000
34+++ src/wadllib/NEWS.txt 2018-07-13 10:07:36 +0000
35@@ -2,6 +2,16 @@
36 NEWS for wadllib
37 ================
38
39+1.3.3
40+=====
41+
42+- Drop support for Python < 2.6.
43+- Add tox testing support.
44+- Implement a subset of MIME multipart/form-data encoding locally rather
45+ than using the standard library's email module, which doesn't have good
46+ handling of binary parts and corrupts bytes in them that look like line
47+ endings in various ways depending on the Python version. [bug=1729754]
48+
49 1.3.2 (2013-02-25)
50 ==================
51
52
53=== modified file 'src/wadllib/README.txt'
54--- src/wadllib/README.txt 2015-02-11 04:19:08 +0000
55+++ src/wadllib/README.txt 2018-07-13 10:07:36 +0000
56@@ -295,10 +295,7 @@
57 to another object. The 'link' property of a parameter lets you examine
58 links before following them.
59
60- >>> try:
61- ... import simplejson as json
62- ... except ImportError:
63- ... import json
64+ >>> import json
65 >>> links_wadl = application_for('links-wadl.xml')
66 >>> service_root = links_wadl.get_resource_by_path('')
67 >>> representation = json.dumps(
68@@ -568,13 +565,63 @@
69 ... "http://www.example.com/", binary_stream)
70 >>> service_root = binary_wadl.get_resource_by_path('')
71
72- >>> method = service_root.get_method('post', 'multipart/form-data')
73- >>> media_type, doc = method.build_representation(
74- ... text_field="text", binary_field="\x01\x02")
75- >>> print(media_type)
76- multipart/form-data; boundary=...
77- >>> b'\x01\x02' in doc
78- True
79+Define a helper that processes the representation the same way
80+zope.publisher would.
81+
82+ >>> import cgi
83+ >>> import io
84+ >>> def assert_message_parts(media_type, doc, expected):
85+ ... if sys.version_info[0] == 3 and sys.version_info[1] < 3:
86+ ... # We can't do much due to https://bugs.python.org/issue18013.
87+ ... for value in expected:
88+ ... if not isinstance(value, bytes):
89+ ... value = value.encode('UTF-8')
90+ ... assert value in doc
91+ ... return
92+ ... environ = {
93+ ... 'REQUEST_METHOD': 'POST',
94+ ... 'CONTENT_TYPE': media_type,
95+ ... 'CONTENT_LENGTH': str(len(doc)),
96+ ... }
97+ ... kwargs = (
98+ ... {'encoding': 'UTF-8'} if sys.version_info[0] >= 3 else {})
99+ ... fs = cgi.FieldStorage(
100+ ... fp=io.BytesIO(doc), environ=environ, keep_blank_values=1,
101+ ... **kwargs)
102+ ... values = []
103+ ... def append_values(fields):
104+ ... for field in fields:
105+ ... if field.list:
106+ ... append_values(field.list)
107+ ... else:
108+ ... values.append(field.value)
109+ ... append_values(fs.list)
110+ ... assert values == expected, (
111+ ... 'Expected %s, got %s' % (expected, values))
112+
113+ >>> method = service_root.get_method('post', 'multipart/form-data')
114+ >>> media_type, doc = method.build_representation(
115+ ... text_field="text", binary_field=b"\x01\x02\r\x81\r")
116+ >>> print(media_type)
117+ multipart/form-data; boundary=...
118+ >>> assert_message_parts(media_type, doc, ['text', b'\x01\x02\r\x81\r'])
119+
120+ >>> method = service_root.get_method('post', 'multipart/form-data')
121+ >>> media_type, doc = method.build_representation(
122+ ... text_field="text\n", binary_field=b"\x01\x02\r\x81\n\r")
123+ >>> print(media_type)
124+ multipart/form-data; boundary=...
125+ >>> assert_message_parts(
126+ ... media_type, doc, ['text\r\n', b'\x01\x02\r\x81\n\r'])
127+
128+ >>> method = service_root.get_method('post', 'multipart/form-data')
129+ >>> media_type, doc = method.build_representation(
130+ ... text_field="text\r\nmore\r\n",
131+ ... binary_field=b"\x01\x02\r\n\x81\r\x82\n")
132+ >>> print(media_type)
133+ multipart/form-data; boundary=...
134+ >>> assert_message_parts(
135+ ... media_type, doc, ['text\r\nmore\r\n', b'\x01\x02\r\n\x81\r\x82\n'])
136
137 >>> method = service_root.get_method('post', 'text/unknown')
138 >>> method.build_representation(field="value")
139
140=== modified file 'src/wadllib/application.py'
141--- src/wadllib/application.py 2015-02-11 03:56:31 +0000
142+++ src/wadllib/application.py 2018-07-13 10:07:36 +0000
143@@ -1,4 +1,4 @@
144-# Copyright 2008-2013 Canonical Ltd. All rights reserved.
145+# Copyright 2008-2018 Canonical Ltd. All rights reserved.
146
147 # This file is part of wadllib.
148 #
149@@ -41,38 +41,20 @@
150 'WADLError',
151 ]
152
153-try:
154- # Make sure to try the Python 2 form first; we don't want to use
155- # io.StringIO in Python 2, because it is strict about only accepting
156- # unicode input.
157- from cStringIO import StringIO as BytesIO
158-except ImportError:
159- from io import BytesIO
160 import datetime
161-try:
162- from email.mime.multipart import MIMEMultipart
163- from email.mime.nonmultipart import MIMENonMultipart
164-except ImportError:
165- # We must be on 2.4.
166- from email.MIMEMultipart import MIMEMultipart
167- from email.MIMENonMultipart import MIMENonMultipart
168-
169+from email.utils import quote
170+import io
171+import json
172+import random
173+import re
174+import sys
175 import time
176 try:
177 from urllib.parse import urlencode
178 except ImportError:
179 from urllib import urlencode
180-try:
181- import simplejson as json
182-except ImportError:
183- import json
184-try:
185- import xml.etree.cElementTree as ET
186-except ImportError:
187- try:
188- import cElementTree as ET
189- except ImportError:
190- import elementtree.ElementTree as ET
191+import xml.etree.cElementTree as ET
192+
193 from lazr.uri import URI, merge
194
195 from wadllib import (
196@@ -823,6 +805,105 @@
197 """The media type of the representation described here."""
198 return self.resolve_definition().tag.attrib['mediaType']
199
200+ def _make_boundary(self, all_parts):
201+ """Make a random boundary that does not appear in `all_parts`."""
202+ _width = len(repr(sys.maxsize - 1))
203+ _fmt = '%%0%dd' % _width
204+ token = random.randrange(sys.maxsize)
205+ boundary = ('=' * 15) + (_fmt % token) + '=='
206+ if all_parts is None:
207+ return boundary
208+ b = boundary
209+ counter = 0
210+ while True:
211+ pattern = ('^--' + re.escape(b) + '(--)?$').encode('ascii')
212+ if not re.search(pattern, all_parts, flags=re.MULTILINE):
213+ break
214+ b = boundary + '.' + str(counter)
215+ counter += 1
216+ return b
217+
218+ def _write_headers(self, buf, headers):
219+ """Write MIME headers to a file object."""
220+ for key, value in headers:
221+ buf.write(key.encode('UTF-8'))
222+ buf.write(b': ')
223+ buf.write(value.encode('UTF-8'))
224+ buf.write(b'\r\n')
225+ buf.write(b'\r\n')
226+
227+ def _write_boundary(self, buf, boundary, closing=False):
228+ """Write a multipart boundary to a file object."""
229+ buf.write(b'--')
230+ buf.write(boundary.encode('UTF-8'))
231+ if closing:
232+ buf.write(b'--')
233+ buf.write(b'\r\n')
234+
235+ def _generate_multipart_form(self, parts):
236+ """Generate a multipart/form-data message.
237+
238+ This is very loosely based on the email module in the Python standard
239+ library. However, that module doesn't really support directly embedding
240+ binary data in a form: various versions of Python have mangled line
241+ separators in different ways, and none of them get it quite right.
242+ Since we only need a tiny subset of MIME here, it's easier to implement
243+ it ourselves.
244+
245+ :return: a tuple of two elements: the Content-Type of the message, and
246+ the entire encoded message as a byte string.
247+ """
248+ # Generate the subparts first so that we can calculate a safe boundary.
249+ encoded_parts = []
250+ for is_binary, name, value in parts:
251+ buf = io.BytesIO()
252+ if is_binary:
253+ ctype = 'application/octet-stream'
254+ # RFC 7578 says that the filename parameter isn't mandatory
255+ # in our case, but without it cgi.FieldStorage tries to
256+ # decode as text on Python 3.
257+ cdisp = 'form-data; name="%s"; filename="%s"' % (
258+ quote(name), quote(name))
259+ else:
260+ ctype = 'text/plain; charset="utf-8"'
261+ cdisp = 'form-data; name="%s"' % quote(name)
262+ self._write_headers(buf, [
263+ ('MIME-Version', '1.0'),
264+ ('Content-Type', ctype),
265+ ('Content-Disposition', cdisp),
266+ ])
267+ if is_binary:
268+ if not isinstance(value, bytes):
269+ raise TypeError('bytes payload expected: %s' % type(value))
270+ buf.write(value)
271+ else:
272+ if not isinstance(value, str):
273+ raise TypeError('str payload expected: %s' % type(value))
274+ lines = re.split(r'\r\n|\r|\n', value)
275+ for line in lines[:-1]:
276+ buf.write(line.encode('UTF-8'))
277+ buf.write(b'\r\n')
278+ buf.write(lines[-1].encode('UTF-8'))
279+ encoded_parts.append(buf.getvalue())
280+
281+ # Create a suitable boundary.
282+ boundary = self._make_boundary(b'\r\n'.join(encoded_parts))
283+
284+ # Now we can write the multipart headers, followed by all the parts.
285+ buf = io.BytesIO()
286+ ctype = 'multipart/form-data; boundary="%s"' % quote(boundary)
287+ self._write_headers(buf, [
288+ ('MIME-Version', '1.0'),
289+ ('Content-Type', ctype),
290+ ])
291+ for encoded_part in encoded_parts:
292+ self._write_boundary(buf, boundary)
293+ buf.write(encoded_part)
294+ buf.write(b'\r\n')
295+ self._write_boundary(buf, boundary, closing=True)
296+
297+ return ctype, buf.getvalue()
298+
299 def bind(self, param_values, **kw_param_values):
300 """Bind the definition to parameter values, creating a document.
301
302@@ -836,29 +917,13 @@
303 if media_type == 'application/x-www-form-urlencoded':
304 doc = urlencode(sorted(validated_values.items()))
305 elif media_type == 'multipart/form-data':
306- outer = MIMEMultipart()
307- outer.set_type('multipart/form-data')
308+ parts = []
309 missing = object()
310 for param in params:
311 value = validated_values.get(param.name, missing)
312 if value is not missing:
313- if param.type == 'binary':
314- maintype, subtype = 'application', 'octet-stream'
315- params = {}
316- else:
317- maintype, subtype = 'text', 'plain'
318- params = {'charset' : 'utf-8'}
319- inner = MIMENonMultipart(maintype, subtype, **params)
320- inner.set_payload(value)
321- inner['Content-Disposition'] = (
322- 'form-data; name="%s"' % param.name)
323- outer.attach(inner)
324- if hasattr(outer, "as_bytes"):
325- doc = outer.as_bytes()
326- else:
327- doc = outer.as_string(unixfrom=False)
328- media_type = (outer.get_content_type() +
329- '; boundary="%s"' % outer.get_boundary())
330+ parts.append((param.type == 'binary', param.name, value))
331+ media_type, doc = self._generate_multipart_form(parts)
332 elif media_type == 'application/json':
333 doc = json.dumps(validated_values)
334 else:
335@@ -1104,7 +1169,9 @@
336
337 def _from_string(self, markup):
338 """Turns markup into a document."""
339- return self._from_stream(BytesIO(markup))
340+ if not isinstance(markup, bytes):
341+ markup = markup.encode("UTF-8")
342+ return self._from_stream(io.BytesIO(markup))
343
344 def get_resource_type(self, resource_type_url):
345 """Retrieve a resource type by the URL of its description."""
346
347=== modified file 'src/wadllib/tests/test_docs.py'
348--- src/wadllib/tests/test_docs.py 2009-03-20 20:30:07 +0000
349+++ src/wadllib/tests/test_docs.py 2018-07-13 10:07:36 +0000
350@@ -1,4 +1,4 @@
351-# Copyright 2009 Canonical Ltd. All rights reserved.
352+# Copyright 2009-2018 Canonical Ltd. All rights reserved.
353 #
354 # This file is part of wadllib
355 #
356@@ -16,16 +16,20 @@
357
358 """Test harness."""
359
360+from __future__ import absolute_import, print_function
361+
362 __metaclass__ = type
363 __all__ = [
364- 'additional_tests',
365+ 'load_tests',
366 ]
367
368+import __future__
369 import atexit
370 import doctest
371 import os
372+
373 import pkg_resources
374-import unittest
375+
376
377 DOCTEST_FLAGS = (
378 doctest.ELLIPSIS |
379@@ -33,7 +37,7 @@
380 doctest.REPORT_NDIFF)
381
382
383-def additional_tests():
384+def load_tests(loader, tests, pattern):
385 doctest_files = [
386 os.path.abspath(
387 pkg_resources.resource_filename('wadllib', 'README.txt'))]
388@@ -44,7 +48,11 @@
389 os.path.abspath(
390 pkg_resources.resource_filename(
391 'wadllib', 'docs/%s' % name)))
392- kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
393+ globs = {
394+ future_item: getattr(__future__, future_item)
395+ for future_item in ('absolute_import', 'print_function')}
396+ kwargs = dict(
397+ module_relative=False, globs=globs, optionflags=DOCTEST_FLAGS)
398 atexit.register(pkg_resources.cleanup_resources)
399- return unittest.TestSuite((
400- doctest.DocFileSuite(*doctest_files, **kwargs)))
401+ tests.addTest(doctest.DocFileSuite(*doctest_files, **kwargs))
402+ return tests
403
404=== added file 'tox.ini'
405--- tox.ini 1970-01-01 00:00:00 +0000
406+++ tox.ini 2018-07-13 10:07:36 +0000
407@@ -0,0 +1,10 @@
408+[tox]
409+envlist =
410+ py27,py32,py33,py34,py35,py36,py37
411+
412+[testenv]
413+commands =
414+ zope-testrunner --test-path src --tests-pattern ^tests {posargs}
415+deps =
416+ .[test]
417+ zope-testrunner
418
419=== modified file 'versions.cfg'
420--- versions.cfg 2011-11-25 19:27:47 +0000
421+++ versions.cfg 2018-07-13 10:07:36 +0000
422@@ -26,10 +26,6 @@
423
424 # Required by:
425 # wadllib==1.2.0
426-elementtree = 1.2.6-20050316
427-
428-# Required by:
429-# wadllib==1.2.0
430 lazr.uri = 1.0.2
431
432 # Required by:
433@@ -46,10 +42,6 @@
434 setuptools = 0.6c11
435
436 # Required by:
437-# wadllib==1.2.0
438-simplejson = 2.2.1
439-
440-# Required by:
441 # z3c.recipe.sphinxdoc==0.0.8
442 zc.buildout = 1.5.2
443

Subscribers

People subscribed via source and target branches