1
=== modified file 'breezy/bzr/tests/test_xml.py'
2
--- breezy/bzr/tests/test_xml.py	2023-04-30 20:58:50 +0000
3
+++ breezy/bzr/tests/test_xml.py	2023-05-05 15:54:42 +0000
4
@@ -528,21 +528,14 @@
5
528
class TestEncodeAndEscape(TestCase):
528
class TestEncodeAndEscape(TestCase):
6
529
    """Whitebox testing of the _encode_and_escape function."""
529
    """Whitebox testing of the _encode_and_escape function."""
7
530
530
8
531
    def setUp(self):
9
532
        super().setUp()
10
533
        # Keep the cache clear before and after the test
11
534
        breezy.bzr.xml_serializer._clear_cache()
12
535
        self.addCleanup(breezy.bzr.xml_serializer._clear_cache)
13
536
14
537
    def test_simple_ascii(self):
531
    def test_simple_ascii(self):
15
538
        # _encode_and_escape always appends a final ", because these parameters
532
        # _encode_and_escape always appends a final ", because these parameters
16
539
        # are being used in xml attributes, and by returning it now, we have to
533
        # are being used in xml attributes, and by returning it now, we have to
17
540
        # do fewer string operations later.
534
        # do fewer string operations later.
18
541
        val = breezy.bzr.xml_serializer.encode_and_escape('foo bar')
535
        val = breezy.bzr.xml_serializer.encode_and_escape('foo bar')
19
542
        self.assertEqual(b'foo bar', val)
536
        self.assertEqual(b'foo bar', val)
20
543
        # The second time should be cached
21
544
        val2 = breezy.bzr.xml_serializer.encode_and_escape('foo bar')
537
        val2 = breezy.bzr.xml_serializer.encode_and_escape('foo bar')
23
545
        self.assertIs(val2, val)
538
        self.assertEqual(val2, val)
24
546
539
25
547
    def test_ascii_with_xml(self):
540
    def test_ascii_with_xml(self):
26
548
        self.assertEqual(b'&amp;&apos;&quot;&lt;&gt;',
541
        self.assertEqual(b'&amp;&apos;&quot;&lt;&gt;',
27
549
542
28
=== modified file 'breezy/bzr/xml_serializer.py'
29
--- breezy/bzr/xml_serializer.py	2023-05-03 17:01:16 +0000
30
+++ breezy/bzr/xml_serializer.py	2023-05-05 15:54:42 +0000
31
@@ -100,24 +100,6 @@
32
100
        return ElementTree().parse(f)
100
        return ElementTree().parse(f)
33
101
101
34
102
102
35
103
def escape_invalid_chars(message):
36
104
    """Escape the XML-invalid characters in a commit message.
37
105
38
106
    :param message: Commit message to escape
39
107
    :return: tuple with escaped message and number of characters escaped
40
108
    """
41
109
    if message is None:
42
110
        return None, 0
43
111
    # Python strings can include characters that can't be
44
112
    # represented in well-formed XML; escape characters that
45
113
    # aren't listed in the XML specification
46
114
    # (http://www.w3.org/TR/REC-xml/#NT-Char).
47
115
    return re.subn('[^\x09\x0A\x0D\u0020-\uD7FF\uE000-\uFFFD]+',
48
116
                   lambda match: match.group(0).encode(
49
117
                       'unicode_escape').decode('ascii'),
50
118
                   message)
51
119
52
120
53
121
def get_utf8_or_ascii(a_str):
103
def get_utf8_or_ascii(a_str):
54
122
    """Return a cached version of the string.
104
    """Return a cached version of the string.
55
123
105
56
@@ -139,84 +121,7 @@
57
139
        return a_str
121
        return a_str
58
140
122
59
141
123
138
142
_utf8_re = lazy_regex.lazy_compile(b'[&<>\'\"]|[\x80-\xff]+')
124
from .._bzr_rs import encode_and_escape, escape_invalid_chars
61
143
_unicode_re = lazy_regex.lazy_compile('[&<>\'\"\u0080-\uffff]')
62
144
63
145
64
146
_xml_escape_map = {
65
147
    "&": '&amp;',
66
148
    "'": "&apos;",  # FIXME: overkill
67
149
    "\"": "&quot;",
68
150
    "<": "&lt;",
69
151
    ">": "&gt;",
70
152
    }
71
153
72
154
73
155
def _unicode_escape_replace(match, _map=_xml_escape_map):
74
156
    """Replace a string of non-ascii, non XML safe characters with their escape
75
157
76
158
    This will escape both Standard XML escapes, like <>"', etc.
77
159
    As well as escaping non ascii characters, because ElementTree did.
78
160
    This helps us remain compatible to older versions of bzr. We may change
79
161
    our policy in the future, though.
80
162
    """
81
163
    # jam 20060816 Benchmarks show that try/KeyError is faster if you
82
164
    # expect the entity to rarely miss. There is about a 10% difference
83
165
    # in overall time. But if you miss frequently, then if None is much
84
166
    # faster. For our use case, we *rarely* have a revision id, file id
85
167
    # or path name that is unicode. So use try/KeyError.
86
168
    try:
87
169
        return _map[match.group()]
88
170
    except KeyError:
89
171
        return "&#%d;" % ord(match.group())
90
172
91
173
92
174
def _utf8_escape_replace(match, _map=_xml_escape_map):
93
175
    """Escape utf8 characters into XML safe ones.
94
176
95
177
    This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
96
178
    or it is handling characters with the high-bit set. For ascii characters,
97
179
    we just lookup the replacement in the dictionary. For everything else, we
98
180
    decode back into Unicode, and then use the XML escape code.
99
181
    """
100
182
    try:
101
183
        return _map[match.group().decode('ascii', 'replace')].encode()
102
184
    except KeyError:
103
185
        return b''.join(b'&#%d;' % ord(uni_chr)
104
186
                        for uni_chr in match.group().decode('utf8'))
105
187
106
188
107
189
_to_escaped_map: Dict[Union[bytes, str], str] = {}
108
190
109
191
110
192
def encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):
111
193
    """Encode the string into utf8, and escape invalid XML characters"""
112
194
    # We frequently get entities we have not seen before, so it is better
113
195
    # to check if None, rather than try/KeyError
114
196
    text = _map.get(unicode_or_utf8_str)
115
197
    if text is None:
116
198
        if isinstance(unicode_or_utf8_str, str):
117
199
            # The alternative policy is to do a regular UTF8 encoding
118
200
            # and then escape only XML meta characters.
119
201
            # Performance is equivalent once you use codecs. *However*
120
202
            # this makes the serialized texts incompatible with old versions
121
203
            # of bzr. So no net gain. (Perhaps the read code would handle utf8
122
204
            # better than entity escapes, but cElementTree seems to do just
123
205
            # fine either way)
124
206
            text = _unicode_re.sub(
125
207
                _unicode_escape_replace, unicode_or_utf8_str).encode()
126
208
        else:
127
209
            # Plain strings are considered to already be in utf-8 so we do a
128
210
            # slightly different method for escaping.
129
211
            text = _utf8_re.sub(_utf8_escape_replace,
130
212
                                unicode_or_utf8_str)
131
213
        _map[unicode_or_utf8_str] = text
132
214
    return text
133
215
134
216
135
217
def _clear_cache():
136
218
    """Clean out the unicode => escaped map"""
137
219
    _to_escaped_map.clear()
139
220
125
140
221
126
141
222
def unpack_inventory_entry(elt, entry_cache=None, return_from_cache=False):
127
def unpack_inventory_entry(elt, entry_cache=None, return_from_cache=False):
142
223
128
143
=== modified file 'crates/bazaar-py/src/lib.rs'
144
--- crates/bazaar-py/src/lib.rs	2023-05-05 12:17:33 +0000
145
+++ crates/bazaar-py/src/lib.rs	2023-05-05 15:54:42 +0000
146
@@ -4,7 +4,7 @@
147
4
use pyo3::exceptions::{PyNotImplementedError, PyRuntimeError, PyTypeError, PyValueError};
4
use pyo3::exceptions::{PyNotImplementedError, PyRuntimeError, PyTypeError, PyValueError};
148
5
use pyo3::import_exception;
5
use pyo3::import_exception;
149
6
use pyo3::prelude::*;
6
use pyo3::prelude::*;
151
7
use pyo3::types::{PyBytes, PyList, PyString};
7
use pyo3::types::{PyBytes, PyList, PyString, PyUnicode};
152
8
use pyo3_file::PyFileLikeObject;
8
use pyo3_file::PyFileLikeObject;
153
9
use std::collections::HashMap;
9
use std::collections::HashMap;
154
10
10
155
@@ -402,6 +402,28 @@
156
402
    }
402
    }
157
403
}
403
}
158
404
404
159
405
#[pyfunction]
160
406
fn escape_invalid_chars(message: Option<&str>) -> (Option<String>, usize) {
161
407
    bazaar::xml_serializer::escape_invalid_chars(message)
162
408
}
163
409
164
410
#[pyfunction]
165
411
fn encode_and_escape(py: Python, unicode_or_utf8_str: PyObject) -> PyResult<&PyBytes> {
166
412
    if let Ok(text) = unicode_or_utf8_str.extract::<&str>(py) {
167
413
        Ok(PyBytes::new(
168
414
            py,
169
415
            bazaar::xml_serializer::encode_and_escape_string(text).as_slice(),
170
416
        ))
171
417
    } else if let Ok(bytes) = unicode_or_utf8_str.extract::<&[u8]>(py) {
172
418
        Ok(PyBytes::new(
173
419
            py,
174
420
            bazaar::xml_serializer::encode_and_escape_bytes(bytes).as_slice(),
175
421
        ))
176
422
    } else {
177
423
        Err(PyTypeError::new_err("expected str or bytes"))
178
424
    }
179
425
}
180
426
181
405
#[pymodule]
427
#[pymodule]
182
406
fn _bzr_rs(py: Python, m: &PyModule) -> PyResult<()> {
428
fn _bzr_rs(py: Python, m: &PyModule) -> PyResult<()> {
183
407
    m.add_wrapped(wrap_pyfunction!(_next_id_suffix))?;
429
    m.add_wrapped(wrap_pyfunction!(_next_id_suffix))?;
184
@@ -424,5 +446,7 @@
185
424
    m.add_wrapped(wrap_pyfunction!(is_null_revision))?;
446
    m.add_wrapped(wrap_pyfunction!(is_null_revision))?;
186
425
    m.add_wrapped(wrap_pyfunction!(is_reserved_revision_id))?;
447
    m.add_wrapped(wrap_pyfunction!(is_reserved_revision_id))?;
187
426
    m.add_wrapped(wrap_pyfunction!(check_not_reserved_id))?;
448
    m.add_wrapped(wrap_pyfunction!(check_not_reserved_id))?;
188
449
    m.add_wrapped(wrap_pyfunction!(escape_invalid_chars))?;
189
450
    m.add_wrapped(wrap_pyfunction!(encode_and_escape))?;
190
427
    Ok(())
451
    Ok(())
191
428
}
452
}
192
429
453
193
=== modified file 'crates/bazaar/src/globbing.rs'
194
--- crates/bazaar/src/globbing.rs	2023-04-22 02:01:38 +0000
195
+++ crates/bazaar/src/globbing.rs	2023-05-05 15:54:42 +0000
196
@@ -31,6 +31,8 @@
197
31
    Closure(Box<dyn FnMut(String) -> String + Sync + Send>),
31
    Closure(Box<dyn FnMut(String) -> String + Sync + Send>),
198
32
}
32
}
199
33
33
200
34
// TODO(jelmer): Consider using RegexSet from the regex crate instead.
201
35
202
34
/// Do a multiple-pattern substitution.
36
/// Do a multiple-pattern substitution.
203
35
///
37
///
204
36
/// The patterns and substitutions are combined into one, so the result of
38
/// The patterns and substitutions are combined into one, so the result of
205
37
39
206
=== modified file 'crates/bazaar/src/lib.rs'
207
--- crates/bazaar/src/lib.rs	2023-05-05 12:14:36 +0000
208
+++ crates/bazaar/src/lib.rs	2023-05-05 15:54:42 +0000
209
@@ -94,3 +94,4 @@
210
94
}
94
}
211
95
95
212
96
pub mod bencode_serializer;
96
pub mod bencode_serializer;
213
97
pub mod xml_serializer;
214
97
98
215
=== added file 'crates/bazaar/src/xml_serializer.rs'
216
--- crates/bazaar/src/xml_serializer.rs	1970-01-01 00:00:00 +0000
217
+++ crates/bazaar/src/xml_serializer.rs	2023-05-05 15:54:42 +0000
218
@@ -0,0 +1,81 @@
219
1
use std::str;
220
2
221
3
lazy_static::lazy_static! {
222
4
    static ref UTF8_RE: regex::bytes::Regex = regex::bytes::Regex::new(r#"(?-u)[&<>'"]|[\x80-\xff]+"#).unwrap();
223
5
    static ref UNICODE_RE: regex::Regex = regex::Regex::new(r#"[&<>'"\u{0080}-\u{ffff}]"#).unwrap();
224
6
225
7
}
226
8
227
9
fn escape_low(c: u8) -> Option<&'static str> {
228
10
    match c {
229
11
        b'&' => Some("&amp;"),
230
12
        b'\'' => Some("&apos;"),
231
13
        b'"' => Some("&quot;"),
232
14
        b'<' => Some("&lt;"),
233
15
        b'>' => Some("&gt;"),
234
16
        _ => None,
235
17
    }
236
18
}
237
19
238
20
fn unicode_escape_replace(cap: &regex::Captures) -> String {
239
21
    let m = cap.get(0).unwrap();
240
22
    assert_eq!(m.as_str().chars().count(), 1,);
241
23
    let c = m.as_str().chars().next().unwrap();
242
24
    if m.len() == 1 {
243
25
        if let Some(ret) = escape_low(m.as_str().as_bytes()[0]) {
244
26
            return ret.to_string();
245
27
        }
246
28
    }
247
29
    format!("&#{};", c as u32)
248
30
}
249
31
250
32
fn utf8_escape_replace(cap: &regex::bytes::Captures) -> Vec<u8> {
251
33
    let m = cap.get(0).unwrap().as_bytes();
252
34
    eprintln!("m: {:?}", cap);
253
35
    if m.len() == 1 {
254
36
        if let Some(ret) = escape_low(m[0]) {
255
37
            return ret.as_bytes().to_vec();
256
38
        }
257
39
    }
258
40
    let utf8 = str::from_utf8(m).unwrap();
259
41
    utf8.chars()
260
42
        .map(|c| format!("&#{};", c as u64).into_bytes())
261
43
        .collect::<Vec<Vec<u8>>>()
262
44
        .concat()
263
45
}
264
46
265
47
pub fn encode_and_escape_string(text: &str) -> Vec<u8> {
266
48
    UNICODE_RE
267
49
        .replace_all(text, unicode_escape_replace)
268
50
        .as_bytes()
269
51
        .to_vec()
270
52
}
271
53
272
54
pub fn encode_and_escape_bytes(data: &[u8]) -> Vec<u8> {
273
55
    UTF8_RE.replace_all(data, utf8_escape_replace).to_vec()
274
56
}
275
57
276
58
pub fn escape_invalid_chars(message: Option<&str>) -> (Option<String>, usize) {
277
59
    if let Some(msg) = message {
278
60
        let escaped = msg
279
61
            .chars()
280
62
            .map(|c| {
281
63
                if c == '\t' || c == '\n' || c == '\r' || c == '\x7f' {
282
64
                    c.to_string()
283
65
                } else if c.is_ascii_control()
284
66
                    || (c as u32) > 0xD7FF && (c as u32) < 0xE000
285
67
                    || (c as u32) > 0xFFFD && (c as u32) < 0x10000
286
68
                {
287
69
                    format!("\\x{:02x}", c as u32)
288
70
                } else {
289
71
                    c.to_string()
290
72
                }
291
73
            })
292
74
            .collect::<Vec<String>>()
293
75
            .join("");
294
76
295
77
        (Some(escaped), msg.len())
296
78
    } else {
297
79
        (None, 0)
298
80
    }
299
81
}
Status:	Merged
Approved by:	Jelmer Vernooij on 2023-05-05
Approved revision:	7780
Merged at revision:	7790
Proposed branch:	lp:~jelmer/brz/serializer-xml-1
Merge into:	lp:brz
Diff against target:	299 lines (+111/-105) 6 files modified breezy/bzr/tests/test_xml.py (+1/-8) breezy/bzr/xml_serializer.py (+1/-96) crates/bazaar-py/src/lib.rs (+25/-1) crates/bazaar/src/globbing.rs (+2/-0) crates/bazaar/src/lib.rs (+1/-0) crates/bazaar/src/xml_serializer.rs (+81/-0)
To merge this branch:	bzr merge lp:~jelmer/brz/serializer-xml-1
Related bugs:	Link a bug report
Reviewer	Review Type	Date Requested	Status
Jelmer Vernooij			Approve on 2023-05-05
Review via email: mp+442461@code.launchpad.net