Merge lp:~jelmer/brz/serializer-xml-1 into lp:brz

Proposed by Jelmer Vernooij
Status: Merged
Approved by: Jelmer Vernooij
Approved revision: 7780
Merged at revision: 7790
Proposed branch: lp:~jelmer/brz/serializer-xml-1
Merge into: lp:brz
Diff against target: 299 lines (+111/-105)
6 files modified
breezy/bzr/tests/test_xml.py (+1/-8)
breezy/bzr/xml_serializer.py (+1/-96)
crates/bazaar-py/src/lib.rs (+25/-1)
crates/bazaar/src/globbing.rs (+2/-0)
crates/bazaar/src/lib.rs (+1/-0)
crates/bazaar/src/xml_serializer.rs (+81/-0)
To merge this branch: bzr merge lp:~jelmer/brz/serializer-xml-1
Reviewer Review Type Date Requested Status
Jelmer Vernooij Approve
Review via email: mp+442461@code.launchpad.net

Commit message

Move some xml helper functions to rust

Description of the change

Move some xml helper functions to rust

To post a comment you must log in.
Revision history for this message
Jelmer Vernooij (jelmer) :
review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'breezy/bzr/tests/test_xml.py'
--- breezy/bzr/tests/test_xml.py 2023-04-30 20:58:50 +0000
+++ breezy/bzr/tests/test_xml.py 2023-05-05 15:54:42 +0000
@@ -528,21 +528,14 @@
528class TestEncodeAndEscape(TestCase):528class TestEncodeAndEscape(TestCase):
529 """Whitebox testing of the _encode_and_escape function."""529 """Whitebox testing of the _encode_and_escape function."""
530530
531 def setUp(self):
532 super().setUp()
533 # Keep the cache clear before and after the test
534 breezy.bzr.xml_serializer._clear_cache()
535 self.addCleanup(breezy.bzr.xml_serializer._clear_cache)
536
537 def test_simple_ascii(self):531 def test_simple_ascii(self):
538 # _encode_and_escape always appends a final ", because these parameters532 # _encode_and_escape always appends a final ", because these parameters
539 # are being used in xml attributes, and by returning it now, we have to533 # are being used in xml attributes, and by returning it now, we have to
540 # do fewer string operations later.534 # do fewer string operations later.
541 val = breezy.bzr.xml_serializer.encode_and_escape('foo bar')535 val = breezy.bzr.xml_serializer.encode_and_escape('foo bar')
542 self.assertEqual(b'foo bar', val)536 self.assertEqual(b'foo bar', val)
543 # The second time should be cached
544 val2 = breezy.bzr.xml_serializer.encode_and_escape('foo bar')537 val2 = breezy.bzr.xml_serializer.encode_and_escape('foo bar')
545 self.assertIs(val2, val)538 self.assertEqual(val2, val)
546539
547 def test_ascii_with_xml(self):540 def test_ascii_with_xml(self):
548 self.assertEqual(b'&'"<>',541 self.assertEqual(b'&'"<>',
549542
=== modified file 'breezy/bzr/xml_serializer.py'
--- breezy/bzr/xml_serializer.py 2023-05-03 17:01:16 +0000
+++ breezy/bzr/xml_serializer.py 2023-05-05 15:54:42 +0000
@@ -100,24 +100,6 @@
100 return ElementTree().parse(f)100 return ElementTree().parse(f)
101101
102102
103def escape_invalid_chars(message):
104 """Escape the XML-invalid characters in a commit message.
105
106 :param message: Commit message to escape
107 :return: tuple with escaped message and number of characters escaped
108 """
109 if message is None:
110 return None, 0
111 # Python strings can include characters that can't be
112 # represented in well-formed XML; escape characters that
113 # aren't listed in the XML specification
114 # (http://www.w3.org/TR/REC-xml/#NT-Char).
115 return re.subn('[^\x09\x0A\x0D\u0020-\uD7FF\uE000-\uFFFD]+',
116 lambda match: match.group(0).encode(
117 'unicode_escape').decode('ascii'),
118 message)
119
120
121def get_utf8_or_ascii(a_str):103def get_utf8_or_ascii(a_str):
122 """Return a cached version of the string.104 """Return a cached version of the string.
123105
@@ -139,84 +121,7 @@
139 return a_str121 return a_str
140122
141123
142_utf8_re = lazy_regex.lazy_compile(b'[&<>\'\"]|[\x80-\xff]+')124from .._bzr_rs import encode_and_escape, escape_invalid_chars
143_unicode_re = lazy_regex.lazy_compile('[&<>\'\"\u0080-\uffff]')
144
145
146_xml_escape_map = {
147 "&": '&amp;',
148 "'": "&apos;", # FIXME: overkill
149 "\"": "&quot;",
150 "<": "&lt;",
151 ">": "&gt;",
152 }
153
154
155def _unicode_escape_replace(match, _map=_xml_escape_map):
156 """Replace a string of non-ascii, non XML safe characters with their escape
157
158 This will escape both Standard XML escapes, like <>"', etc.
159 As well as escaping non ascii characters, because ElementTree did.
160 This helps us remain compatible to older versions of bzr. We may change
161 our policy in the future, though.
162 """
163 # jam 20060816 Benchmarks show that try/KeyError is faster if you
164 # expect the entity to rarely miss. There is about a 10% difference
165 # in overall time. But if you miss frequently, then if None is much
166 # faster. For our use case, we *rarely* have a revision id, file id
167 # or path name that is unicode. So use try/KeyError.
168 try:
169 return _map[match.group()]
170 except KeyError:
171 return "&#%d;" % ord(match.group())
172
173
174def _utf8_escape_replace(match, _map=_xml_escape_map):
175 """Escape utf8 characters into XML safe ones.
176
177 This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
178 or it is handling characters with the high-bit set. For ascii characters,
179 we just lookup the replacement in the dictionary. For everything else, we
180 decode back into Unicode, and then use the XML escape code.
181 """
182 try:
183 return _map[match.group().decode('ascii', 'replace')].encode()
184 except KeyError:
185 return b''.join(b'&#%d;' % ord(uni_chr)
186 for uni_chr in match.group().decode('utf8'))
187
188
189_to_escaped_map: Dict[Union[bytes, str], str] = {}
190
191
192def encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):
193 """Encode the string into utf8, and escape invalid XML characters"""
194 # We frequently get entities we have not seen before, so it is better
195 # to check if None, rather than try/KeyError
196 text = _map.get(unicode_or_utf8_str)
197 if text is None:
198 if isinstance(unicode_or_utf8_str, str):
199 # The alternative policy is to do a regular UTF8 encoding
200 # and then escape only XML meta characters.
201 # Performance is equivalent once you use codecs. *However*
202 # this makes the serialized texts incompatible with old versions
203 # of bzr. So no net gain. (Perhaps the read code would handle utf8
204 # better than entity escapes, but cElementTree seems to do just
205 # fine either way)
206 text = _unicode_re.sub(
207 _unicode_escape_replace, unicode_or_utf8_str).encode()
208 else:
209 # Plain strings are considered to already be in utf-8 so we do a
210 # slightly different method for escaping.
211 text = _utf8_re.sub(_utf8_escape_replace,
212 unicode_or_utf8_str)
213 _map[unicode_or_utf8_str] = text
214 return text
215
216
217def _clear_cache():
218 """Clean out the unicode => escaped map"""
219 _to_escaped_map.clear()
220125
221126
222def unpack_inventory_entry(elt, entry_cache=None, return_from_cache=False):127def unpack_inventory_entry(elt, entry_cache=None, return_from_cache=False):
223128
=== modified file 'crates/bazaar-py/src/lib.rs'
--- crates/bazaar-py/src/lib.rs 2023-05-05 12:17:33 +0000
+++ crates/bazaar-py/src/lib.rs 2023-05-05 15:54:42 +0000
@@ -4,7 +4,7 @@
4use pyo3::exceptions::{PyNotImplementedError, PyRuntimeError, PyTypeError, PyValueError};4use pyo3::exceptions::{PyNotImplementedError, PyRuntimeError, PyTypeError, PyValueError};
5use pyo3::import_exception;5use pyo3::import_exception;
6use pyo3::prelude::*;6use pyo3::prelude::*;
7use pyo3::types::{PyBytes, PyList, PyString};7use pyo3::types::{PyBytes, PyList, PyString, PyUnicode};
8use pyo3_file::PyFileLikeObject;8use pyo3_file::PyFileLikeObject;
9use std::collections::HashMap;9use std::collections::HashMap;
1010
@@ -402,6 +402,28 @@
402 }402 }
403}403}
404404
405#[pyfunction]
406fn escape_invalid_chars(message: Option<&str>) -> (Option<String>, usize) {
407 bazaar::xml_serializer::escape_invalid_chars(message)
408}
409
410#[pyfunction]
411fn encode_and_escape(py: Python, unicode_or_utf8_str: PyObject) -> PyResult<&PyBytes> {
412 if let Ok(text) = unicode_or_utf8_str.extract::<&str>(py) {
413 Ok(PyBytes::new(
414 py,
415 bazaar::xml_serializer::encode_and_escape_string(text).as_slice(),
416 ))
417 } else if let Ok(bytes) = unicode_or_utf8_str.extract::<&[u8]>(py) {
418 Ok(PyBytes::new(
419 py,
420 bazaar::xml_serializer::encode_and_escape_bytes(bytes).as_slice(),
421 ))
422 } else {
423 Err(PyTypeError::new_err("expected str or bytes"))
424 }
425}
426
405#[pymodule]427#[pymodule]
406fn _bzr_rs(py: Python, m: &PyModule) -> PyResult<()> {428fn _bzr_rs(py: Python, m: &PyModule) -> PyResult<()> {
407 m.add_wrapped(wrap_pyfunction!(_next_id_suffix))?;429 m.add_wrapped(wrap_pyfunction!(_next_id_suffix))?;
@@ -424,5 +446,7 @@
424 m.add_wrapped(wrap_pyfunction!(is_null_revision))?;446 m.add_wrapped(wrap_pyfunction!(is_null_revision))?;
425 m.add_wrapped(wrap_pyfunction!(is_reserved_revision_id))?;447 m.add_wrapped(wrap_pyfunction!(is_reserved_revision_id))?;
426 m.add_wrapped(wrap_pyfunction!(check_not_reserved_id))?;448 m.add_wrapped(wrap_pyfunction!(check_not_reserved_id))?;
449 m.add_wrapped(wrap_pyfunction!(escape_invalid_chars))?;
450 m.add_wrapped(wrap_pyfunction!(encode_and_escape))?;
427 Ok(())451 Ok(())
428}452}
429453
=== modified file 'crates/bazaar/src/globbing.rs'
--- crates/bazaar/src/globbing.rs 2023-04-22 02:01:38 +0000
+++ crates/bazaar/src/globbing.rs 2023-05-05 15:54:42 +0000
@@ -31,6 +31,8 @@
31 Closure(Box<dyn FnMut(String) -> String + Sync + Send>),31 Closure(Box<dyn FnMut(String) -> String + Sync + Send>),
32}32}
3333
34// TODO(jelmer): Consider using RegexSet from the regex crate instead.
35
34/// Do a multiple-pattern substitution.36/// Do a multiple-pattern substitution.
35///37///
36/// The patterns and substitutions are combined into one, so the result of38/// The patterns and substitutions are combined into one, so the result of
3739
=== modified file 'crates/bazaar/src/lib.rs'
--- crates/bazaar/src/lib.rs 2023-05-05 12:14:36 +0000
+++ crates/bazaar/src/lib.rs 2023-05-05 15:54:42 +0000
@@ -94,3 +94,4 @@
94}94}
9595
96pub mod bencode_serializer;96pub mod bencode_serializer;
97pub mod xml_serializer;
9798
=== added file 'crates/bazaar/src/xml_serializer.rs'
--- crates/bazaar/src/xml_serializer.rs 1970-01-01 00:00:00 +0000
+++ crates/bazaar/src/xml_serializer.rs 2023-05-05 15:54:42 +0000
@@ -0,0 +1,81 @@
1use std::str;
2
3lazy_static::lazy_static! {
4 static ref UTF8_RE: regex::bytes::Regex = regex::bytes::Regex::new(r#"(?-u)[&<>'"]|[\x80-\xff]+"#).unwrap();
5 static ref UNICODE_RE: regex::Regex = regex::Regex::new(r#"[&<>'"\u{0080}-\u{ffff}]"#).unwrap();
6
7}
8
9fn escape_low(c: u8) -> Option<&'static str> {
10 match c {
11 b'&' => Some("&amp;"),
12 b'\'' => Some("&apos;"),
13 b'"' => Some("&quot;"),
14 b'<' => Some("&lt;"),
15 b'>' => Some("&gt;"),
16 _ => None,
17 }
18}
19
20fn unicode_escape_replace(cap: &regex::Captures) -> String {
21 let m = cap.get(0).unwrap();
22 assert_eq!(m.as_str().chars().count(), 1,);
23 let c = m.as_str().chars().next().unwrap();
24 if m.len() == 1 {
25 if let Some(ret) = escape_low(m.as_str().as_bytes()[0]) {
26 return ret.to_string();
27 }
28 }
29 format!("&#{};", c as u32)
30}
31
32fn utf8_escape_replace(cap: &regex::bytes::Captures) -> Vec<u8> {
33 let m = cap.get(0).unwrap().as_bytes();
34 eprintln!("m: {:?}", cap);
35 if m.len() == 1 {
36 if let Some(ret) = escape_low(m[0]) {
37 return ret.as_bytes().to_vec();
38 }
39 }
40 let utf8 = str::from_utf8(m).unwrap();
41 utf8.chars()
42 .map(|c| format!("&#{};", c as u64).into_bytes())
43 .collect::<Vec<Vec<u8>>>()
44 .concat()
45}
46
47pub fn encode_and_escape_string(text: &str) -> Vec<u8> {
48 UNICODE_RE
49 .replace_all(text, unicode_escape_replace)
50 .as_bytes()
51 .to_vec()
52}
53
54pub fn encode_and_escape_bytes(data: &[u8]) -> Vec<u8> {
55 UTF8_RE.replace_all(data, utf8_escape_replace).to_vec()
56}
57
58pub fn escape_invalid_chars(message: Option<&str>) -> (Option<String>, usize) {
59 if let Some(msg) = message {
60 let escaped = msg
61 .chars()
62 .map(|c| {
63 if c == '\t' || c == '\n' || c == '\r' || c == '\x7f' {
64 c.to_string()
65 } else if c.is_ascii_control()
66 || (c as u32) > 0xD7FF && (c as u32) < 0xE000
67 || (c as u32) > 0xFFFD && (c as u32) < 0x10000
68 {
69 format!("\\x{:02x}", c as u32)
70 } else {
71 c.to_string()
72 }
73 })
74 .collect::<Vec<String>>()
75 .join("");
76
77 (Some(escaped), msg.len())
78 } else {
79 (None, 0)
80 }
81}

Subscribers

People subscribed via source and target branches