Merge lp:~jelmer/brz/serializer-xml-1 into lp:brz

Proposed by Jelmer Vernooij
Status: Merged
Approved by: Jelmer Vernooij
Approved revision: 7780
Merged at revision: 7790
Proposed branch: lp:~jelmer/brz/serializer-xml-1
Merge into: lp:brz
Diff against target: 299 lines (+111/-105)
6 files modified
breezy/bzr/tests/test_xml.py (+1/-8)
breezy/bzr/xml_serializer.py (+1/-96)
crates/bazaar-py/src/lib.rs (+25/-1)
crates/bazaar/src/globbing.rs (+2/-0)
crates/bazaar/src/lib.rs (+1/-0)
crates/bazaar/src/xml_serializer.rs (+81/-0)
To merge this branch: bzr merge lp:~jelmer/brz/serializer-xml-1
Reviewer Review Type Date Requested Status
Jelmer Vernooij Approve
Review via email: mp+442461@code.launchpad.net

Commit message

Move some xml helper functions to rust

Description of the change

Move some xml helper functions to rust

To post a comment you must log in.
Revision history for this message
Jelmer Vernooij (jelmer) :
review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'breezy/bzr/tests/test_xml.py'
2--- breezy/bzr/tests/test_xml.py 2023-04-30 20:58:50 +0000
3+++ breezy/bzr/tests/test_xml.py 2023-05-05 15:54:42 +0000
4@@ -528,21 +528,14 @@
5 class TestEncodeAndEscape(TestCase):
6 """Whitebox testing of the _encode_and_escape function."""
7
8- def setUp(self):
9- super().setUp()
10- # Keep the cache clear before and after the test
11- breezy.bzr.xml_serializer._clear_cache()
12- self.addCleanup(breezy.bzr.xml_serializer._clear_cache)
13-
14 def test_simple_ascii(self):
15 # _encode_and_escape always appends a final ", because these parameters
16 # are being used in xml attributes, and by returning it now, we have to
17 # do fewer string operations later.
18 val = breezy.bzr.xml_serializer.encode_and_escape('foo bar')
19 self.assertEqual(b'foo bar', val)
20- # The second time should be cached
21 val2 = breezy.bzr.xml_serializer.encode_and_escape('foo bar')
22- self.assertIs(val2, val)
23+ self.assertEqual(val2, val)
24
25 def test_ascii_with_xml(self):
26 self.assertEqual(b'&'"<>',
27
28=== modified file 'breezy/bzr/xml_serializer.py'
29--- breezy/bzr/xml_serializer.py 2023-05-03 17:01:16 +0000
30+++ breezy/bzr/xml_serializer.py 2023-05-05 15:54:42 +0000
31@@ -100,24 +100,6 @@
32 return ElementTree().parse(f)
33
34
35-def escape_invalid_chars(message):
36- """Escape the XML-invalid characters in a commit message.
37-
38- :param message: Commit message to escape
39- :return: tuple with escaped message and number of characters escaped
40- """
41- if message is None:
42- return None, 0
43- # Python strings can include characters that can't be
44- # represented in well-formed XML; escape characters that
45- # aren't listed in the XML specification
46- # (http://www.w3.org/TR/REC-xml/#NT-Char).
47- return re.subn('[^\x09\x0A\x0D\u0020-\uD7FF\uE000-\uFFFD]+',
48- lambda match: match.group(0).encode(
49- 'unicode_escape').decode('ascii'),
50- message)
51-
52-
53 def get_utf8_or_ascii(a_str):
54 """Return a cached version of the string.
55
56@@ -139,84 +121,7 @@
57 return a_str
58
59
60-_utf8_re = lazy_regex.lazy_compile(b'[&<>\'\"]|[\x80-\xff]+')
61-_unicode_re = lazy_regex.lazy_compile('[&<>\'\"\u0080-\uffff]')
62-
63-
64-_xml_escape_map = {
65- "&": '&amp;',
66- "'": "&apos;", # FIXME: overkill
67- "\"": "&quot;",
68- "<": "&lt;",
69- ">": "&gt;",
70- }
71-
72-
73-def _unicode_escape_replace(match, _map=_xml_escape_map):
74- """Replace a string of non-ascii, non XML safe characters with their escape
75-
76- This will escape both Standard XML escapes, like <>"', etc.
77- As well as escaping non ascii characters, because ElementTree did.
78- This helps us remain compatible to older versions of bzr. We may change
79- our policy in the future, though.
80- """
81- # jam 20060816 Benchmarks show that try/KeyError is faster if you
82- # expect the entity to rarely miss. There is about a 10% difference
83- # in overall time. But if you miss frequently, then if None is much
84- # faster. For our use case, we *rarely* have a revision id, file id
85- # or path name that is unicode. So use try/KeyError.
86- try:
87- return _map[match.group()]
88- except KeyError:
89- return "&#%d;" % ord(match.group())
90-
91-
92-def _utf8_escape_replace(match, _map=_xml_escape_map):
93- """Escape utf8 characters into XML safe ones.
94-
95- This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
96- or it is handling characters with the high-bit set. For ascii characters,
97- we just lookup the replacement in the dictionary. For everything else, we
98- decode back into Unicode, and then use the XML escape code.
99- """
100- try:
101- return _map[match.group().decode('ascii', 'replace')].encode()
102- except KeyError:
103- return b''.join(b'&#%d;' % ord(uni_chr)
104- for uni_chr in match.group().decode('utf8'))
105-
106-
107-_to_escaped_map: Dict[Union[bytes, str], str] = {}
108-
109-
110-def encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):
111- """Encode the string into utf8, and escape invalid XML characters"""
112- # We frequently get entities we have not seen before, so it is better
113- # to check if None, rather than try/KeyError
114- text = _map.get(unicode_or_utf8_str)
115- if text is None:
116- if isinstance(unicode_or_utf8_str, str):
117- # The alternative policy is to do a regular UTF8 encoding
118- # and then escape only XML meta characters.
119- # Performance is equivalent once you use codecs. *However*
120- # this makes the serialized texts incompatible with old versions
121- # of bzr. So no net gain. (Perhaps the read code would handle utf8
122- # better than entity escapes, but cElementTree seems to do just
123- # fine either way)
124- text = _unicode_re.sub(
125- _unicode_escape_replace, unicode_or_utf8_str).encode()
126- else:
127- # Plain strings are considered to already be in utf-8 so we do a
128- # slightly different method for escaping.
129- text = _utf8_re.sub(_utf8_escape_replace,
130- unicode_or_utf8_str)
131- _map[unicode_or_utf8_str] = text
132- return text
133-
134-
135-def _clear_cache():
136- """Clean out the unicode => escaped map"""
137- _to_escaped_map.clear()
138+from .._bzr_rs import encode_and_escape, escape_invalid_chars
139
140
141 def unpack_inventory_entry(elt, entry_cache=None, return_from_cache=False):
142
143=== modified file 'crates/bazaar-py/src/lib.rs'
144--- crates/bazaar-py/src/lib.rs 2023-05-05 12:17:33 +0000
145+++ crates/bazaar-py/src/lib.rs 2023-05-05 15:54:42 +0000
146@@ -4,7 +4,7 @@
147 use pyo3::exceptions::{PyNotImplementedError, PyRuntimeError, PyTypeError, PyValueError};
148 use pyo3::import_exception;
149 use pyo3::prelude::*;
150-use pyo3::types::{PyBytes, PyList, PyString};
151+use pyo3::types::{PyBytes, PyList, PyString, PyUnicode};
152 use pyo3_file::PyFileLikeObject;
153 use std::collections::HashMap;
154
155@@ -402,6 +402,28 @@
156 }
157 }
158
159+#[pyfunction]
160+fn escape_invalid_chars(message: Option<&str>) -> (Option<String>, usize) {
161+ bazaar::xml_serializer::escape_invalid_chars(message)
162+}
163+
164+#[pyfunction]
165+fn encode_and_escape(py: Python, unicode_or_utf8_str: PyObject) -> PyResult<&PyBytes> {
166+ if let Ok(text) = unicode_or_utf8_str.extract::<&str>(py) {
167+ Ok(PyBytes::new(
168+ py,
169+ bazaar::xml_serializer::encode_and_escape_string(text).as_slice(),
170+ ))
171+ } else if let Ok(bytes) = unicode_or_utf8_str.extract::<&[u8]>(py) {
172+ Ok(PyBytes::new(
173+ py,
174+ bazaar::xml_serializer::encode_and_escape_bytes(bytes).as_slice(),
175+ ))
176+ } else {
177+ Err(PyTypeError::new_err("expected str or bytes"))
178+ }
179+}
180+
181 #[pymodule]
182 fn _bzr_rs(py: Python, m: &PyModule) -> PyResult<()> {
183 m.add_wrapped(wrap_pyfunction!(_next_id_suffix))?;
184@@ -424,5 +446,7 @@
185 m.add_wrapped(wrap_pyfunction!(is_null_revision))?;
186 m.add_wrapped(wrap_pyfunction!(is_reserved_revision_id))?;
187 m.add_wrapped(wrap_pyfunction!(check_not_reserved_id))?;
188+ m.add_wrapped(wrap_pyfunction!(escape_invalid_chars))?;
189+ m.add_wrapped(wrap_pyfunction!(encode_and_escape))?;
190 Ok(())
191 }
192
193=== modified file 'crates/bazaar/src/globbing.rs'
194--- crates/bazaar/src/globbing.rs 2023-04-22 02:01:38 +0000
195+++ crates/bazaar/src/globbing.rs 2023-05-05 15:54:42 +0000
196@@ -31,6 +31,8 @@
197 Closure(Box<dyn FnMut(String) -> String + Sync + Send>),
198 }
199
200+// TODO(jelmer): Consider using RegexSet from the regex crate instead.
201+
202 /// Do a multiple-pattern substitution.
203 ///
204 /// The patterns and substitutions are combined into one, so the result of
205
206=== modified file 'crates/bazaar/src/lib.rs'
207--- crates/bazaar/src/lib.rs 2023-05-05 12:14:36 +0000
208+++ crates/bazaar/src/lib.rs 2023-05-05 15:54:42 +0000
209@@ -94,3 +94,4 @@
210 }
211
212 pub mod bencode_serializer;
213+pub mod xml_serializer;
214
215=== added file 'crates/bazaar/src/xml_serializer.rs'
216--- crates/bazaar/src/xml_serializer.rs 1970-01-01 00:00:00 +0000
217+++ crates/bazaar/src/xml_serializer.rs 2023-05-05 15:54:42 +0000
218@@ -0,0 +1,81 @@
219+use std::str;
220+
221+lazy_static::lazy_static! {
222+ static ref UTF8_RE: regex::bytes::Regex = regex::bytes::Regex::new(r#"(?-u)[&<>'"]|[\x80-\xff]+"#).unwrap();
223+ static ref UNICODE_RE: regex::Regex = regex::Regex::new(r#"[&<>'"\u{0080}-\u{ffff}]"#).unwrap();
224+
225+}
226+
227+fn escape_low(c: u8) -> Option<&'static str> {
228+ match c {
229+ b'&' => Some("&amp;"),
230+ b'\'' => Some("&apos;"),
231+ b'"' => Some("&quot;"),
232+ b'<' => Some("&lt;"),
233+ b'>' => Some("&gt;"),
234+ _ => None,
235+ }
236+}
237+
238+fn unicode_escape_replace(cap: &regex::Captures) -> String {
239+ let m = cap.get(0).unwrap();
240+ assert_eq!(m.as_str().chars().count(), 1,);
241+ let c = m.as_str().chars().next().unwrap();
242+ if m.len() == 1 {
243+ if let Some(ret) = escape_low(m.as_str().as_bytes()[0]) {
244+ return ret.to_string();
245+ }
246+ }
247+ format!("&#{};", c as u32)
248+}
249+
250+fn utf8_escape_replace(cap: &regex::bytes::Captures) -> Vec<u8> {
251+ let m = cap.get(0).unwrap().as_bytes();
252+ eprintln!("m: {:?}", cap);
253+ if m.len() == 1 {
254+ if let Some(ret) = escape_low(m[0]) {
255+ return ret.as_bytes().to_vec();
256+ }
257+ }
258+ let utf8 = str::from_utf8(m).unwrap();
259+ utf8.chars()
260+ .map(|c| format!("&#{};", c as u64).into_bytes())
261+ .collect::<Vec<Vec<u8>>>()
262+ .concat()
263+}
264+
265+pub fn encode_and_escape_string(text: &str) -> Vec<u8> {
266+ UNICODE_RE
267+ .replace_all(text, unicode_escape_replace)
268+ .as_bytes()
269+ .to_vec()
270+}
271+
272+pub fn encode_and_escape_bytes(data: &[u8]) -> Vec<u8> {
273+ UTF8_RE.replace_all(data, utf8_escape_replace).to_vec()
274+}
275+
276+pub fn escape_invalid_chars(message: Option<&str>) -> (Option<String>, usize) {
277+ if let Some(msg) = message {
278+ let escaped = msg
279+ .chars()
280+ .map(|c| {
281+ if c == '\t' || c == '\n' || c == '\r' || c == '\x7f' {
282+ c.to_string()
283+ } else if c.is_ascii_control()
284+ || (c as u32) > 0xD7FF && (c as u32) < 0xE000
285+ || (c as u32) > 0xFFFD && (c as u32) < 0x10000
286+ {
287+ format!("\\x{:02x}", c as u32)
288+ } else {
289+ c.to_string()
290+ }
291+ })
292+ .collect::<Vec<String>>()
293+ .join("");
294+
295+ (Some(escaped), msg.len())
296+ } else {
297+ (None, 0)
298+ }
299+}

Subscribers

People subscribed via source and target branches