Breezy

Merge lp:~jelmer/brz/serializer-xml-1 into lp:brz

serializer-xml-1
Merge into trunk

Proposed by Jelmer Vernooij on 2023-05-05

Status:	Merged
Approved by:	Jelmer Vernooij on 2023-05-05
Approved revision:	7780
Merged at revision:	7790
Proposed branch:	lp:~jelmer/brz/serializer-xml-1
Merge into:	lp:brz
Diff against target:	299 lines (+111/-105) 6 files modified breezy/bzr/tests/test_xml.py (+1/-8) breezy/bzr/xml_serializer.py (+1/-96) crates/bazaar-py/src/lib.rs (+25/-1) crates/bazaar/src/globbing.rs (+2/-0) crates/bazaar/src/lib.rs (+1/-0) crates/bazaar/src/xml_serializer.rs (+81/-0)
To merge this branch:	bzr merge lp:~jelmer/brz/serializer-xml-1
Related bugs:	Link a bug report

Reviewer	Review Type	Date Requested	Status
Jelmer Vernooij			Approve on 2023-05-05
Review via email: mp+442461@code.launchpad.net

Commit message

Move some xml helper functions to rust

Description of the change

Move some xml helper functions to rust

Revision history for this message

Jelmer Vernooij (jelmer) on 2023-05-05:

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk

Subscribers

People subscribed via source and target branches

to all changes:

Breezy developers

Jelmer Vernooij

Robert Ladyman

 === modified file 'breezy/bzr/tests/test_xml.py'
 --- breezy/bzr/tests/test_xml.py	2023-04-30 20:58:50 +0000
 +++ breezy/bzr/tests/test_xml.py	2023-05-05 15:54:42 +0000
@@ -528,21 +528,14 @@
  class TestEncodeAndEscape(TestCase):
      """Whitebox testing of the _encode_and_escape function."""
--    def setUp(self):
--        super().setUp()
--        # Keep the cache clear before and after the test
--        breezy.bzr.xml_serializer._clear_cache()
--        self.addCleanup(breezy.bzr.xml_serializer._clear_cache)
--
      def test_simple_ascii(self):
          # _encode_and_escape always appends a final ", because these parameters
          # are being used in xml attributes, and by returning it now, we have to
          # do fewer string operations later.
          val = breezy.bzr.xml_serializer.encode_and_escape('foo bar')
          self.assertEqual(b'foo bar', val)
--        # The second time should be cached
          val2 = breezy.bzr.xml_serializer.encode_and_escape('foo bar')
--        self.assertIs(val2, val)
++        self.assertEqual(val2, val)
      def test_ascii_with_xml(self):
          self.assertEqual(b'&amp;&apos;&quot;&lt;&gt;',
 === modified file 'breezy/bzr/xml_serializer.py'
 --- breezy/bzr/xml_serializer.py	2023-05-03 17:01:16 +0000
 +++ breezy/bzr/xml_serializer.py	2023-05-05 15:54:42 +0000
@@ -100,24 +100,6 @@
          return ElementTree().parse(f)
--def escape_invalid_chars(message):
--    """Escape the XML-invalid characters in a commit message.
--
--    :param message: Commit message to escape
--    :return: tuple with escaped message and number of characters escaped
--    """
--    if message is None:
--        return None, 0
--    # Python strings can include characters that can't be
--    # represented in well-formed XML; escape characters that
--    # aren't listed in the XML specification
--    # (http://www.w3.org/TR/REC-xml/#NT-Char).
--    return re.subn('[^\x09\x0A\x0D\u0020-\uD7FF\uE000-\uFFFD]+',
--                   lambda match: match.group(0).encode(
--                       'unicode_escape').decode('ascii'),
--                   message)
--
--
  def get_utf8_or_ascii(a_str):
      """Return a cached version of the string.
@@ -139,84 +121,7 @@
          return a_str
--_utf8_re = lazy_regex.lazy_compile(b'[&<>\'\"]|[\x80-\xff]+')
--_unicode_re = lazy_regex.lazy_compile('[&<>\'\"\u0080-\uffff]')
--
--
--_xml_escape_map = {
--    "&": '&amp;',
--    "'": "&apos;",  # FIXME: overkill
--    "\"": "&quot;",
--    "<": "&lt;",
--    ">": "&gt;",
--    }
--
--
--def _unicode_escape_replace(match, _map=_xml_escape_map):
--    """Replace a string of non-ascii, non XML safe characters with their escape
--
--    This will escape both Standard XML escapes, like <>"', etc.
--    As well as escaping non ascii characters, because ElementTree did.
--    This helps us remain compatible to older versions of bzr. We may change
--    our policy in the future, though.
--    """
--    # jam 20060816 Benchmarks show that try/KeyError is faster if you
--    # expect the entity to rarely miss. There is about a 10% difference
--    # in overall time. But if you miss frequently, then if None is much
--    # faster. For our use case, we *rarely* have a revision id, file id
--    # or path name that is unicode. So use try/KeyError.
--    try:
--        return _map[match.group()]
--    except KeyError:
--        return "&#%d;" % ord(match.group())
--
--
--def _utf8_escape_replace(match, _map=_xml_escape_map):
--    """Escape utf8 characters into XML safe ones.
--
--    This uses 2 tricks. It is either escaping "standard" characters, like "&<>,
--    or it is handling characters with the high-bit set. For ascii characters,
--    we just lookup the replacement in the dictionary. For everything else, we
--    decode back into Unicode, and then use the XML escape code.
--    """
--    try:
--        return _map[match.group().decode('ascii', 'replace')].encode()
--    except KeyError:
--        return b''.join(b'&#%d;' % ord(uni_chr)
--                        for uni_chr in match.group().decode('utf8'))
--
--
--_to_escaped_map: Dict[Union[bytes, str], str] = {}
--
--
--def encode_and_escape(unicode_or_utf8_str, _map=_to_escaped_map):
--    """Encode the string into utf8, and escape invalid XML characters"""
--    # We frequently get entities we have not seen before, so it is better
--    # to check if None, rather than try/KeyError
--    text = _map.get(unicode_or_utf8_str)
--    if text is None:
--        if isinstance(unicode_or_utf8_str, str):
--            # The alternative policy is to do a regular UTF8 encoding
--            # and then escape only XML meta characters.
--            # Performance is equivalent once you use codecs. *However*
--            # this makes the serialized texts incompatible with old versions
--            # of bzr. So no net gain. (Perhaps the read code would handle utf8
--            # better than entity escapes, but cElementTree seems to do just
--            # fine either way)
--            text = _unicode_re.sub(
--                _unicode_escape_replace, unicode_or_utf8_str).encode()
--        else:
--            # Plain strings are considered to already be in utf-8 so we do a
--            # slightly different method for escaping.
--            text = _utf8_re.sub(_utf8_escape_replace,
--                                unicode_or_utf8_str)
--        _map[unicode_or_utf8_str] = text
--    return text
--
--
--def _clear_cache():
--    """Clean out the unicode => escaped map"""
--    _to_escaped_map.clear()
++from .._bzr_rs import encode_and_escape, escape_invalid_chars
  def unpack_inventory_entry(elt, entry_cache=None, return_from_cache=False):
 === modified file 'crates/bazaar-py/src/lib.rs'
 --- crates/bazaar-py/src/lib.rs	2023-05-05 12:17:33 +0000
 +++ crates/bazaar-py/src/lib.rs	2023-05-05 15:54:42 +0000
@@ -4,7 +4,7 @@
  use pyo3::exceptions::{PyNotImplementedError, PyRuntimeError, PyTypeError, PyValueError};
  use pyo3::import_exception;
  use pyo3::prelude::*;
--use pyo3::types::{PyBytes, PyList, PyString};
++use pyo3::types::{PyBytes, PyList, PyString, PyUnicode};
  use pyo3_file::PyFileLikeObject;
  use std::collections::HashMap;
@@ -402,6 +402,28 @@
+     }
+ }
++#[pyfunction]
++fn escape_invalid_chars(message: Option<&str>) -> (Option<String>, usize) {
++    bazaar::xml_serializer::escape_invalid_chars(message)
++}
++
++#[pyfunction]
++fn encode_and_escape(py: Python, unicode_or_utf8_str: PyObject) -> PyResult<&PyBytes> {
++    if let Ok(text) = unicode_or_utf8_str.extract::<&str>(py) {
++        Ok(PyBytes::new(
++            py,
++            bazaar::xml_serializer::encode_and_escape_string(text).as_slice(),
++        ))
++    } else if let Ok(bytes) = unicode_or_utf8_str.extract::<&[u8]>(py) {
++        Ok(PyBytes::new(
++            py,
++            bazaar::xml_serializer::encode_and_escape_bytes(bytes).as_slice(),
++        ))
++    } else {
++        Err(PyTypeError::new_err("expected str or bytes"))
++    }
++}
++
  #[pymodule]
  fn _bzr_rs(py: Python, m: &PyModule) -> PyResult<()> {
      m.add_wrapped(wrap_pyfunction!(_next_id_suffix))?;
@@ -424,5 +446,7 @@
      m.add_wrapped(wrap_pyfunction!(is_null_revision))?;
      m.add_wrapped(wrap_pyfunction!(is_reserved_revision_id))?;
      m.add_wrapped(wrap_pyfunction!(check_not_reserved_id))?;
++    m.add_wrapped(wrap_pyfunction!(escape_invalid_chars))?;
++    m.add_wrapped(wrap_pyfunction!(encode_and_escape))?;
      Ok(())
+ }
 === modified file 'crates/bazaar/src/globbing.rs'
 --- crates/bazaar/src/globbing.rs	2023-04-22 02:01:38 +0000
 +++ crates/bazaar/src/globbing.rs	2023-05-05 15:54:42 +0000
@@ -31,6 +31,8 @@
      Closure(Box<dyn FnMut(String) -> String + Sync + Send>),
+ }
++// TODO(jelmer): Consider using RegexSet from the regex crate instead.
++
  /// Do a multiple-pattern substitution.
  ///
  /// The patterns and substitutions are combined into one, so the result of
 === modified file 'crates/bazaar/src/lib.rs'
 --- crates/bazaar/src/lib.rs	2023-05-05 12:14:36 +0000
 +++ crates/bazaar/src/lib.rs	2023-05-05 15:54:42 +0000
@@ -94,3 +94,4 @@
+ }
  pub mod bencode_serializer;
++pub mod xml_serializer;
 === added file 'crates/bazaar/src/xml_serializer.rs'
 --- crates/bazaar/src/xml_serializer.rs	1970-01-01 00:00:00 +0000
 +++ crates/bazaar/src/xml_serializer.rs	2023-05-05 15:54:42 +0000
@@ -0,0 +1,81 @@
++use std::str;
++
++lazy_static::lazy_static! {
++    static ref UTF8_RE: regex::bytes::Regex = regex::bytes::Regex::new(r#"(?-u)[&<>'"]|[\x80-\xff]+"#).unwrap();
++    static ref UNICODE_RE: regex::Regex = regex::Regex::new(r#"[&<>'"\u{0080}-\u{ffff}]"#).unwrap();
++
++}
++
++fn escape_low(c: u8) -> Option<&'static str> {
++    match c {
++        b'&' => Some("&amp;"),
++        b'\'' => Some("&apos;"),
++        b'"' => Some("&quot;"),
++        b'<' => Some("&lt;"),
++        b'>' => Some("&gt;"),
++        _ => None,
++    }
++}
++
++fn unicode_escape_replace(cap: &regex::Captures) -> String {
++    let m = cap.get(0).unwrap();
++    assert_eq!(m.as_str().chars().count(), 1,);
++    let c = m.as_str().chars().next().unwrap();
++    if m.len() == 1 {
++        if let Some(ret) = escape_low(m.as_str().as_bytes()[0]) {
++            return ret.to_string();
++        }
++    }
++    format!("&#{};", c as u32)
++}
++
++fn utf8_escape_replace(cap: &regex::bytes::Captures) -> Vec<u8> {
++    let m = cap.get(0).unwrap().as_bytes();
++    eprintln!("m: {:?}", cap);
++    if m.len() == 1 {
++        if let Some(ret) = escape_low(m[0]) {
++            return ret.as_bytes().to_vec();
++        }
++    }
++    let utf8 = str::from_utf8(m).unwrap();
++    utf8.chars()
++        .map(|c| format!("&#{};", c as u64).into_bytes())
++        .collect::<Vec<Vec<u8>>>()
++        .concat()
++}
++
++pub fn encode_and_escape_string(text: &str) -> Vec<u8> {
++    UNICODE_RE
++        .replace_all(text, unicode_escape_replace)
++        .as_bytes()
++        .to_vec()
++}
++
++pub fn encode_and_escape_bytes(data: &[u8]) -> Vec<u8> {
++    UTF8_RE.replace_all(data, utf8_escape_replace).to_vec()
++}
++
++pub fn escape_invalid_chars(message: Option<&str>) -> (Option<String>, usize) {
++    if let Some(msg) = message {
++        let escaped = msg
++            .chars()
++            .map(|c| {
++                if c == '\t' || c == '\n' || c == '\r' || c == '\x7f' {
++                    c.to_string()
++                } else if c.is_ascii_control()
++                    || (c as u32) > 0xD7FF && (c as u32) < 0xE000
++                    || (c as u32) > 0xFFFD && (c as u32) < 0x10000
++                {
++                    format!("\\x{:02x}", c as u32)
++                } else {
++                    c.to_string()
++                }
++            })
++            .collect::<Vec<String>>()
++            .join("");
++
++        (Some(escaped), msg.len())
++    } else {
++        (None, 0)
++    }
++}