unicode: remove implementations of encode_utf8

This commit removes our explicit implementations of encode_utf8 and replaces them with uses of `char::encode_utf8`, which was added to the standard library in Rust 1.15.
2025-04-12 07:34:07 +00:00 · 2018-04-28 13:11:29 -04:00 · 2018-04-28 13:11:29 -04:00 · 9604cc07ed
commit 9604cc07ed
parent fc06d1a7ea
3 changed files with 9 additions and 78 deletions
--- a/regex-syntax/src/hir/literal/mod.rs
+++ b/regex-syntax/src/hir/literal/mod.rs
@ -19,7 +19,6 @@ use std::mem;
 use std::ops;

 use hir::{self, Hir, HirKind};
-use unicode;

 /// A set of literal byte strings extracted from a regular expression.
 ///
@ -603,9 +602,8 @@ impl Literals {
 fn prefixes(expr: &Hir, lits: &mut Literals) {
    match *expr.kind() {
        HirKind::Literal(hir::Literal::Unicode(c)) => {
-            let mut buf = [0u8; 4];
-            let i = unicode::encode_utf8(c, &mut buf).unwrap();
-            lits.cross_add(&buf[..i]);
+            let mut buf = [0; 4];
+            lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
        }
        HirKind::Literal(hir::Literal::Byte(b)) => {
            lits.cross_add(&[b]);
@ -685,7 +683,7 @@ fn suffixes(expr: &Hir, lits: &mut Literals) {
    match *expr.kind() {
        HirKind::Literal(hir::Literal::Unicode(c)) => {
            let mut buf = [0u8; 4];
-            let i = unicode::encode_utf8(c, &mut buf).unwrap();
+            let i = c.encode_utf8(&mut buf).len();
            let mut buf = &mut buf[..i];
            buf.reverse();
            lits.cross_add(buf);
--- a/regex-syntax/src/unicode.rs
+++ b/regex-syntax/src/unicode.rs
@ -25,42 +25,6 @@ pub enum Error {
    PropertyValueNotFound,
 }

-/// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
-///
-/// If `dst` is not long enough, then `None` is returned. Otherwise, the number
-/// of bytes written is returned.
-pub fn encode_utf8(character: char, dst: &mut [u8]) -> Option<usize> {
-    // TODO: Remove this function once we move to at least Rust 1.15, which
-    // provides char::encode_utf8 for us.
-    const TAG_CONT: u8 = 0b1000_0000;
-    const TAG_TWO: u8 = 0b1100_0000;
-    const TAG_THREE: u8 = 0b1110_0000;
-    const TAG_FOUR: u8 = 0b1111_0000;
-
-    let code = character as u32;
-    if code <= 0x7F && !dst.is_empty() {
-        dst[0] = code as u8;
-        Some(1)
-    } else if code <= 0x7FF && dst.len() >= 2 {
-        dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO;
-        dst[1] = (code & 0x3F) as u8 | TAG_CONT;
-        Some(2)
-    } else if code <= 0xFFFF && dst.len() >= 3  {
-        dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE;
-        dst[1] = (code >>  6 & 0x3F) as u8 | TAG_CONT;
-        dst[2] = (code & 0x3F) as u8 | TAG_CONT;
-        Some(3)
-    } else if dst.len() >= 4 {
-        dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR;
-        dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
-        dst[2] = (code >>  6 & 0x3F) as u8 | TAG_CONT;
-        dst[3] = (code & 0x3F) as u8 | TAG_CONT;
-        Some(4)
-    } else {
-        None
-    }
-}
-
 /// An iterator over a codepoint's simple case equivalence class.
 #[derive(Debug)]
 pub struct SimpleFoldIter(::std::slice::Iter<'static, char>);
--- a/src/utf8.rs
+++ b/src/utf8.rs
@ -38,37 +38,6 @@ pub fn next_utf8(text: &[u8], i: usize) -> usize {
    i + inc
 }

-/// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
-///
-/// If `dst` is not long enough, then `None` is returned. Otherwise, the number
-/// of bytes written is returned.
-#[allow(dead_code)]
-#[inline]
-pub fn encode_utf8(character: char, dst: &mut [u8]) -> Option<usize> {
-    let code = character as u32;
-    if code <= 0x7F && !dst.is_empty() {
-        dst[0] = code as u8;
-        Some(1)
-    } else if code <= 0x7FF && dst.len() >= 2 {
-        dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO;
-        dst[1] = (code & 0x3F) as u8 | TAG_CONT;
-        Some(2)
-    } else if code <= 0xFFFF && dst.len() >= 3  {
-        dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE;
-        dst[1] = (code >>  6 & 0x3F) as u8 | TAG_CONT;
-        dst[2] = (code & 0x3F) as u8 | TAG_CONT;
-        Some(3)
-    } else if dst.len() >= 4 {
-        dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR;
-        dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
-        dst[2] = (code >>  6 & 0x3F) as u8 | TAG_CONT;
-        dst[3] = (code & 0x3F) as u8 | TAG_CONT;
-        Some(4)
-    } else {
-        None
-    }
-}
-
 /// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`.
 ///
 /// If no valid UTF-8 sequence could be found, then `None` is returned.
@ -184,14 +153,14 @@ mod tests {

    use super::{
        TAG_CONT, TAG_TWO, TAG_THREE, TAG_FOUR,
-        decode_utf8, decode_last_utf8, encode_utf8,
+        decode_utf8, decode_last_utf8,
    };

    #[test]
    fn prop_roundtrip() {
        fn p(given_cp: char) -> bool {
            let mut tmp = [0; 4];
-            let encoded_len = encode_utf8(given_cp, &mut tmp).unwrap();
+            let encoded_len = given_cp.encode_utf8(&mut tmp).len();
            let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap();
            encoded_len == got_len && given_cp == got_cp
        }
@ -202,7 +171,7 @@ mod tests {
    fn prop_roundtrip_last() {
        fn p(given_cp: char) -> bool {
            let mut tmp = [0; 4];
-            let encoded_len = encode_utf8(given_cp, &mut tmp).unwrap();
+            let encoded_len = given_cp.encode_utf8(&mut tmp).len();
            let (got_cp, got_len) =
                decode_last_utf8(&tmp[..encoded_len]).unwrap();
            encoded_len == got_len && given_cp == got_cp
@ -214,7 +183,7 @@ mod tests {
    fn prop_encode_matches_std() {
        fn p(cp: char) -> bool {
            let mut got = [0; 4];
-            let n = encode_utf8(cp, &mut got).unwrap();
+            let n = cp.encode_utf8(&mut got).len();
            let expected = cp.to_string();
            &got[..n] == expected.as_bytes()
        }
@ -225,7 +194,7 @@ mod tests {
    fn prop_decode_matches_std() {
        fn p(given_cp: char) -> bool {
            let mut tmp = [0; 4];
-            let n = encode_utf8(given_cp, &mut tmp).unwrap();
+            let n = given_cp.encode_utf8(&mut tmp).len();
            let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap();
            let expected_cp =
                str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap();
@ -238,7 +207,7 @@ mod tests {
    fn prop_decode_last_matches_std() {
        fn p(given_cp: char) -> bool {
            let mut tmp = [0; 4];
-            let n = encode_utf8(given_cp, &mut tmp).unwrap();
+            let n = given_cp.encode_utf8(&mut tmp).len();
            let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap();
            let expected_cp =
                str::from_utf8(&tmp[..n]).unwrap()