unicode: remove implementations of encode_utf8

This commit removes our explicit implementations of encode_utf8 and
replaces them with uses of `char::encode_utf8`, which was added to the
standard library in Rust 1.15.
This commit is contained in:
Andrew Gallant 2018-04-28 13:11:29 -04:00
parent fc06d1a7ea
commit 9604cc07ed
3 changed files with 9 additions and 78 deletions

View File

@ -19,7 +19,6 @@ use std::mem;
use std::ops;
use hir::{self, Hir, HirKind};
use unicode;
/// A set of literal byte strings extracted from a regular expression.
///
@ -603,9 +602,8 @@ impl Literals {
fn prefixes(expr: &Hir, lits: &mut Literals) {
match *expr.kind() {
HirKind::Literal(hir::Literal::Unicode(c)) => {
let mut buf = [0u8; 4];
let i = unicode::encode_utf8(c, &mut buf).unwrap();
lits.cross_add(&buf[..i]);
let mut buf = [0; 4];
lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
}
HirKind::Literal(hir::Literal::Byte(b)) => {
lits.cross_add(&[b]);
@ -685,7 +683,7 @@ fn suffixes(expr: &Hir, lits: &mut Literals) {
match *expr.kind() {
HirKind::Literal(hir::Literal::Unicode(c)) => {
let mut buf = [0u8; 4];
let i = unicode::encode_utf8(c, &mut buf).unwrap();
let i = c.encode_utf8(&mut buf).len();
let mut buf = &mut buf[..i];
buf.reverse();
lits.cross_add(buf);

View File

@ -25,42 +25,6 @@ pub enum Error {
PropertyValueNotFound,
}
/// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
///
/// If `dst` is not long enough, then `None` is returned. Otherwise, the number
/// of bytes written is returned.
pub fn encode_utf8(character: char, dst: &mut [u8]) -> Option<usize> {
// TODO: Remove this function once we move to at least Rust 1.15, which
// provides char::encode_utf8 for us.
const TAG_CONT: u8 = 0b1000_0000;
const TAG_TWO: u8 = 0b1100_0000;
const TAG_THREE: u8 = 0b1110_0000;
const TAG_FOUR: u8 = 0b1111_0000;
let code = character as u32;
if code <= 0x7F && !dst.is_empty() {
dst[0] = code as u8;
Some(1)
} else if code <= 0x7FF && dst.len() >= 2 {
dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO;
dst[1] = (code & 0x3F) as u8 | TAG_CONT;
Some(2)
} else if code <= 0xFFFF && dst.len() >= 3 {
dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE;
dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
dst[2] = (code & 0x3F) as u8 | TAG_CONT;
Some(3)
} else if dst.len() >= 4 {
dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR;
dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
dst[3] = (code & 0x3F) as u8 | TAG_CONT;
Some(4)
} else {
None
}
}
/// An iterator over a codepoint's simple case equivalence class.
#[derive(Debug)]
pub struct SimpleFoldIter(::std::slice::Iter<'static, char>);

View File

@ -38,37 +38,6 @@ pub fn next_utf8(text: &[u8], i: usize) -> usize {
i + inc
}
/// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
///
/// If `dst` is not long enough, then `None` is returned. Otherwise, the number
/// of bytes written is returned.
#[allow(dead_code)]
#[inline]
pub fn encode_utf8(character: char, dst: &mut [u8]) -> Option<usize> {
let code = character as u32;
if code <= 0x7F && !dst.is_empty() {
dst[0] = code as u8;
Some(1)
} else if code <= 0x7FF && dst.len() >= 2 {
dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO;
dst[1] = (code & 0x3F) as u8 | TAG_CONT;
Some(2)
} else if code <= 0xFFFF && dst.len() >= 3 {
dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE;
dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
dst[2] = (code & 0x3F) as u8 | TAG_CONT;
Some(3)
} else if dst.len() >= 4 {
dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR;
dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
dst[3] = (code & 0x3F) as u8 | TAG_CONT;
Some(4)
} else {
None
}
}
/// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`.
///
/// If no valid UTF-8 sequence could be found, then `None` is returned.
@ -184,14 +153,14 @@ mod tests {
use super::{
TAG_CONT, TAG_TWO, TAG_THREE, TAG_FOUR,
decode_utf8, decode_last_utf8, encode_utf8,
decode_utf8, decode_last_utf8,
};
#[test]
fn prop_roundtrip() {
fn p(given_cp: char) -> bool {
let mut tmp = [0; 4];
let encoded_len = encode_utf8(given_cp, &mut tmp).unwrap();
let encoded_len = given_cp.encode_utf8(&mut tmp).len();
let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap();
encoded_len == got_len && given_cp == got_cp
}
@ -202,7 +171,7 @@ mod tests {
fn prop_roundtrip_last() {
fn p(given_cp: char) -> bool {
let mut tmp = [0; 4];
let encoded_len = encode_utf8(given_cp, &mut tmp).unwrap();
let encoded_len = given_cp.encode_utf8(&mut tmp).len();
let (got_cp, got_len) =
decode_last_utf8(&tmp[..encoded_len]).unwrap();
encoded_len == got_len && given_cp == got_cp
@ -214,7 +183,7 @@ mod tests {
fn prop_encode_matches_std() {
fn p(cp: char) -> bool {
let mut got = [0; 4];
let n = encode_utf8(cp, &mut got).unwrap();
let n = cp.encode_utf8(&mut got).len();
let expected = cp.to_string();
&got[..n] == expected.as_bytes()
}
@ -225,7 +194,7 @@ mod tests {
fn prop_decode_matches_std() {
fn p(given_cp: char) -> bool {
let mut tmp = [0; 4];
let n = encode_utf8(given_cp, &mut tmp).unwrap();
let n = given_cp.encode_utf8(&mut tmp).len();
let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap();
let expected_cp =
str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap();
@ -238,7 +207,7 @@ mod tests {
fn prop_decode_last_matches_std() {
fn p(given_cp: char) -> bool {
let mut tmp = [0; 4];
let n = encode_utf8(given_cp, &mut tmp).unwrap();
let n = given_cp.encode_utf8(&mut tmp).len();
let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap();
let expected_cp =
str::from_utf8(&tmp[..n]).unwrap()