Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
unicode: remove implementations of encode_utf8
Browse files Browse the repository at this point in the history
This commit removes our explicit implementations of encode_utf8 and
replaces them with uses of `char::encode_utf8`, which was added to the
standard library in Rust 1.15.
BurntSushi committed May 1, 2018
1 parent 9510fe1 commit 8ad256b
Showing 3 changed files with 9 additions and 78 deletions.
8 changes: 3 additions & 5 deletions regex-syntax/src/hir/literal/mod.rs
Original file line number Diff line number Diff line change
@@ -19,7 +19,6 @@ use std::mem;
use std::ops;

use hir::{self, Hir, HirKind};
use unicode;

/// A set of literal byte strings extracted from a regular expression.
///
@@ -603,9 +602,8 @@ impl Literals {
fn prefixes(expr: &Hir, lits: &mut Literals) {
match *expr.kind() {
HirKind::Literal(hir::Literal::Unicode(c)) => {
let mut buf = [0u8; 4];
let i = unicode::encode_utf8(c, &mut buf).unwrap();
lits.cross_add(&buf[..i]);
let mut buf = [0; 4];
lits.cross_add(c.encode_utf8(&mut buf).as_bytes());
}
HirKind::Literal(hir::Literal::Byte(b)) => {
lits.cross_add(&[b]);
@@ -685,7 +683,7 @@ fn suffixes(expr: &Hir, lits: &mut Literals) {
match *expr.kind() {
HirKind::Literal(hir::Literal::Unicode(c)) => {
let mut buf = [0u8; 4];
let i = unicode::encode_utf8(c, &mut buf).unwrap();
let i = c.encode_utf8(&mut buf).len();
let mut buf = &mut buf[..i];
buf.reverse();
lits.cross_add(buf);
36 changes: 0 additions & 36 deletions regex-syntax/src/unicode.rs
Original file line number Diff line number Diff line change
@@ -25,42 +25,6 @@ pub enum Error {
PropertyValueNotFound,
}

/// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
///
/// If `dst` is not long enough, then `None` is returned. Otherwise, the number
/// of bytes written is returned.
pub fn encode_utf8(character: char, dst: &mut [u8]) -> Option<usize> {
// TODO: Remove this function once we move to at least Rust 1.15, which
// provides char::encode_utf8 for us.
const TAG_CONT: u8 = 0b1000_0000;
const TAG_TWO: u8 = 0b1100_0000;
const TAG_THREE: u8 = 0b1110_0000;
const TAG_FOUR: u8 = 0b1111_0000;

let code = character as u32;
if code <= 0x7F && !dst.is_empty() {
dst[0] = code as u8;
Some(1)
} else if code <= 0x7FF && dst.len() >= 2 {
dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO;
dst[1] = (code & 0x3F) as u8 | TAG_CONT;
Some(2)
} else if code <= 0xFFFF && dst.len() >= 3 {
dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE;
dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
dst[2] = (code & 0x3F) as u8 | TAG_CONT;
Some(3)
} else if dst.len() >= 4 {
dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR;
dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
dst[3] = (code & 0x3F) as u8 | TAG_CONT;
Some(4)
} else {
None
}
}

/// An iterator over a codepoint's simple case equivalence class.
#[derive(Debug)]
pub struct SimpleFoldIter(::std::slice::Iter<'static, char>);
43 changes: 6 additions & 37 deletions src/utf8.rs
Original file line number Diff line number Diff line change
@@ -38,37 +38,6 @@ pub fn next_utf8(text: &[u8], i: usize) -> usize {
i + inc
}

/// Encode the given Unicode character to `dst` as a single UTF-8 sequence.
///
/// If `dst` is not long enough, then `None` is returned. Otherwise, the number
/// of bytes written is returned.
#[allow(dead_code)]
#[inline]
pub fn encode_utf8(character: char, dst: &mut [u8]) -> Option<usize> {
let code = character as u32;
if code <= 0x7F && !dst.is_empty() {
dst[0] = code as u8;
Some(1)
} else if code <= 0x7FF && dst.len() >= 2 {
dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO;
dst[1] = (code & 0x3F) as u8 | TAG_CONT;
Some(2)
} else if code <= 0xFFFF && dst.len() >= 3 {
dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE;
dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
dst[2] = (code & 0x3F) as u8 | TAG_CONT;
Some(3)
} else if dst.len() >= 4 {
dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR;
dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
dst[3] = (code & 0x3F) as u8 | TAG_CONT;
Some(4)
} else {
None
}
}

/// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`.
///
/// If no valid UTF-8 sequence could be found, then `None` is returned.
@@ -184,14 +153,14 @@ mod tests {

use super::{
TAG_CONT, TAG_TWO, TAG_THREE, TAG_FOUR,
decode_utf8, decode_last_utf8, encode_utf8,
decode_utf8, decode_last_utf8,
};

#[test]
fn prop_roundtrip() {
fn p(given_cp: char) -> bool {
let mut tmp = [0; 4];
let encoded_len = encode_utf8(given_cp, &mut tmp).unwrap();
let encoded_len = given_cp.encode_utf8(&mut tmp).len();
let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap();
encoded_len == got_len && given_cp == got_cp
}
@@ -202,7 +171,7 @@ mod tests {
fn prop_roundtrip_last() {
fn p(given_cp: char) -> bool {
let mut tmp = [0; 4];
let encoded_len = encode_utf8(given_cp, &mut tmp).unwrap();
let encoded_len = given_cp.encode_utf8(&mut tmp).len();
let (got_cp, got_len) =
decode_last_utf8(&tmp[..encoded_len]).unwrap();
encoded_len == got_len && given_cp == got_cp
@@ -214,7 +183,7 @@ mod tests {
fn prop_encode_matches_std() {
fn p(cp: char) -> bool {
let mut got = [0; 4];
let n = encode_utf8(cp, &mut got).unwrap();
let n = cp.encode_utf8(&mut got).len();
let expected = cp.to_string();
&got[..n] == expected.as_bytes()
}
@@ -225,7 +194,7 @@ mod tests {
fn prop_decode_matches_std() {
fn p(given_cp: char) -> bool {
let mut tmp = [0; 4];
let n = encode_utf8(given_cp, &mut tmp).unwrap();
let n = given_cp.encode_utf8(&mut tmp).len();
let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap();
let expected_cp =
str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap();
@@ -238,7 +207,7 @@ mod tests {
fn prop_decode_last_matches_std() {
fn p(given_cp: char) -> bool {
let mut tmp = [0; 4];
let n = encode_utf8(given_cp, &mut tmp).unwrap();
let n = given_cp.encode_utf8(&mut tmp).len();
let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap();
let expected_cp =
str::from_utf8(&tmp[..n]).unwrap()

0 comments on commit 8ad256b

Please sign in to comment.