From 02bd23bbf5ae47d9479660a1ce9e960705897d19 Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Wed, 3 Jun 2020 21:41:03 +0000 Subject: [PATCH 01/30] initial import of char_collection --- Cargo.toml | 1 + components/char_collection/BUILD.gn | 32 + components/char_collection/Cargo.toml | 14 + .../meta/char_collection_lib_test.cmx | 5 + .../char_collection/src/char_collection.rs | 653 ++++++++++++++++++ components/char_collection/src/conversions.rs | 156 +++++ components/char_collection/src/lib.rs | 12 + components/char_collection/src/macros.rs | 54 ++ components/char_collection/src/operators.rs | 62 ++ 9 files changed, 989 insertions(+) create mode 100644 components/char_collection/BUILD.gn create mode 100644 components/char_collection/Cargo.toml create mode 100644 components/char_collection/meta/char_collection_lib_test.cmx create mode 100644 components/char_collection/src/char_collection.rs create mode 100644 components/char_collection/src/conversions.rs create mode 100644 components/char_collection/src/lib.rs create mode 100644 components/char_collection/src/macros.rs create mode 100644 components/char_collection/src/operators.rs diff --git a/Cargo.toml b/Cargo.toml index 17ed9200055..1752ce7e4bd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,5 +3,6 @@ members = [ "components/icu", "components/icu4x", + "components/char_collection", "components/locale", ] diff --git a/components/char_collection/BUILD.gn b/components/char_collection/BUILD.gn new file mode 100644 index 00000000000..f772f5023e6 --- /dev/null +++ b/components/char_collection/BUILD.gn @@ -0,0 +1,32 @@ +# Copyright 2019 The Fuchsia Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. +import("//build/rust/rustc_library.gni") +import("//build/test/test_package.gni") +import("//build/testing/environments.gni") +# Library for working with collections of Unicode code points. +rustc_library("char_collection") { + edition = "2018" + with_unit_tests = true + deps = [ + "//src/lib/intl/unicode_utils/unicode_blocks", + "//third_party/rust_crates:anyhow", + "//third_party/rust_crates:paste", + "//third_party/rust_crates:thiserror", + "//third_party/rust_crates:unic-char-range", + "//third_party/rust_crates:unic-ucd-block", + ] +} +test_package("char_collection_tests") { + deps = [ ":char_collection_test" ] + tests = [ + { + name = "char_collection_lib_test" + environments = basic_envs + }, + ] +} +group("tests") { + testonly = true + public_deps = [ ":char_collection_tests" ] +} diff --git a/components/char_collection/Cargo.toml b/components/char_collection/Cargo.toml new file mode 100644 index 00000000000..8e66cb5a0d5 --- /dev/null +++ b/components/char_collection/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "icu-char-collection" +description = "API for managing Unicode Language and Locale Identifiers" +version = "0.0.1" +authors = ["The ICU4X Project Developers"] +edition = "2018" +readme = "README.md" +repository = "https://github.com/unicode-org/icu4x" +license = "MIT/Apache-2.0" +categories = ["internationalization"] +include = [ + "src/**/*", + "Cargo.toml", +] diff --git a/components/char_collection/meta/char_collection_lib_test.cmx b/components/char_collection/meta/char_collection_lib_test.cmx new file mode 100644 index 00000000000..3bb56a96aa2 --- /dev/null +++ b/components/char_collection/meta/char_collection_lib_test.cmx @@ -0,0 +1,5 @@ +{ + "program": { + "binary": "test/char_collection_lib_test" + } +} \ No newline at end of file diff --git a/components/char_collection/src/char_collection.rs b/components/char_collection/src/char_collection.rs new file mode 100644 index 00000000000..5e97361db63 --- /dev/null +++ b/components/char_collection/src/char_collection.rs @@ -0,0 +1,653 @@ +// Copyright 2019 The Fuchsia Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +use { + anyhow::{format_err, Error}, + std::{ + clone::Clone, + cmp::Ordering, + hash::{Hash, Hasher}, + iter::Iterator, + ops::Range, + vec::Vec, + }, + unic_char_range::{chars, CharIter, CharRange}, +}; +/// A trait for objects that represent one or more disjoint, non-adjacent +/// [CharRanges](unic_char_range::CharRange). +pub trait MultiCharRange { + /// Iterate over the disjoint, non-adjacent [CharRange]s in the collection in ascending order. + fn iter_ranges<'a>(&'a self) -> Box + 'a>; + /// The number of ranges in the collection. + fn range_count(&self) -> usize; +} +/// A collection of `char`s (i.e. Unicode code points), used for storing large continuous ranges +/// efficiently. +/// +/// Lookups and insertions are O(log R), where R is the number of disjoint +/// ranges in the collection. +/// +/// The easiest way to create instances is using the +/// [char_collect!](::char_collection::char_collect) macro. +/// +/// ``` +/// use char_collection::CharCollection; +/// +/// let mut collection: CharCollection = char_collect!('a'..='d', 'x'..='z'); +/// char_collection += 'e'; +/// char_collection += chars!('p'..='t'); +/// assert_eq!( +/// collection.iter_ranges().collect(), +/// vec![chars!('a'..='e'), chars!('p'..='t'), chars!('x'..='z')]); +/// +/// assert!(collection.contains(&'c')); +/// assert!(collection.contains_range(chars!('q'..='s'))); +/// assert!(!collection.contains(&'9')); +/// +/// collection -= chars!('t'..='y'); +/// assert_eq!( +/// collection.iter_ranges().collect(), +/// vec![chars!('a'..='e', chars!('p'..'s'), chars!('z'..='z'))]); +/// ``` +/// +/// TODO(kpozin): Implement IntoIter. +#[derive(Clone, Debug, Eq, PartialEq, Default)] +pub struct CharCollection { + ranges: Vec, +} +impl CharCollection { + /// Create a new, empty `CharCollection`. + pub fn new() -> CharCollection { + CharCollection::default() + } + /// Create a new `CharCollection` from a list of disjoint, non-adjacent `CharRange`s, pre-sorted + /// in ascending code point order. + /// + /// This factory method is primarily intended for use in deserializing valid representations of + /// `CharCollections`. Will return an error if ranges are out of order, overlapping, or + /// adjacent. + pub fn from_sorted_ranges(ranges: T) -> Result + where + T: IntoIterator, + { + // If the original `ranges` is also a Vec, this doesn't result in an extra copy. + let collection = CharCollection { ranges: ranges.into_iter().collect() }; + let ranges: &Vec = &collection.ranges; + match (1..ranges.len()).find(|i| (ranges[*i].low as i64 - ranges[*i - 1].high as i64) <= 1) + { + Some(i) => Err(format_err!( + "These ranges are out of order, overlapping, or adjacent: {}, {}", + format_range(&ranges[i - 1]), + format_range(&ranges[i]) + )), + None => Ok(collection), + } + } + /// Create a new `CharCollection` from a list of `char`s, pre-sorted in ascending code point + /// order. + /// + /// This factory method is primarily intended for use in deserializing valid representations of + /// `CharCollections`. Will return an error if chars are out of order or contain duplicates. + pub fn from_sorted_chars(chars: T) -> Result + where + T: IntoIterator, + { + let mut collection = CharCollection::new(); + for ch in chars.into_iter() { + collection.append(ch)?; + } + Ok(collection) + } + /// Iterate over all the `char`s in the collection. + pub fn iter(&self) -> impl Iterator + '_ { + self.ranges.iter().flat_map(CharRange::iter) + } + /// Test whether the collection contains a specific `char`. + /// + /// The time complexity is O(log R), where R is the number of ranges in + /// the collection. + pub fn contains(&self, ch: &char) -> bool { + self.find_containing_range(ch).is_ok() + } + /// Test whether the collection contains an entire range of characters. + /// + /// The time complexity is O(log R), where R is the number of ranges in + /// the collection. + pub fn contains_range(&self, range: &CharRange) -> bool { + if range.is_empty() { + return false; + } + let lower_existing_range = self.find_containing_range(&range.low); + let upper_existing_range = self.find_containing_range(&range.high); + // Fully enclosed in existing range. + return lower_existing_range == upper_existing_range && lower_existing_range.is_ok(); + } + /// Insert a `char` or other collection of chars into this collection. + /// + /// Returns `&mut self` for easy chaining. + /// + /// The time complexity is O(T log(R + T)), where R + /// is the number of ranges in this collection and T is the number of ranges in + /// `to_add`. + pub fn insert(&mut self, to_add: &V) -> &mut Self { + to_add.iter_ranges().for_each(|range| self.insert_char_range(&range)); + self + } + /// Appends a `char` to the end of the existing collection. Panics if the given `char` is not + /// higher than the highest code point in the existing collection. + /// + /// Returns `&mut self` for easy chaining. + /// + /// The time complexity is O(1). + pub fn append(&mut self, ch: char) -> Result<&mut Self, Error> { + let mut coalesced = false; + if let Some(last_range) = self.ranges.last_mut() { + if last_range.cmp_char(ch) != Ordering::Less { + return Err(format_err!("Cannot append {} after {}", ch, last_range.high)); + } + if are_chars_adjacent(&last_range.high, &ch) { + last_range.high = ch; + coalesced = true; + } + } + if !coalesced { + self.ranges.push(chars!(ch..=ch)); + } + Ok(self) + } + /// Appends a `CharRange` to the end of the existing collection. Panics if the given range is + /// not higher than the highest code point in the existing collection. (The new range _may_ be + /// adjacent to the previous highest range, but may not overlap.) + /// + /// Returns `&mut self` for easy chaining. + /// + /// The time complexity is O(1). + pub fn append_range(&mut self, range: CharRange) -> Result<&mut Self, Error> { + let mut coalesced = false; + if let Some(last_range) = self.ranges.last_mut() { + if last_range.cmp_char(range.low) != Ordering::Less { + return Err(format_err!( + "Cannot append {} after {}", + format_range(&range), + last_range.high + )); + } + if are_chars_adjacent(&last_range.high, &range.low) { + last_range.high = range.high; + coalesced = true; + } + } + if !coalesced { + self.ranges.push(range); + } + Ok(self) + } + /// Remove a `char` or other collection of chars from this collection. + /// + /// Returns `&mut self` for easy chaining. + /// + /// The time complexity is O(T log(R + T)), where R + /// is the number of ranges in this collection and T is the number of ranges in + /// `to_remove`. + pub fn remove(&mut self, to_remove: &V) -> &mut Self { + to_remove.iter_ranges().for_each(|range| self.remove_char_range(&range)); + self + } + /// Remove all entries from this collection. + /// + /// Returns `&mut self` for easy chaining. + pub fn clear(&mut self) -> &mut Self { + self.ranges.clear(); + self + } + /// Return the set union of this collection and another one. + /// + /// The time complexity is O(min(R, T) log(R + T)), + /// where R is the number of ranges in this collection and T is the number + /// of ranges in `rhs`. + pub fn union(&self, rhs: &V) -> CharCollection { + let mut result: CharCollection; + if self.range_count() > rhs.range_count() { + result = self.clone(); + result.insert(rhs); + } else { + result = rhs.into(); + result.insert(self); + } + result + } + /// Return the set intersection of this collection and another one. + /// + /// The time complexity is O(min(R, T) log(R + T)), + /// where R is the number of ranges in this collection and T is the number + /// of ranges in `rhs`. + pub fn intersection(&self, rhs: &V) -> CharCollection { + let mut result: CharCollection; + if self.range_count() > rhs.range_count() { + result = self.clone(); + let rhs: CharCollection = rhs.into(); + result.remove(&rhs.complement()); + } else { + result = rhs.into(); + result.remove(&self.complement()); + } + result + } + /// Return the (non-symmetric) set difference of this collection and another one. + /// + /// The time complexity is O(T log(R + T)), where R + /// is the number of ranges in this collection and T is the number of ranges in + /// `rhs`. + pub fn difference(&self, rhs: &V) -> CharCollection { + let mut result: CharCollection = self.clone(); + result.remove(rhs); + result + } + /// Return the set complement of this collection (over the universe of `char`s). + /// + /// The time complexity is O(R), where R is the number of ranges in this + /// collection. + pub fn complement(&self) -> CharCollection { + if self.ranges.is_empty() { + return CharCollection::from(&CharRange::all()); + } + let mut result_ranges: Vec = Vec::new(); + if self.ranges[0].low != '\u{0}' { + result_ranges.push(CharRange::open_right('\u{0}', self.ranges[0].low)); + } + let mut prev_high = self.ranges[0].high; + for range in &self.ranges[1..] { + result_ranges.push(CharRange::open(prev_high, range.low)); + prev_high = range.high; + } + if prev_high != std::char::MAX { + result_ranges.push(CharRange::open_left(prev_high, std::char::MAX)); + } + CharCollection { ranges: result_ranges } + } + /// Insert a single `CharRange`. + /// + /// Depending on how the new range relates to existing ranges in + /// the collection, it might be subsumed by an existing range, modify the endpoints of an + /// existing range, or replace one or more existing ranges. + fn insert_char_range(&mut self, new_range: &CharRange) { + if new_range.is_empty() { + return; + } + let lower_existing_range = self.find_containing_range(&new_range.low); + let upper_existing_range = self.find_containing_range(&new_range.high); + // Fully enclosed in existing range. + if lower_existing_range == upper_existing_range && lower_existing_range.is_ok() { + return; + } + let new_low: char; + let new_high: char; + let remove_from_idx: usize; + let remove_to_idx: usize; + match lower_existing_range { + Ok((idx, lower_existing_range)) => { + new_low = lower_existing_range.low; + remove_from_idx = idx; + } + Err(idx) => { + new_low = new_range.low; + remove_from_idx = idx; + } + } + match upper_existing_range { + Ok((idx, higher_existing_range)) => { + new_high = higher_existing_range.high; + remove_to_idx = idx + 1; + } + Err(idx) => { + new_high = new_range.high; + remove_to_idx = idx; + } + } + self.replace_ranges(chars!(new_low..=new_high), remove_from_idx..remove_to_idx); + } + /// Remove a single `CharRange`. + /// + /// Depending on how the removed range relates to existing ranges in the collection, it might + /// remove or modify the endpoints of existing ranges. + fn remove_char_range(&mut self, range_to_remove: &CharRange) { + if range_to_remove.is_empty() { + return; + } + let lower_existing_range = self.find_containing_range(&range_to_remove.low); + let upper_existing_range = self.find_containing_range(&range_to_remove.high); + let mut replacement_ranges: Vec = Vec::new(); + let remove_from_idx: usize; + let remove_to_idx: usize; + match lower_existing_range { + Ok((idx, lower_existing_range)) => { + if lower_existing_range.low < range_to_remove.low { + replacement_ranges + .push(CharRange::open_right(lower_existing_range.low, range_to_remove.low)); + } + remove_from_idx = idx; + } + Err(idx) => remove_from_idx = idx, + } + match upper_existing_range { + Ok((idx, higher_existing_range)) => { + if range_to_remove.high < higher_existing_range.high { + replacement_ranges.push(CharRange::open_left( + range_to_remove.high, + higher_existing_range.high, + )); + } + remove_to_idx = idx + 1; + } + Err(idx) => { + remove_to_idx = idx; + } + } + self.ranges.splice(remove_from_idx..remove_to_idx, replacement_ranges); + } + /// Delete all the existing `CharRange`s that fall within `indices_to_replace` in the vector, + /// and insert `char_range_to_insert` in their place. If the newly formed range is adjacent to + /// a kept range on its left or right, coalesce them. + fn replace_ranges( + &mut self, + mut char_range_to_insert: CharRange, + mut indices_to_replace: Range, + ) { + // If the newly formed range is adjacent to the range on its left, coalesce the two. + if indices_to_replace.start > 0 { + let prev_char_range = self.ranges[indices_to_replace.start - 1]; + if are_chars_adjacent(&prev_char_range.high, &char_range_to_insert.low) { + char_range_to_insert.low = prev_char_range.low; + indices_to_replace.start -= 1; + } + } + // If the newly formed range is adjacent to the range on its right, coalesce the two. + if indices_to_replace.end < self.ranges.len() { + let next_char_range = self.ranges[indices_to_replace.end]; + if are_chars_adjacent(&char_range_to_insert.high, &next_char_range.low) { + char_range_to_insert.high = next_char_range.high; + indices_to_replace.end += 1; + } + } + self.ranges.splice(indices_to_replace, vec![char_range_to_insert]); + } + fn find_containing_range(&self, query: &char) -> Result<(usize, CharRange), usize> { + let result = self.ranges.binary_search_by(|range| range.cmp_char(query.clone())); + match result { + Ok(index) => Ok((index, self.ranges[index])), + Err(index) => Err(index), + } + } +} +impl MultiCharRange for CharCollection { + fn iter_ranges<'a>(&'a self) -> Box + 'a> { + Box::new(self.ranges.iter().map(|range| range.clone())) + } + fn range_count(&self) -> usize { + self.ranges.len() + } +} +impl Hash for CharCollection { + fn hash(&self, state: &mut H) { + self.ranges.iter().for_each(|range| hash_char_range(range, state)); + } +} +fn hash_char_range(range: &CharRange, state: &mut H) { + range.low.hash(state); + range.high.hash(state); +} +fn are_chars_adjacent(left: &char, right: &char) -> bool { + let mut iter: CharIter = CharRange::open_right(left.clone(), right.clone()).iter(); + match iter.next_back() { + None => false, + Some(next_right) => left == &next_right, + } +} +fn format_range(range: &CharRange) -> String { + format!("{}..={}", range.low, range.high) +} +#[cfg(test)] +mod tests { + use { + super::{are_chars_adjacent, CharCollection}, + anyhow::Error, + std::char, + unic_char_range::{chars, CharRange}, + }; + #[test] + fn test_from_sorted_ranges() -> Result<(), Error> { + let expected = char_collect!('a'..='d', 'g'..='l', 'z'); + let actual = CharCollection::from_sorted_ranges(vec![ + chars!('a'..='d'), + chars!('g'..='l'), + chars!('z'..='z'), + ])?; + assert_eq!(actual, expected); + Ok(()) + } + #[test] + fn test_from_sorted_ranges_out_of_order() { + assert!(CharCollection::from_sorted_ranges(vec![ + chars!('g'..='l'), + chars!('a'..='d'), + chars!('z'..='z'), + ]) + .is_err()); + } + #[test] + fn test_from_sorted_ranges_overlap() { + assert!(CharCollection::from_sorted_ranges(vec![ + chars!('a'..='d'), + chars!('c'..='l'), + chars!('z'..='z'), + ]) + .is_err()); + } + #[test] + fn test_from_sorted_ranges_adjacent() { + assert!( + CharCollection::from_sorted_ranges(vec![chars!('a'..='d'), chars!('e'..='g')]).is_err() + ); + } + #[test] + fn test_from_sorted_chars() -> Result<(), Error> { + let chars = vec!['a', 'b', 'c', 'd', 'g', 'h', 'i', 'j', 'k', 'l', 'z']; + let expected = char_collect!('a'..='d', 'g'..='l', 'z'); + let actual = CharCollection::from_sorted_chars(chars)?; + assert_eq!(actual, expected); + Ok(()) + } + #[test] + fn test_from_sorted_chars_out_of_order() { + let chars = vec!['a', 'b', 'c', 'd', 'g', 'h', 'i', 'j', 'k', 'l', 'e']; + assert!(CharCollection::from_sorted_chars(chars).is_err()); + } + #[test] + fn test_find_containing_range() { + let collection = char_collect!({ ('a'..='d') + ('g'..='j') + ('l'..='o') + 'z' }); + assert_eq!(collection.find_containing_range(&'0'), Err(0)); + assert_eq!(collection.find_containing_range(&'c'), Ok((0, chars!('a'..='d')))); + assert_eq!(collection.find_containing_range(&'e'), Err(1)); + } + #[test] + fn test_insert_initial() { + let collection = char_collect!('a'..='d'); + assert_eq!(collection.ranges, vec![chars!('a'..='d')]) + } + #[test] + fn test_insert_exact_match() { + let mut collection = char_collect!('a'..='d', 'g'..='l'); + collection += 'a'..='d'; + assert_eq!(collection.ranges, vec![chars!('a'..='d'), chars!('g'..='l')]); + } + #[test] + fn test_insert_non_overlapping_sorted() { + let collection = char_collect!('a'..='d', 'g'..='j', 'l'..='o'); + assert_eq!( + collection.ranges, + vec![chars!('a'..='d'), chars!('g'..='j'), chars!('l'..='o')] + ); + } + #[test] + fn test_insert_non_overlapping_unsorted() { + let collection = char_collect!('l'..='o', 'a'..='d', 'l'..='o', 'a'..='d', 'g'..='j'); + assert_eq!( + collection.ranges, + vec![chars!('a'..='d'), chars!('g'..='j'), chars!('l'..='o')] + ); + } + #[test] + fn test_insert_overlapping_all_existent() { + let mut collection = char_collect!('l'..='o', 'a'..='d'); + collection += 'a'..='o'; + assert_eq!(collection.ranges, vec![chars!('a'..='o')]); + } + #[test] + fn test_insert_overlapping_some_existent() { + let mut collection = char_collect!('c'..='e', 'j'..='m', 'p'..='s'); + collection += 'i'..='n'; + assert_eq!( + collection.ranges, + vec![chars!('c'..='e'), chars!('i'..='n'), chars!('p'..='s')] + ); + } + #[test] + fn test_insert_overlapping_with_intersections() { + let mut collection = char_collect!('c'..='e', 'j'..='m', 'p'..='s'); + collection += 'd'..='k'; + assert_eq!(collection.ranges, vec![chars!('c'..='m'), chars!('p'..='s')]); + } + #[test] + fn test_insert_coalesce_adjacent_ranges() { + let mut collection = char_collect!('a'..='c', 'j'..='m'); + collection += 'd'..='i'; + assert_eq!(collection.ranges, vec![chars!('a'..='m')]); + } + #[test] + fn test_append() -> Result<(), Error> { + let mut collection = char_collect!('a'..='c'); + collection.append('d')?.append('g')?.append('h')?.append('i')?.append('z')?; + assert_eq!(collection, char_collect!('a'..='d', 'g'..='i', 'z')); + Ok(()) + } + #[test] + fn test_append_out_of_order() -> Result<(), Error> { + let mut collection = char_collect!('a'..='c'); + assert!(collection + .append('d')? + .append('g')? + .append('h')? + .append('i')? + .append('e') + .is_err()); + Ok(()) + } + #[test] + fn test_append_range() -> Result<(), Error> { + let mut collection = char_collect!('a'..='c'); + collection.append_range(chars!('g'..='i'))?.append_range(chars!('j'..='m'))?; + assert_eq!(collection, char_collect!('a'..='c', 'g'..='m')); + Ok(()) + } + #[test] + fn test_append_range_out_of_order() -> Result<(), Error> { + let mut collection = char_collect!('a'..='c'); + assert!(collection + .append_range(chars!('g'..='i'))? + .append_range(chars!('j'..='m'))? + .append_range(chars!('k'..='m')) + .is_err()); + Ok(()) + } + #[test] + fn test_remove_exact_range() { + let mut collection = char_collect!('c'..='e', 'j'..='m', 'p'..='s'); + collection -= 'j'..='m'; + assert_eq!(collection.ranges, vec![chars!('c'..='e'), chars!['p'..='s']]); + } + #[test] + fn test_remove_overlapping_all_existent() { + let mut collection = char_collect!('c'..='e', 'j'..='m', 'p'..='s'); + collection -= 'c'..='s'; + assert_eq!(collection.ranges, vec![]); + } + #[test] + fn test_remove_overlapping_all_existent_superset() { + let mut collection = char_collect!('c'..='e', 'j'..='m', 'p'..='s'); + collection -= 'a'..='z'; + assert_eq!(collection.ranges, vec![]); + } + #[test] + fn test_remove_one_subrange() { + let mut collection = char_collect!('c'..='e', 'j'..='m', 'p'..='s'); + collection -= 'k'..='l'; + assert_eq!( + collection.ranges, + vec![chars!('c'..='e'), chars!('j'..='j'), chars!('m'..='m'), chars!('p'..='s')] + ); + } + #[test] + fn test_remove_intersection() { + let mut collection = char_collect!('c'..='e', 'j'..='m', 'p'..='s'); + collection -= 'd'..='q'; + assert_eq!(collection.ranges, vec![chars!('c'..='c'), chars!('r'..='s')]); + } + #[test] + fn test_complement_simple() { + let collection = char_collect!(0x10..=0x50, 0x70..=0x70, 0x99..=0x640); + assert_eq!( + collection.complement(), + char_collect!(0x00..=0x0F, 0x51..=0x6F, 0x71..=0x98, 0x641..=(char::MAX as u32)) + ); + } + #[test] + fn test_complement_all() { + let collection = char_collect!(CharRange::all()); + assert_eq!(collection.complement(), char_collect!()); + } + #[test] + fn test_complement_none() { + let collection = char_collect!(); + assert_eq!(collection.complement(), char_collect!(CharRange::all())); + } + #[test] + fn test_complement_includes_min_and_max() { + let collection = char_collect!(0x0..=0x10, 0x40..=0x50, 0xCCCC..=(char::MAX as u32)); + assert_eq!(collection.complement(), char_collect!(0x11..=0x3F, 0x51..=0xCCCB)); + } + #[test] + fn test_union() { + let collection_a = char_collect!('a'..='g', 'm'..='z', 'B'..='R'); + let collection_b = char_collect!('e'..='q', 'W'..='Y'); + let expected = char_collect!('a'..='z', 'B'..='R', 'W'..='Y'); + assert_eq!(collection_a.union(&collection_b), expected); + assert_eq!(collection_b.union(&collection_a), expected); + } + #[test] + fn test_intersection() { + let collection_a = char_collect!('a'..='g', 'm'..='z'); + let collection_b = char_collect!('e'..='q'); + let expected = char_collect!('e'..='g', 'm'..='q'); + assert_eq!(collection_a.intersection(&collection_b), expected); + assert_eq!(collection_b.intersection(&collection_a), expected); + } + #[test] + fn test_macro_expressions() { + use unicode_blocks::UnicodeBlockId::Arabic; + let collection = + char_collect!({ ('c'..='e') + ('f'..='h') - ('a'..='d') + Arabic + (0x5..=0x42) }); + assert_eq!(collection, char_collect!(0x5..=0x42, 'e'..='h', Arabic)); + } + #[test] + fn test_iter() { + let collection = char_collect!('a'..='c', 'j'..='l', 'x'..='z'); + let v = collection.iter().collect::>(); + assert_eq!(v, vec!['a', 'b', 'c', 'j', 'k', 'l', 'x', 'y', 'z']); + } + #[test] + fn test_are_chars_adjacent() { + assert!(are_chars_adjacent(&'a', &'b')); + assert!(!are_chars_adjacent(&'b', &'a')); + assert!(!are_chars_adjacent(&'a', &'c')); + } +} \ No newline at end of file diff --git a/components/char_collection/src/conversions.rs b/components/char_collection/src/conversions.rs new file mode 100644 index 00000000000..a9a58b6e7ff --- /dev/null +++ b/components/char_collection/src/conversions.rs @@ -0,0 +1,156 @@ +// Copyright 2019 The Fuchsia Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +//! Conversion (`From`) implementations for [CharCollection], via [MultiCharRange]. +use std::boxed::Box; +use std::convert::TryFrom; +use std::iter; +use std::ops::RangeInclusive; +use unic_char_range::CharRange; +use unic_ucd_block::Block; +use unicode_blocks::UnicodeBlockId; +use crate::{CharCollection, MultiCharRange}; +macro_rules! impl_for_range_inclusive_int_type { + ($($t:ty),*) => {$( + impl MultiCharRange for RangeInclusive<$t> { + fn iter_ranges(&self) -> Box> { + Box::new(iter::once(to_char_range!(self))) + } + fn range_count(&self) -> usize { + 1 + } + })*} +} +// This macro is needed because there is no way to express "can be cast as u32" using traits. +macro_rules! to_char_range { + ($range:expr) => { + CharRange::closed( + char::try_from(*$range.start() as u32).unwrap(), + char::try_from(*$range.end() as u32).unwrap(), + ) + }; +} +impl MultiCharRange for char { + fn iter_ranges(&self) -> Box> { + Box::new(std::iter::once(CharRange::closed(*self, *self))) + } + fn range_count(&self) -> usize { + 1 + } +} +impl MultiCharRange for CharRange { + fn iter_ranges(&self) -> Box> { + Box::new(iter::once(self.clone())) + } + fn range_count(&self) -> usize { + 1 + } +} +impl MultiCharRange for RangeInclusive { + fn iter_ranges(&self) -> Box> { + Box::new(iter::once(CharRange::closed(*self.start(), *self.end()))) + } + fn range_count(&self) -> usize { + 1 + } +} +impl_for_range_inclusive_int_type!(u8, i8, u32, i32); +impl MultiCharRange for UnicodeBlockId { + fn iter_ranges(&self) -> Box> { + self.block().iter_ranges() + } + fn range_count(&self) -> usize { + 1 + } +} +impl MultiCharRange for Block { + fn iter_ranges<'a>(&'a self) -> Box + 'a> { + Box::new(self.range.iter_ranges()) + } + fn range_count(&self) -> usize { + 1 + } +} +impl From<&T> for CharCollection { + fn from(source: &T) -> Self { + let mut collection = CharCollection::new(); + collection.insert(source); + collection + } +} +#[cfg(test)] +mod multi_char_range_tests { + use crate::MultiCharRange; + use paste; + use unic_char_range::{chars, CharRange}; + #[test] + fn test_char() { + let source = 'a'; + assert_eq!(source.iter_ranges().collect::>(), vec![chars!('a'..='a')]); + assert_eq!(source.range_count(), 1); + } + #[test] + fn test_char_range() { + let source = chars!('d'..='g'); + assert_eq!(source.iter_ranges().collect::>(), vec![chars!('d'..='g')]); + assert_eq!(source.range_count(), 1); + } + #[test] + fn test_range_inclusive_char() { + let source = 'd'..='g'; + assert_eq!(source.iter_ranges().collect::>(), vec![chars!('d'..='g')]); + assert_eq!(source.range_count(), 1); + } + macro_rules! test_range_inclusive_int { + ($t:ty) => { + paste::item! { + #[test] + fn []() { + let source: std::ops::RangeInclusive<$t> = 0x0..=0x9; + assert_eq!( + source.iter_ranges().collect::>(), + vec![chars!('\u{0}'..='\u{9}')] + ); + assert_eq!(source.range_count(), 1); + } + } + }; + } + test_range_inclusive_int!(u8); + test_range_inclusive_int!(i8); + test_range_inclusive_int!(u32); + test_range_inclusive_int!(i32); + #[test] + fn test_unicode_block_id() { + let source = unicode_blocks::UnicodeBlockId::BasicLatin; + assert_eq!( + source.iter_ranges().collect::>(), + vec![chars!('\u{0000}'..='\u{007f}')] + ); + assert_eq!(source.range_count(), 1); + } + #[test] + fn test_unicode_block() { + let source = unicode_blocks::UnicodeBlockId::BasicLatin.block(); + assert_eq!( + source.iter_ranges().collect::>(), + vec![chars!('\u{0000}'..='\u{007f}')] + ); + assert_eq!(source.range_count(), 1); + } +} +#[cfg(test)] +mod from_tests { + use crate::CharCollection; + use unicode_blocks::UnicodeBlockId; + #[test] + fn test_char() { + let actual: CharCollection = (&'a').into(); + assert_eq!(actual, char_collect!('a'..='a')); + } + #[test] + fn test_unicode_block_id() { + let actual: CharCollection = (&UnicodeBlockId::BasicLatin).into(); + assert_eq!(actual, char_collect!('\u{0000}'..='\u{007f}')); + } +} \ No newline at end of file diff --git a/components/char_collection/src/lib.rs b/components/char_collection/src/lib.rs new file mode 100644 index 00000000000..ee5e0de5205 --- /dev/null +++ b/components/char_collection/src/lib.rs @@ -0,0 +1,12 @@ +// Copyright 2019 The Fuchsia Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +#[macro_use] +mod macros; +mod char_collection; +mod conversions; +mod operators; +pub use char_collection::CharCollection; +pub use char_collection::MultiCharRange; +pub use conversions::*; +pub use operators::*; \ No newline at end of file diff --git a/components/char_collection/src/macros.rs b/components/char_collection/src/macros.rs new file mode 100644 index 00000000000..83a6a1b45fc --- /dev/null +++ b/components/char_collection/src/macros.rs @@ -0,0 +1,54 @@ +// Copyright 2019 The Fuchsia Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +/// Generate a [CharCollection] from a sequence of `char`s, +/// [CharRanges](unic_char_range::CharRange), or Unicode [Blocks](unic_ucd_block::Block). +/// +/// The macro can be used with either a comma-separated list of items, or with an expression +/// representing set operations. +/// +/// ``` +/// use char_collection::char_collect; +/// use unicode_blocks::UnicodeBlockId; +/// use unic_char_range::CharRange; +/// +/// let c1 = char_collect!( +/// 'a'..='z', +/// CharRange::closed('D', 'G'), +/// UnicodeBlockId::Cyrillic, +/// 0x01..=0x05, +/// '@'); +/// +/// let c2 = char_collect!({ ('a'..='z') - ('p'..='t') + UnicodeBlockId::Bengali }); +/// ``` +/// +/// *NOTE:* Parenthetical expressions currently aren't supported unless they start with a +/// `CharCollection`. +/// ``` +/// use char_collection::char_collect; +/// +/// // This works: +/// let c1 = char_collect!({ ('a'..='z') + (char_collect!('A'..='Z') - ('L'..='P')) }); +/// +/// // This doesn't: +/// let c1 = char_collect!({ ('a'..='z') + (('A'..='Z') - ('L'..='P')) }); +/// ``` +#[macro_export] +macro_rules! char_collect { + ({ $($x:tt)+ }) => { + { + $crate::CharCollection::new() + $($x)* + } + }; + ( $( $x:expr ),* ) => { + { + // Allow unused mut in case the collection is empty. + #[allow(unused_mut)] + let mut col = $crate::CharCollection::new(); + $( + col.insert(& $x); + )* + col + } + }; +} \ No newline at end of file diff --git a/components/char_collection/src/operators.rs b/components/char_collection/src/operators.rs new file mode 100644 index 00000000000..1beb4656073 --- /dev/null +++ b/components/char_collection/src/operators.rs @@ -0,0 +1,62 @@ +// Copyright 2019 The Fuchsia Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +//! Implementations of standard operators for [CharCollection]. +//! +//! `+` and `|` are equivalent. `+` is easier to use with `-`, as they have the same operator +//! precedence. +use crate::{CharCollection, MultiCharRange}; +use std::convert::Into; +use std::ops; +impl ops::BitOr for CharCollection { + type Output = CharCollection; + fn bitor(self, rhs: V) -> Self::Output { + let result: CharCollection = self.into(); + result.union(&rhs) + } +} +impl ops::Add for CharCollection { + type Output = CharCollection; + fn add(self, rhs: V) -> Self::Output { + let result: CharCollection = self.into(); + result.union(&rhs) + } +} +impl ops::BitOrAssign for CharCollection { + fn bitor_assign(&mut self, rhs: V) { + self.insert(&rhs); + } +} +impl ops::AddAssign for CharCollection { + fn add_assign(&mut self, rhs: V) { + self.insert(&rhs); + } +} +impl ops::Sub for CharCollection { + type Output = CharCollection; + fn sub(self, rhs: V) -> Self::Output { + self.difference(&rhs) + } +} +impl ops::SubAssign for CharCollection { + fn sub_assign(&mut self, rhs: V) { + self.remove(&rhs); + } +} +impl ops::BitAnd for CharCollection { + type Output = CharCollection; + fn bitand(self, rhs: V) -> Self::Output { + self.intersection(&rhs) + } +} +impl ops::BitAndAssign for CharCollection { + fn bitand_assign(&mut self, rhs: V) { + *self = self.intersection(&rhs); + } +} +impl ops::Not for CharCollection { + type Output = CharCollection; + fn not(self) -> Self::Output { + self.complement() + } +} \ No newline at end of file From c66b7707e4f5f4668f5f810eb9e270ae9aef9fa3 Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Wed, 3 Jun 2020 22:49:11 +0000 Subject: [PATCH 02/30] anyhow::Error dependency removed and std::error::Error added --- .../char_collection/src/char_collection.rs | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/components/char_collection/src/char_collection.rs b/components/char_collection/src/char_collection.rs index 5e97361db63..b31def4f444 100644 --- a/components/char_collection/src/char_collection.rs +++ b/components/char_collection/src/char_collection.rs @@ -2,7 +2,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. use { - anyhow::{format_err, Error}, std::{ clone::Clone, cmp::Ordering, @@ -10,6 +9,7 @@ use { iter::Iterator, ops::Range, vec::Vec, + error::Error, }, unic_char_range::{chars, CharIter, CharRange}, }; @@ -66,7 +66,7 @@ impl CharCollection { /// This factory method is primarily intended for use in deserializing valid representations of /// `CharCollections`. Will return an error if ranges are out of order, overlapping, or /// adjacent. - pub fn from_sorted_ranges(ranges: T) -> Result + pub fn from_sorted_ranges(ranges: T) -> Result> where T: IntoIterator, { @@ -75,11 +75,11 @@ impl CharCollection { let ranges: &Vec = &collection.ranges; match (1..ranges.len()).find(|i| (ranges[*i].low as i64 - ranges[*i - 1].high as i64) <= 1) { - Some(i) => Err(format_err!( - "These ranges are out of order, overlapping, or adjacent: {}, {}", + Some(i) => Err(format!( + "These ranges are out of order, overlapping, or adjacent: {:?}, {:?}", format_range(&ranges[i - 1]), format_range(&ranges[i]) - )), + ).into()), None => Ok(collection), } } @@ -88,7 +88,7 @@ impl CharCollection { /// /// This factory method is primarily intended for use in deserializing valid representations of /// `CharCollections`. Will return an error if chars are out of order or contain duplicates. - pub fn from_sorted_chars(chars: T) -> Result + pub fn from_sorted_chars(chars: T) -> Result> where T: IntoIterator, { @@ -139,11 +139,11 @@ impl CharCollection { /// Returns `&mut self` for easy chaining. /// /// The time complexity is O(1). - pub fn append(&mut self, ch: char) -> Result<&mut Self, Error> { + pub fn append(&mut self, ch: char) -> Result<&mut Self, Box> { let mut coalesced = false; if let Some(last_range) = self.ranges.last_mut() { if last_range.cmp_char(ch) != Ordering::Less { - return Err(format_err!("Cannot append {} after {}", ch, last_range.high)); + return Err(format!("Cannot append {:?} after {:?}", ch, last_range.high).into()); } if are_chars_adjacent(&last_range.high, &ch) { last_range.high = ch; @@ -162,15 +162,15 @@ impl CharCollection { /// Returns `&mut self` for easy chaining. /// /// The time complexity is O(1). - pub fn append_range(&mut self, range: CharRange) -> Result<&mut Self, Error> { + pub fn append_range(&mut self, range: CharRange) -> Result<&mut Self, Box> { let mut coalesced = false; if let Some(last_range) = self.ranges.last_mut() { if last_range.cmp_char(range.low) != Ordering::Less { - return Err(format_err!( - "Cannot append {} after {}", + return Err(format!( + "Cannot append {:?} after {:?}", format_range(&range), last_range.high - )); + ).into()); } if are_chars_adjacent(&last_range.high, &range.low) { last_range.high = range.high; @@ -410,12 +410,12 @@ fn format_range(range: &CharRange) -> String { mod tests { use { super::{are_chars_adjacent, CharCollection}, - anyhow::Error, + std::error::Error, std::char, unic_char_range::{chars, CharRange}, }; #[test] - fn test_from_sorted_ranges() -> Result<(), Error> { + fn test_from_sorted_ranges() -> Result<(), Box> { let expected = char_collect!('a'..='d', 'g'..='l', 'z'); let actual = CharCollection::from_sorted_ranges(vec![ chars!('a'..='d'), @@ -450,7 +450,7 @@ mod tests { ); } #[test] - fn test_from_sorted_chars() -> Result<(), Error> { + fn test_from_sorted_chars() -> Result<(), Box> { let chars = vec!['a', 'b', 'c', 'd', 'g', 'h', 'i', 'j', 'k', 'l', 'z']; let expected = char_collect!('a'..='d', 'g'..='l', 'z'); let actual = CharCollection::from_sorted_chars(chars)?; @@ -524,14 +524,14 @@ mod tests { assert_eq!(collection.ranges, vec![chars!('a'..='m')]); } #[test] - fn test_append() -> Result<(), Error> { + fn test_append() -> Result<(), Box> { let mut collection = char_collect!('a'..='c'); collection.append('d')?.append('g')?.append('h')?.append('i')?.append('z')?; assert_eq!(collection, char_collect!('a'..='d', 'g'..='i', 'z')); Ok(()) } #[test] - fn test_append_out_of_order() -> Result<(), Error> { + fn test_append_out_of_order() -> Result<(), Box> { let mut collection = char_collect!('a'..='c'); assert!(collection .append('d')? @@ -543,14 +543,14 @@ mod tests { Ok(()) } #[test] - fn test_append_range() -> Result<(), Error> { + fn test_append_range() -> Result<(), Box> { let mut collection = char_collect!('a'..='c'); collection.append_range(chars!('g'..='i'))?.append_range(chars!('j'..='m'))?; assert_eq!(collection, char_collect!('a'..='c', 'g'..='m')); Ok(()) } #[test] - fn test_append_range_out_of_order() -> Result<(), Error> { + fn test_append_range_out_of_order() -> Result<(), Box> { let mut collection = char_collect!('a'..='c'); assert!(collection .append_range(chars!('g'..='i'))? From e4b8f56ad7e69c4ed2171d71b2fab8254679eeeb Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Wed, 3 Jun 2020 22:55:26 +0000 Subject: [PATCH 03/30] std imports made consistent and unic-ucd-block dependency removed --- .../char_collection/src/char_collection.rs | 2 + components/char_collection/src/conversions.rs | 84 +++++++++---------- 2 files changed, 44 insertions(+), 42 deletions(-) diff --git a/components/char_collection/src/char_collection.rs b/components/char_collection/src/char_collection.rs index b31def4f444..1094c4964c6 100644 --- a/components/char_collection/src/char_collection.rs +++ b/components/char_collection/src/char_collection.rs @@ -10,6 +10,8 @@ use { ops::Range, vec::Vec, error::Error, + convert::Into, + boxed::Box }, unic_char_range::{chars, CharIter, CharRange}, }; diff --git a/components/char_collection/src/conversions.rs b/components/char_collection/src/conversions.rs index a9a58b6e7ff..f2c4d18ffa7 100644 --- a/components/char_collection/src/conversions.rs +++ b/components/char_collection/src/conversions.rs @@ -7,8 +7,8 @@ use std::convert::TryFrom; use std::iter; use std::ops::RangeInclusive; use unic_char_range::CharRange; -use unic_ucd_block::Block; -use unicode_blocks::UnicodeBlockId; +// use unic_ucd_block::Block; +// use unicode_blocks::UnicodeBlockId; use crate::{CharCollection, MultiCharRange}; macro_rules! impl_for_range_inclusive_int_type { ($($t:ty),*) => {$( @@ -55,22 +55,22 @@ impl MultiCharRange for RangeInclusive { } } impl_for_range_inclusive_int_type!(u8, i8, u32, i32); -impl MultiCharRange for UnicodeBlockId { - fn iter_ranges(&self) -> Box> { - self.block().iter_ranges() - } - fn range_count(&self) -> usize { - 1 - } -} -impl MultiCharRange for Block { - fn iter_ranges<'a>(&'a self) -> Box + 'a> { - Box::new(self.range.iter_ranges()) - } - fn range_count(&self) -> usize { - 1 - } -} +// impl MultiCharRange for UnicodeBlockId { +// fn iter_ranges(&self) -> Box> { +// self.block().iter_ranges() +// } +// fn range_count(&self) -> usize { +// 1 +// } +// } +// impl MultiCharRange for Block { +// fn iter_ranges<'a>(&'a self) -> Box + 'a> { +// Box::new(self.range.iter_ranges()) +// } +// fn range_count(&self) -> usize { +// 1 +// } +// } impl From<&T> for CharCollection { fn from(source: &T) -> Self { let mut collection = CharCollection::new(); @@ -120,37 +120,37 @@ mod multi_char_range_tests { test_range_inclusive_int!(i8); test_range_inclusive_int!(u32); test_range_inclusive_int!(i32); - #[test] - fn test_unicode_block_id() { - let source = unicode_blocks::UnicodeBlockId::BasicLatin; - assert_eq!( - source.iter_ranges().collect::>(), - vec![chars!('\u{0000}'..='\u{007f}')] - ); - assert_eq!(source.range_count(), 1); - } - #[test] - fn test_unicode_block() { - let source = unicode_blocks::UnicodeBlockId::BasicLatin.block(); - assert_eq!( - source.iter_ranges().collect::>(), - vec![chars!('\u{0000}'..='\u{007f}')] - ); - assert_eq!(source.range_count(), 1); - } + // #[test] + // fn test_unicode_block_id() { + // let source = unicode_blocks::UnicodeBlockId::BasicLatin; + // assert_eq!( + // source.iter_ranges().collect::>(), + // vec![chars!('\u{0000}'..='\u{007f}')] + // ); + // assert_eq!(source.range_count(), 1); + // } + // #[test] + // fn test_unicode_block() { + // let source = unicode_blocks::UnicodeBlockId::BasicLatin.block(); + // assert_eq!( + // source.iter_ranges().collect::>(), + // vec![chars!('\u{0000}'..='\u{007f}')] + // ); + // assert_eq!(source.range_count(), 1); + // } } #[cfg(test)] mod from_tests { use crate::CharCollection; - use unicode_blocks::UnicodeBlockId; + // use unicode_blocks::UnicodeBlockId; #[test] fn test_char() { let actual: CharCollection = (&'a').into(); assert_eq!(actual, char_collect!('a'..='a')); } - #[test] - fn test_unicode_block_id() { - let actual: CharCollection = (&UnicodeBlockId::BasicLatin).into(); - assert_eq!(actual, char_collect!('\u{0000}'..='\u{007f}')); - } + // #[test] + // fn test_unicode_block_id() { + // let actual: CharCollection = (&UnicodeBlockId::BasicLatin).into(); + // assert_eq!(actual, char_collect!('\u{0000}'..='\u{007f}')); + // } } \ No newline at end of file From 4963cc1a5d2e6c548ecf04f02dfcb7a3b93fa4ca Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Wed, 10 Jun 2020 01:05:18 +0000 Subject: [PATCH 04/30] Replaced CharRange, passing 36/40 tests --- components/char_collection/Cargo.toml | 4 + .../char_collection/src/char_collection.rs | 164 ++++++++++++++++-- components/char_collection/src/conversions.rs | 8 +- components/char_collection/src/lib.rs | 2 + components/char_collection/src/macros.rs | 38 ++-- 5 files changed, 176 insertions(+), 40 deletions(-) diff --git a/components/char_collection/Cargo.toml b/components/char_collection/Cargo.toml index 8e66cb5a0d5..8ed0275f2a2 100644 --- a/components/char_collection/Cargo.toml +++ b/components/char_collection/Cargo.toml @@ -12,3 +12,7 @@ include = [ "src/**/*", "Cargo.toml", ] + +[dependencies] +unic-char-range = "0.9.0" +paste = "0.1.16" diff --git a/components/char_collection/src/char_collection.rs b/components/char_collection/src/char_collection.rs index 1094c4964c6..63f4c7cb88b 100644 --- a/components/char_collection/src/char_collection.rs +++ b/components/char_collection/src/char_collection.rs @@ -3,6 +3,7 @@ // found in the LICENSE file. use { std::{ + char, clone::Clone, cmp::Ordering, hash::{Hash, Hasher}, @@ -13,8 +14,151 @@ use { convert::Into, boxed::Box }, - unic_char_range::{chars, CharIter, CharRange}, + // unic_char_range::{chars, CharIter, CharRange}, }; + +#[derive(Copy, Clone, Debug, Eq)] +pub struct CharRange { + low: char, + high: char, +} + +impl CharRange { +// open_right + // would we want this to return a Option next time? + pub fn open_right(low: char, high: char) -> CharRange { + // nothing happens if this fails + let high: char = char::from_u32(high as u32 - 1).unwrap(); + CharRange{low, high} + } +// closed + pub fn closed(low: char, high: char) -> CharRange { + // if low == '\u{0}' { // need way to handle this + // // for now just leave alone + // } + CharRange{low, high} + } +// open + pub fn open(low: char, high: char) -> CharRange { + // this is repeated here + let low: char = char::from_u32(low as u32 + 1).unwrap(); + let high: char = char::from_u32(high as u32 - 1).unwrap(); + CharRange{low, high} + } +// open_left + pub fn open_left(low: char, high: char) -> CharRange { + // this is repeated here + let high: char = char::from_u32(high as u32 + 1).unwrap(); + CharRange{low, high} + } +// all + pub fn all() -> CharRange { + CharRange{low: '\u{0}', high: char::MAX} + } +// cmp_char + pub fn cmp_char(&self, comp_char: char) -> Ordering { + if self.high < comp_char { + Ordering::Less + } + else if self.low > comp_char { + Ordering::Greater + } + else { + Ordering::Equal + } + } +// contains + pub fn contains(&self, ch: char) -> bool { + self.low <= ch && ch <= self.high + } +// is_empty + pub fn is_empty(&self) -> bool { + self.low > self.high + } + pub fn iter(&self) -> CharIter { + (*self).into() + } +} + +impl IntoIterator for CharRange { + type IntoIter = CharIter; + type Item = char; + fn into_iter(self) -> CharIter { + self.iter() + } +} + +impl PartialEq for CharRange { + fn eq(&self, other: &CharRange) -> bool { + (self.is_empty() && other.is_empty()) || (self.low == other.low && self.high == other.high) + } +} + +#[derive(Clone, Debug)] +pub struct CharIter { + low: char, + high: char +} + +impl From for CharIter { + fn from(range: CharRange) -> CharIter { + CharIter { + low: range.low, + high: range.high + } + } +} + +impl From for CharRange { + fn from(iter: CharIter) -> CharRange { + CharRange { + low: iter.low, + high: iter.high + } + } +} + +impl CharIter { + fn advance(&mut self) { + if self.low == char::MAX { + self.high = '\0'; + } + else { + self.low = char::from_u32(self.low as u32 + 1).unwrap(); + } + } + fn retreat(&mut self) { + if self.high == '\0' { + self.low = char::MAX; + } + else { + self.high = char::from_u32(self.high as u32 - 1).unwrap(); + } + } + fn next_back(&mut self) -> Option { + if self.low > self.high { + None + } + else { + let ch = self.high; + self.retreat(); + Some(ch) + } + } +} + +impl Iterator for CharIter { + type Item = char; + fn next(&mut self) -> Option { + if self.low > self.high { + return None; + } + let ch = self.low; + self.advance(); + Some(ch) + } +} + /// A trait for objects that represent one or more disjoint, non-adjacent /// [CharRanges](unic_char_range::CharRange). pub trait MultiCharRange { @@ -411,10 +555,10 @@ fn format_range(range: &CharRange) -> String { #[cfg(test)] mod tests { use { - super::{are_chars_adjacent, CharCollection}, + super::{are_chars_adjacent, CharCollection, CharRange}, std::error::Error, std::char, - unic_char_range::{chars, CharRange}, + // unic_char_range::{chars, CharRange}, }; #[test] fn test_from_sorted_ranges() -> Result<(), Box> { @@ -633,13 +777,13 @@ mod tests { assert_eq!(collection_a.intersection(&collection_b), expected); assert_eq!(collection_b.intersection(&collection_a), expected); } - #[test] - fn test_macro_expressions() { - use unicode_blocks::UnicodeBlockId::Arabic; - let collection = - char_collect!({ ('c'..='e') + ('f'..='h') - ('a'..='d') + Arabic + (0x5..=0x42) }); - assert_eq!(collection, char_collect!(0x5..=0x42, 'e'..='h', Arabic)); - } + // #[test] + // fn test_macro_expressions() { + // use unicode_blocks::UnicodeBlockId::Arabic; + // let collection = + // char_collect!({ ('c'..='e') + ('f'..='h') - ('a'..='d') + Arabic + (0x5..=0x42) }); + // assert_eq!(collection, char_collect!(0x5..=0x42, 'e'..='h', Arabic)); + // } #[test] fn test_iter() { let collection = char_collect!('a'..='c', 'j'..='l', 'x'..='z'); diff --git a/components/char_collection/src/conversions.rs b/components/char_collection/src/conversions.rs index f2c4d18ffa7..1ca0d36b26d 100644 --- a/components/char_collection/src/conversions.rs +++ b/components/char_collection/src/conversions.rs @@ -6,10 +6,10 @@ use std::boxed::Box; use std::convert::TryFrom; use std::iter; use std::ops::RangeInclusive; -use unic_char_range::CharRange; +// use unic_char_range::CharRange; // use unic_ucd_block::Block; // use unicode_blocks::UnicodeBlockId; -use crate::{CharCollection, MultiCharRange}; +use crate::{CharRange, CharCollection, MultiCharRange}; macro_rules! impl_for_range_inclusive_int_type { ($($t:ty),*) => {$( impl MultiCharRange for RangeInclusive<$t> { @@ -80,9 +80,9 @@ impl From<&T> for CharCollection { } #[cfg(test)] mod multi_char_range_tests { - use crate::MultiCharRange; + use crate::{MultiCharRange, CharRange}; use paste; - use unic_char_range::{chars, CharRange}; + // use unic_char_range::{chars, CharRange}; #[test] fn test_char() { let source = 'a'; diff --git a/components/char_collection/src/lib.rs b/components/char_collection/src/lib.rs index ee5e0de5205..d6d4a1096f5 100644 --- a/components/char_collection/src/lib.rs +++ b/components/char_collection/src/lib.rs @@ -8,5 +8,7 @@ mod conversions; mod operators; pub use char_collection::CharCollection; pub use char_collection::MultiCharRange; +pub use char_collection::CharRange; +pub use char_collection::CharIter; pub use conversions::*; pub use operators::*; \ No newline at end of file diff --git a/components/char_collection/src/macros.rs b/components/char_collection/src/macros.rs index 83a6a1b45fc..3830b9d3f66 100644 --- a/components/char_collection/src/macros.rs +++ b/components/char_collection/src/macros.rs @@ -7,32 +7,6 @@ /// The macro can be used with either a comma-separated list of items, or with an expression /// representing set operations. /// -/// ``` -/// use char_collection::char_collect; -/// use unicode_blocks::UnicodeBlockId; -/// use unic_char_range::CharRange; -/// -/// let c1 = char_collect!( -/// 'a'..='z', -/// CharRange::closed('D', 'G'), -/// UnicodeBlockId::Cyrillic, -/// 0x01..=0x05, -/// '@'); -/// -/// let c2 = char_collect!({ ('a'..='z') - ('p'..='t') + UnicodeBlockId::Bengali }); -/// ``` -/// -/// *NOTE:* Parenthetical expressions currently aren't supported unless they start with a -/// `CharCollection`. -/// ``` -/// use char_collection::char_collect; -/// -/// // This works: -/// let c1 = char_collect!({ ('a'..='z') + (char_collect!('A'..='Z') - ('L'..='P')) }); -/// -/// // This doesn't: -/// let c1 = char_collect!({ ('a'..='z') + (('A'..='Z') - ('L'..='P')) }); -/// ``` #[macro_export] macro_rules! char_collect { ({ $($x:tt)+ }) => { @@ -51,4 +25,16 @@ macro_rules! char_collect { col } }; +} +#[macro_export] +macro_rules! chars { + ($low:tt .. $high:tt) => { + $crate::CharRange::open_right($low, $high) + }; + ($low:tt ..= $high:tt) => { + $crate::CharRange::closed($low, $high) + }; + (..) => { + $crate::CharRange::all() + } } \ No newline at end of file From 01ca5efa5784bc635cec90f4fbbea2a74b1f1b65 Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Wed, 10 Jun 2020 02:13:01 +0000 Subject: [PATCH 05/30] Fixed bug, pass all 40 tests --- .../char_collection/src/char_collection.rs | 21 +------------------ 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/components/char_collection/src/char_collection.rs b/components/char_collection/src/char_collection.rs index 63f4c7cb88b..e7914c9993a 100644 --- a/components/char_collection/src/char_collection.rs +++ b/components/char_collection/src/char_collection.rs @@ -48,7 +48,7 @@ impl CharRange { // open_left pub fn open_left(low: char, high: char) -> CharRange { // this is repeated here - let high: char = char::from_u32(high as u32 + 1).unwrap(); + let low: char = char::from_u32(low as u32 + 1).unwrap(); CharRange{low, high} } // all @@ -176,25 +176,6 @@ pub trait MultiCharRange { /// The easiest way to create instances is using the /// [char_collect!](::char_collection::char_collect) macro. /// -/// ``` -/// use char_collection::CharCollection; -/// -/// let mut collection: CharCollection = char_collect!('a'..='d', 'x'..='z'); -/// char_collection += 'e'; -/// char_collection += chars!('p'..='t'); -/// assert_eq!( -/// collection.iter_ranges().collect(), -/// vec![chars!('a'..='e'), chars!('p'..='t'), chars!('x'..='z')]); -/// -/// assert!(collection.contains(&'c')); -/// assert!(collection.contains_range(chars!('q'..='s'))); -/// assert!(!collection.contains(&'9')); -/// -/// collection -= chars!('t'..='y'); -/// assert_eq!( -/// collection.iter_ranges().collect(), -/// vec![chars!('a'..='e', chars!('p'..'s'), chars!('z'..='z'))]); -/// ``` /// /// TODO(kpozin): Implement IntoIter. #[derive(Clone, Debug, Eq, PartialEq, Default)] From b57d18fbeb95c02ad413bb36adfc4f5a504e066a Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Fri, 12 Jun 2020 16:19:43 +0000 Subject: [PATCH 06/30] Remove dependency file --- components/char_collection/BUILD.gn | 32 ----------------------------- 1 file changed, 32 deletions(-) delete mode 100644 components/char_collection/BUILD.gn diff --git a/components/char_collection/BUILD.gn b/components/char_collection/BUILD.gn deleted file mode 100644 index f772f5023e6..00000000000 --- a/components/char_collection/BUILD.gn +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright 2019 The Fuchsia Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. -import("//build/rust/rustc_library.gni") -import("//build/test/test_package.gni") -import("//build/testing/environments.gni") -# Library for working with collections of Unicode code points. -rustc_library("char_collection") { - edition = "2018" - with_unit_tests = true - deps = [ - "//src/lib/intl/unicode_utils/unicode_blocks", - "//third_party/rust_crates:anyhow", - "//third_party/rust_crates:paste", - "//third_party/rust_crates:thiserror", - "//third_party/rust_crates:unic-char-range", - "//third_party/rust_crates:unic-ucd-block", - ] -} -test_package("char_collection_tests") { - deps = [ ":char_collection_test" ] - tests = [ - { - name = "char_collection_lib_test" - environments = basic_envs - }, - ] -} -group("tests") { - testonly = true - public_deps = [ ":char_collection_tests" ] -} From dedebe25466cf513923af8de8b56380051908a41 Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Tue, 16 Jun 2020 19:24:57 +0000 Subject: [PATCH 07/30] github actions and README fixes --- components/char_collection/README.md | 12 ++ .../char_collection/src/char_collection.rs | 186 +++++++++++------- components/char_collection/src/conversions.rs | 21 +- components/char_collection/src/lib.rs | 6 +- components/char_collection/src/macros.rs | 4 +- components/char_collection/src/operators.rs | 2 +- 6 files changed, 150 insertions(+), 81 deletions(-) create mode 100644 components/char_collection/README.md diff --git a/components/char_collection/README.md b/components/char_collection/README.md new file mode 100644 index 00000000000..9cb580caa61 --- /dev/null +++ b/components/char_collection/README.md @@ -0,0 +1,12 @@ +# ICU4X + +ICU4X is a set of internationalization components for Unicode. + +# Status [![crates.io](http://meritbadge.herokuapp.com/icu4x)](https://crates.io/crates/icu4x) + +The project is in an incubation period. + +# Authors + +The project is managed by a subcommittee of ICU-TC in the Unicode Consortium focused on providing solutions for client-side internationalization. + diff --git a/components/char_collection/src/char_collection.rs b/components/char_collection/src/char_collection.rs index e7914c9993a..639e955e7c1 100644 --- a/components/char_collection/src/char_collection.rs +++ b/components/char_collection/src/char_collection.rs @@ -1,20 +1,17 @@ // Copyright 2019 The Fuchsia Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. -use { - std::{ - char, - clone::Clone, - cmp::Ordering, - hash::{Hash, Hasher}, - iter::Iterator, - ops::Range, - vec::Vec, - error::Error, - convert::Into, - boxed::Box - }, - // unic_char_range::{chars, CharIter, CharRange}, +use std::{ + boxed::Box, + char, + clone::Clone, + cmp::Ordering, + convert::Into, + error::Error, + hash::{Hash, Hasher}, + iter::Iterator, + ops::Range, + vec::Vec, }; #[derive(Copy, Clone, Debug, Eq)] @@ -24,54 +21,55 @@ pub struct CharRange { } impl CharRange { -// open_right + // open_right // would we want this to return a Option next time? - pub fn open_right(low: char, high: char) -> CharRange { + pub fn open_right(low: char, high: char) -> CharRange { // nothing happens if this fails let high: char = char::from_u32(high as u32 - 1).unwrap(); - CharRange{low, high} + CharRange { low, high } } -// closed + // closed pub fn closed(low: char, high: char) -> CharRange { // if low == '\u{0}' { // need way to handle this - // // for now just leave alone - // } - CharRange{low, high} + // // for now just leave alone + // } + CharRange { low, high } } -// open + // open pub fn open(low: char, high: char) -> CharRange { // this is repeated here let low: char = char::from_u32(low as u32 + 1).unwrap(); let high: char = char::from_u32(high as u32 - 1).unwrap(); - CharRange{low, high} + CharRange { low, high } } -// open_left + // open_left pub fn open_left(low: char, high: char) -> CharRange { // this is repeated here let low: char = char::from_u32(low as u32 + 1).unwrap(); - CharRange{low, high} + CharRange { low, high } } -// all + // all pub fn all() -> CharRange { - CharRange{low: '\u{0}', high: char::MAX} + CharRange { + low: '\u{0}', + high: char::MAX, + } } -// cmp_char + // cmp_char pub fn cmp_char(&self, comp_char: char) -> Ordering { if self.high < comp_char { Ordering::Less - } - else if self.low > comp_char { + } else if self.low > comp_char { Ordering::Greater - } - else { + } else { Ordering::Equal } } -// contains + // contains pub fn contains(&self, ch: char) -> bool { - self.low <= ch && ch <= self.high + self.low <= ch && ch <= self.high } -// is_empty + // is_empty pub fn is_empty(&self) -> bool { self.low > self.high } @@ -90,21 +88,21 @@ impl IntoIterator for CharRange { impl PartialEq for CharRange { fn eq(&self, other: &CharRange) -> bool { - (self.is_empty() && other.is_empty()) || (self.low == other.low && self.high == other.high) + (self.is_empty() && other.is_empty()) || (self.low == other.low && self.high == other.high) } } #[derive(Clone, Debug)] pub struct CharIter { low: char, - high: char + high: char, } impl From for CharIter { fn from(range: CharRange) -> CharIter { CharIter { low: range.low, - high: range.high + high: range.high, } } } @@ -113,7 +111,7 @@ impl From for CharRange { fn from(iter: CharIter) -> CharRange { CharRange { low: iter.low, - high: iter.high + high: iter.high, } } } @@ -122,24 +120,21 @@ impl CharIter { fn advance(&mut self) { if self.low == char::MAX { self.high = '\0'; - } - else { + } else { self.low = char::from_u32(self.low as u32 + 1).unwrap(); } } fn retreat(&mut self) { if self.high == '\0' { self.low = char::MAX; - } - else { + } else { self.high = char::from_u32(self.high as u32 - 1).unwrap(); } } fn next_back(&mut self) -> Option { if self.low > self.high { None - } - else { + } else { let ch = self.high; self.retreat(); Some(ch) @@ -198,7 +193,9 @@ impl CharCollection { T: IntoIterator, { // If the original `ranges` is also a Vec, this doesn't result in an extra copy. - let collection = CharCollection { ranges: ranges.into_iter().collect() }; + let collection = CharCollection { + ranges: ranges.into_iter().collect(), + }; let ranges: &Vec = &collection.ranges; match (1..ranges.len()).find(|i| (ranges[*i].low as i64 - ranges[*i - 1].high as i64) <= 1) { @@ -206,7 +203,8 @@ impl CharCollection { "These ranges are out of order, overlapping, or adjacent: {:?}, {:?}", format_range(&ranges[i - 1]), format_range(&ranges[i]) - ).into()), + ) + .into()), None => Ok(collection), } } @@ -257,7 +255,9 @@ impl CharCollection { /// is the number of ranges in this collection and T is the number of ranges in /// `to_add`. pub fn insert(&mut self, to_add: &V) -> &mut Self { - to_add.iter_ranges().for_each(|range| self.insert_char_range(&range)); + to_add + .iter_ranges() + .for_each(|range| self.insert_char_range(&range)); self } /// Appends a `char` to the end of the existing collection. Panics if the given `char` is not @@ -297,7 +297,8 @@ impl CharCollection { "Cannot append {:?} after {:?}", format_range(&range), last_range.high - ).into()); + ) + .into()); } if are_chars_adjacent(&last_range.high, &range.low) { last_range.high = range.high; @@ -317,7 +318,9 @@ impl CharCollection { /// is the number of ranges in this collection and T is the number of ranges in /// `to_remove`. pub fn remove(&mut self, to_remove: &V) -> &mut Self { - to_remove.iter_ranges().for_each(|range| self.remove_char_range(&range)); + to_remove + .iter_ranges() + .for_each(|range| self.remove_char_range(&range)); self } /// Remove all entries from this collection. @@ -390,7 +393,9 @@ impl CharCollection { if prev_high != std::char::MAX { result_ranges.push(CharRange::open_left(prev_high, std::char::MAX)); } - CharCollection { ranges: result_ranges } + CharCollection { + ranges: result_ranges, + } } /// Insert a single `CharRange`. /// @@ -449,8 +454,10 @@ impl CharCollection { match lower_existing_range { Ok((idx, lower_existing_range)) => { if lower_existing_range.low < range_to_remove.low { - replacement_ranges - .push(CharRange::open_right(lower_existing_range.low, range_to_remove.low)); + replacement_ranges.push(CharRange::open_right( + lower_existing_range.low, + range_to_remove.low, + )); } remove_from_idx = idx; } @@ -470,7 +477,8 @@ impl CharCollection { remove_to_idx = idx; } } - self.ranges.splice(remove_from_idx..remove_to_idx, replacement_ranges); + self.ranges + .splice(remove_from_idx..remove_to_idx, replacement_ranges); } /// Delete all the existing `CharRange`s that fall within `indices_to_replace` in the vector, /// and insert `char_range_to_insert` in their place. If the newly formed range is adjacent to @@ -496,10 +504,13 @@ impl CharCollection { indices_to_replace.end += 1; } } - self.ranges.splice(indices_to_replace, vec![char_range_to_insert]); + self.ranges + .splice(indices_to_replace, vec![char_range_to_insert]); } fn find_containing_range(&self, query: &char) -> Result<(usize, CharRange), usize> { - let result = self.ranges.binary_search_by(|range| range.cmp_char(query.clone())); + let result = self + .ranges + .binary_search_by(|range| range.cmp_char(query.clone())); match result { Ok(index) => Ok((index, self.ranges[index])), Err(index) => Err(index), @@ -516,7 +527,9 @@ impl MultiCharRange for CharCollection { } impl Hash for CharCollection { fn hash(&self, state: &mut H) { - self.ranges.iter().for_each(|range| hash_char_range(range, state)); + self.ranges + .iter() + .for_each(|range| hash_char_range(range, state)); } } fn hash_char_range(range: &CharRange, state: &mut H) { @@ -537,9 +550,9 @@ fn format_range(range: &CharRange) -> String { mod tests { use { super::{are_chars_adjacent, CharCollection, CharRange}, - std::error::Error, std::char, // unic_char_range::{chars, CharRange}, + std::error::Error, }; #[test] fn test_from_sorted_ranges() -> Result<(), Box> { @@ -593,7 +606,10 @@ mod tests { fn test_find_containing_range() { let collection = char_collect!({ ('a'..='d') + ('g'..='j') + ('l'..='o') + 'z' }); assert_eq!(collection.find_containing_range(&'0'), Err(0)); - assert_eq!(collection.find_containing_range(&'c'), Ok((0, chars!('a'..='d')))); + assert_eq!( + collection.find_containing_range(&'c'), + Ok((0, chars!('a'..='d'))) + ); assert_eq!(collection.find_containing_range(&'e'), Err(1)); } #[test] @@ -605,7 +621,10 @@ mod tests { fn test_insert_exact_match() { let mut collection = char_collect!('a'..='d', 'g'..='l'); collection += 'a'..='d'; - assert_eq!(collection.ranges, vec![chars!('a'..='d'), chars!('g'..='l')]); + assert_eq!( + collection.ranges, + vec![chars!('a'..='d'), chars!('g'..='l')] + ); } #[test] fn test_insert_non_overlapping_sorted() { @@ -642,7 +661,10 @@ mod tests { fn test_insert_overlapping_with_intersections() { let mut collection = char_collect!('c'..='e', 'j'..='m', 'p'..='s'); collection += 'd'..='k'; - assert_eq!(collection.ranges, vec![chars!('c'..='m'), chars!('p'..='s')]); + assert_eq!( + collection.ranges, + vec![chars!('c'..='m'), chars!('p'..='s')] + ); } #[test] fn test_insert_coalesce_adjacent_ranges() { @@ -653,7 +675,12 @@ mod tests { #[test] fn test_append() -> Result<(), Box> { let mut collection = char_collect!('a'..='c'); - collection.append('d')?.append('g')?.append('h')?.append('i')?.append('z')?; + collection + .append('d')? + .append('g')? + .append('h')? + .append('i')? + .append('z')?; assert_eq!(collection, char_collect!('a'..='d', 'g'..='i', 'z')); Ok(()) } @@ -672,7 +699,9 @@ mod tests { #[test] fn test_append_range() -> Result<(), Box> { let mut collection = char_collect!('a'..='c'); - collection.append_range(chars!('g'..='i'))?.append_range(chars!('j'..='m'))?; + collection + .append_range(chars!('g'..='i'))? + .append_range(chars!('j'..='m'))?; assert_eq!(collection, char_collect!('a'..='c', 'g'..='m')); Ok(()) } @@ -690,7 +719,10 @@ mod tests { fn test_remove_exact_range() { let mut collection = char_collect!('c'..='e', 'j'..='m', 'p'..='s'); collection -= 'j'..='m'; - assert_eq!(collection.ranges, vec![chars!('c'..='e'), chars!['p'..='s']]); + assert_eq!( + collection.ranges, + vec![chars!('c'..='e'), chars!['p'..='s']] + ); } #[test] fn test_remove_overlapping_all_existent() { @@ -710,21 +742,34 @@ mod tests { collection -= 'k'..='l'; assert_eq!( collection.ranges, - vec![chars!('c'..='e'), chars!('j'..='j'), chars!('m'..='m'), chars!('p'..='s')] + vec![ + chars!('c'..='e'), + chars!('j'..='j'), + chars!('m'..='m'), + chars!('p'..='s') + ] ); } #[test] fn test_remove_intersection() { let mut collection = char_collect!('c'..='e', 'j'..='m', 'p'..='s'); collection -= 'd'..='q'; - assert_eq!(collection.ranges, vec![chars!('c'..='c'), chars!('r'..='s')]); + assert_eq!( + collection.ranges, + vec![chars!('c'..='c'), chars!('r'..='s')] + ); } #[test] fn test_complement_simple() { let collection = char_collect!(0x10..=0x50, 0x70..=0x70, 0x99..=0x640); assert_eq!( collection.complement(), - char_collect!(0x00..=0x0F, 0x51..=0x6F, 0x71..=0x98, 0x641..=(char::MAX as u32)) + char_collect!( + 0x00..=0x0F, + 0x51..=0x6F, + 0x71..=0x98, + 0x641..=(char::MAX as u32) + ) ); } #[test] @@ -740,7 +785,10 @@ mod tests { #[test] fn test_complement_includes_min_and_max() { let collection = char_collect!(0x0..=0x10, 0x40..=0x50, 0xCCCC..=(char::MAX as u32)); - assert_eq!(collection.complement(), char_collect!(0x11..=0x3F, 0x51..=0xCCCB)); + assert_eq!( + collection.complement(), + char_collect!(0x11..=0x3F, 0x51..=0xCCCB) + ); } #[test] fn test_union() { @@ -777,4 +825,4 @@ mod tests { assert!(!are_chars_adjacent(&'b', &'a')); assert!(!are_chars_adjacent(&'a', &'c')); } -} \ No newline at end of file +} diff --git a/components/char_collection/src/conversions.rs b/components/char_collection/src/conversions.rs index 1ca0d36b26d..02e3c223ace 100644 --- a/components/char_collection/src/conversions.rs +++ b/components/char_collection/src/conversions.rs @@ -9,7 +9,7 @@ use std::ops::RangeInclusive; // use unic_char_range::CharRange; // use unic_ucd_block::Block; // use unicode_blocks::UnicodeBlockId; -use crate::{CharRange, CharCollection, MultiCharRange}; +use crate::{CharCollection, CharRange, MultiCharRange}; macro_rules! impl_for_range_inclusive_int_type { ($($t:ty),*) => {$( impl MultiCharRange for RangeInclusive<$t> { @@ -80,25 +80,34 @@ impl From<&T> for CharCollection { } #[cfg(test)] mod multi_char_range_tests { - use crate::{MultiCharRange, CharRange}; + use crate::{CharRange, MultiCharRange}; use paste; // use unic_char_range::{chars, CharRange}; #[test] fn test_char() { let source = 'a'; - assert_eq!(source.iter_ranges().collect::>(), vec![chars!('a'..='a')]); + assert_eq!( + source.iter_ranges().collect::>(), + vec![chars!('a'..='a')] + ); assert_eq!(source.range_count(), 1); } #[test] fn test_char_range() { let source = chars!('d'..='g'); - assert_eq!(source.iter_ranges().collect::>(), vec![chars!('d'..='g')]); + assert_eq!( + source.iter_ranges().collect::>(), + vec![chars!('d'..='g')] + ); assert_eq!(source.range_count(), 1); } #[test] fn test_range_inclusive_char() { let source = 'd'..='g'; - assert_eq!(source.iter_ranges().collect::>(), vec![chars!('d'..='g')]); + assert_eq!( + source.iter_ranges().collect::>(), + vec![chars!('d'..='g')] + ); assert_eq!(source.range_count(), 1); } macro_rules! test_range_inclusive_int { @@ -153,4 +162,4 @@ mod from_tests { // let actual: CharCollection = (&UnicodeBlockId::BasicLatin).into(); // assert_eq!(actual, char_collect!('\u{0000}'..='\u{007f}')); // } -} \ No newline at end of file +} diff --git a/components/char_collection/src/lib.rs b/components/char_collection/src/lib.rs index d6d4a1096f5..f135d871fe0 100644 --- a/components/char_collection/src/lib.rs +++ b/components/char_collection/src/lib.rs @@ -7,8 +7,8 @@ mod char_collection; mod conversions; mod operators; pub use char_collection::CharCollection; -pub use char_collection::MultiCharRange; -pub use char_collection::CharRange; pub use char_collection::CharIter; +pub use char_collection::CharRange; +pub use char_collection::MultiCharRange; pub use conversions::*; -pub use operators::*; \ No newline at end of file +pub use operators::*; diff --git a/components/char_collection/src/macros.rs b/components/char_collection/src/macros.rs index 3830b9d3f66..49569781f50 100644 --- a/components/char_collection/src/macros.rs +++ b/components/char_collection/src/macros.rs @@ -36,5 +36,5 @@ macro_rules! chars { }; (..) => { $crate::CharRange::all() - } -} \ No newline at end of file + }; +} diff --git a/components/char_collection/src/operators.rs b/components/char_collection/src/operators.rs index 1beb4656073..cec8cad4e3d 100644 --- a/components/char_collection/src/operators.rs +++ b/components/char_collection/src/operators.rs @@ -59,4 +59,4 @@ impl ops::Not for CharCollection { fn not(self) -> Self::Output { self.complement() } -} \ No newline at end of file +} From a1c2d6afcbb6a86e866ef253585891bf43210182 Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Tue, 23 Jun 2020 06:19:05 +0000 Subject: [PATCH 08/30] L1 initial completion, unit tests not complete --- components/char_collection/src/lib.rs | 2 + components/char_collection/src/uniset.rs | 182 +++++++++++++++++++++++ 2 files changed, 184 insertions(+) create mode 100644 components/char_collection/src/uniset.rs diff --git a/components/char_collection/src/lib.rs b/components/char_collection/src/lib.rs index f135d871fe0..54eacbe847e 100644 --- a/components/char_collection/src/lib.rs +++ b/components/char_collection/src/lib.rs @@ -6,9 +6,11 @@ mod macros; mod char_collection; mod conversions; mod operators; +mod uniset; pub use char_collection::CharCollection; pub use char_collection::CharIter; pub use char_collection::CharRange; pub use char_collection::MultiCharRange; pub use conversions::*; pub use operators::*; +pub use uniset::UnicodeSet; diff --git a/components/char_collection/src/uniset.rs b/components/char_collection/src/uniset.rs new file mode 100644 index 00000000000..8ebfda757eb --- /dev/null +++ b/components/char_collection/src/uniset.rs @@ -0,0 +1,182 @@ +use std::{ + boxed::Box, + char, + clone::Clone, + cmp::Ordering, + convert::From, // https://doc.rust-lang.org/std/convert/trait.From.html rust practice says do not use Into + error::Error, + hash::{Hash, Hasher}, + iter::Iterator, + num::ParseIntError, + ops::Range, + str::Split, + vec::Vec, +}; + +const UNICODESET_MAX: u32 = 0x110000; // does max imply inclusive? else should be 10FFFF +const UNICODESET_MIN: u32 = 0x000000; + +/// Given string representation of inversion list create set +/// Check if sorted during iteration +fn parse_serial_string(serialize_str: &str) -> Result, Box> { + // wondering how much this method catches in tests + // let split_serialize: Split<&str> = serialize.split(" "); + // let capacity: u8 = split_serialize.next().unwrap(). + let mut serialize = serialize_str.split(" "); + let capacity: usize = serialize.next().unwrap().parse()?; + if capacity % 2 != 0 { + return Err("Capacity must be even".into()); + } + let mut serialized_vec: Vec = Vec::with_capacity(capacity); + let mut prev: u32 = 0; + for str_ele in serialize { + // unsure if the capacity matters if we can expand, but that might be an issue if you expand into too much memory + // otherwise shrink_to_fit is possible + let parsed: u32 = str_ele.parse()?; + if serialized_vec.len() + 1 > serialized_vec.capacity() { + return Err("Serialization capacity is too small".into()); + } + if parsed < prev { + return Err("Serialization must be sorted".into()); + } + serialized_vec.push(parsed); + prev = parsed; + } + if serialized_vec.len() % 2 != 0 { + return Err("Serialization must be even".into()); + } + serialized_vec.shrink_to_fit(); // necessary if the length < capacity + Ok(serialized_vec) +} + +//#[derive(Copy, Clone, Debug, Eq)] +pub struct UnicodeSet { + // If we wanted to use an array to keep the memory on the stack, there is an unsafe nightly feature + // https://doc.rust-lang.org/nightly/core/array/trait.FixedSizeArray.html + // Allows for traits of fixed size arrays + set: Vec, // is set misleading? could be uset +} + +impl UnicodeSet { + pub fn new(serialize: &str) -> Result> { + match parse_serial_string(serialize) { + Ok(serialize) => Ok(UnicodeSet { set: serialize }), + Err(e) => Err(e), + } + } + + pub fn from_range(start: &u32, end: &u32) -> UnicodeSet { + UnicodeSet { + set: vec![*start, *end], + } + } + + pub fn all() -> UnicodeSet { + UnicodeSet { + set: vec![UNICODESET_MIN, UNICODESET_MAX], + } + } + + pub fn bmp() -> UnicodeSet { + UnicodeSet { + set: vec![UNICODESET_MIN, 0xFFFF], + } + } + + pub fn contains(&self, query: &u32) -> bool { + // need an enforcement of pattern + //Need to evaluate + // let mut low = 0; + // let mut high = self.set.len() - 1; + // if low >= high || query > self.set[high] || query < self.set[low]{ + // false + // } + // // [2, 5, 10, 12] => [2, 4], [10, 11] + // // [2, 5, 10] => [2, 4], [10] + // // [2, 5, 10, 10, 12] + // // [1, 1, 0] + // // 5, 9 + // let mut pos: i8 = -1; + // while low <= high { + // let middle = (low + high) >> 1; + // let check = self.set[middle]; + // if middle == low { + // pos = middle; + // break + // } + // if check < query { + // low = middle + 1; + // } + // else { + // high = middle - 1; + // } + // } + // if pos == -1 { + // pos = middle + 1; + // } + // [2, 5, 10, 15] + match self.set.binary_search(query) { + // relies on having even # elements + Ok(pos) => { + if pos % 2 == 0 { + true + } else { + if pos > 0 && &self.set[pos - 1] == query { + true + } else { + false + } + } + } + Err(pos) => { + if pos % 2 == 0 { + false + } else { + if pos >= self.set.len() - 1 { + false + } else { + true + } + } + } + } + } +} + +#[cfg(test)] +mod tests { + use { + super::{parse_serial_string, UnicodeSet}, + std::num::ParseIntError, + }; + // parse_serial_string + #[test] + fn test_parse_serial_string() { + let expected = vec![2, 3, 4, 5]; + let actual = parse_serial_string("4 2 3 4 5").unwrap(); + assert_eq!(actual, expected); + } + #[test] + fn test_parse_serial_string_no_char() { + assert!(parse_serial_string("4 2 A 3 4 5").is_err()); + } + #[test] + fn test_parse_serial_string_empty() { + assert!(parse_serial_string("").is_err()); + } + #[test] + fn test_parse_serial_string_wrong_format() { + assert!(parse_serial_string("[4, 2, 3, 4, 5 ]").is_err()); + } + #[test] + fn test_parse_serial_string_capacity_not_even() { + assert!(parse_serial_string("3 2 3 4").is_err()); + } + #[test] + fn test_parse_serial_string_size_not_even() { + assert!(parse_serial_string("4 3 2 1").is_err()); + } + + // UnicodeSet constructors +} +// impl From Date: Tue, 23 Jun 2020 16:51:57 +0000 Subject: [PATCH 09/30] UnicodeSet tests --- components/char_collection/src/uniset.rs | 119 +++++++++++++++-------- 1 file changed, 77 insertions(+), 42 deletions(-) diff --git a/components/char_collection/src/uniset.rs b/components/char_collection/src/uniset.rs index 8ebfda757eb..217d6900daa 100644 --- a/components/char_collection/src/uniset.rs +++ b/components/char_collection/src/uniset.rs @@ -13,11 +13,17 @@ use std::{ vec::Vec, }; -const UNICODESET_MAX: u32 = 0x110000; // does max imply inclusive? else should be 10FFFF +const UNICODESET_MAX: u32 = 0x10FFFF; // does max imply inclusive? else should be 10FFFF const UNICODESET_MIN: u32 = 0x000000; - +const BMP_MAX: u32 = 0xFFFF; /// Given string representation of inversion list create set -/// Check if sorted during iteration +/// +/// Requires starting capacity integer, followed by space delimited integer code points. +/// There must be an even number of elements (not including the capacity int), and must be +/// in ascending sorted order. +/// +/// Example String: `4 0 5 10 15` designates a capacity of size 4, followed by 2 ranges +/// The ranges are {0, 4} and {10, 14} inclusive fn parse_serial_string(serialize_str: &str) -> Result, Box> { // wondering how much this method catches in tests // let split_serialize: Split<&str> = serialize.split(" "); @@ -49,6 +55,10 @@ fn parse_serial_string(serialize_str: &str) -> Result, Box> Ok(serialized_vec) } +/// UnicodeSet membership wrapper +/// +/// Provides exposure to membership functions and constructors from serialized UnicodeSets +/// and predefined ranges. //#[derive(Copy, Clone, Debug, Eq)] pub struct UnicodeSet { // If we wanted to use an array to keep the memory on the stack, there is an unsafe nightly feature @@ -65,58 +75,32 @@ impl UnicodeSet { } } - pub fn from_range(start: &u32, end: &u32) -> UnicodeSet { - UnicodeSet { - set: vec![*start, *end], + pub fn from_range(start: &u32, end: &u32) -> Result> { + if start > end { + return Err("Range is out of order".into()) + } + if start < &UNICODESET_MIN || end > &UNICODESET_MAX { + return Err("Range is out of bounds".into()) } + Ok(UnicodeSet { + set: vec![*start, *end], + }) } pub fn all() -> UnicodeSet { UnicodeSet { - set: vec![UNICODESET_MIN, UNICODESET_MAX], + set: vec![UNICODESET_MIN, UNICODESET_MAX + 1], } } pub fn bmp() -> UnicodeSet { UnicodeSet { - set: vec![UNICODESET_MIN, 0xFFFF], + set: vec![UNICODESET_MIN, BMP_MAX + 1], } } pub fn contains(&self, query: &u32) -> bool { - // need an enforcement of pattern - //Need to evaluate - // let mut low = 0; - // let mut high = self.set.len() - 1; - // if low >= high || query > self.set[high] || query < self.set[low]{ - // false - // } - // // [2, 5, 10, 12] => [2, 4], [10, 11] - // // [2, 5, 10] => [2, 4], [10] - // // [2, 5, 10, 10, 12] - // // [1, 1, 0] - // // 5, 9 - // let mut pos: i8 = -1; - // while low <= high { - // let middle = (low + high) >> 1; - // let check = self.set[middle]; - // if middle == low { - // pos = middle; - // break - // } - // if check < query { - // low = middle + 1; - // } - // else { - // high = middle - 1; - // } - // } - // if pos == -1 { - // pos = middle + 1; - // } - // [2, 5, 10, 15] match self.set.binary_search(query) { - // relies on having even # elements Ok(pos) => { if pos % 2 == 0 { true @@ -132,7 +116,7 @@ impl UnicodeSet { if pos % 2 == 0 { false } else { - if pos >= self.set.len() - 1 { + if pos >= self.set.len() { false } else { true @@ -146,7 +130,7 @@ impl UnicodeSet { #[cfg(test)] mod tests { use { - super::{parse_serial_string, UnicodeSet}, + super::{parse_serial_string, UnicodeSet, UNICODESET_MIN, UNICODESET_MAX, BMP_MAX}, std::num::ParseIntError, }; // parse_serial_string @@ -178,5 +162,56 @@ mod tests { } // UnicodeSet constructors + #[test] + fn test_unicodeset_new() { + let expected = vec![2, 3, 4, 5]; + let actual = UnicodeSet::new("4 2 3 4 5").unwrap().set; + assert_eq!(actual, expected); + } + #[test] + fn test_unicodeset_new_error() { + assert!(UnicodeSet::new("3 2 4 3").is_err()); + } + #[test] + fn test_unicodeset_from_range() { + let expected = vec![4, 10]; + let actual = UnicodeSet::from_range(&4, &10).unwrap().set; + assert_eq!(actual, expected); + } + #[test] + fn test_unicodeset_from_range_bad_order() { + assert!(UnicodeSet::from_range(&10, &5).is_err()); + } + #[test] + fn test_unicodeset_from_range_out_of_bounds() { + assert!(UnicodeSet::from_range(&0, &0x110000).is_err()); + } + #[test] + fn test_unicodeset_all() { + let expected = vec![UNICODESET_MIN, UNICODESET_MAX + 1]; + assert_eq!(UnicodeSet::all().set, expected); + } + #[test] + fn test_unicodeset_bmp() { + let expected = vec![UNICODESET_MIN, BMP_MAX + 1]; + assert_eq!(UnicodeSet::bmp().set, expected); + } + #[test] + fn test_unicodeset_contains() { + let check = UnicodeSet::new("4 2 5 10 15").unwrap(); + assert!(check.contains(&2)); + assert!(check.contains(&4)); + assert!(check.contains(&10)); + assert!(check.contains(&14)); + } + #[test] + fn test_unicodeset_contains_false() { + let check = UnicodeSet::new("4 2 5 10 15").unwrap(); + assert!(!check.contains(&1)); + assert!(!check.contains(&5)); + assert!(!check.contains(&9)); + assert!(!check.contains(&15)); + assert!(!check.contains(&16)); + } } // impl From Date: Tue, 23 Jun 2020 17:04:20 +0000 Subject: [PATCH 10/30] Complete contains test and docs --- components/char_collection/src/uniset.rs | 27 ++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/components/char_collection/src/uniset.rs b/components/char_collection/src/uniset.rs index 217d6900daa..7e066829c21 100644 --- a/components/char_collection/src/uniset.rs +++ b/components/char_collection/src/uniset.rs @@ -16,6 +16,7 @@ use std::{ const UNICODESET_MAX: u32 = 0x10FFFF; // does max imply inclusive? else should be 10FFFF const UNICODESET_MIN: u32 = 0x000000; const BMP_MAX: u32 = 0xFFFF; + /// Given string representation of inversion list create set /// /// Requires starting capacity integer, followed by space delimited integer code points. @@ -68,6 +69,15 @@ pub struct UnicodeSet { } impl UnicodeSet { + /// Returns Result of UnicodeSet from serialized string + /// + /// Returns an error if the serialized string fails to parse. + /// The serialized string requires starting capacity integer, followed by space delimited + /// integer code points. There must be an even number of elements (not including the + /// capacity int), and must be in ascending sorted order. + /// + /// Example String: `"4 0 5 10 15"` designates a capacity of size `4`, followed by 2 ranges + /// The ranges are `{0, 4}` and `{10, 14}` inclusive pub fn new(serialize: &str) -> Result> { match parse_serial_string(serialize) { Ok(serialize) => Ok(UnicodeSet { set: serialize }), @@ -75,6 +85,13 @@ impl UnicodeSet { } } + /// Returns Result of UnicodeSet from a single pair of integers defining a range + /// + /// `start`: inclusive, `end`: exclusive + /// + /// Returns an error if the range is invalid (out of order and out of bounds). + /// + /// Example Call: `UnicodeSet::from_range(&0, &15)` pub fn from_range(start: &u32, end: &u32) -> Result> { if start > end { return Err("Range is out of order".into()) @@ -87,18 +104,28 @@ impl UnicodeSet { }) } + /// Returns UnicodeSet spanning entire Unicode range + /// + /// The range spans from `0x0 -> 0x10FFFF` inclusive pub fn all() -> UnicodeSet { UnicodeSet { set: vec![UNICODESET_MIN, UNICODESET_MAX + 1], } } + /// Returns UnicodeSet spanning BMP range + /// + /// The range spans from `0x0 -> 0xFFFF` inclusive pub fn bmp() -> UnicodeSet { UnicodeSet { set: vec![UNICODESET_MIN, BMP_MAX + 1], } } + /// Checks to see the query is in the UnicodeSet + /// + /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points + /// on the set using `std::vec::Vec` implementation pub fn contains(&self, query: &u32) -> bool { match self.set.binary_search(query) { Ok(pos) => { From 0a55933cf3680c318533ffcf04de7ebb8beb26a7 Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Tue, 23 Jun 2020 17:04:39 +0000 Subject: [PATCH 11/30] formatting --- components/char_collection/src/uniset.rs | 30 ++++++++++++------------ 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/components/char_collection/src/uniset.rs b/components/char_collection/src/uniset.rs index 7e066829c21..0c85d26daea 100644 --- a/components/char_collection/src/uniset.rs +++ b/components/char_collection/src/uniset.rs @@ -69,11 +69,11 @@ pub struct UnicodeSet { } impl UnicodeSet { - /// Returns Result of UnicodeSet from serialized string + /// Returns Result of UnicodeSet from serialized string /// - /// Returns an error if the serialized string fails to parse. + /// Returns an error if the serialized string fails to parse. /// The serialized string requires starting capacity integer, followed by space delimited - /// integer code points. There must be an even number of elements (not including the + /// integer code points. There must be an even number of elements (not including the /// capacity int), and must be in ascending sorted order. /// /// Example String: `"4 0 5 10 15"` designates a capacity of size `4`, followed by 2 ranges @@ -86,26 +86,26 @@ impl UnicodeSet { } /// Returns Result of UnicodeSet from a single pair of integers defining a range - /// + /// /// `start`: inclusive, `end`: exclusive - /// + /// /// Returns an error if the range is invalid (out of order and out of bounds). - /// + /// /// Example Call: `UnicodeSet::from_range(&0, &15)` pub fn from_range(start: &u32, end: &u32) -> Result> { if start > end { - return Err("Range is out of order".into()) + return Err("Range is out of order".into()); } if start < &UNICODESET_MIN || end > &UNICODESET_MAX { - return Err("Range is out of bounds".into()) + return Err("Range is out of bounds".into()); } Ok(UnicodeSet { set: vec![*start, *end], }) } - /// Returns UnicodeSet spanning entire Unicode range - /// + /// Returns UnicodeSet spanning entire Unicode range + /// /// The range spans from `0x0 -> 0x10FFFF` inclusive pub fn all() -> UnicodeSet { UnicodeSet { @@ -113,8 +113,8 @@ impl UnicodeSet { } } - /// Returns UnicodeSet spanning BMP range - /// + /// Returns UnicodeSet spanning BMP range + /// /// The range spans from `0x0 -> 0xFFFF` inclusive pub fn bmp() -> UnicodeSet { UnicodeSet { @@ -123,9 +123,9 @@ impl UnicodeSet { } /// Checks to see the query is in the UnicodeSet - /// + /// /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points - /// on the set using `std::vec::Vec` implementation + /// on the set using `std::vec::Vec` implementation pub fn contains(&self, query: &u32) -> bool { match self.set.binary_search(query) { Ok(pos) => { @@ -157,7 +157,7 @@ impl UnicodeSet { #[cfg(test)] mod tests { use { - super::{parse_serial_string, UnicodeSet, UNICODESET_MIN, UNICODESET_MAX, BMP_MAX}, + super::{parse_serial_string, UnicodeSet, BMP_MAX, UNICODESET_MAX, UNICODESET_MIN}, std::num::ParseIntError, }; // parse_serial_string From 4a04a96f3c4479593792c1e291f7346462ef41e1 Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Tue, 23 Jun 2020 18:14:54 +0000 Subject: [PATCH 12/30] added is_empty() and size() --- components/char_collection/src/uniset.rs | 71 +++++++++++++++++------- 1 file changed, 52 insertions(+), 19 deletions(-) diff --git a/components/char_collection/src/uniset.rs b/components/char_collection/src/uniset.rs index 0c85d26daea..d399e34f00e 100644 --- a/components/char_collection/src/uniset.rs +++ b/components/char_collection/src/uniset.rs @@ -1,17 +1,4 @@ -use std::{ - boxed::Box, - char, - clone::Clone, - cmp::Ordering, - convert::From, // https://doc.rust-lang.org/std/convert/trait.From.html rust practice says do not use Into - error::Error, - hash::{Hash, Hasher}, - iter::Iterator, - num::ParseIntError, - ops::Range, - str::Split, - vec::Vec, -}; +use std::{boxed::Box, error::Error, iter::Iterator, slice::Iter, vec::Vec}; const UNICODESET_MAX: u32 = 0x10FFFF; // does max imply inclusive? else should be 10FFFF const UNICODESET_MIN: u32 = 0x000000; @@ -43,7 +30,7 @@ fn parse_serial_string(serialize_str: &str) -> Result, Box> if serialized_vec.len() + 1 > serialized_vec.capacity() { return Err("Serialization capacity is too small".into()); } - if parsed < prev { + if parsed <= prev { return Err("Serialization must be sorted".into()); } serialized_vec.push(parsed); @@ -121,6 +108,25 @@ impl UnicodeSet { set: vec![UNICODESET_MIN, BMP_MAX + 1], } } + /// Returns an `Iter` of start and stop `u32` points of the UnicodeSet + pub fn iter(&self) -> Iter { + self.set.iter() + } + + /// Returns the cardinality of the UnicodeSet + pub fn size(&self) -> Result> { + if self.set.len() < 2 { + return Err("UnicodeSet length < 2".into()); + } + let end: u32 = self.iter().skip(1).step_by(2).sum::(); + let start: u32 = self.iter().step_by(2).sum::(); + Ok((end - start) as usize) + } + + /// Returns whether or not the UnicodeSet is empty + pub fn is_empty(&self) -> bool { + self.set.len() < 2 // unsure if this is appropriate definition of just self.set.is_empty() + } /// Checks to see the query is in the UnicodeSet /// @@ -156,10 +162,7 @@ impl UnicodeSet { #[cfg(test)] mod tests { - use { - super::{parse_serial_string, UnicodeSet, BMP_MAX, UNICODESET_MAX, UNICODESET_MIN}, - std::num::ParseIntError, - }; + use super::{parse_serial_string, UnicodeSet, BMP_MAX, UNICODESET_MAX, UNICODESET_MIN}; // parse_serial_string #[test] fn test_parse_serial_string() { @@ -180,6 +183,14 @@ mod tests { assert!(parse_serial_string("[4, 2, 3, 4, 5 ]").is_err()); } #[test] + fn test_parse_serial_string_wrong_order() { + assert!(parse_serial_string("4 1 0 4 2").is_err()); + } + #[test] + fn test_parse_serial_string_single_char_error() { + assert!(parse_serial_string("4 1 1 2 2").is_err()); + } + #[test] fn test_parse_serial_string_capacity_not_even() { assert!(parse_serial_string("3 2 3 4").is_err()); } @@ -240,5 +251,27 @@ mod tests { assert!(!check.contains(&15)); assert!(!check.contains(&16)); } + #[test] + fn test_unicodeset_size() { + let check = UnicodeSet::new("4 2 5 10 15").unwrap(); + assert_eq!(8, check.size().unwrap()); + let check = UnicodeSet::all(); + let expected = UNICODESET_MAX + 1 - UNICODESET_MIN; + assert_eq!(expected as usize, check.size().unwrap()); + } + #[test] + fn test_unicodeset_size_error() { + let check = UnicodeSet { set: vec![0] }; + assert!(check.size().is_err()); + } + #[test] + fn test_unicodeset_is_empty() { + let check = UnicodeSet { set: vec![] }; + assert!(check.is_empty()); + let check = UnicodeSet { set: vec![0] }; + assert!(check.is_empty()); + let check = UnicodeSet::all(); + assert!(!check.is_empty()); + } } // impl From Date: Tue, 23 Jun 2020 18:24:24 +0000 Subject: [PATCH 13/30] proposed changes --- components/char_collection/Cargo.toml | 1 + components/char_collection/src/conversions.rs | 48 ++----------------- components/char_collection/src/uniset.rs | 1 - 3 files changed, 4 insertions(+), 46 deletions(-) diff --git a/components/char_collection/Cargo.toml b/components/char_collection/Cargo.toml index 8ed0275f2a2..b649bc92db3 100644 --- a/components/char_collection/Cargo.toml +++ b/components/char_collection/Cargo.toml @@ -7,6 +7,7 @@ edition = "2018" readme = "README.md" repository = "https://github.com/unicode-org/icu4x" license = "MIT/Apache-2.0" +license-file = "LICENSE" categories = ["internationalization"] include = [ "src/**/*", diff --git a/components/char_collection/src/conversions.rs b/components/char_collection/src/conversions.rs index 02e3c223ace..7da45e5467d 100644 --- a/components/char_collection/src/conversions.rs +++ b/components/char_collection/src/conversions.rs @@ -2,14 +2,12 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. //! Conversion (`From`) implementations for [CharCollection], via [MultiCharRange]. +use crate::{CharCollection, CharRange, MultiCharRange}; use std::boxed::Box; use std::convert::TryFrom; use std::iter; use std::ops::RangeInclusive; -// use unic_char_range::CharRange; -// use unic_ucd_block::Block; -// use unicode_blocks::UnicodeBlockId; -use crate::{CharCollection, CharRange, MultiCharRange}; + macro_rules! impl_for_range_inclusive_int_type { ($($t:ty),*) => {$( impl MultiCharRange for RangeInclusive<$t> { @@ -55,22 +53,7 @@ impl MultiCharRange for RangeInclusive { } } impl_for_range_inclusive_int_type!(u8, i8, u32, i32); -// impl MultiCharRange for UnicodeBlockId { -// fn iter_ranges(&self) -> Box> { -// self.block().iter_ranges() -// } -// fn range_count(&self) -> usize { -// 1 -// } -// } -// impl MultiCharRange for Block { -// fn iter_ranges<'a>(&'a self) -> Box + 'a> { -// Box::new(self.range.iter_ranges()) -// } -// fn range_count(&self) -> usize { -// 1 -// } -// } + impl From<&T> for CharCollection { fn from(source: &T) -> Self { let mut collection = CharCollection::new(); @@ -82,7 +65,6 @@ impl From<&T> for CharCollection { mod multi_char_range_tests { use crate::{CharRange, MultiCharRange}; use paste; - // use unic_char_range::{chars, CharRange}; #[test] fn test_char() { let source = 'a'; @@ -129,37 +111,13 @@ mod multi_char_range_tests { test_range_inclusive_int!(i8); test_range_inclusive_int!(u32); test_range_inclusive_int!(i32); - // #[test] - // fn test_unicode_block_id() { - // let source = unicode_blocks::UnicodeBlockId::BasicLatin; - // assert_eq!( - // source.iter_ranges().collect::>(), - // vec![chars!('\u{0000}'..='\u{007f}')] - // ); - // assert_eq!(source.range_count(), 1); - // } - // #[test] - // fn test_unicode_block() { - // let source = unicode_blocks::UnicodeBlockId::BasicLatin.block(); - // assert_eq!( - // source.iter_ranges().collect::>(), - // vec![chars!('\u{0000}'..='\u{007f}')] - // ); - // assert_eq!(source.range_count(), 1); - // } } #[cfg(test)] mod from_tests { use crate::CharCollection; - // use unicode_blocks::UnicodeBlockId; #[test] fn test_char() { let actual: CharCollection = (&'a').into(); assert_eq!(actual, char_collect!('a'..='a')); } - // #[test] - // fn test_unicode_block_id() { - // let actual: CharCollection = (&UnicodeBlockId::BasicLatin).into(); - // assert_eq!(actual, char_collect!('\u{0000}'..='\u{007f}')); - // } } diff --git a/components/char_collection/src/uniset.rs b/components/char_collection/src/uniset.rs index d399e34f00e..5604c45043e 100644 --- a/components/char_collection/src/uniset.rs +++ b/components/char_collection/src/uniset.rs @@ -274,4 +274,3 @@ mod tests { assert!(!check.is_empty()); } } -// impl From Date: Tue, 23 Jun 2020 20:13:03 +0000 Subject: [PATCH 14/30] Closure for contains and docs --- components/char_collection/Cargo.toml | 3 +- components/char_collection/src/uniset.rs | 111 ++++++++++++++++------- 2 files changed, 79 insertions(+), 35 deletions(-) diff --git a/components/char_collection/Cargo.toml b/components/char_collection/Cargo.toml index b649bc92db3..1dd07f3dc60 100644 --- a/components/char_collection/Cargo.toml +++ b/components/char_collection/Cargo.toml @@ -6,8 +6,7 @@ authors = ["The ICU4X Project Developers"] edition = "2018" readme = "README.md" repository = "https://github.com/unicode-org/icu4x" -license = "MIT/Apache-2.0" -license-file = "LICENSE" +license-file = "../../LICENSE" categories = ["internationalization"] include = [ "src/**/*", diff --git a/components/char_collection/src/uniset.rs b/components/char_collection/src/uniset.rs index 5604c45043e..950cb2c74cc 100644 --- a/components/char_collection/src/uniset.rs +++ b/components/char_collection/src/uniset.rs @@ -13,16 +13,13 @@ const BMP_MAX: u32 = 0xFFFF; /// Example String: `4 0 5 10 15` designates a capacity of size 4, followed by 2 ranges /// The ranges are {0, 4} and {10, 14} inclusive fn parse_serial_string(serialize_str: &str) -> Result, Box> { - // wondering how much this method catches in tests - // let split_serialize: Split<&str> = serialize.split(" "); - // let capacity: u8 = split_serialize.next().unwrap(). let mut serialize = serialize_str.split(" "); let capacity: usize = serialize.next().unwrap().parse()?; if capacity % 2 != 0 { return Err("Capacity must be even".into()); } let mut serialized_vec: Vec = Vec::with_capacity(capacity); - let mut prev: u32 = 0; + let mut prev: Option = None; for str_ele in serialize { // unsure if the capacity matters if we can expand, but that might be an issue if you expand into too much memory // otherwise shrink_to_fit is possible @@ -30,11 +27,11 @@ fn parse_serial_string(serialize_str: &str) -> Result, Box> if serialized_vec.len() + 1 > serialized_vec.capacity() { return Err("Serialization capacity is too small".into()); } - if parsed <= prev { + if Some(parsed) <= prev { return Err("Serialization must be sorted".into()); } serialized_vec.push(parsed); - prev = parsed; + prev = Some(parsed); } if serialized_vec.len() % 2 != 0 { return Err("Serialization must be even".into()); @@ -128,36 +125,62 @@ impl UnicodeSet { self.set.len() < 2 // unsure if this is appropriate definition of just self.set.is_empty() } - /// Checks to see the query is in the UnicodeSet - /// - /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points - /// on the set using `std::vec::Vec` implementation - pub fn contains(&self, query: &u32) -> bool { + /// Wrapper for contains conditions closures + fn contains(&self, query: &u32, condition: C) -> bool + where + C: Fn(usize) -> bool, + { match self.set.binary_search(query) { Ok(pos) => { if pos % 2 == 0 { - true + return condition(pos); } else { - if pos > 0 && &self.set[pos - 1] == query { - true - } else { - false - } + false } } Err(pos) => { - if pos % 2 == 0 { - false + if pos % 2 != 0 && pos < self.set.len() { + return condition(pos); } else { - if pos >= self.set.len() { - false - } else { - true - } + false } } } } + + /// Checks to see the query is in the UnicodeSet + /// + /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points + /// in the set using `std::vec::Vec` implementation + /// + /// Example: `contains_point(&10)` + pub fn contains_point(&self, query: &u32) -> bool { + let condition_closure = |_: usize| -> bool { true }; + self.contains(query, condition_closure) + } + + /// Checks to see if the range is in the UnicodeSet, returns a Result + /// + /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points + /// in the set using `std::vec::Vec` implementation + /// + /// Only runs the search once on the `start` parameter, while the `end` parameter is checked + /// in a single `O(1)` step + /// + /// Example: `contains_range(&0, &10)` + pub fn contains_range(&self, start: &u32, end: &u32) -> Result> { + if start >= end { + return Err("Range cannot be out of order".into()); + } + let condition_closure = |pos: usize| -> bool { + if end < &self.set[pos + 1] { + true + } else { + false + } + }; + return Ok(self.contains(start, condition_closure)); + } } #[cfg(test)] @@ -237,19 +260,41 @@ mod tests { #[test] fn test_unicodeset_contains() { let check = UnicodeSet::new("4 2 5 10 15").unwrap(); - assert!(check.contains(&2)); - assert!(check.contains(&4)); - assert!(check.contains(&10)); - assert!(check.contains(&14)); + assert!(check.contains_point(&2)); + assert!(check.contains_point(&4)); + assert!(check.contains_point(&10)); + assert!(check.contains_point(&14)); } #[test] fn test_unicodeset_contains_false() { let check = UnicodeSet::new("4 2 5 10 15").unwrap(); - assert!(!check.contains(&1)); - assert!(!check.contains(&5)); - assert!(!check.contains(&9)); - assert!(!check.contains(&15)); - assert!(!check.contains(&16)); + assert!(!check.contains_point(&1)); + assert!(!check.contains_point(&5)); + assert!(!check.contains_point(&9)); + assert!(!check.contains_point(&15)); + assert!(!check.contains_point(&16)); + } + #[test] + fn test_unicodeset_contains_range() { + let check = UnicodeSet::new("4 0 10 15 25").unwrap(); + assert!(check.contains_range(&2, &5).unwrap()); + assert!(check.contains_range(&0, &9).unwrap()); + assert!(check.contains_range(&15, &24).unwrap()); + } + #[test] + fn test_unicodeset_contains_range_false() { + let check = UnicodeSet::new("4 0 10 15 25").unwrap(); + assert!(!check.contains_range(&0, &10).unwrap()); + assert!(!check.contains_range(&15, &25).unwrap()); + assert!(!check.contains_range(&0, &16).unwrap()); + assert!(!check.contains_range(&10, &15).unwrap()); + assert!(!check.contains_range(&11, &14).unwrap()); + } + #[test] + fn test_unicodeset_contains_range_invalid() { + let check = UnicodeSet::all(); + assert!(check.contains_range(&10, &0).is_err()); + assert!(check.contains_range(&0, &0).is_err()); } #[test] fn test_unicodeset_size() { From 8e2a34aa189c57b1901da83aae0e9786698f9128 Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Tue, 23 Jun 2020 22:57:41 +0000 Subject: [PATCH 15/30] Removed unnecessary files and formatting changes --- Cargo.toml | 2 +- components/char_collection/Cargo.toml | 18 - components/char_collection/README.md | 12 - .../meta/char_collection_lib_test.cmx | 5 - .../char_collection/src/char_collection.rs | 828 ------------------ components/char_collection/src/conversions.rs | 123 --- components/char_collection/src/lib.rs | 16 - components/char_collection/src/macros.rs | 40 - components/char_collection/src/operators.rs | 62 -- components/char_collection/src/uniset.rs | 321 ------- 10 files changed, 1 insertion(+), 1426 deletions(-) delete mode 100644 components/char_collection/Cargo.toml delete mode 100644 components/char_collection/README.md delete mode 100644 components/char_collection/meta/char_collection_lib_test.cmx delete mode 100644 components/char_collection/src/char_collection.rs delete mode 100644 components/char_collection/src/conversions.rs delete mode 100644 components/char_collection/src/lib.rs delete mode 100644 components/char_collection/src/macros.rs delete mode 100644 components/char_collection/src/operators.rs delete mode 100644 components/char_collection/src/uniset.rs diff --git a/Cargo.toml b/Cargo.toml index 1752ce7e4bd..d679ba2c20e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,6 +3,6 @@ members = [ "components/icu", "components/icu4x", - "components/char_collection", + "components/uniset", "components/locale", ] diff --git a/components/char_collection/Cargo.toml b/components/char_collection/Cargo.toml deleted file mode 100644 index 1dd07f3dc60..00000000000 --- a/components/char_collection/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -name = "icu-char-collection" -description = "API for managing Unicode Language and Locale Identifiers" -version = "0.0.1" -authors = ["The ICU4X Project Developers"] -edition = "2018" -readme = "README.md" -repository = "https://github.com/unicode-org/icu4x" -license-file = "../../LICENSE" -categories = ["internationalization"] -include = [ - "src/**/*", - "Cargo.toml", -] - -[dependencies] -unic-char-range = "0.9.0" -paste = "0.1.16" diff --git a/components/char_collection/README.md b/components/char_collection/README.md deleted file mode 100644 index 9cb580caa61..00000000000 --- a/components/char_collection/README.md +++ /dev/null @@ -1,12 +0,0 @@ -# ICU4X - -ICU4X is a set of internationalization components for Unicode. - -# Status [![crates.io](http://meritbadge.herokuapp.com/icu4x)](https://crates.io/crates/icu4x) - -The project is in an incubation period. - -# Authors - -The project is managed by a subcommittee of ICU-TC in the Unicode Consortium focused on providing solutions for client-side internationalization. - diff --git a/components/char_collection/meta/char_collection_lib_test.cmx b/components/char_collection/meta/char_collection_lib_test.cmx deleted file mode 100644 index 3bb56a96aa2..00000000000 --- a/components/char_collection/meta/char_collection_lib_test.cmx +++ /dev/null @@ -1,5 +0,0 @@ -{ - "program": { - "binary": "test/char_collection_lib_test" - } -} \ No newline at end of file diff --git a/components/char_collection/src/char_collection.rs b/components/char_collection/src/char_collection.rs deleted file mode 100644 index 639e955e7c1..00000000000 --- a/components/char_collection/src/char_collection.rs +++ /dev/null @@ -1,828 +0,0 @@ -// Copyright 2019 The Fuchsia Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. -use std::{ - boxed::Box, - char, - clone::Clone, - cmp::Ordering, - convert::Into, - error::Error, - hash::{Hash, Hasher}, - iter::Iterator, - ops::Range, - vec::Vec, -}; - -#[derive(Copy, Clone, Debug, Eq)] -pub struct CharRange { - low: char, - high: char, -} - -impl CharRange { - // open_right - // would we want this to return a Option next time? - pub fn open_right(low: char, high: char) -> CharRange { - // nothing happens if this fails - let high: char = char::from_u32(high as u32 - 1).unwrap(); - CharRange { low, high } - } - // closed - pub fn closed(low: char, high: char) -> CharRange { - // if low == '\u{0}' { // need way to handle this - // // for now just leave alone - // } - CharRange { low, high } - } - // open - pub fn open(low: char, high: char) -> CharRange { - // this is repeated here - let low: char = char::from_u32(low as u32 + 1).unwrap(); - let high: char = char::from_u32(high as u32 - 1).unwrap(); - CharRange { low, high } - } - // open_left - pub fn open_left(low: char, high: char) -> CharRange { - // this is repeated here - let low: char = char::from_u32(low as u32 + 1).unwrap(); - CharRange { low, high } - } - // all - pub fn all() -> CharRange { - CharRange { - low: '\u{0}', - high: char::MAX, - } - } - // cmp_char - pub fn cmp_char(&self, comp_char: char) -> Ordering { - if self.high < comp_char { - Ordering::Less - } else if self.low > comp_char { - Ordering::Greater - } else { - Ordering::Equal - } - } - // contains - pub fn contains(&self, ch: char) -> bool { - self.low <= ch && ch <= self.high - } - // is_empty - pub fn is_empty(&self) -> bool { - self.low > self.high - } - pub fn iter(&self) -> CharIter { - (*self).into() - } -} - -impl IntoIterator for CharRange { - type IntoIter = CharIter; - type Item = char; - fn into_iter(self) -> CharIter { - self.iter() - } -} - -impl PartialEq for CharRange { - fn eq(&self, other: &CharRange) -> bool { - (self.is_empty() && other.is_empty()) || (self.low == other.low && self.high == other.high) - } -} - -#[derive(Clone, Debug)] -pub struct CharIter { - low: char, - high: char, -} - -impl From for CharIter { - fn from(range: CharRange) -> CharIter { - CharIter { - low: range.low, - high: range.high, - } - } -} - -impl From for CharRange { - fn from(iter: CharIter) -> CharRange { - CharRange { - low: iter.low, - high: iter.high, - } - } -} - -impl CharIter { - fn advance(&mut self) { - if self.low == char::MAX { - self.high = '\0'; - } else { - self.low = char::from_u32(self.low as u32 + 1).unwrap(); - } - } - fn retreat(&mut self) { - if self.high == '\0' { - self.low = char::MAX; - } else { - self.high = char::from_u32(self.high as u32 - 1).unwrap(); - } - } - fn next_back(&mut self) -> Option { - if self.low > self.high { - None - } else { - let ch = self.high; - self.retreat(); - Some(ch) - } - } -} - -impl Iterator for CharIter { - type Item = char; - fn next(&mut self) -> Option { - if self.low > self.high { - return None; - } - let ch = self.low; - self.advance(); - Some(ch) - } -} - -/// A trait for objects that represent one or more disjoint, non-adjacent -/// [CharRanges](unic_char_range::CharRange). -pub trait MultiCharRange { - /// Iterate over the disjoint, non-adjacent [CharRange]s in the collection in ascending order. - fn iter_ranges<'a>(&'a self) -> Box + 'a>; - /// The number of ranges in the collection. - fn range_count(&self) -> usize; -} -/// A collection of `char`s (i.e. Unicode code points), used for storing large continuous ranges -/// efficiently. -/// -/// Lookups and insertions are O(log R), where R is the number of disjoint -/// ranges in the collection. -/// -/// The easiest way to create instances is using the -/// [char_collect!](::char_collection::char_collect) macro. -/// -/// -/// TODO(kpozin): Implement IntoIter. -#[derive(Clone, Debug, Eq, PartialEq, Default)] -pub struct CharCollection { - ranges: Vec, -} -impl CharCollection { - /// Create a new, empty `CharCollection`. - pub fn new() -> CharCollection { - CharCollection::default() - } - /// Create a new `CharCollection` from a list of disjoint, non-adjacent `CharRange`s, pre-sorted - /// in ascending code point order. - /// - /// This factory method is primarily intended for use in deserializing valid representations of - /// `CharCollections`. Will return an error if ranges are out of order, overlapping, or - /// adjacent. - pub fn from_sorted_ranges(ranges: T) -> Result> - where - T: IntoIterator, - { - // If the original `ranges` is also a Vec, this doesn't result in an extra copy. - let collection = CharCollection { - ranges: ranges.into_iter().collect(), - }; - let ranges: &Vec = &collection.ranges; - match (1..ranges.len()).find(|i| (ranges[*i].low as i64 - ranges[*i - 1].high as i64) <= 1) - { - Some(i) => Err(format!( - "These ranges are out of order, overlapping, or adjacent: {:?}, {:?}", - format_range(&ranges[i - 1]), - format_range(&ranges[i]) - ) - .into()), - None => Ok(collection), - } - } - /// Create a new `CharCollection` from a list of `char`s, pre-sorted in ascending code point - /// order. - /// - /// This factory method is primarily intended for use in deserializing valid representations of - /// `CharCollections`. Will return an error if chars are out of order or contain duplicates. - pub fn from_sorted_chars(chars: T) -> Result> - where - T: IntoIterator, - { - let mut collection = CharCollection::new(); - for ch in chars.into_iter() { - collection.append(ch)?; - } - Ok(collection) - } - /// Iterate over all the `char`s in the collection. - pub fn iter(&self) -> impl Iterator + '_ { - self.ranges.iter().flat_map(CharRange::iter) - } - /// Test whether the collection contains a specific `char`. - /// - /// The time complexity is O(log R), where R is the number of ranges in - /// the collection. - pub fn contains(&self, ch: &char) -> bool { - self.find_containing_range(ch).is_ok() - } - /// Test whether the collection contains an entire range of characters. - /// - /// The time complexity is O(log R), where R is the number of ranges in - /// the collection. - pub fn contains_range(&self, range: &CharRange) -> bool { - if range.is_empty() { - return false; - } - let lower_existing_range = self.find_containing_range(&range.low); - let upper_existing_range = self.find_containing_range(&range.high); - // Fully enclosed in existing range. - return lower_existing_range == upper_existing_range && lower_existing_range.is_ok(); - } - /// Insert a `char` or other collection of chars into this collection. - /// - /// Returns `&mut self` for easy chaining. - /// - /// The time complexity is O(T log(R + T)), where R - /// is the number of ranges in this collection and T is the number of ranges in - /// `to_add`. - pub fn insert(&mut self, to_add: &V) -> &mut Self { - to_add - .iter_ranges() - .for_each(|range| self.insert_char_range(&range)); - self - } - /// Appends a `char` to the end of the existing collection. Panics if the given `char` is not - /// higher than the highest code point in the existing collection. - /// - /// Returns `&mut self` for easy chaining. - /// - /// The time complexity is O(1). - pub fn append(&mut self, ch: char) -> Result<&mut Self, Box> { - let mut coalesced = false; - if let Some(last_range) = self.ranges.last_mut() { - if last_range.cmp_char(ch) != Ordering::Less { - return Err(format!("Cannot append {:?} after {:?}", ch, last_range.high).into()); - } - if are_chars_adjacent(&last_range.high, &ch) { - last_range.high = ch; - coalesced = true; - } - } - if !coalesced { - self.ranges.push(chars!(ch..=ch)); - } - Ok(self) - } - /// Appends a `CharRange` to the end of the existing collection. Panics if the given range is - /// not higher than the highest code point in the existing collection. (The new range _may_ be - /// adjacent to the previous highest range, but may not overlap.) - /// - /// Returns `&mut self` for easy chaining. - /// - /// The time complexity is O(1). - pub fn append_range(&mut self, range: CharRange) -> Result<&mut Self, Box> { - let mut coalesced = false; - if let Some(last_range) = self.ranges.last_mut() { - if last_range.cmp_char(range.low) != Ordering::Less { - return Err(format!( - "Cannot append {:?} after {:?}", - format_range(&range), - last_range.high - ) - .into()); - } - if are_chars_adjacent(&last_range.high, &range.low) { - last_range.high = range.high; - coalesced = true; - } - } - if !coalesced { - self.ranges.push(range); - } - Ok(self) - } - /// Remove a `char` or other collection of chars from this collection. - /// - /// Returns `&mut self` for easy chaining. - /// - /// The time complexity is O(T log(R + T)), where R - /// is the number of ranges in this collection and T is the number of ranges in - /// `to_remove`. - pub fn remove(&mut self, to_remove: &V) -> &mut Self { - to_remove - .iter_ranges() - .for_each(|range| self.remove_char_range(&range)); - self - } - /// Remove all entries from this collection. - /// - /// Returns `&mut self` for easy chaining. - pub fn clear(&mut self) -> &mut Self { - self.ranges.clear(); - self - } - /// Return the set union of this collection and another one. - /// - /// The time complexity is O(min(R, T) log(R + T)), - /// where R is the number of ranges in this collection and T is the number - /// of ranges in `rhs`. - pub fn union(&self, rhs: &V) -> CharCollection { - let mut result: CharCollection; - if self.range_count() > rhs.range_count() { - result = self.clone(); - result.insert(rhs); - } else { - result = rhs.into(); - result.insert(self); - } - result - } - /// Return the set intersection of this collection and another one. - /// - /// The time complexity is O(min(R, T) log(R + T)), - /// where R is the number of ranges in this collection and T is the number - /// of ranges in `rhs`. - pub fn intersection(&self, rhs: &V) -> CharCollection { - let mut result: CharCollection; - if self.range_count() > rhs.range_count() { - result = self.clone(); - let rhs: CharCollection = rhs.into(); - result.remove(&rhs.complement()); - } else { - result = rhs.into(); - result.remove(&self.complement()); - } - result - } - /// Return the (non-symmetric) set difference of this collection and another one. - /// - /// The time complexity is O(T log(R + T)), where R - /// is the number of ranges in this collection and T is the number of ranges in - /// `rhs`. - pub fn difference(&self, rhs: &V) -> CharCollection { - let mut result: CharCollection = self.clone(); - result.remove(rhs); - result - } - /// Return the set complement of this collection (over the universe of `char`s). - /// - /// The time complexity is O(R), where R is the number of ranges in this - /// collection. - pub fn complement(&self) -> CharCollection { - if self.ranges.is_empty() { - return CharCollection::from(&CharRange::all()); - } - let mut result_ranges: Vec = Vec::new(); - if self.ranges[0].low != '\u{0}' { - result_ranges.push(CharRange::open_right('\u{0}', self.ranges[0].low)); - } - let mut prev_high = self.ranges[0].high; - for range in &self.ranges[1..] { - result_ranges.push(CharRange::open(prev_high, range.low)); - prev_high = range.high; - } - if prev_high != std::char::MAX { - result_ranges.push(CharRange::open_left(prev_high, std::char::MAX)); - } - CharCollection { - ranges: result_ranges, - } - } - /// Insert a single `CharRange`. - /// - /// Depending on how the new range relates to existing ranges in - /// the collection, it might be subsumed by an existing range, modify the endpoints of an - /// existing range, or replace one or more existing ranges. - fn insert_char_range(&mut self, new_range: &CharRange) { - if new_range.is_empty() { - return; - } - let lower_existing_range = self.find_containing_range(&new_range.low); - let upper_existing_range = self.find_containing_range(&new_range.high); - // Fully enclosed in existing range. - if lower_existing_range == upper_existing_range && lower_existing_range.is_ok() { - return; - } - let new_low: char; - let new_high: char; - let remove_from_idx: usize; - let remove_to_idx: usize; - match lower_existing_range { - Ok((idx, lower_existing_range)) => { - new_low = lower_existing_range.low; - remove_from_idx = idx; - } - Err(idx) => { - new_low = new_range.low; - remove_from_idx = idx; - } - } - match upper_existing_range { - Ok((idx, higher_existing_range)) => { - new_high = higher_existing_range.high; - remove_to_idx = idx + 1; - } - Err(idx) => { - new_high = new_range.high; - remove_to_idx = idx; - } - } - self.replace_ranges(chars!(new_low..=new_high), remove_from_idx..remove_to_idx); - } - /// Remove a single `CharRange`. - /// - /// Depending on how the removed range relates to existing ranges in the collection, it might - /// remove or modify the endpoints of existing ranges. - fn remove_char_range(&mut self, range_to_remove: &CharRange) { - if range_to_remove.is_empty() { - return; - } - let lower_existing_range = self.find_containing_range(&range_to_remove.low); - let upper_existing_range = self.find_containing_range(&range_to_remove.high); - let mut replacement_ranges: Vec = Vec::new(); - let remove_from_idx: usize; - let remove_to_idx: usize; - match lower_existing_range { - Ok((idx, lower_existing_range)) => { - if lower_existing_range.low < range_to_remove.low { - replacement_ranges.push(CharRange::open_right( - lower_existing_range.low, - range_to_remove.low, - )); - } - remove_from_idx = idx; - } - Err(idx) => remove_from_idx = idx, - } - match upper_existing_range { - Ok((idx, higher_existing_range)) => { - if range_to_remove.high < higher_existing_range.high { - replacement_ranges.push(CharRange::open_left( - range_to_remove.high, - higher_existing_range.high, - )); - } - remove_to_idx = idx + 1; - } - Err(idx) => { - remove_to_idx = idx; - } - } - self.ranges - .splice(remove_from_idx..remove_to_idx, replacement_ranges); - } - /// Delete all the existing `CharRange`s that fall within `indices_to_replace` in the vector, - /// and insert `char_range_to_insert` in their place. If the newly formed range is adjacent to - /// a kept range on its left or right, coalesce them. - fn replace_ranges( - &mut self, - mut char_range_to_insert: CharRange, - mut indices_to_replace: Range, - ) { - // If the newly formed range is adjacent to the range on its left, coalesce the two. - if indices_to_replace.start > 0 { - let prev_char_range = self.ranges[indices_to_replace.start - 1]; - if are_chars_adjacent(&prev_char_range.high, &char_range_to_insert.low) { - char_range_to_insert.low = prev_char_range.low; - indices_to_replace.start -= 1; - } - } - // If the newly formed range is adjacent to the range on its right, coalesce the two. - if indices_to_replace.end < self.ranges.len() { - let next_char_range = self.ranges[indices_to_replace.end]; - if are_chars_adjacent(&char_range_to_insert.high, &next_char_range.low) { - char_range_to_insert.high = next_char_range.high; - indices_to_replace.end += 1; - } - } - self.ranges - .splice(indices_to_replace, vec![char_range_to_insert]); - } - fn find_containing_range(&self, query: &char) -> Result<(usize, CharRange), usize> { - let result = self - .ranges - .binary_search_by(|range| range.cmp_char(query.clone())); - match result { - Ok(index) => Ok((index, self.ranges[index])), - Err(index) => Err(index), - } - } -} -impl MultiCharRange for CharCollection { - fn iter_ranges<'a>(&'a self) -> Box + 'a> { - Box::new(self.ranges.iter().map(|range| range.clone())) - } - fn range_count(&self) -> usize { - self.ranges.len() - } -} -impl Hash for CharCollection { - fn hash(&self, state: &mut H) { - self.ranges - .iter() - .for_each(|range| hash_char_range(range, state)); - } -} -fn hash_char_range(range: &CharRange, state: &mut H) { - range.low.hash(state); - range.high.hash(state); -} -fn are_chars_adjacent(left: &char, right: &char) -> bool { - let mut iter: CharIter = CharRange::open_right(left.clone(), right.clone()).iter(); - match iter.next_back() { - None => false, - Some(next_right) => left == &next_right, - } -} -fn format_range(range: &CharRange) -> String { - format!("{}..={}", range.low, range.high) -} -#[cfg(test)] -mod tests { - use { - super::{are_chars_adjacent, CharCollection, CharRange}, - std::char, - // unic_char_range::{chars, CharRange}, - std::error::Error, - }; - #[test] - fn test_from_sorted_ranges() -> Result<(), Box> { - let expected = char_collect!('a'..='d', 'g'..='l', 'z'); - let actual = CharCollection::from_sorted_ranges(vec![ - chars!('a'..='d'), - chars!('g'..='l'), - chars!('z'..='z'), - ])?; - assert_eq!(actual, expected); - Ok(()) - } - #[test] - fn test_from_sorted_ranges_out_of_order() { - assert!(CharCollection::from_sorted_ranges(vec![ - chars!('g'..='l'), - chars!('a'..='d'), - chars!('z'..='z'), - ]) - .is_err()); - } - #[test] - fn test_from_sorted_ranges_overlap() { - assert!(CharCollection::from_sorted_ranges(vec![ - chars!('a'..='d'), - chars!('c'..='l'), - chars!('z'..='z'), - ]) - .is_err()); - } - #[test] - fn test_from_sorted_ranges_adjacent() { - assert!( - CharCollection::from_sorted_ranges(vec![chars!('a'..='d'), chars!('e'..='g')]).is_err() - ); - } - #[test] - fn test_from_sorted_chars() -> Result<(), Box> { - let chars = vec!['a', 'b', 'c', 'd', 'g', 'h', 'i', 'j', 'k', 'l', 'z']; - let expected = char_collect!('a'..='d', 'g'..='l', 'z'); - let actual = CharCollection::from_sorted_chars(chars)?; - assert_eq!(actual, expected); - Ok(()) - } - #[test] - fn test_from_sorted_chars_out_of_order() { - let chars = vec!['a', 'b', 'c', 'd', 'g', 'h', 'i', 'j', 'k', 'l', 'e']; - assert!(CharCollection::from_sorted_chars(chars).is_err()); - } - #[test] - fn test_find_containing_range() { - let collection = char_collect!({ ('a'..='d') + ('g'..='j') + ('l'..='o') + 'z' }); - assert_eq!(collection.find_containing_range(&'0'), Err(0)); - assert_eq!( - collection.find_containing_range(&'c'), - Ok((0, chars!('a'..='d'))) - ); - assert_eq!(collection.find_containing_range(&'e'), Err(1)); - } - #[test] - fn test_insert_initial() { - let collection = char_collect!('a'..='d'); - assert_eq!(collection.ranges, vec![chars!('a'..='d')]) - } - #[test] - fn test_insert_exact_match() { - let mut collection = char_collect!('a'..='d', 'g'..='l'); - collection += 'a'..='d'; - assert_eq!( - collection.ranges, - vec![chars!('a'..='d'), chars!('g'..='l')] - ); - } - #[test] - fn test_insert_non_overlapping_sorted() { - let collection = char_collect!('a'..='d', 'g'..='j', 'l'..='o'); - assert_eq!( - collection.ranges, - vec![chars!('a'..='d'), chars!('g'..='j'), chars!('l'..='o')] - ); - } - #[test] - fn test_insert_non_overlapping_unsorted() { - let collection = char_collect!('l'..='o', 'a'..='d', 'l'..='o', 'a'..='d', 'g'..='j'); - assert_eq!( - collection.ranges, - vec![chars!('a'..='d'), chars!('g'..='j'), chars!('l'..='o')] - ); - } - #[test] - fn test_insert_overlapping_all_existent() { - let mut collection = char_collect!('l'..='o', 'a'..='d'); - collection += 'a'..='o'; - assert_eq!(collection.ranges, vec![chars!('a'..='o')]); - } - #[test] - fn test_insert_overlapping_some_existent() { - let mut collection = char_collect!('c'..='e', 'j'..='m', 'p'..='s'); - collection += 'i'..='n'; - assert_eq!( - collection.ranges, - vec![chars!('c'..='e'), chars!('i'..='n'), chars!('p'..='s')] - ); - } - #[test] - fn test_insert_overlapping_with_intersections() { - let mut collection = char_collect!('c'..='e', 'j'..='m', 'p'..='s'); - collection += 'd'..='k'; - assert_eq!( - collection.ranges, - vec![chars!('c'..='m'), chars!('p'..='s')] - ); - } - #[test] - fn test_insert_coalesce_adjacent_ranges() { - let mut collection = char_collect!('a'..='c', 'j'..='m'); - collection += 'd'..='i'; - assert_eq!(collection.ranges, vec![chars!('a'..='m')]); - } - #[test] - fn test_append() -> Result<(), Box> { - let mut collection = char_collect!('a'..='c'); - collection - .append('d')? - .append('g')? - .append('h')? - .append('i')? - .append('z')?; - assert_eq!(collection, char_collect!('a'..='d', 'g'..='i', 'z')); - Ok(()) - } - #[test] - fn test_append_out_of_order() -> Result<(), Box> { - let mut collection = char_collect!('a'..='c'); - assert!(collection - .append('d')? - .append('g')? - .append('h')? - .append('i')? - .append('e') - .is_err()); - Ok(()) - } - #[test] - fn test_append_range() -> Result<(), Box> { - let mut collection = char_collect!('a'..='c'); - collection - .append_range(chars!('g'..='i'))? - .append_range(chars!('j'..='m'))?; - assert_eq!(collection, char_collect!('a'..='c', 'g'..='m')); - Ok(()) - } - #[test] - fn test_append_range_out_of_order() -> Result<(), Box> { - let mut collection = char_collect!('a'..='c'); - assert!(collection - .append_range(chars!('g'..='i'))? - .append_range(chars!('j'..='m'))? - .append_range(chars!('k'..='m')) - .is_err()); - Ok(()) - } - #[test] - fn test_remove_exact_range() { - let mut collection = char_collect!('c'..='e', 'j'..='m', 'p'..='s'); - collection -= 'j'..='m'; - assert_eq!( - collection.ranges, - vec![chars!('c'..='e'), chars!['p'..='s']] - ); - } - #[test] - fn test_remove_overlapping_all_existent() { - let mut collection = char_collect!('c'..='e', 'j'..='m', 'p'..='s'); - collection -= 'c'..='s'; - assert_eq!(collection.ranges, vec![]); - } - #[test] - fn test_remove_overlapping_all_existent_superset() { - let mut collection = char_collect!('c'..='e', 'j'..='m', 'p'..='s'); - collection -= 'a'..='z'; - assert_eq!(collection.ranges, vec![]); - } - #[test] - fn test_remove_one_subrange() { - let mut collection = char_collect!('c'..='e', 'j'..='m', 'p'..='s'); - collection -= 'k'..='l'; - assert_eq!( - collection.ranges, - vec![ - chars!('c'..='e'), - chars!('j'..='j'), - chars!('m'..='m'), - chars!('p'..='s') - ] - ); - } - #[test] - fn test_remove_intersection() { - let mut collection = char_collect!('c'..='e', 'j'..='m', 'p'..='s'); - collection -= 'd'..='q'; - assert_eq!( - collection.ranges, - vec![chars!('c'..='c'), chars!('r'..='s')] - ); - } - #[test] - fn test_complement_simple() { - let collection = char_collect!(0x10..=0x50, 0x70..=0x70, 0x99..=0x640); - assert_eq!( - collection.complement(), - char_collect!( - 0x00..=0x0F, - 0x51..=0x6F, - 0x71..=0x98, - 0x641..=(char::MAX as u32) - ) - ); - } - #[test] - fn test_complement_all() { - let collection = char_collect!(CharRange::all()); - assert_eq!(collection.complement(), char_collect!()); - } - #[test] - fn test_complement_none() { - let collection = char_collect!(); - assert_eq!(collection.complement(), char_collect!(CharRange::all())); - } - #[test] - fn test_complement_includes_min_and_max() { - let collection = char_collect!(0x0..=0x10, 0x40..=0x50, 0xCCCC..=(char::MAX as u32)); - assert_eq!( - collection.complement(), - char_collect!(0x11..=0x3F, 0x51..=0xCCCB) - ); - } - #[test] - fn test_union() { - let collection_a = char_collect!('a'..='g', 'm'..='z', 'B'..='R'); - let collection_b = char_collect!('e'..='q', 'W'..='Y'); - let expected = char_collect!('a'..='z', 'B'..='R', 'W'..='Y'); - assert_eq!(collection_a.union(&collection_b), expected); - assert_eq!(collection_b.union(&collection_a), expected); - } - #[test] - fn test_intersection() { - let collection_a = char_collect!('a'..='g', 'm'..='z'); - let collection_b = char_collect!('e'..='q'); - let expected = char_collect!('e'..='g', 'm'..='q'); - assert_eq!(collection_a.intersection(&collection_b), expected); - assert_eq!(collection_b.intersection(&collection_a), expected); - } - // #[test] - // fn test_macro_expressions() { - // use unicode_blocks::UnicodeBlockId::Arabic; - // let collection = - // char_collect!({ ('c'..='e') + ('f'..='h') - ('a'..='d') + Arabic + (0x5..=0x42) }); - // assert_eq!(collection, char_collect!(0x5..=0x42, 'e'..='h', Arabic)); - // } - #[test] - fn test_iter() { - let collection = char_collect!('a'..='c', 'j'..='l', 'x'..='z'); - let v = collection.iter().collect::>(); - assert_eq!(v, vec!['a', 'b', 'c', 'j', 'k', 'l', 'x', 'y', 'z']); - } - #[test] - fn test_are_chars_adjacent() { - assert!(are_chars_adjacent(&'a', &'b')); - assert!(!are_chars_adjacent(&'b', &'a')); - assert!(!are_chars_adjacent(&'a', &'c')); - } -} diff --git a/components/char_collection/src/conversions.rs b/components/char_collection/src/conversions.rs deleted file mode 100644 index 7da45e5467d..00000000000 --- a/components/char_collection/src/conversions.rs +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright 2019 The Fuchsia Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. -//! Conversion (`From`) implementations for [CharCollection], via [MultiCharRange]. -use crate::{CharCollection, CharRange, MultiCharRange}; -use std::boxed::Box; -use std::convert::TryFrom; -use std::iter; -use std::ops::RangeInclusive; - -macro_rules! impl_for_range_inclusive_int_type { - ($($t:ty),*) => {$( - impl MultiCharRange for RangeInclusive<$t> { - fn iter_ranges(&self) -> Box> { - Box::new(iter::once(to_char_range!(self))) - } - fn range_count(&self) -> usize { - 1 - } - })*} -} -// This macro is needed because there is no way to express "can be cast as u32" using traits. -macro_rules! to_char_range { - ($range:expr) => { - CharRange::closed( - char::try_from(*$range.start() as u32).unwrap(), - char::try_from(*$range.end() as u32).unwrap(), - ) - }; -} -impl MultiCharRange for char { - fn iter_ranges(&self) -> Box> { - Box::new(std::iter::once(CharRange::closed(*self, *self))) - } - fn range_count(&self) -> usize { - 1 - } -} -impl MultiCharRange for CharRange { - fn iter_ranges(&self) -> Box> { - Box::new(iter::once(self.clone())) - } - fn range_count(&self) -> usize { - 1 - } -} -impl MultiCharRange for RangeInclusive { - fn iter_ranges(&self) -> Box> { - Box::new(iter::once(CharRange::closed(*self.start(), *self.end()))) - } - fn range_count(&self) -> usize { - 1 - } -} -impl_for_range_inclusive_int_type!(u8, i8, u32, i32); - -impl From<&T> for CharCollection { - fn from(source: &T) -> Self { - let mut collection = CharCollection::new(); - collection.insert(source); - collection - } -} -#[cfg(test)] -mod multi_char_range_tests { - use crate::{CharRange, MultiCharRange}; - use paste; - #[test] - fn test_char() { - let source = 'a'; - assert_eq!( - source.iter_ranges().collect::>(), - vec![chars!('a'..='a')] - ); - assert_eq!(source.range_count(), 1); - } - #[test] - fn test_char_range() { - let source = chars!('d'..='g'); - assert_eq!( - source.iter_ranges().collect::>(), - vec![chars!('d'..='g')] - ); - assert_eq!(source.range_count(), 1); - } - #[test] - fn test_range_inclusive_char() { - let source = 'd'..='g'; - assert_eq!( - source.iter_ranges().collect::>(), - vec![chars!('d'..='g')] - ); - assert_eq!(source.range_count(), 1); - } - macro_rules! test_range_inclusive_int { - ($t:ty) => { - paste::item! { - #[test] - fn []() { - let source: std::ops::RangeInclusive<$t> = 0x0..=0x9; - assert_eq!( - source.iter_ranges().collect::>(), - vec![chars!('\u{0}'..='\u{9}')] - ); - assert_eq!(source.range_count(), 1); - } - } - }; - } - test_range_inclusive_int!(u8); - test_range_inclusive_int!(i8); - test_range_inclusive_int!(u32); - test_range_inclusive_int!(i32); -} -#[cfg(test)] -mod from_tests { - use crate::CharCollection; - #[test] - fn test_char() { - let actual: CharCollection = (&'a').into(); - assert_eq!(actual, char_collect!('a'..='a')); - } -} diff --git a/components/char_collection/src/lib.rs b/components/char_collection/src/lib.rs deleted file mode 100644 index 54eacbe847e..00000000000 --- a/components/char_collection/src/lib.rs +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright 2019 The Fuchsia Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. -#[macro_use] -mod macros; -mod char_collection; -mod conversions; -mod operators; -mod uniset; -pub use char_collection::CharCollection; -pub use char_collection::CharIter; -pub use char_collection::CharRange; -pub use char_collection::MultiCharRange; -pub use conversions::*; -pub use operators::*; -pub use uniset::UnicodeSet; diff --git a/components/char_collection/src/macros.rs b/components/char_collection/src/macros.rs deleted file mode 100644 index 49569781f50..00000000000 --- a/components/char_collection/src/macros.rs +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019 The Fuchsia Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. -/// Generate a [CharCollection] from a sequence of `char`s, -/// [CharRanges](unic_char_range::CharRange), or Unicode [Blocks](unic_ucd_block::Block). -/// -/// The macro can be used with either a comma-separated list of items, or with an expression -/// representing set operations. -/// -#[macro_export] -macro_rules! char_collect { - ({ $($x:tt)+ }) => { - { - $crate::CharCollection::new() + $($x)* - } - }; - ( $( $x:expr ),* ) => { - { - // Allow unused mut in case the collection is empty. - #[allow(unused_mut)] - let mut col = $crate::CharCollection::new(); - $( - col.insert(& $x); - )* - col - } - }; -} -#[macro_export] -macro_rules! chars { - ($low:tt .. $high:tt) => { - $crate::CharRange::open_right($low, $high) - }; - ($low:tt ..= $high:tt) => { - $crate::CharRange::closed($low, $high) - }; - (..) => { - $crate::CharRange::all() - }; -} diff --git a/components/char_collection/src/operators.rs b/components/char_collection/src/operators.rs deleted file mode 100644 index cec8cad4e3d..00000000000 --- a/components/char_collection/src/operators.rs +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright 2019 The Fuchsia Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. -//! Implementations of standard operators for [CharCollection]. -//! -//! `+` and `|` are equivalent. `+` is easier to use with `-`, as they have the same operator -//! precedence. -use crate::{CharCollection, MultiCharRange}; -use std::convert::Into; -use std::ops; -impl ops::BitOr for CharCollection { - type Output = CharCollection; - fn bitor(self, rhs: V) -> Self::Output { - let result: CharCollection = self.into(); - result.union(&rhs) - } -} -impl ops::Add for CharCollection { - type Output = CharCollection; - fn add(self, rhs: V) -> Self::Output { - let result: CharCollection = self.into(); - result.union(&rhs) - } -} -impl ops::BitOrAssign for CharCollection { - fn bitor_assign(&mut self, rhs: V) { - self.insert(&rhs); - } -} -impl ops::AddAssign for CharCollection { - fn add_assign(&mut self, rhs: V) { - self.insert(&rhs); - } -} -impl ops::Sub for CharCollection { - type Output = CharCollection; - fn sub(self, rhs: V) -> Self::Output { - self.difference(&rhs) - } -} -impl ops::SubAssign for CharCollection { - fn sub_assign(&mut self, rhs: V) { - self.remove(&rhs); - } -} -impl ops::BitAnd for CharCollection { - type Output = CharCollection; - fn bitand(self, rhs: V) -> Self::Output { - self.intersection(&rhs) - } -} -impl ops::BitAndAssign for CharCollection { - fn bitand_assign(&mut self, rhs: V) { - *self = self.intersection(&rhs); - } -} -impl ops::Not for CharCollection { - type Output = CharCollection; - fn not(self) -> Self::Output { - self.complement() - } -} diff --git a/components/char_collection/src/uniset.rs b/components/char_collection/src/uniset.rs deleted file mode 100644 index 950cb2c74cc..00000000000 --- a/components/char_collection/src/uniset.rs +++ /dev/null @@ -1,321 +0,0 @@ -use std::{boxed::Box, error::Error, iter::Iterator, slice::Iter, vec::Vec}; - -const UNICODESET_MAX: u32 = 0x10FFFF; // does max imply inclusive? else should be 10FFFF -const UNICODESET_MIN: u32 = 0x000000; -const BMP_MAX: u32 = 0xFFFF; - -/// Given string representation of inversion list create set -/// -/// Requires starting capacity integer, followed by space delimited integer code points. -/// There must be an even number of elements (not including the capacity int), and must be -/// in ascending sorted order. -/// -/// Example String: `4 0 5 10 15` designates a capacity of size 4, followed by 2 ranges -/// The ranges are {0, 4} and {10, 14} inclusive -fn parse_serial_string(serialize_str: &str) -> Result, Box> { - let mut serialize = serialize_str.split(" "); - let capacity: usize = serialize.next().unwrap().parse()?; - if capacity % 2 != 0 { - return Err("Capacity must be even".into()); - } - let mut serialized_vec: Vec = Vec::with_capacity(capacity); - let mut prev: Option = None; - for str_ele in serialize { - // unsure if the capacity matters if we can expand, but that might be an issue if you expand into too much memory - // otherwise shrink_to_fit is possible - let parsed: u32 = str_ele.parse()?; - if serialized_vec.len() + 1 > serialized_vec.capacity() { - return Err("Serialization capacity is too small".into()); - } - if Some(parsed) <= prev { - return Err("Serialization must be sorted".into()); - } - serialized_vec.push(parsed); - prev = Some(parsed); - } - if serialized_vec.len() % 2 != 0 { - return Err("Serialization must be even".into()); - } - serialized_vec.shrink_to_fit(); // necessary if the length < capacity - Ok(serialized_vec) -} - -/// UnicodeSet membership wrapper -/// -/// Provides exposure to membership functions and constructors from serialized UnicodeSets -/// and predefined ranges. -//#[derive(Copy, Clone, Debug, Eq)] -pub struct UnicodeSet { - // If we wanted to use an array to keep the memory on the stack, there is an unsafe nightly feature - // https://doc.rust-lang.org/nightly/core/array/trait.FixedSizeArray.html - // Allows for traits of fixed size arrays - set: Vec, // is set misleading? could be uset -} - -impl UnicodeSet { - /// Returns Result of UnicodeSet from serialized string - /// - /// Returns an error if the serialized string fails to parse. - /// The serialized string requires starting capacity integer, followed by space delimited - /// integer code points. There must be an even number of elements (not including the - /// capacity int), and must be in ascending sorted order. - /// - /// Example String: `"4 0 5 10 15"` designates a capacity of size `4`, followed by 2 ranges - /// The ranges are `{0, 4}` and `{10, 14}` inclusive - pub fn new(serialize: &str) -> Result> { - match parse_serial_string(serialize) { - Ok(serialize) => Ok(UnicodeSet { set: serialize }), - Err(e) => Err(e), - } - } - - /// Returns Result of UnicodeSet from a single pair of integers defining a range - /// - /// `start`: inclusive, `end`: exclusive - /// - /// Returns an error if the range is invalid (out of order and out of bounds). - /// - /// Example Call: `UnicodeSet::from_range(&0, &15)` - pub fn from_range(start: &u32, end: &u32) -> Result> { - if start > end { - return Err("Range is out of order".into()); - } - if start < &UNICODESET_MIN || end > &UNICODESET_MAX { - return Err("Range is out of bounds".into()); - } - Ok(UnicodeSet { - set: vec![*start, *end], - }) - } - - /// Returns UnicodeSet spanning entire Unicode range - /// - /// The range spans from `0x0 -> 0x10FFFF` inclusive - pub fn all() -> UnicodeSet { - UnicodeSet { - set: vec![UNICODESET_MIN, UNICODESET_MAX + 1], - } - } - - /// Returns UnicodeSet spanning BMP range - /// - /// The range spans from `0x0 -> 0xFFFF` inclusive - pub fn bmp() -> UnicodeSet { - UnicodeSet { - set: vec![UNICODESET_MIN, BMP_MAX + 1], - } - } - /// Returns an `Iter` of start and stop `u32` points of the UnicodeSet - pub fn iter(&self) -> Iter { - self.set.iter() - } - - /// Returns the cardinality of the UnicodeSet - pub fn size(&self) -> Result> { - if self.set.len() < 2 { - return Err("UnicodeSet length < 2".into()); - } - let end: u32 = self.iter().skip(1).step_by(2).sum::(); - let start: u32 = self.iter().step_by(2).sum::(); - Ok((end - start) as usize) - } - - /// Returns whether or not the UnicodeSet is empty - pub fn is_empty(&self) -> bool { - self.set.len() < 2 // unsure if this is appropriate definition of just self.set.is_empty() - } - - /// Wrapper for contains conditions closures - fn contains(&self, query: &u32, condition: C) -> bool - where - C: Fn(usize) -> bool, - { - match self.set.binary_search(query) { - Ok(pos) => { - if pos % 2 == 0 { - return condition(pos); - } else { - false - } - } - Err(pos) => { - if pos % 2 != 0 && pos < self.set.len() { - return condition(pos); - } else { - false - } - } - } - } - - /// Checks to see the query is in the UnicodeSet - /// - /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points - /// in the set using `std::vec::Vec` implementation - /// - /// Example: `contains_point(&10)` - pub fn contains_point(&self, query: &u32) -> bool { - let condition_closure = |_: usize| -> bool { true }; - self.contains(query, condition_closure) - } - - /// Checks to see if the range is in the UnicodeSet, returns a Result - /// - /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points - /// in the set using `std::vec::Vec` implementation - /// - /// Only runs the search once on the `start` parameter, while the `end` parameter is checked - /// in a single `O(1)` step - /// - /// Example: `contains_range(&0, &10)` - pub fn contains_range(&self, start: &u32, end: &u32) -> Result> { - if start >= end { - return Err("Range cannot be out of order".into()); - } - let condition_closure = |pos: usize| -> bool { - if end < &self.set[pos + 1] { - true - } else { - false - } - }; - return Ok(self.contains(start, condition_closure)); - } -} - -#[cfg(test)] -mod tests { - use super::{parse_serial_string, UnicodeSet, BMP_MAX, UNICODESET_MAX, UNICODESET_MIN}; - // parse_serial_string - #[test] - fn test_parse_serial_string() { - let expected = vec![2, 3, 4, 5]; - let actual = parse_serial_string("4 2 3 4 5").unwrap(); - assert_eq!(actual, expected); - } - #[test] - fn test_parse_serial_string_no_char() { - assert!(parse_serial_string("4 2 A 3 4 5").is_err()); - } - #[test] - fn test_parse_serial_string_empty() { - assert!(parse_serial_string("").is_err()); - } - #[test] - fn test_parse_serial_string_wrong_format() { - assert!(parse_serial_string("[4, 2, 3, 4, 5 ]").is_err()); - } - #[test] - fn test_parse_serial_string_wrong_order() { - assert!(parse_serial_string("4 1 0 4 2").is_err()); - } - #[test] - fn test_parse_serial_string_single_char_error() { - assert!(parse_serial_string("4 1 1 2 2").is_err()); - } - #[test] - fn test_parse_serial_string_capacity_not_even() { - assert!(parse_serial_string("3 2 3 4").is_err()); - } - #[test] - fn test_parse_serial_string_size_not_even() { - assert!(parse_serial_string("4 3 2 1").is_err()); - } - - // UnicodeSet constructors - #[test] - fn test_unicodeset_new() { - let expected = vec![2, 3, 4, 5]; - let actual = UnicodeSet::new("4 2 3 4 5").unwrap().set; - assert_eq!(actual, expected); - } - #[test] - fn test_unicodeset_new_error() { - assert!(UnicodeSet::new("3 2 4 3").is_err()); - } - #[test] - fn test_unicodeset_from_range() { - let expected = vec![4, 10]; - let actual = UnicodeSet::from_range(&4, &10).unwrap().set; - assert_eq!(actual, expected); - } - #[test] - fn test_unicodeset_from_range_bad_order() { - assert!(UnicodeSet::from_range(&10, &5).is_err()); - } - #[test] - fn test_unicodeset_from_range_out_of_bounds() { - assert!(UnicodeSet::from_range(&0, &0x110000).is_err()); - } - #[test] - fn test_unicodeset_all() { - let expected = vec![UNICODESET_MIN, UNICODESET_MAX + 1]; - assert_eq!(UnicodeSet::all().set, expected); - } - #[test] - fn test_unicodeset_bmp() { - let expected = vec![UNICODESET_MIN, BMP_MAX + 1]; - assert_eq!(UnicodeSet::bmp().set, expected); - } - #[test] - fn test_unicodeset_contains() { - let check = UnicodeSet::new("4 2 5 10 15").unwrap(); - assert!(check.contains_point(&2)); - assert!(check.contains_point(&4)); - assert!(check.contains_point(&10)); - assert!(check.contains_point(&14)); - } - #[test] - fn test_unicodeset_contains_false() { - let check = UnicodeSet::new("4 2 5 10 15").unwrap(); - assert!(!check.contains_point(&1)); - assert!(!check.contains_point(&5)); - assert!(!check.contains_point(&9)); - assert!(!check.contains_point(&15)); - assert!(!check.contains_point(&16)); - } - #[test] - fn test_unicodeset_contains_range() { - let check = UnicodeSet::new("4 0 10 15 25").unwrap(); - assert!(check.contains_range(&2, &5).unwrap()); - assert!(check.contains_range(&0, &9).unwrap()); - assert!(check.contains_range(&15, &24).unwrap()); - } - #[test] - fn test_unicodeset_contains_range_false() { - let check = UnicodeSet::new("4 0 10 15 25").unwrap(); - assert!(!check.contains_range(&0, &10).unwrap()); - assert!(!check.contains_range(&15, &25).unwrap()); - assert!(!check.contains_range(&0, &16).unwrap()); - assert!(!check.contains_range(&10, &15).unwrap()); - assert!(!check.contains_range(&11, &14).unwrap()); - } - #[test] - fn test_unicodeset_contains_range_invalid() { - let check = UnicodeSet::all(); - assert!(check.contains_range(&10, &0).is_err()); - assert!(check.contains_range(&0, &0).is_err()); - } - #[test] - fn test_unicodeset_size() { - let check = UnicodeSet::new("4 2 5 10 15").unwrap(); - assert_eq!(8, check.size().unwrap()); - let check = UnicodeSet::all(); - let expected = UNICODESET_MAX + 1 - UNICODESET_MIN; - assert_eq!(expected as usize, check.size().unwrap()); - } - #[test] - fn test_unicodeset_size_error() { - let check = UnicodeSet { set: vec![0] }; - assert!(check.size().is_err()); - } - #[test] - fn test_unicodeset_is_empty() { - let check = UnicodeSet { set: vec![] }; - assert!(check.is_empty()); - let check = UnicodeSet { set: vec![0] }; - assert!(check.is_empty()); - let check = UnicodeSet::all(); - assert!(!check.is_empty()); - } -} From cc781c81e876cae03b957e042cef150d2262cf29 Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Tue, 23 Jun 2020 22:59:31 +0000 Subject: [PATCH 16/30] Update to repo --- components/uniset/Cargo.toml | 14 + components/uniset/README.md | 12 + .../uniset/meta/char_collection_lib_test.cmx | 5 + components/uniset/src/lib.rs | 6 + components/uniset/src/uniset.rs | 328 ++++++++++++++++++ 5 files changed, 365 insertions(+) create mode 100644 components/uniset/Cargo.toml create mode 100644 components/uniset/README.md create mode 100644 components/uniset/meta/char_collection_lib_test.cmx create mode 100644 components/uniset/src/lib.rs create mode 100644 components/uniset/src/uniset.rs diff --git a/components/uniset/Cargo.toml b/components/uniset/Cargo.toml new file mode 100644 index 00000000000..b7e4103b289 --- /dev/null +++ b/components/uniset/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "icu4x-unicodeset" +description = "API for managing Unicode Language and Locale Identifiers" +version = "0.0.1" +authors = ["The ICU4X Project Developers"] +edition = "2018" +readme = "README.md" +repository = "https://github.com/unicode-org/icu4x" +license-file = "../../LICENSE" +categories = ["internationalization"] +include = [ + "src/**/*", + "Cargo.toml", +] diff --git a/components/uniset/README.md b/components/uniset/README.md new file mode 100644 index 00000000000..9cb580caa61 --- /dev/null +++ b/components/uniset/README.md @@ -0,0 +1,12 @@ +# ICU4X + +ICU4X is a set of internationalization components for Unicode. + +# Status [![crates.io](http://meritbadge.herokuapp.com/icu4x)](https://crates.io/crates/icu4x) + +The project is in an incubation period. + +# Authors + +The project is managed by a subcommittee of ICU-TC in the Unicode Consortium focused on providing solutions for client-side internationalization. + diff --git a/components/uniset/meta/char_collection_lib_test.cmx b/components/uniset/meta/char_collection_lib_test.cmx new file mode 100644 index 00000000000..3bb56a96aa2 --- /dev/null +++ b/components/uniset/meta/char_collection_lib_test.cmx @@ -0,0 +1,5 @@ +{ + "program": { + "binary": "test/char_collection_lib_test" + } +} \ No newline at end of file diff --git a/components/uniset/src/lib.rs b/components/uniset/src/lib.rs new file mode 100644 index 00000000000..c9e5f2c2d87 --- /dev/null +++ b/components/uniset/src/lib.rs @@ -0,0 +1,6 @@ +// Copyright 2019 The Fuchsia Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +#[macro_use] +mod uniset; +pub use uniset::UnicodeSet; diff --git a/components/uniset/src/uniset.rs b/components/uniset/src/uniset.rs new file mode 100644 index 00000000000..f265fb3cf41 --- /dev/null +++ b/components/uniset/src/uniset.rs @@ -0,0 +1,328 @@ +use std::{boxed::Box, error::Error, iter::Iterator, slice::Iter, vec::Vec}; + +const CODEPOINT_MAX: u32 = 0x10FFFF; // does max imply inclusive? else should be 10FFFF +const CODEPOINT_MIN: u32 = 0x000000; +const BMP_MAX: u32 = 0xFFFF; + +/// Given string representation of inversion list create set +/// +/// See UnicodeSet::new for conditions +fn parse_serial_string(serialize_str: &str) -> Result, Box> { + let mut serialize = serialize_str.split(" "); + let capacity: usize = serialize + .next() + .ok_or("Splitting did not yield anything")? + .parse()?; + if capacity % 2 != 0 { + return Err("Capacity must be even".into()); + } + let mut serialized_vec: Vec = Vec::with_capacity(capacity); + let mut prev: Option = None; + for str_elem in serialize { + let parsed: u32 = str_elem.parse()?; + if serialized_vec.len() + 1 > serialized_vec.capacity() { + return Err("Serialization capacity is too small".into()); + } + if Some(parsed) <= prev { + return Err("Serialization must be sorted".into()); + } + serialized_vec.push(parsed); + prev = Some(parsed); + } + if serialized_vec.len() % 2 != 0 { + return Err("Serialization must be even".into()); + } + serialized_vec.shrink_to_fit(); // necessary if the length < capacity + Ok(serialized_vec) +} + +/// UnicodeSet membership wrapper +/// +/// Provides exposure to membership functions and constructors from serialized UnicodeSets +/// and predefined ranges. +//#[derive(Copy, Clone, Debug, Eq)] +pub struct UnicodeSet { + // If we wanted to use an array to keep the memory on the stack, there is an unsafe nightly feature + // https://doc.rust-lang.org/nightly/core/array/trait.FixedSizeArray.html + // Allows for traits of fixed size arrays + inv_list: Vec, +} + +impl UnicodeSet { + /// Returns Result of UnicodeSet from serialized string + /// + /// Returns an error if the serialized string fails to parse. + /// The serialized string requires starting capacity integer, followed by space delimited + /// integer code points. There must be an even number of elements (not including the + /// capacity int), and must be in ascending sorted order. + /// + /// Example String: `"4 0 5 10 15"` designates a capacity of size `4`, followed by 2 ranges + /// The ranges are `{0, 4}` and `{10, 14}` inclusive + pub fn new(serialize: &str) -> Result> { + match parse_serial_string(serialize) { + Ok(serialize) => Ok(UnicodeSet { + inv_list: serialize, + }), + Err(e) => Err(e), + } + } + + /// Returns Result of UnicodeSet from a single pair of integers defining a range + /// + /// `start`: inclusive, `end`: exclusive + /// + /// Returns an error if the range is invalid (out of order and out of bounds). + /// + /// Example Call: `UnicodeSet::from_range(&0, &15)` + pub fn from_range(start: &u32, end: &u32) -> Result> { + if start > end { + return Err("Range is out of order".into()); + } + if start < &CODEPOINT_MIN || end > &CODEPOINT_MAX { + return Err("Range is out of bounds".into()); + } + Ok(UnicodeSet { + inv_list: vec![*start, *end], + }) + } + + /// Returns UnicodeSet spanning entire Unicode range + /// + /// The range spans from `0x0 -> 0x10FFFF` inclusive + pub fn all() -> UnicodeSet { + UnicodeSet { + inv_list: vec![CODEPOINT_MIN, CODEPOINT_MAX + 1], + } + } + + /// Returns UnicodeSet spanning BMP range + /// + /// The range spans from `0x0 -> 0xFFFF` inclusive + pub fn bmp() -> UnicodeSet { + UnicodeSet { + inv_list: vec![CODEPOINT_MIN, BMP_MAX + 1], + } + } + + /// Returns an `Iter` of start and stop `u32` points of the UnicodeSet + pub fn iter(&self) -> Iter { + self.inv_list.iter() + } + + /// Returns the cardinality of the UnicodeSet + /// + /// + pub fn size(&self) -> Result> { + if self.inv_list.len() < 2 { + return Err("UnicodeSet length < 2".into()); + } + let end_point_sum: u32 = self.iter().skip(1).step_by(2).sum::(); + let start_point_sum: u32 = self.iter().step_by(2).sum::(); + Ok((end_point_sum - start_point_sum) as usize) + } + + /// Returns whether or not the UnicodeSet is empty + pub fn is_empty(&self) -> bool { + self.inv_list.len() < 2 // unsure if this is appropriate definition of just self.inv_list.is_empty() + } + + /// Wrapper for contains conditions closures + fn contains(&self, query: &u32, condition: C) -> bool + where + C: Fn(usize) -> bool, + { + match self.inv_list.binary_search(query) { + Ok(pos) => { + if pos % 2 == 0 { + condition(pos) + } else { + false + } + } + Err(pos) => { + if pos % 2 != 0 && pos < self.inv_list.len() { + condition(pos) + } else { + false + } + } + } + } + + /// Checks to see the query is in the UnicodeSet + /// + /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points + /// in the set using `std::vec::Vec` implementation + /// + /// Example: `contains_point(&10)` + pub fn contains_point(&self, query: &u32) -> bool { + let condition_closure = |_: usize| -> bool { true }; + self.contains(query, condition_closure) + } + + /// Checks to see if the range is in the UnicodeSet, returns a Result + /// + /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points + /// in the set using `std::vec::Vec` implementation + /// + /// Only runs the search once on the `start` parameter, while the `end` parameter is checked + /// in a single `O(1)` step + /// + /// Example: `contains_range(&0, &10)` + pub fn contains_range(&self, start: &u32, end: &u32) -> Result> { + if start >= end { + return Err("Range cannot be out of order".into()); + } + let condition_closure = |pos: usize| -> bool { + if end < &self.inv_list[pos + 1] { + true + } else { + false + } + }; + return Ok(self.contains(start, condition_closure)); + } +} + +#[cfg(test)] +mod tests { + use super::{parse_serial_string, UnicodeSet, BMP_MAX, CODEPOINT_MAX, CODEPOINT_MIN}; + + // parse_serial_string + #[test] + fn test_parse_serial_string() { + let expected = vec![2, 3, 4, 5]; + let actual = parse_serial_string("4 2 3 4 5").unwrap(); + assert_eq!(actual, expected); + } + #[test] + fn test_parse_serial_string_no_char() { + assert!(parse_serial_string("4 2 A 3 4 5").is_err()); + } + #[test] + fn test_parse_serial_string_empty() { + assert!(parse_serial_string("").is_err()); + } + #[test] + fn test_parse_serial_string_wrong_format() { + assert!(parse_serial_string("[4, 2, 3, 4, 5 ]").is_err()); + } + #[test] + fn test_parse_serial_string_wrong_order() { + assert!(parse_serial_string("4 1 0 4 2").is_err()); + } + #[test] + fn test_parse_serial_string_single_char_error() { + assert!(parse_serial_string("4 1 1 2 2").is_err()); + } + #[test] + fn test_parse_serial_string_capacity_not_even() { + assert!(parse_serial_string("3 2 3 4").is_err()); + } + #[test] + fn test_parse_serial_string_size_not_even() { + assert!(parse_serial_string("4 3 2 1").is_err()); + } + + // UnicodeSet constructors + #[test] + fn test_unicodeset_new() { + let expected = vec![2, 3, 4, 5]; + let actual = UnicodeSet::new("4 2 3 4 5").unwrap().inv_list; + assert_eq!(actual, expected); + } + #[test] + fn test_unicodeset_new_error() { + assert!(UnicodeSet::new("3 2 4 3").is_err()); + } + #[test] + fn test_unicodeset_from_range() { + let expected = vec![4, 10]; + let actual = UnicodeSet::from_range(&4, &10).unwrap().inv_list; + assert_eq!(actual, expected); + } + #[test] + fn test_unicodeset_from_range_bad_order() { + assert!(UnicodeSet::from_range(&10, &5).is_err()); + } + #[test] + fn test_unicodeset_from_range_out_of_bounds() { + assert!(UnicodeSet::from_range(&0, &0x110000).is_err()); + } + #[test] + fn test_unicodeset_all() { + let expected = vec![CODEPOINT_MIN, CODEPOINT_MAX + 1]; + assert_eq!(UnicodeSet::all().inv_list, expected); + } + #[test] + fn test_unicodeset_bmp() { + let expected = vec![CODEPOINT_MIN, BMP_MAX + 1]; + assert_eq!(UnicodeSet::bmp().inv_list, expected); + } + + // UnicodeSet membership functions + #[test] + fn test_unicodeset_contains() { + let check = UnicodeSet::new("4 2 5 10 15").unwrap(); + assert!(check.contains_point(&2)); + assert!(check.contains_point(&4)); + assert!(check.contains_point(&10)); + assert!(check.contains_point(&14)); + } + #[test] + fn test_unicodeset_contains_false() { + let check = UnicodeSet::new("4 2 5 10 15").unwrap(); + assert!(!check.contains_point(&1)); + assert!(!check.contains_point(&5)); + assert!(!check.contains_point(&9)); + assert!(!check.contains_point(&15)); + assert!(!check.contains_point(&16)); + } + #[test] + fn test_unicodeset_contains_range() { + let check = UnicodeSet::new("4 0 10 15 25").unwrap(); + assert!(check.contains_range(&2, &5).unwrap()); + assert!(check.contains_range(&0, &9).unwrap()); + assert!(check.contains_range(&15, &24).unwrap()); + } + #[test] + fn test_unicodeset_contains_range_false() { + let check = UnicodeSet::new("4 0 10 15 25").unwrap(); + assert!(!check.contains_range(&0, &10).unwrap()); + assert!(!check.contains_range(&15, &25).unwrap()); + assert!(!check.contains_range(&0, &16).unwrap()); + assert!(!check.contains_range(&10, &15).unwrap()); + assert!(!check.contains_range(&11, &14).unwrap()); + } + #[test] + fn test_unicodeset_contains_range_invalid() { + let check = UnicodeSet::all(); + assert!(check.contains_range(&10, &0).is_err()); + assert!(check.contains_range(&0, &0).is_err()); + } + #[test] + fn test_unicodeset_size() { + let check = UnicodeSet::new("4 2 5 10 15").unwrap(); + assert_eq!(8, check.size().unwrap()); + let check = UnicodeSet::all(); + let expected = CODEPOINT_MAX + 1 - CODEPOINT_MIN; + assert_eq!(expected as usize, check.size().unwrap()); + } + #[test] + fn test_unicodeset_size_error() { + let check = UnicodeSet { inv_list: vec![0] }; + assert!(check.size().is_err()); + } + #[test] + fn test_unicodeset_is_empty() { + let check = UnicodeSet { inv_list: vec![] }; + assert!(check.is_empty()); + let check = UnicodeSet { inv_list: vec![0] }; + assert!(check.is_empty()); + } + #[test] + fn test_unicodeset_is_not_empty() { + let check = UnicodeSet::all(); + assert!(!check.is_empty()); + } +} From cfd9edfc4fb614b529dceea7953b7b2be2ef6870 Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Wed, 24 Jun 2020 22:37:18 +0000 Subject: [PATCH 17/30] formatting and cleaning up changes --- components/uniset/src/uniset.rs | 160 ++++++++++++++++++++------------ 1 file changed, 99 insertions(+), 61 deletions(-) diff --git a/components/uniset/src/uniset.rs b/components/uniset/src/uniset.rs index f265fb3cf41..a8185957a62 100644 --- a/components/uniset/src/uniset.rs +++ b/components/uniset/src/uniset.rs @@ -1,7 +1,10 @@ use std::{boxed::Box, error::Error, iter::Iterator, slice::Iter, vec::Vec}; -const CODEPOINT_MAX: u32 = 0x10FFFF; // does max imply inclusive? else should be 10FFFF +/// Represents the maximum Unicode Code Point, inclusive +const CODEPOINT_MAX: u32 = 0x10FFFF; +/// Represents the minimum UNicode Code Point, inclusive const CODEPOINT_MIN: u32 = 0x000000; +/// Represents the end code point of the Basic Multilingual Plane range, starting from code point 0 , inclusive const BMP_MAX: u32 = 0xFFFF; /// Given string representation of inversion list create set @@ -11,26 +14,43 @@ fn parse_serial_string(serialize_str: &str) -> Result, Box> let mut serialize = serialize_str.split(" "); let capacity: usize = serialize .next() - .ok_or("Splitting did not yield anything")? + .ok_or(format!( + "Splitting by spaces did not yield a capacity: {:?}", + serialize + ))? .parse()?; if capacity % 2 != 0 { - return Err("Capacity must be even".into()); + return Err(format!("Capacity must be even. Parsed Capacity: {}", capacity).into()); } let mut serialized_vec: Vec = Vec::with_capacity(capacity); let mut prev: Option = None; for str_elem in serialize { let parsed: u32 = str_elem.parse()?; if serialized_vec.len() + 1 > serialized_vec.capacity() { - return Err("Serialization capacity is too small".into()); + return Err(format!( + "Serialization capacity is too small. Allocated Capacity: {}", + capacity + ) + .into()); } if Some(parsed) <= prev { - return Err("Serialization must be sorted".into()); + return Err(format!( + "Serialization must be sorted. {:?} followed by {:?}", + Some(parsed), + prev + ) + .into()); } serialized_vec.push(parsed); prev = Some(parsed); } if serialized_vec.len() % 2 != 0 { - return Err("Serialization must be even".into()); + return Err(format!( + "Serialization must be even. Serialization: {:?} Length: {}", + serialized_vec, + serialized_vec.len() + ) + .into()); } serialized_vec.shrink_to_fit(); // necessary if the length < capacity Ok(serialized_vec) @@ -40,6 +60,7 @@ fn parse_serial_string(serialize_str: &str) -> Result, Box> /// /// Provides exposure to membership functions and constructors from serialized UnicodeSets /// and predefined ranges. +/// Implements an inversion list. //#[derive(Copy, Clone, Debug, Eq)] pub struct UnicodeSet { // If we wanted to use an array to keep the memory on the stack, there is an unsafe nightly feature @@ -74,15 +95,19 @@ impl UnicodeSet { /// Returns an error if the range is invalid (out of order and out of bounds). /// /// Example Call: `UnicodeSet::from_range(&0, &15)` - pub fn from_range(start: &u32, end: &u32) -> Result> { + pub fn from_range(start: u32, end: u32) -> Result> { if start > end { - return Err("Range is out of order".into()); + return Err(format!("Range is out of order. start: {} end: {}", start, end).into()); } - if start < &CODEPOINT_MIN || end > &CODEPOINT_MAX { - return Err("Range is out of bounds".into()); + if start < CODEPOINT_MIN || end > CODEPOINT_MAX { + return Err(format!( + "Range is out of bounds. start: {}, min: {}, end: {}, max: {}", + start, CODEPOINT_MIN, end, CODEPOINT_MAX + ) + .into()); } Ok(UnicodeSet { - inv_list: vec![*start, *end], + inv_list: vec![start, end], }) } @@ -104,21 +129,32 @@ impl UnicodeSet { } } - /// Returns an `Iter` of start and stop `u32` points of the UnicodeSet + /// Yields an iterator of start and stop points of ranges in the UnicodeSet + /// + /// Example: + /// + /// ``` + /// use icu4x_unicodeset::UnicodeSet; + /// let example = UnicodeSet::new("4 0 10 15 20"); + /// let mut example_iter = example.iter(); + /// example_iter.next(); // => 0 + /// example_iter.next(); // => 10 + /// example_iter.next(); // => 10, etc. + /// ``` pub fn iter(&self) -> Iter { self.inv_list.iter() } - /// Returns the cardinality of the UnicodeSet - /// + /// Returns the number of elements of the UnicodeSet /// - pub fn size(&self) -> Result> { - if self.inv_list.len() < 2 { - return Err("UnicodeSet length < 2".into()); + /// Returns an error if a complete range is not defined in the UnicodeSet + pub fn size(&self) -> usize { + if self.is_empty() { + return 0; } let end_point_sum: u32 = self.iter().skip(1).step_by(2).sum::(); let start_point_sum: u32 = self.iter().step_by(2).sum::(); - Ok((end_point_sum - start_point_sum) as usize) + (end_point_sum - start_point_sum) as usize } /// Returns whether or not the UnicodeSet is empty @@ -126,12 +162,20 @@ impl UnicodeSet { self.inv_list.len() < 2 // unsure if this is appropriate definition of just self.inv_list.is_empty() } - /// Wrapper for contains conditions closures - fn contains(&self, query: &u32, condition: C) -> bool + /// Wrapper for contains + /// + /// Takes in a single code point `query`, and a closure `condition` + /// to see if the `query` is located in the inversion list. + /// + /// Example: + /// + /// `let condition_closure = |_: usize| -> bool {true};` + /// `self.contains(10, condition_closure);` + fn contains(&self, query: u32, condition: C) -> bool where C: Fn(usize) -> bool, { - match self.inv_list.binary_search(query) { + match self.inv_list.binary_search(&query) { Ok(pos) => { if pos % 2 == 0 { condition(pos) @@ -155,8 +199,8 @@ impl UnicodeSet { /// in the set using `std::vec::Vec` implementation /// /// Example: `contains_point(&10)` - pub fn contains_point(&self, query: &u32) -> bool { - let condition_closure = |_: usize| -> bool { true }; + pub fn contains_point(&self, query: u32) -> bool { + let condition_closure = |_| -> bool { true }; self.contains(query, condition_closure) } @@ -169,24 +213,19 @@ impl UnicodeSet { /// in a single `O(1)` step /// /// Example: `contains_range(&0, &10)` - pub fn contains_range(&self, start: &u32, end: &u32) -> Result> { + pub fn contains_range(&self, start: u32, end: u32) -> Result> { if start >= end { return Err("Range cannot be out of order".into()); } - let condition_closure = |pos: usize| -> bool { - if end < &self.inv_list[pos + 1] { - true - } else { - false - } - }; - return Ok(self.contains(start, condition_closure)); + let condition_closure = |pos| end < self.inv_list[pos + 1]; + Ok(self.contains(start, condition_closure)) } } #[cfg(test)] mod tests { use super::{parse_serial_string, UnicodeSet, BMP_MAX, CODEPOINT_MAX, CODEPOINT_MIN}; + use std::vec::Vec; // parse_serial_string #[test] @@ -238,16 +277,16 @@ mod tests { #[test] fn test_unicodeset_from_range() { let expected = vec![4, 10]; - let actual = UnicodeSet::from_range(&4, &10).unwrap().inv_list; + let actual = UnicodeSet::from_range(4, 10).unwrap().inv_list; assert_eq!(actual, expected); } #[test] fn test_unicodeset_from_range_bad_order() { - assert!(UnicodeSet::from_range(&10, &5).is_err()); + assert!(UnicodeSet::from_range(10, 5).is_err()); } #[test] fn test_unicodeset_from_range_out_of_bounds() { - assert!(UnicodeSet::from_range(&0, &0x110000).is_err()); + assert!(UnicodeSet::from_range(0, 0x110000).is_err()); } #[test] fn test_unicodeset_all() { @@ -264,54 +303,53 @@ mod tests { #[test] fn test_unicodeset_contains() { let check = UnicodeSet::new("4 2 5 10 15").unwrap(); - assert!(check.contains_point(&2)); - assert!(check.contains_point(&4)); - assert!(check.contains_point(&10)); - assert!(check.contains_point(&14)); + assert!(check.contains_point(2)); + assert!(check.contains_point(4)); + assert!(check.contains_point(10)); + assert!(check.contains_point(14)); } #[test] fn test_unicodeset_contains_false() { let check = UnicodeSet::new("4 2 5 10 15").unwrap(); - assert!(!check.contains_point(&1)); - assert!(!check.contains_point(&5)); - assert!(!check.contains_point(&9)); - assert!(!check.contains_point(&15)); - assert!(!check.contains_point(&16)); + assert!(!check.contains_point(1)); + assert!(!check.contains_point(5)); + assert!(!check.contains_point(9)); + assert!(!check.contains_point(15)); + assert!(!check.contains_point(16)); } #[test] fn test_unicodeset_contains_range() { let check = UnicodeSet::new("4 0 10 15 25").unwrap(); - assert!(check.contains_range(&2, &5).unwrap()); - assert!(check.contains_range(&0, &9).unwrap()); - assert!(check.contains_range(&15, &24).unwrap()); + assert!(check.contains_range(2, 5).unwrap()); + assert!(check.contains_range(0, 9).unwrap()); + assert!(check.contains_range(15, 24).unwrap()); } #[test] fn test_unicodeset_contains_range_false() { let check = UnicodeSet::new("4 0 10 15 25").unwrap(); - assert!(!check.contains_range(&0, &10).unwrap()); - assert!(!check.contains_range(&15, &25).unwrap()); - assert!(!check.contains_range(&0, &16).unwrap()); - assert!(!check.contains_range(&10, &15).unwrap()); - assert!(!check.contains_range(&11, &14).unwrap()); + assert!(!check.contains_range(0, 10).unwrap()); + assert!(!check.contains_range(15, 25).unwrap()); + assert!(!check.contains_range(0, 16).unwrap()); + assert!(!check.contains_range(10, 15).unwrap()); + assert!(!check.contains_range(11, 14).unwrap()); } #[test] fn test_unicodeset_contains_range_invalid() { let check = UnicodeSet::all(); - assert!(check.contains_range(&10, &0).is_err()); - assert!(check.contains_range(&0, &0).is_err()); + assert!(check.contains_range(10, 0).is_err()); + assert!(check.contains_range(0, 0).is_err()); } #[test] fn test_unicodeset_size() { let check = UnicodeSet::new("4 2 5 10 15").unwrap(); - assert_eq!(8, check.size().unwrap()); + assert_eq!(8, check.size()); let check = UnicodeSet::all(); let expected = CODEPOINT_MAX + 1 - CODEPOINT_MIN; - assert_eq!(expected as usize, check.size().unwrap()); - } - #[test] - fn test_unicodeset_size_error() { - let check = UnicodeSet { inv_list: vec![0] }; - assert!(check.size().is_err()); + assert_eq!(expected as usize, check.size()); + let check = UnicodeSet { + inv_list: Vec::new(), + }; + assert_eq!(check.size(), 0); } #[test] fn test_unicodeset_is_empty() { From c41fa76ca1e9a560600d168dcefc18547015053c Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Fri, 26 Jun 2020 21:40:16 +0000 Subject: [PATCH 18/30] replace u32 with char and fix typos and optimizations --- components/uniset/src/uniset.rs | 81 +++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 30 deletions(-) diff --git a/components/uniset/src/uniset.rs b/components/uniset/src/uniset.rs index a8185957a62..5828c52e0ca 100644 --- a/components/uniset/src/uniset.rs +++ b/components/uniset/src/uniset.rs @@ -1,8 +1,8 @@ -use std::{boxed::Box, error::Error, iter::Iterator, slice::Iter, vec::Vec}; +use std::{boxed::Box, error::Error, iter::Iterator, slice::Iter, str::FromStr, vec::Vec}; /// Represents the maximum Unicode Code Point, inclusive const CODEPOINT_MAX: u32 = 0x10FFFF; -/// Represents the minimum UNicode Code Point, inclusive +/// Represents the minimum Unicode Code Point, inclusive const CODEPOINT_MIN: u32 = 0x000000; /// Represents the end code point of the Basic Multilingual Plane range, starting from code point 0 , inclusive const BMP_MAX: u32 = 0xFFFF; @@ -69,6 +69,24 @@ pub struct UnicodeSet { inv_list: Vec, } +impl FromStr for UnicodeSet { + type Err = Box; + + fn from_str(serialize: &str) -> Result { + match parse_serial_string(serialize) { + Ok(serialize) => { + if serialize.len() % 2 != 0 { + return Err("Array length must be even".into()); + } + Ok(UnicodeSet { + inv_list: serialize, + }) + } + Err(e) => Err(e.into()), + } + } +} + impl UnicodeSet { /// Returns Result of UnicodeSet from serialized string /// @@ -152,9 +170,11 @@ impl UnicodeSet { if self.is_empty() { return 0; } - let end_point_sum: u32 = self.iter().skip(1).step_by(2).sum::(); - let start_point_sum: u32 = self.iter().step_by(2).sum::(); - (end_point_sum - start_point_sum) as usize + let mut sum = 0; + for (i, end_point) in self.iter().skip(1).step_by(2).enumerate() { + sum += end_point - self.inv_list[2 * i]; + } + sum as usize } /// Returns whether or not the UnicodeSet is empty @@ -171,11 +191,11 @@ impl UnicodeSet { /// /// `let condition_closure = |_: usize| -> bool {true};` /// `self.contains(10, condition_closure);` - fn contains(&self, query: u32, condition: C) -> bool + fn contains(&self, query: char, condition: C) -> bool where C: Fn(usize) -> bool, { - match self.inv_list.binary_search(&query) { + match self.inv_list.binary_search(&(query as u32)) { Ok(pos) => { if pos % 2 == 0 { condition(pos) @@ -199,7 +219,7 @@ impl UnicodeSet { /// in the set using `std::vec::Vec` implementation /// /// Example: `contains_point(&10)` - pub fn contains_point(&self, query: u32) -> bool { + pub fn contains_point(&self, query: char) -> bool { let condition_closure = |_| -> bool { true }; self.contains(query, condition_closure) } @@ -213,11 +233,11 @@ impl UnicodeSet { /// in a single `O(1)` step /// /// Example: `contains_range(&0, &10)` - pub fn contains_range(&self, start: u32, end: u32) -> Result> { + pub fn contains_range(&self, start: char, end: char) -> Result> { if start >= end { return Err("Range cannot be out of order".into()); } - let condition_closure = |pos| end < self.inv_list[pos + 1]; + let condition_closure = |pos| (end as u32) < self.inv_list[pos + 1]; Ok(self.contains(start, condition_closure)) } } @@ -225,6 +245,7 @@ impl UnicodeSet { #[cfg(test)] mod tests { use super::{parse_serial_string, UnicodeSet, BMP_MAX, CODEPOINT_MAX, CODEPOINT_MIN}; + use std::str::FromStr; use std::vec::Vec; // parse_serial_string @@ -267,7 +288,7 @@ mod tests { #[test] fn test_unicodeset_new() { let expected = vec![2, 3, 4, 5]; - let actual = UnicodeSet::new("4 2 3 4 5").unwrap().inv_list; + let actual = UnicodeSet::from_str("4 2 3 4 5").unwrap().inv_list; assert_eq!(actual, expected); } #[test] @@ -303,41 +324,41 @@ mod tests { #[test] fn test_unicodeset_contains() { let check = UnicodeSet::new("4 2 5 10 15").unwrap(); - assert!(check.contains_point(2)); - assert!(check.contains_point(4)); - assert!(check.contains_point(10)); - assert!(check.contains_point(14)); + assert!(check.contains_point(2 as char)); + assert!(check.contains_point(4 as char)); + assert!(check.contains_point(10 as char)); + assert!(check.contains_point(14 as char)); } #[test] fn test_unicodeset_contains_false() { let check = UnicodeSet::new("4 2 5 10 15").unwrap(); - assert!(!check.contains_point(1)); - assert!(!check.contains_point(5)); - assert!(!check.contains_point(9)); - assert!(!check.contains_point(15)); - assert!(!check.contains_point(16)); + assert!(!check.contains_point(1 as char)); + assert!(!check.contains_point(5 as char)); + assert!(!check.contains_point(9 as char)); + assert!(!check.contains_point(15 as char)); + assert!(!check.contains_point(16 as char)); } #[test] fn test_unicodeset_contains_range() { let check = UnicodeSet::new("4 0 10 15 25").unwrap(); - assert!(check.contains_range(2, 5).unwrap()); - assert!(check.contains_range(0, 9).unwrap()); - assert!(check.contains_range(15, 24).unwrap()); + assert!(check.contains_range(2 as char, 5 as char).unwrap()); + assert!(check.contains_range(0 as char, 9 as char).unwrap()); + assert!(check.contains_range(15 as char, 24 as char).unwrap()); } #[test] fn test_unicodeset_contains_range_false() { let check = UnicodeSet::new("4 0 10 15 25").unwrap(); - assert!(!check.contains_range(0, 10).unwrap()); - assert!(!check.contains_range(15, 25).unwrap()); - assert!(!check.contains_range(0, 16).unwrap()); - assert!(!check.contains_range(10, 15).unwrap()); - assert!(!check.contains_range(11, 14).unwrap()); + assert!(!check.contains_range(0 as char, 10 as char).unwrap()); + assert!(!check.contains_range(15 as char, 25 as char).unwrap()); + assert!(!check.contains_range(0 as char, 16 as char).unwrap()); + assert!(!check.contains_range(10 as char, 15 as char).unwrap()); + assert!(!check.contains_range(11 as char, 14 as char).unwrap()); } #[test] fn test_unicodeset_contains_range_invalid() { let check = UnicodeSet::all(); - assert!(check.contains_range(10, 0).is_err()); - assert!(check.contains_range(0, 0).is_err()); + assert!(check.contains_range(10 as char, 0 as char).is_err()); + assert!(check.contains_range(0 as char, 0 as char).is_err()); } #[test] fn test_unicodeset_size() { From da5eecf10b9c39caa00fcf6429f7b4fc3f3c26a2 Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Thu, 9 Jul 2020 20:11:55 +0000 Subject: [PATCH 19/30] remove unnecessary imports, made more rusty --- .../uniset/meta/char_collection_lib_test.cmx | 5 - components/uniset/src/conversions.rs | 123 ++++++ components/uniset/src/lib.rs | 6 + components/uniset/src/uniset.rs | 395 +++++++----------- components/uniset/src/utils.rs | 64 +++ 5 files changed, 352 insertions(+), 241 deletions(-) delete mode 100644 components/uniset/meta/char_collection_lib_test.cmx create mode 100644 components/uniset/src/conversions.rs create mode 100644 components/uniset/src/utils.rs diff --git a/components/uniset/meta/char_collection_lib_test.cmx b/components/uniset/meta/char_collection_lib_test.cmx deleted file mode 100644 index 3bb56a96aa2..00000000000 --- a/components/uniset/meta/char_collection_lib_test.cmx +++ /dev/null @@ -1,5 +0,0 @@ -{ - "program": { - "binary": "test/char_collection_lib_test" - } -} \ No newline at end of file diff --git a/components/uniset/src/conversions.rs b/components/uniset/src/conversions.rs new file mode 100644 index 00000000000..484f693d166 --- /dev/null +++ b/components/uniset/src/conversions.rs @@ -0,0 +1,123 @@ +use crate::utils::deconstruct_range; +use crate::UnicodeSet; +use std::{ + convert::TryFrom, + ops::{Range, RangeBounds, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive}, +}; + +fn try_from_range_impl(range: impl RangeBounds) -> Result { + let (from, till) = deconstruct_range(range); + if from < till { + let set = vec![from, till]; + Ok(UnicodeSet::try_from(set).unwrap()) + } else { + Err((from, till)) + } +} + +impl TryFrom> for UnicodeSet { + type Error = String; + + fn try_from(range: Range) -> Result { + match try_from_range_impl(range) { + Ok(u) => Ok(u), + Err((from, till)) => Err(format!("Range must be ascending: {} - {}", from, till)), + } + } +} + +impl TryFrom> for UnicodeSet { + type Error = String; + + fn try_from(range: RangeFrom) -> Result { + match try_from_range_impl(range) { + Ok(u) => Ok(u), + Err((from, till)) => Err(format!("Range must be ascending: {} - {}", from, till)), + } + } +} + +impl TryFrom for UnicodeSet { + type Error = String; + + fn try_from(_: RangeFull) -> Result { + Ok(UnicodeSet::all()) + } +} + +impl TryFrom> for UnicodeSet { + type Error = String; + + fn try_from(range: RangeInclusive) -> Result { + match try_from_range_impl(range) { + Ok(u) => Ok(u), + Err((from, till)) => Err(format!("Range must be ascending: {} - {}", from, till)), + } + } +} + +impl TryFrom> for UnicodeSet { + type Error = String; + + fn try_from(range: RangeTo) -> Result { + match try_from_range_impl(range) { + Ok(u) => Ok(u), + Err((from, till)) => Err(format!("Range must be ascending: {} - {}", from, till)), + } + } +} + +impl TryFrom> for UnicodeSet { + type Error = String; + + fn try_from(range: RangeToInclusive) -> Result { + Ok(try_from_range_impl(range).unwrap()) + } +} + +#[cfg(test)] +mod tests { + use crate::UnicodeSet; + use std::convert::TryFrom; + + #[test] + fn test_try_from_range() { + assert!(UnicodeSet::try_from('A'..'B').is_ok()); + } + #[test] + fn test_try_from_range_error() { + assert!(UnicodeSet::try_from('A'..'A').is_err()); + } + #[test] + fn test_try_from_range_inclusive() { + assert!(UnicodeSet::try_from('A'..='A').is_ok()); + } + #[test] + fn test_try_from_range_inclusive_err() { + assert!(UnicodeSet::try_from('B'..='A').is_err()); + } + #[test] + fn test_try_from_range_from() { + assert!(UnicodeSet::try_from('A'..).is_ok()); + } + #[test] + fn test_try_from_range_from_err() { + assert!(UnicodeSet::try_from((std::char::MAX)..).is_err()); + } + #[test] + fn test_try_from_range_to() { + assert!(UnicodeSet::try_from(..'A').is_ok()); + } + #[test] + fn test_try_from_range_to_err() { + assert!(UnicodeSet::try_from(..(0 as char)).is_err()); + } + #[test] + fn test_try_from_range_to_inclusive() { + assert!(UnicodeSet::try_from(..='A').is_ok()); + } + #[test] + fn test_try_from_range_full() { + assert!(UnicodeSet::try_from(..).is_ok()); + } +} diff --git a/components/uniset/src/lib.rs b/components/uniset/src/lib.rs index c9e5f2c2d87..1cff18c16f5 100644 --- a/components/uniset/src/lib.rs +++ b/components/uniset/src/lib.rs @@ -3,4 +3,10 @@ // found in the LICENSE file. #[macro_use] mod uniset; +mod conversions; +mod utils; +// mod iter; +pub use conversions::*; pub use uniset::UnicodeSet; +pub use utils::*; +// pub use iter::UnicodeSetIter; diff --git a/components/uniset/src/uniset.rs b/components/uniset/src/uniset.rs index 5828c52e0ca..b8b52e2b041 100644 --- a/components/uniset/src/uniset.rs +++ b/components/uniset/src/uniset.rs @@ -1,61 +1,15 @@ -use std::{boxed::Box, error::Error, iter::Iterator, slice::Iter, str::FromStr, vec::Vec}; +use std::{ + char::{from_u32, MAX}, + convert::TryFrom, + ops::RangeBounds, + slice::Iter, +}; -/// Represents the maximum Unicode Code Point, inclusive -const CODEPOINT_MAX: u32 = 0x10FFFF; -/// Represents the minimum Unicode Code Point, inclusive -const CODEPOINT_MIN: u32 = 0x000000; +use crate::utils::{deconstruct_range, is_sorted}; +// use crate::UnicodeSetIter; /// Represents the end code point of the Basic Multilingual Plane range, starting from code point 0 , inclusive const BMP_MAX: u32 = 0xFFFF; -/// Given string representation of inversion list create set -/// -/// See UnicodeSet::new for conditions -fn parse_serial_string(serialize_str: &str) -> Result, Box> { - let mut serialize = serialize_str.split(" "); - let capacity: usize = serialize - .next() - .ok_or(format!( - "Splitting by spaces did not yield a capacity: {:?}", - serialize - ))? - .parse()?; - if capacity % 2 != 0 { - return Err(format!("Capacity must be even. Parsed Capacity: {}", capacity).into()); - } - let mut serialized_vec: Vec = Vec::with_capacity(capacity); - let mut prev: Option = None; - for str_elem in serialize { - let parsed: u32 = str_elem.parse()?; - if serialized_vec.len() + 1 > serialized_vec.capacity() { - return Err(format!( - "Serialization capacity is too small. Allocated Capacity: {}", - capacity - ) - .into()); - } - if Some(parsed) <= prev { - return Err(format!( - "Serialization must be sorted. {:?} followed by {:?}", - Some(parsed), - prev - ) - .into()); - } - serialized_vec.push(parsed); - prev = Some(parsed); - } - if serialized_vec.len() % 2 != 0 { - return Err(format!( - "Serialization must be even. Serialization: {:?} Length: {}", - serialized_vec, - serialized_vec.len() - ) - .into()); - } - serialized_vec.shrink_to_fit(); // necessary if the length < capacity - Ok(serialized_vec) -} - /// UnicodeSet membership wrapper /// /// Provides exposure to membership functions and constructors from serialized UnicodeSets @@ -69,72 +23,28 @@ pub struct UnicodeSet { inv_list: Vec, } -impl FromStr for UnicodeSet { - type Err = Box; +impl TryFrom> for UnicodeSet { + type Error = String; - fn from_str(serialize: &str) -> Result { - match parse_serial_string(serialize) { - Ok(serialize) => { - if serialize.len() % 2 != 0 { - return Err("Array length must be even".into()); - } - Ok(UnicodeSet { - inv_list: serialize, - }) - } - Err(e) => Err(e.into()), + fn try_from(set: Vec) -> Result { + if is_sorted(&set) { + Ok(UnicodeSet { inv_list: set }) + } else { + Err(format!( + "UnicodeSet set must be sorted without duplicates: {:?}", + set + )) } } } impl UnicodeSet { - /// Returns Result of UnicodeSet from serialized string - /// - /// Returns an error if the serialized string fails to parse. - /// The serialized string requires starting capacity integer, followed by space delimited - /// integer code points. There must be an even number of elements (not including the - /// capacity int), and must be in ascending sorted order. - /// - /// Example String: `"4 0 5 10 15"` designates a capacity of size `4`, followed by 2 ranges - /// The ranges are `{0, 4}` and `{10, 14}` inclusive - pub fn new(serialize: &str) -> Result> { - match parse_serial_string(serialize) { - Ok(serialize) => Ok(UnicodeSet { - inv_list: serialize, - }), - Err(e) => Err(e), - } - } - - /// Returns Result of UnicodeSet from a single pair of integers defining a range - /// - /// `start`: inclusive, `end`: exclusive - /// - /// Returns an error if the range is invalid (out of order and out of bounds). - /// - /// Example Call: `UnicodeSet::from_range(&0, &15)` - pub fn from_range(start: u32, end: u32) -> Result> { - if start > end { - return Err(format!("Range is out of order. start: {} end: {}", start, end).into()); - } - if start < CODEPOINT_MIN || end > CODEPOINT_MAX { - return Err(format!( - "Range is out of bounds. start: {}, min: {}, end: {}, max: {}", - start, CODEPOINT_MIN, end, CODEPOINT_MAX - ) - .into()); - } - Ok(UnicodeSet { - inv_list: vec![start, end], - }) - } - /// Returns UnicodeSet spanning entire Unicode range /// /// The range spans from `0x0 -> 0x10FFFF` inclusive pub fn all() -> UnicodeSet { UnicodeSet { - inv_list: vec![CODEPOINT_MIN, CODEPOINT_MAX + 1], + inv_list: vec![0, (MAX as u32) + 1], } } @@ -143,7 +53,7 @@ impl UnicodeSet { /// The range spans from `0x0 -> 0xFFFF` inclusive pub fn bmp() -> UnicodeSet { UnicodeSet { - inv_list: vec![CODEPOINT_MIN, BMP_MAX + 1], + inv_list: vec![0, BMP_MAX + 1], } } @@ -153,28 +63,54 @@ impl UnicodeSet { /// /// ``` /// use icu4x_unicodeset::UnicodeSet; - /// let example = UnicodeSet::new("4 0 10 15 20"); - /// let mut example_iter = example.iter(); - /// example_iter.next(); // => 0 - /// example_iter.next(); // => 10 - /// example_iter.next(); // => 10, etc. + /// use std::convert::TryFrom; + /// let example_list = vec![0, 10, 15, 20]; + /// let example = UnicodeSet::try_from(example_list).unwrap(); + /// let mut example_ranges = example.ranges(); + /// assert_eq!(Some(&0), example_ranges.next()); + /// assert_eq!(Some(&10), example_ranges.next()); + /// assert_eq!(Some(&15), example_ranges.next()); + /// assert_eq!(Some(&20), example_ranges.next()); + /// assert_eq!(None, example_ranges.next()); /// ``` - pub fn iter(&self) -> Iter { + pub fn ranges(&self) -> Iter { self.inv_list.iter() } - /// Returns the number of elements of the UnicodeSet + /// Yields an iterator going through the character set in the UnicodeSet /// - /// Returns an error if a complete range is not defined in the UnicodeSet + /// Example: + /// + /// ``` + /// use icu4x_unicodeset::UnicodeSet; + /// use std::convert::TryFrom; + /// let example_list = vec![65, 68, 69, 70]; + /// let example = UnicodeSet::try_from(example_list).unwrap(); + /// let mut example_iter = example.iter(); + /// assert_eq!(Some('A'), example_iter.next()); + /// assert_eq!(Some('B'), example_iter.next()); + /// assert_eq!(Some('C'), example_iter.next()); + /// assert_eq!(Some('E'), example_iter.next()); + /// assert_eq!(None, example_iter.next()); + /// ``` + pub fn iter(&self) -> impl Iterator + '_ { + self.inv_list + .chunks(2) + .flat_map(|pair| (pair[0]..pair[1])) + .map(|val| from_u32(val).unwrap()) + } + + /// Returns the number of elements of the UnicodeSet pub fn size(&self) -> usize { if self.is_empty() { return 0; } - let mut sum = 0; - for (i, end_point) in self.iter().skip(1).step_by(2).enumerate() { - sum += end_point - self.inv_list[2 * i]; - } - sum as usize + let s: u32 = self + .inv_list + .chunks(2) + .map(|end_points| end_points[1] - end_points[0]) + .sum(); + s as usize } /// Returns whether or not the UnicodeSet is empty @@ -184,30 +120,21 @@ impl UnicodeSet { /// Wrapper for contains /// - /// Takes in a single code point `query`, and a closure `condition` - /// to see if the `query` is located in the inversion list. - /// - /// Example: - /// - /// `let condition_closure = |_: usize| -> bool {true};` - /// `self.contains(10, condition_closure);` - fn contains(&self, query: char, condition: C) -> bool - where - C: Fn(usize) -> bool, - { - match self.inv_list.binary_search(&(query as u32)) { + /// Returns an Option as to whether or not it is possible for the query to be contained + fn contains_impl(&self, query: u32) -> Option { + match self.inv_list.binary_search(&query) { Ok(pos) => { if pos % 2 == 0 { - condition(pos) + Some(pos) } else { - false + None } } Err(pos) => { if pos % 2 != 0 && pos < self.inv_list.len() { - condition(pos) + Some(pos) } else { - false + None } } } @@ -218,10 +145,21 @@ impl UnicodeSet { /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points /// in the set using `std::vec::Vec` implementation /// - /// Example: `contains_point(&10)` - pub fn contains_point(&self, query: char) -> bool { - let condition_closure = |_| -> bool { true }; - self.contains(query, condition_closure) + /// Example: + /// + /// ``` + /// use icu4x_unicodeset::UnicodeSet; + /// use std::convert::TryFrom; + /// let example_list = vec![65, 67, 68, 69]; + /// let example = UnicodeSet::try_from(example_list).unwrap(); + /// assert!(example.contains('A')); + /// assert!(!example.contains('C')); + /// ``` + pub fn contains(&self, query: char) -> bool { + match self.contains_impl(query as u32) { + Some(_) => true, + None => false, + } } /// Checks to see if the range is in the UnicodeSet, returns a Result @@ -232,140 +170,103 @@ impl UnicodeSet { /// Only runs the search once on the `start` parameter, while the `end` parameter is checked /// in a single `O(1)` step /// - /// Example: `contains_range(&0, &10)` - pub fn contains_range(&self, start: char, end: char) -> Result> { - if start >= end { - return Err("Range cannot be out of order".into()); + /// Example: + /// + /// ``` + /// use icu4x_unicodeset::UnicodeSet; + /// use std::convert::TryFrom; + /// let example_list = vec![65, 67, 68, 69]; + /// let example = UnicodeSet::try_from(example_list).unwrap(); + /// assert!(example.contains_range('A'..'C')); + /// assert!(example.contains_range('A'..='B')); + /// assert!(!example.contains_range('A'..='C')); + /// ``` + pub fn contains_range(&self, range: impl RangeBounds) -> bool { + let (from, till) = deconstruct_range(range); + if from >= till { + return false; + } + match self.contains_impl(from) { + Some(pos) => (till) <= self.inv_list[pos + 1], + None => false, } - let condition_closure = |pos| (end as u32) < self.inv_list[pos + 1]; - Ok(self.contains(start, condition_closure)) } } #[cfg(test)] mod tests { - use super::{parse_serial_string, UnicodeSet, BMP_MAX, CODEPOINT_MAX, CODEPOINT_MIN}; - use std::str::FromStr; - use std::vec::Vec; - - // parse_serial_string - #[test] - fn test_parse_serial_string() { - let expected = vec![2, 3, 4, 5]; - let actual = parse_serial_string("4 2 3 4 5").unwrap(); - assert_eq!(actual, expected); - } - #[test] - fn test_parse_serial_string_no_char() { - assert!(parse_serial_string("4 2 A 3 4 5").is_err()); - } - #[test] - fn test_parse_serial_string_empty() { - assert!(parse_serial_string("").is_err()); - } - #[test] - fn test_parse_serial_string_wrong_format() { - assert!(parse_serial_string("[4, 2, 3, 4, 5 ]").is_err()); - } - #[test] - fn test_parse_serial_string_wrong_order() { - assert!(parse_serial_string("4 1 0 4 2").is_err()); - } - #[test] - fn test_parse_serial_string_single_char_error() { - assert!(parse_serial_string("4 1 1 2 2").is_err()); - } - #[test] - fn test_parse_serial_string_capacity_not_even() { - assert!(parse_serial_string("3 2 3 4").is_err()); - } - #[test] - fn test_parse_serial_string_size_not_even() { - assert!(parse_serial_string("4 3 2 1").is_err()); - } + use super::{UnicodeSet, BMP_MAX}; + use std::{char::MAX, convert::TryFrom, vec::Vec}; - // UnicodeSet constructors #[test] - fn test_unicodeset_new() { - let expected = vec![2, 3, 4, 5]; - let actual = UnicodeSet::from_str("4 2 3 4 5").unwrap().inv_list; - assert_eq!(actual, expected); + fn test_unicodeset_try_from_vec() { + let check = vec![2, 3, 4, 5]; + assert!(UnicodeSet::try_from(check).is_ok()); } #[test] - fn test_unicodeset_new_error() { - assert!(UnicodeSet::new("3 2 4 3").is_err()); - } - #[test] - fn test_unicodeset_from_range() { - let expected = vec![4, 10]; - let actual = UnicodeSet::from_range(4, 10).unwrap().inv_list; - assert_eq!(actual, expected); - } - #[test] - fn test_unicodeset_from_range_bad_order() { - assert!(UnicodeSet::from_range(10, 5).is_err()); - } - #[test] - fn test_unicodeset_from_range_out_of_bounds() { - assert!(UnicodeSet::from_range(0, 0x110000).is_err()); + fn test_unicodeset_try_from_vec_error() { + let check = vec![1, 1, 2, 3, 4]; + assert!(UnicodeSet::try_from(check).is_err()); } #[test] fn test_unicodeset_all() { - let expected = vec![CODEPOINT_MIN, CODEPOINT_MAX + 1]; + let expected = vec![0, (MAX as u32) + 1]; assert_eq!(UnicodeSet::all().inv_list, expected); } #[test] fn test_unicodeset_bmp() { - let expected = vec![CODEPOINT_MIN, BMP_MAX + 1]; + let expected = vec![0, BMP_MAX + 1]; assert_eq!(UnicodeSet::bmp().inv_list, expected); } // UnicodeSet membership functions #[test] fn test_unicodeset_contains() { - let check = UnicodeSet::new("4 2 5 10 15").unwrap(); - assert!(check.contains_point(2 as char)); - assert!(check.contains_point(4 as char)); - assert!(check.contains_point(10 as char)); - assert!(check.contains_point(14 as char)); + let ex = vec![2, 5, 10, 15]; + let check = UnicodeSet::try_from(ex).unwrap(); + assert!(check.contains(2 as char)); + assert!(check.contains(4 as char)); + assert!(check.contains(10 as char)); + assert!(check.contains(14 as char)); } #[test] fn test_unicodeset_contains_false() { - let check = UnicodeSet::new("4 2 5 10 15").unwrap(); - assert!(!check.contains_point(1 as char)); - assert!(!check.contains_point(5 as char)); - assert!(!check.contains_point(9 as char)); - assert!(!check.contains_point(15 as char)); - assert!(!check.contains_point(16 as char)); + let ex = vec![2, 5, 10, 15]; + let check = UnicodeSet::try_from(ex).unwrap(); + assert!(!check.contains(1 as char)); + assert!(!check.contains(5 as char)); + assert!(!check.contains(9 as char)); + assert!(!check.contains(15 as char)); + assert!(!check.contains(16 as char)); } #[test] fn test_unicodeset_contains_range() { - let check = UnicodeSet::new("4 0 10 15 25").unwrap(); - assert!(check.contains_range(2 as char, 5 as char).unwrap()); - assert!(check.contains_range(0 as char, 9 as char).unwrap()); - assert!(check.contains_range(15 as char, 24 as char).unwrap()); + let ex = vec![65, 70, 75, 85]; + let check = UnicodeSet::try_from(ex).unwrap(); + assert!(check.contains_range('A'..='E')); // 65 - 69 + assert!(check.contains_range('K'..'U')); // 75 - 84 } #[test] fn test_unicodeset_contains_range_false() { - let check = UnicodeSet::new("4 0 10 15 25").unwrap(); - assert!(!check.contains_range(0 as char, 10 as char).unwrap()); - assert!(!check.contains_range(15 as char, 25 as char).unwrap()); - assert!(!check.contains_range(0 as char, 16 as char).unwrap()); - assert!(!check.contains_range(10 as char, 15 as char).unwrap()); - assert!(!check.contains_range(11 as char, 14 as char).unwrap()); + let ex = vec![65, 70, 75, 85]; + let check = UnicodeSet::try_from(ex).unwrap(); + assert!(!check.contains_range('!'..'A')); // 33 - 65 + assert!(!check.contains_range('F'..'K')); // 70 - 74 + assert!(!check.contains_range('U'..)); } #[test] fn test_unicodeset_contains_range_invalid() { let check = UnicodeSet::all(); - assert!(check.contains_range(10 as char, 0 as char).is_err()); - assert!(check.contains_range(0 as char, 0 as char).is_err()); + assert!(!check.contains_range('A'..'!')); // 65 - 33 + assert!(!check.contains_range('A'..'A')); } #[test] fn test_unicodeset_size() { - let check = UnicodeSet::new("4 2 5 10 15").unwrap(); + let ex = vec![2, 5, 10, 15]; + let check = UnicodeSet::try_from(ex).unwrap(); assert_eq!(8, check.size()); let check = UnicodeSet::all(); - let expected = CODEPOINT_MAX + 1 - CODEPOINT_MIN; + let expected = (MAX as u32) + 1; assert_eq!(expected as usize, check.size()); let check = UnicodeSet { inv_list: Vec::new(), @@ -384,4 +285,26 @@ mod tests { let check = UnicodeSet::all(); assert!(!check.is_empty()); } + #[test] + fn test_unicodeset_ranges() { + let ex = vec![65, 70, 75, 85]; + let check = UnicodeSet::try_from(ex).unwrap(); + let mut iter = check.ranges(); + assert_eq!(iter.next().unwrap(), &65); + assert_eq!(iter.next().unwrap(), &70); + assert_eq!(iter.next().unwrap(), &75); + assert_eq!(iter.next().unwrap(), &85); + assert_eq!(iter.next(), None); + } + #[test] + fn test_unicodeset_iter() { + let ex = vec![65, 68, 69, 70]; + let check = UnicodeSet::try_from(ex).unwrap(); + let mut iter = check.iter(); + assert_eq!(Some('A'), iter.next()); + assert_eq!(Some('B'), iter.next()); + assert_eq!(Some('C'), iter.next()); + assert_eq!(Some('E'), iter.next()); + assert_eq!(None, iter.next()); + } } diff --git a/components/uniset/src/utils.rs b/components/uniset/src/utils.rs new file mode 100644 index 00000000000..562c9f08af8 --- /dev/null +++ b/components/uniset/src/utils.rs @@ -0,0 +1,64 @@ +use std::{ + char::MAX, + ops::{Bound::*, RangeBounds}, +}; + +/// Returns whether the vector is sorted ascending non inclusive +pub fn is_sorted(v: &Vec) -> bool { + v.chunks(2).all(|chunk| chunk[0] < chunk[1]) +} + +/// Returns start (inclusive) and end (exclusive) bounds of RangeBounds +pub fn deconstruct_range(range: impl RangeBounds) -> (u32, u32) { + let from = match range.start_bound() { + Included(b) => (*b as u32), + Excluded(b) => (*b as u32), + Unbounded => 0, + }; + let till = match range.end_bound() { + Included(b) => (*b as u32) + 1, + Excluded(b) => (*b as u32), + Unbounded => MAX as u32, + }; + (from, till) +} + +#[cfg(test)] +mod tests { + use super::{deconstruct_range, is_sorted}; + use std::char::MAX; + + #[test] + fn test_is_sorted() { + let check = vec![2, 3, 4, 5]; + assert!(is_sorted(&check)); + } + #[test] + fn test_is_sorted_out_of_order() { + let check = vec![5, 4, 5, 6, 7]; + assert!(!is_sorted(&check)); + } + #[test] + fn test_is_sorted_duplicate() { + let check = vec![1, 2, 3, 3, 5]; + assert!(!is_sorted(&check)); + } + + // deconstruct_range + #[test] + fn test_deconstruct_range() { + let expected = (65, 69); + let check = deconstruct_range('A'..'E'); // Range + assert_eq!(check, expected); + let check = deconstruct_range('A'..='D'); // Range Inclusive + assert_eq!(check, expected); + let check = deconstruct_range('A'..); // Range From + assert_eq!(check, (65, MAX as u32)); + let check = deconstruct_range(..'A'); // Range To + assert_eq!(check, (0, 65)); + let check = deconstruct_range(..='A'); // Range To Inclusive + assert_eq!(check, (0, 66)); + let check = deconstruct_range(..); // Range Full + assert_eq!(check, (0, MAX as u32)); + } +} From 22b4fe0ff9689e58912ca73e80bb40a4e6391f50 Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Thu, 9 Jul 2020 20:20:02 +0000 Subject: [PATCH 20/30] clipply fix --- components/uniset/src/uniset.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/components/uniset/src/uniset.rs b/components/uniset/src/uniset.rs index b8b52e2b041..3c1a3b2c4ee 100644 --- a/components/uniset/src/uniset.rs +++ b/components/uniset/src/uniset.rs @@ -156,10 +156,7 @@ impl UnicodeSet { /// assert!(!example.contains('C')); /// ``` pub fn contains(&self, query: char) -> bool { - match self.contains_impl(query as u32) { - Some(_) => true, - None => false, - } + self.contains_impl(query as u32).is_some() } /// Checks to see if the range is in the UnicodeSet, returns a Result From 8d9138eaab753c91dab1717faa40f4d59704c4e7 Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Thu, 9 Jul 2020 20:40:00 +0000 Subject: [PATCH 21/30] more clippy lint fixes --- components/uniset/src/utils.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/uniset/src/utils.rs b/components/uniset/src/utils.rs index 562c9f08af8..c324360a698 100644 --- a/components/uniset/src/utils.rs +++ b/components/uniset/src/utils.rs @@ -4,7 +4,7 @@ use std::{ }; /// Returns whether the vector is sorted ascending non inclusive -pub fn is_sorted(v: &Vec) -> bool { +pub fn is_sorted(v: &[u32]) -> bool { v.chunks(2).all(|chunk| chunk[0] < chunk[1]) } From dac8a4a4551b03c68c161ae41c2b752545fe50e8 Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Tue, 14 Jul 2020 20:08:50 +0000 Subject: [PATCH 22/30] Architecture checks minus benchmarks --- components/uniset/Cargo.toml | 2 +- components/uniset/src/conversions.rs | 100 +++++++++++++-------------- components/uniset/src/lib.rs | 13 ++-- components/uniset/src/uniset.rs | 68 +++++++++--------- components/uniset/src/utils.rs | 54 +++++++++------ 5 files changed, 123 insertions(+), 114 deletions(-) diff --git a/components/uniset/Cargo.toml b/components/uniset/Cargo.toml index b7e4103b289..bbe4fe2c01b 100644 --- a/components/uniset/Cargo.toml +++ b/components/uniset/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "icu4x-unicodeset" +name = "icu-unicodeset" description = "API for managing Unicode Language and Locale Identifiers" version = "0.0.1" authors = ["The ICU4X Project Developers"] diff --git a/components/uniset/src/conversions.rs b/components/uniset/src/conversions.rs index 484f693d166..68e039e9086 100644 --- a/components/uniset/src/conversions.rs +++ b/components/uniset/src/conversions.rs @@ -1,3 +1,4 @@ +use super::USetError; use crate::utils::deconstruct_range; use crate::UnicodeSet; use std::{ @@ -5,119 +6,116 @@ use std::{ ops::{Range, RangeBounds, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive}, }; -fn try_from_range_impl(range: impl RangeBounds) -> Result { +fn try_from_range_impl(range: &impl RangeBounds) -> Result { let (from, till) = deconstruct_range(range); if from < till { let set = vec![from, till]; Ok(UnicodeSet::try_from(set).unwrap()) } else { - Err((from, till)) + Err(USetError::InvalidRange(from, till)) } } -impl TryFrom> for UnicodeSet { - type Error = String; +impl TryFrom<&Range> for UnicodeSet { + type Error = USetError; - fn try_from(range: Range) -> Result { - match try_from_range_impl(range) { - Ok(u) => Ok(u), - Err((from, till)) => Err(format!("Range must be ascending: {} - {}", from, till)), - } + fn try_from(range: &Range) -> Result { + try_from_range_impl(range) } } -impl TryFrom> for UnicodeSet { - type Error = String; +impl TryFrom<&RangeFrom> for UnicodeSet { + type Error = USetError; - fn try_from(range: RangeFrom) -> Result { - match try_from_range_impl(range) { - Ok(u) => Ok(u), - Err((from, till)) => Err(format!("Range must be ascending: {} - {}", from, till)), - } + fn try_from(range: &RangeFrom) -> Result { + try_from_range_impl(range) } } -impl TryFrom for UnicodeSet { - type Error = String; +impl TryFrom<&RangeFull> for UnicodeSet { + type Error = USetError; - fn try_from(_: RangeFull) -> Result { + fn try_from(_: &RangeFull) -> Result { Ok(UnicodeSet::all()) } } -impl TryFrom> for UnicodeSet { - type Error = String; +impl TryFrom<&RangeInclusive> for UnicodeSet { + type Error = USetError; - fn try_from(range: RangeInclusive) -> Result { - match try_from_range_impl(range) { - Ok(u) => Ok(u), - Err((from, till)) => Err(format!("Range must be ascending: {} - {}", from, till)), - } + fn try_from(range: &RangeInclusive) -> Result { + try_from_range_impl(range) } } -impl TryFrom> for UnicodeSet { - type Error = String; +impl TryFrom<&RangeTo> for UnicodeSet { + type Error = USetError; - fn try_from(range: RangeTo) -> Result { - match try_from_range_impl(range) { - Ok(u) => Ok(u), - Err((from, till)) => Err(format!("Range must be ascending: {} - {}", from, till)), - } + fn try_from(range: &RangeTo) -> Result { + try_from_range_impl(range) } } -impl TryFrom> for UnicodeSet { - type Error = String; +impl TryFrom<&RangeToInclusive> for UnicodeSet { + type Error = USetError; - fn try_from(range: RangeToInclusive) -> Result { - Ok(try_from_range_impl(range).unwrap()) + fn try_from(range: &RangeToInclusive) -> Result { + try_from_range_impl(range) } } #[cfg(test)] mod tests { + use super::USetError; use crate::UnicodeSet; use std::convert::TryFrom; - #[test] fn test_try_from_range() { - assert!(UnicodeSet::try_from('A'..'B').is_ok()); + let check: Vec = UnicodeSet::try_from(&('A'..'B')).unwrap().iter().collect(); + assert_eq!(vec!['A'], check); } #[test] fn test_try_from_range_error() { - assert!(UnicodeSet::try_from('A'..'A').is_err()); + let check = UnicodeSet::try_from(&('A'..'A')); + assert_eq!(Err(USetError::InvalidRange(65, 65)), check); } #[test] fn test_try_from_range_inclusive() { - assert!(UnicodeSet::try_from('A'..='A').is_ok()); + let check: Vec = UnicodeSet::try_from(&('A'..='A')).unwrap().iter().collect(); + assert_eq!(vec!['A'], check); } #[test] fn test_try_from_range_inclusive_err() { - assert!(UnicodeSet::try_from('B'..='A').is_err()); + let check = UnicodeSet::try_from(&('B'..'A')); + assert_eq!(Err(USetError::InvalidRange(66, 65)), check); } #[test] fn test_try_from_range_from() { - assert!(UnicodeSet::try_from('A'..).is_ok()); - } - #[test] - fn test_try_from_range_from_err() { - assert!(UnicodeSet::try_from((std::char::MAX)..).is_err()); + let uset = UnicodeSet::try_from(&('A'..)).unwrap(); + let check: Vec<&u32> = uset.ranges().collect(); + assert_eq!(vec![&65, &((std::char::MAX as u32) + 1)], check); } #[test] fn test_try_from_range_to() { - assert!(UnicodeSet::try_from(..'A').is_ok()); + let uset = UnicodeSet::try_from(&(..'A')).unwrap(); + let check: Vec<&u32> = uset.ranges().collect(); + assert_eq!(vec![&0, &65], check); } #[test] fn test_try_from_range_to_err() { - assert!(UnicodeSet::try_from(..(0 as char)).is_err()); + let check = UnicodeSet::try_from(&(..(0 as char))); + assert_eq!(Err(USetError::InvalidRange(0, 0)), check); } #[test] fn test_try_from_range_to_inclusive() { - assert!(UnicodeSet::try_from(..='A').is_ok()); + let uset = UnicodeSet::try_from(&(..='A')).unwrap(); + let check: Vec<&u32> = uset.ranges().collect(); + assert_eq!(vec![&0, &66], check); } #[test] fn test_try_from_range_full() { - assert!(UnicodeSet::try_from(..).is_ok()); + let uset = UnicodeSet::try_from(&(..)).unwrap(); + let check: Vec<&u32> = uset.ranges().collect(); + assert_eq!(vec![&0, &((std::char::MAX as u32) + 1)], check); } } diff --git a/components/uniset/src/lib.rs b/components/uniset/src/lib.rs index 1cff18c16f5..870eb001e30 100644 --- a/components/uniset/src/lib.rs +++ b/components/uniset/src/lib.rs @@ -1,12 +1,15 @@ -// Copyright 2019 The Fuchsia Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. #[macro_use] mod uniset; mod conversions; mod utils; -// mod iter; + pub use conversions::*; pub use uniset::UnicodeSet; pub use utils::*; -// pub use iter::UnicodeSetIter; + +/// Custom Errors for UnicodeSet. +#[derive(Debug, PartialEq)] +pub enum USetError { + InvalidSet(Vec), + InvalidRange(u32, u32), +} diff --git a/components/uniset/src/uniset.rs b/components/uniset/src/uniset.rs index 3c1a3b2c4ee..ae58d99f9b9 100644 --- a/components/uniset/src/uniset.rs +++ b/components/uniset/src/uniset.rs @@ -5,8 +5,8 @@ use std::{ slice::Iter, }; -use crate::utils::{deconstruct_range, is_sorted}; -// use crate::UnicodeSetIter; +use super::USetError; +use crate::utils::{deconstruct_range, is_valid}; /// Represents the end code point of the Basic Multilingual Plane range, starting from code point 0 , inclusive const BMP_MAX: u32 = 0xFFFF; @@ -14,8 +14,8 @@ const BMP_MAX: u32 = 0xFFFF; /// /// Provides exposure to membership functions and constructors from serialized UnicodeSets /// and predefined ranges. -/// Implements an inversion list. -//#[derive(Copy, Clone, Debug, Eq)] +/// Implements an [inversion list.](https://en.wikipedia.org/wiki/Inversion_list) +#[derive(Debug, PartialEq)] pub struct UnicodeSet { // If we wanted to use an array to keep the memory on the stack, there is an unsafe nightly feature // https://doc.rust-lang.org/nightly/core/array/trait.FixedSizeArray.html @@ -24,16 +24,13 @@ pub struct UnicodeSet { } impl TryFrom> for UnicodeSet { - type Error = String; + type Error = USetError; fn try_from(set: Vec) -> Result { - if is_sorted(&set) { + if is_valid(&set) { Ok(UnicodeSet { inv_list: set }) } else { - Err(format!( - "UnicodeSet set must be sorted without duplicates: {:?}", - set - )) + Err(USetError::InvalidSet(set)) } } } @@ -62,7 +59,7 @@ impl UnicodeSet { /// Example: /// /// ``` - /// use icu4x_unicodeset::UnicodeSet; + /// use icu_unicodeset::UnicodeSet; /// use std::convert::TryFrom; /// let example_list = vec![0, 10, 15, 20]; /// let example = UnicodeSet::try_from(example_list).unwrap(); @@ -82,7 +79,7 @@ impl UnicodeSet { /// Example: /// /// ``` - /// use icu4x_unicodeset::UnicodeSet; + /// use icu_unicodeset::UnicodeSet; /// use std::convert::TryFrom; /// let example_list = vec![65, 68, 69, 70]; /// let example = UnicodeSet::try_from(example_list).unwrap(); @@ -115,7 +112,7 @@ impl UnicodeSet { /// Returns whether or not the UnicodeSet is empty pub fn is_empty(&self) -> bool { - self.inv_list.len() < 2 // unsure if this is appropriate definition of just self.inv_list.is_empty() + self.inv_list.is_empty() } /// Wrapper for contains @@ -148,7 +145,7 @@ impl UnicodeSet { /// Example: /// /// ``` - /// use icu4x_unicodeset::UnicodeSet; + /// use icu_unicodeset::UnicodeSet; /// use std::convert::TryFrom; /// let example_list = vec![65, 67, 68, 69]; /// let example = UnicodeSet::try_from(example_list).unwrap(); @@ -170,15 +167,15 @@ impl UnicodeSet { /// Example: /// /// ``` - /// use icu4x_unicodeset::UnicodeSet; + /// use icu_unicodeset::UnicodeSet; /// use std::convert::TryFrom; /// let example_list = vec![65, 67, 68, 69]; /// let example = UnicodeSet::try_from(example_list).unwrap(); - /// assert!(example.contains_range('A'..'C')); - /// assert!(example.contains_range('A'..='B')); - /// assert!(!example.contains_range('A'..='C')); + /// assert!(example.contains_range(&('A'..'C'))); + /// assert!(example.contains_range(&('A'..='B'))); + /// assert!(!example.contains_range(&('A'..='C'))); /// ``` - pub fn contains_range(&self, range: impl RangeBounds) -> bool { + pub fn contains_range(&self, range: &impl RangeBounds) -> bool { let (from, till) = deconstruct_range(range); if from >= till { return false; @@ -192,18 +189,19 @@ impl UnicodeSet { #[cfg(test)] mod tests { - use super::{UnicodeSet, BMP_MAX}; + use super::{USetError, UnicodeSet, BMP_MAX}; use std::{char::MAX, convert::TryFrom, vec::Vec}; #[test] fn test_unicodeset_try_from_vec() { - let check = vec![2, 3, 4, 5]; - assert!(UnicodeSet::try_from(check).is_ok()); + let check = UnicodeSet::try_from(vec![2, 3, 4, 5]).unwrap().inv_list; + assert_eq!(vec![2, 3, 4, 5], check); } #[test] fn test_unicodeset_try_from_vec_error() { let check = vec![1, 1, 2, 3, 4]; - assert!(UnicodeSet::try_from(check).is_err()); + let set = UnicodeSet::try_from(vec![1, 1, 2, 3, 4]); + assert_eq!(Err(USetError::InvalidSet(check)), set); } #[test] fn test_unicodeset_all() { @@ -240,22 +238,22 @@ mod tests { fn test_unicodeset_contains_range() { let ex = vec![65, 70, 75, 85]; let check = UnicodeSet::try_from(ex).unwrap(); - assert!(check.contains_range('A'..='E')); // 65 - 69 - assert!(check.contains_range('K'..'U')); // 75 - 84 + assert!(check.contains_range(&('A'..='E'))); // 65 - 69 + assert!(check.contains_range(&('K'..'U'))); // 75 - 84 } #[test] fn test_unicodeset_contains_range_false() { let ex = vec![65, 70, 75, 85]; let check = UnicodeSet::try_from(ex).unwrap(); - assert!(!check.contains_range('!'..'A')); // 33 - 65 - assert!(!check.contains_range('F'..'K')); // 70 - 74 - assert!(!check.contains_range('U'..)); + assert!(!check.contains_range(&('!'..'A'))); // 33 - 65 + assert!(!check.contains_range(&('F'..'K'))); // 70 - 74 + assert!(!check.contains_range(&('U'..))); } #[test] fn test_unicodeset_contains_range_invalid() { let check = UnicodeSet::all(); - assert!(!check.contains_range('A'..'!')); // 65 - 33 - assert!(!check.contains_range('A'..'A')); + assert!(!check.contains_range(&('A'..'!'))); // 65 - 33 + assert!(!check.contains_range(&('A'..'A'))); } #[test] fn test_unicodeset_size() { @@ -274,8 +272,6 @@ mod tests { fn test_unicodeset_is_empty() { let check = UnicodeSet { inv_list: vec![] }; assert!(check.is_empty()); - let check = UnicodeSet { inv_list: vec![0] }; - assert!(check.is_empty()); } #[test] fn test_unicodeset_is_not_empty() { @@ -287,10 +283,10 @@ mod tests { let ex = vec![65, 70, 75, 85]; let check = UnicodeSet::try_from(ex).unwrap(); let mut iter = check.ranges(); - assert_eq!(iter.next().unwrap(), &65); - assert_eq!(iter.next().unwrap(), &70); - assert_eq!(iter.next().unwrap(), &75); - assert_eq!(iter.next().unwrap(), &85); + assert_eq!(iter.next(), Some(&65)); + assert_eq!(iter.next(), Some(&70)); + assert_eq!(iter.next(), Some(&75)); + assert_eq!(iter.next(), Some(&85)); assert_eq!(iter.next(), None); } #[test] diff --git a/components/uniset/src/utils.rs b/components/uniset/src/utils.rs index c324360a698..a953074e285 100644 --- a/components/uniset/src/utils.rs +++ b/components/uniset/src/utils.rs @@ -3,13 +3,16 @@ use std::{ ops::{Bound::*, RangeBounds}, }; -/// Returns whether the vector is sorted ascending non inclusive -pub fn is_sorted(v: &[u32]) -> bool { - v.chunks(2).all(|chunk| chunk[0] < chunk[1]) +/// Returns whether the vector is sorted ascending non inclusive, of even length, +/// and within the bounds of `0x0 -> 0x10FFFF` inclusive. +pub fn is_valid(v: &[u32]) -> bool { + v.len() % 2 == 0 + && v.chunks(2).all(|chunk| chunk[0] < chunk[1]) + && v[v.len() - 1] <= (MAX as u32) + 1 } /// Returns start (inclusive) and end (exclusive) bounds of RangeBounds -pub fn deconstruct_range(range: impl RangeBounds) -> (u32, u32) { +pub fn deconstruct_range(range: &impl RangeBounds) -> (u32, u32) { let from = match range.start_bound() { Included(b) => (*b as u32), Excluded(b) => (*b as u32), @@ -18,47 +21,56 @@ pub fn deconstruct_range(range: impl RangeBounds) -> (u32, u32) { let till = match range.end_bound() { Included(b) => (*b as u32) + 1, Excluded(b) => (*b as u32), - Unbounded => MAX as u32, + Unbounded => (MAX as u32) + 1, }; (from, till) } #[cfg(test)] mod tests { - use super::{deconstruct_range, is_sorted}; + use super::{deconstruct_range, is_valid}; use std::char::MAX; #[test] - fn test_is_sorted() { + fn test_is_valid() { let check = vec![2, 3, 4, 5]; - assert!(is_sorted(&check)); + assert!(is_valid(&check)); } #[test] - fn test_is_sorted_out_of_order() { + fn test_is_valid_out_of_order() { let check = vec![5, 4, 5, 6, 7]; - assert!(!is_sorted(&check)); + assert!(!is_valid(&check)); } #[test] - fn test_is_sorted_duplicate() { + fn test_is_valid_duplicate() { let check = vec![1, 2, 3, 3, 5]; - assert!(!is_sorted(&check)); + assert!(!is_valid(&check)); + } + #[test] + fn test_is_valid_odd() { + let check = vec![1, 2, 3, 4, 5]; + assert!(!is_valid(&check)); + } + #[test] + fn test_is_valid_out_of_range() { + let check = vec![1, 2, 3, 4, (MAX as u32) + 1]; + assert!(!is_valid(&check)); } - // deconstruct_range #[test] fn test_deconstruct_range() { let expected = (65, 69); - let check = deconstruct_range('A'..'E'); // Range + let check = deconstruct_range(&('A'..'E')); // Range assert_eq!(check, expected); - let check = deconstruct_range('A'..='D'); // Range Inclusive + let check = deconstruct_range(&('A'..='D')); // Range Inclusive assert_eq!(check, expected); - let check = deconstruct_range('A'..); // Range From - assert_eq!(check, (65, MAX as u32)); - let check = deconstruct_range(..'A'); // Range To + let check = deconstruct_range(&('A'..)); // Range From + assert_eq!(check, (65, (MAX as u32) + 1)); + let check = deconstruct_range(&(..'A')); // Range To assert_eq!(check, (0, 65)); - let check = deconstruct_range(..='A'); // Range To Inclusive + let check = deconstruct_range(&(..='A')); // Range To Inclusive assert_eq!(check, (0, 66)); - let check = deconstruct_range(..); // Range Full - assert_eq!(check, (0, MAX as u32)); + let check = deconstruct_range(&(..)); // Range Full + assert_eq!(check, (0, (MAX as u32) + 1)); } } From 7fd908202d3ceeb4c7cac1e224d2dc5cb6db44d8 Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Tue, 14 Jul 2020 23:07:06 +0000 Subject: [PATCH 23/30] added benchmarks and fixed surrogate code points in iter --- components/uniset/Cargo.toml | 7 ++++ components/uniset/benches/inv_list.rs | 59 +++++++++++++++++++++++++++ components/uniset/src/uniset.rs | 26 ++++++++---- 3 files changed, 85 insertions(+), 7 deletions(-) create mode 100644 components/uniset/benches/inv_list.rs diff --git a/components/uniset/Cargo.toml b/components/uniset/Cargo.toml index bbe4fe2c01b..279467b3aa6 100644 --- a/components/uniset/Cargo.toml +++ b/components/uniset/Cargo.toml @@ -12,3 +12,10 @@ include = [ "src/**/*", "Cargo.toml", ] + +[dev-dependencies] +criterion = "0.3" + +[[bench]] +name = "inv_list" +harness = false \ No newline at end of file diff --git a/components/uniset/benches/inv_list.rs b/components/uniset/benches/inv_list.rs new file mode 100644 index 00000000000..fd7e98b6abc --- /dev/null +++ b/components/uniset/benches/inv_list.rs @@ -0,0 +1,59 @@ +use criterion::{criterion_group, criterion_main, Criterion}; +use icu_unicodeset::UnicodeSet; +use std::convert::TryFrom; + +/// Best Case Contains +/// +/// Create a single small range and check contains on every value in range +fn best_case_contains(c: &mut Criterion) { + let check = vec![65, 70]; + let uset = UnicodeSet::try_from(check).unwrap(); + c.bench_function("inv_list/contains_best", |b| { + b.iter(|| uset.iter().map(|c| uset.contains(c))) + }); +} + +/// Worst Case Contains +/// +/// Create the maximum number of ranges ([0, 1, 2, 3], etc.) and check contains on 100 first values +fn worst_case_contains(c: &mut Criterion) { + let check: Vec = (0..((std::char::MAX as u32) + 1)).collect(); + let uset = UnicodeSet::try_from(check).unwrap(); + c.bench_function("inv_list/contains_worst", |b| { + b.iter(|| uset.iter().take(100).map(|c| uset.contains(c))) + }); +} +/// Best Case Contains Range +/// +/// Create a single small range and check contains on every value in range +fn best_case_contains_range(c: &mut Criterion) { + let check = vec![65, 70]; + let uset = UnicodeSet::try_from(check).unwrap(); + c.bench_function("inv_list/contains_range_best", |b| { + b.iter(|| uset.iter().map(|c| uset.contains_range(&('A'..c)))) + }); +} + +/// Worst Case Contains Range +/// +/// Create the maximum number of ranges ([0, 1, 2, 3], etc.) and check contains on 100 first values +fn worst_case_contains_range(c: &mut Criterion) { + let check: Vec = (0..((std::char::MAX as u32) + 1)).collect(); + let start = std::char::from_u32(0).unwrap(); + let uset = UnicodeSet::try_from(check).unwrap(); + c.bench_function("inv_list/contains_range_worst", |b| { + b.iter(|| { + uset.iter() + .take(100) + .map(|c| uset.contains_range(&(start..c))) + }) + }); +} +criterion_group!( + benches, + best_case_contains, + worst_case_contains, + best_case_contains_range, + worst_case_contains_range +); +criterion_main!(benches); diff --git a/components/uniset/src/uniset.rs b/components/uniset/src/uniset.rs index ae58d99f9b9..0a18d337419 100644 --- a/components/uniset/src/uniset.rs +++ b/components/uniset/src/uniset.rs @@ -94,7 +94,7 @@ impl UnicodeSet { self.inv_list .chunks(2) .flat_map(|pair| (pair[0]..pair[1])) - .map(|val| from_u32(val).unwrap()) + .filter_map(from_u32) } /// Returns the number of elements of the UnicodeSet @@ -140,7 +140,7 @@ impl UnicodeSet { /// Checks to see the query is in the UnicodeSet /// /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points - /// in the set using `std::vec::Vec` implementation + /// in the set using `std` implementation /// /// Example: /// @@ -159,10 +159,8 @@ impl UnicodeSet { /// Checks to see if the range is in the UnicodeSet, returns a Result /// /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points - /// in the set using `std::vec::Vec` implementation - /// - /// Only runs the search once on the `start` parameter, while the `end` parameter is checked - /// in a single `O(1)` step + /// in the set using `std::vec::Vec` implementation Only runs the search once on the `start` + /// parameter, while the `end` parameter is checked in a single `O(1)` step /// /// Example: /// @@ -175,6 +173,20 @@ impl UnicodeSet { /// assert!(example.contains_range(&('A'..='B'))); /// assert!(!example.contains_range(&('A'..='C'))); /// ``` + /// + /// Surrogate points (`0xD800 -> 0xDFFF`) will return false if the Range contains them but the + /// UnicodeSet does not. + /// + /// Example: + /// + /// ``` + /// use icu_unicodeset::UnicodeSet; + /// use std::{convert::TryFrom, char::from_u32}; + /// let check = from_u32(0xD7FE).unwrap() .. from_u32(0xE001).unwrap(); + /// let example_list = vec![0xD7FE, 0xD7FF, 0xE000, 0xE001]; + /// let example = UnicodeSet::try_from(example_list).unwrap(); + /// assert!(!example.contains_range(&(check))); + /// ``` pub fn contains_range(&self, range: &impl RangeBounds) -> bool { let (from, till) = deconstruct_range(range); if from >= till { @@ -291,7 +303,7 @@ mod tests { } #[test] fn test_unicodeset_iter() { - let ex = vec![65, 68, 69, 70]; + let ex = vec![65, 68, 69, 70, 0xD800, 0xD801]; let check = UnicodeSet::try_from(ex).unwrap(); let mut iter = check.iter(); assert_eq!(Some('A'), iter.next()); From 0f5a021b4f5b35ac81cd3fc41e6ccd7eb8a21b0c Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Wed, 15 Jul 2020 16:28:02 +0000 Subject: [PATCH 24/30] fix to is_valid --- components/uniset/src/uniset.rs | 7 ++++--- components/uniset/src/utils.rs | 7 ++++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/components/uniset/src/uniset.rs b/components/uniset/src/uniset.rs index 0a18d337419..6938057b5ef 100644 --- a/components/uniset/src/uniset.rs +++ b/components/uniset/src/uniset.rs @@ -206,13 +206,14 @@ mod tests { #[test] fn test_unicodeset_try_from_vec() { - let check = UnicodeSet::try_from(vec![2, 3, 4, 5]).unwrap().inv_list; - assert_eq!(vec![2, 3, 4, 5], check); + let ex = vec![2, 3, 4, 5]; + let check = UnicodeSet::try_from(ex.clone()).unwrap().inv_list; + assert_eq!(ex, check); } #[test] fn test_unicodeset_try_from_vec_error() { let check = vec![1, 1, 2, 3, 4]; - let set = UnicodeSet::try_from(vec![1, 1, 2, 3, 4]); + let set = UnicodeSet::try_from(check.clone()); assert_eq!(Err(USetError::InvalidSet(check)), set); } #[test] diff --git a/components/uniset/src/utils.rs b/components/uniset/src/utils.rs index a953074e285..6d1d7206b50 100644 --- a/components/uniset/src/utils.rs +++ b/components/uniset/src/utils.rs @@ -7,7 +7,7 @@ use std::{ /// and within the bounds of `0x0 -> 0x10FFFF` inclusive. pub fn is_valid(v: &[u32]) -> bool { v.len() % 2 == 0 - && v.chunks(2).all(|chunk| chunk[0] < chunk[1]) + && v.windows(2).all(|chunk| chunk[0] < chunk[1]) && v[v.len() - 1] <= (MAX as u32) + 1 } @@ -37,6 +37,11 @@ mod tests { assert!(is_valid(&check)); } #[test] + fn test_is_valid_overlapping() { + let check = vec![2, 5, 4, 6]; + assert!(!is_valid(&check)); + } + #[test] fn test_is_valid_out_of_order() { let check = vec![5, 4, 5, 6, 7]; assert!(!is_valid(&check)); From c7c4330bb7e82c1310f85170555da6fefac1d9e2 Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Wed, 15 Jul 2020 18:26:20 +0000 Subject: [PATCH 25/30] bench changes and other minor fixes --- components/uniset/benches/inv_list.rs | 74 +++++++++++---------------- components/uniset/src/conversions.rs | 12 ++--- components/uniset/src/uniset.rs | 6 +-- components/uniset/src/utils.rs | 10 ++-- 4 files changed, 45 insertions(+), 57 deletions(-) diff --git a/components/uniset/benches/inv_list.rs b/components/uniset/benches/inv_list.rs index fd7e98b6abc..d53e21f5f2e 100644 --- a/components/uniset/benches/inv_list.rs +++ b/components/uniset/benches/inv_list.rs @@ -1,59 +1,43 @@ use criterion::{criterion_group, criterion_main, Criterion}; use icu_unicodeset::UnicodeSet; -use std::convert::TryFrom; +use std::{convert::TryFrom, char::{MAX, from_u32}}; -/// Best Case Contains -/// -/// Create a single small range and check contains on every value in range -fn best_case_contains(c: &mut Criterion) { - let check = vec![65, 70]; - let uset = UnicodeSet::try_from(check).unwrap(); - c.bench_function("inv_list/contains_best", |b| { - b.iter(|| uset.iter().map(|c| uset.contains(c))) - }); -} +fn contains_bench(c: &mut Criterion) { + let best_ex = vec![65, 70]; + let best_sample = UnicodeSet::try_from(best_ex).unwrap(); + let worst_ex: Vec = (0..((MAX as u32) + 1)).collect(); + let worst_sample = UnicodeSet::try_from(worst_ex).unwrap(); -/// Worst Case Contains -/// -/// Create the maximum number of ranges ([0, 1, 2, 3], etc.) and check contains on 100 first values -fn worst_case_contains(c: &mut Criterion) { - let check: Vec = (0..((std::char::MAX as u32) + 1)).collect(); - let uset = UnicodeSet::try_from(check).unwrap(); - c.bench_function("inv_list/contains_worst", |b| { - b.iter(|| uset.iter().take(100).map(|c| uset.contains(c))) + let mut group = c.benchmark_group("uniset/contains"); + group.bench_with_input("best", &best_sample, |b, sample| { + b.iter(|| sample.iter().map(|ch| sample.contains(ch))) }); -} -/// Best Case Contains Range -/// -/// Create a single small range and check contains on every value in range -fn best_case_contains_range(c: &mut Criterion) { - let check = vec![65, 70]; - let uset = UnicodeSet::try_from(check).unwrap(); - c.bench_function("inv_list/contains_range_best", |b| { - b.iter(|| uset.iter().map(|c| uset.contains_range(&('A'..c)))) + group.bench_with_input("worst", &worst_sample, |b, sample| { + b.iter(|| sample.iter().take(100).map(|ch| sample.contains(ch))) }); + group.finish(); } -/// Worst Case Contains Range -/// -/// Create the maximum number of ranges ([0, 1, 2, 3], etc.) and check contains on 100 first values -fn worst_case_contains_range(c: &mut Criterion) { - let check: Vec = (0..((std::char::MAX as u32) + 1)).collect(); - let start = std::char::from_u32(0).unwrap(); - let uset = UnicodeSet::try_from(check).unwrap(); - c.bench_function("inv_list/contains_range_worst", |b| { +fn contains_range_bench(c: &mut Criterion) { + let best_ex = vec![65, 70]; + let best_sample = UnicodeSet::try_from(best_ex).unwrap(); + let worst_ex: Vec = (0..((MAX as u32) + 1)).collect(); + let worst_sample = UnicodeSet::try_from(worst_ex).unwrap(); + + let mut group = c.benchmark_group("uniset/contains_range"); + group.bench_with_input("best", &best_sample, |b, sample| { + b.iter(|| sample.iter().map(|ch| sample.contains_range(&('A'..ch)))) + }); + group.bench_with_input("worst", &worst_sample, |b, sample| { b.iter(|| { - uset.iter() + sample + .iter() .take(100) - .map(|c| uset.contains_range(&(start..c))) + .map(|ch| sample.contains_range(&(from_u32(0).unwrap()..ch))) }) }); + group.finish(); } -criterion_group!( - benches, - best_case_contains, - worst_case_contains, - best_case_contains_range, - worst_case_contains_range -); + +criterion_group!(benches, contains_bench, contains_range_bench); criterion_main!(benches); diff --git a/components/uniset/src/conversions.rs b/components/uniset/src/conversions.rs index 68e039e9086..04230a22c87 100644 --- a/components/uniset/src/conversions.rs +++ b/components/uniset/src/conversions.rs @@ -6,7 +6,7 @@ use std::{ ops::{Range, RangeBounds, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive}, }; -fn try_from_range_impl(range: &impl RangeBounds) -> Result { +fn try_from_range(range: &impl RangeBounds) -> Result { let (from, till) = deconstruct_range(range); if from < till { let set = vec![from, till]; @@ -20,7 +20,7 @@ impl TryFrom<&Range> for UnicodeSet { type Error = USetError; fn try_from(range: &Range) -> Result { - try_from_range_impl(range) + try_from_range(range) } } @@ -28,7 +28,7 @@ impl TryFrom<&RangeFrom> for UnicodeSet { type Error = USetError; fn try_from(range: &RangeFrom) -> Result { - try_from_range_impl(range) + try_from_range(range) } } @@ -44,7 +44,7 @@ impl TryFrom<&RangeInclusive> for UnicodeSet { type Error = USetError; fn try_from(range: &RangeInclusive) -> Result { - try_from_range_impl(range) + try_from_range(range) } } @@ -52,7 +52,7 @@ impl TryFrom<&RangeTo> for UnicodeSet { type Error = USetError; fn try_from(range: &RangeTo) -> Result { - try_from_range_impl(range) + try_from_range(range) } } @@ -60,7 +60,7 @@ impl TryFrom<&RangeToInclusive> for UnicodeSet { type Error = USetError; fn try_from(range: &RangeToInclusive) -> Result { - try_from_range_impl(range) + try_from_range(range) } } diff --git a/components/uniset/src/uniset.rs b/components/uniset/src/uniset.rs index 6938057b5ef..e3d5f6fa46a 100644 --- a/components/uniset/src/uniset.rs +++ b/components/uniset/src/uniset.rs @@ -118,7 +118,7 @@ impl UnicodeSet { /// Wrapper for contains /// /// Returns an Option as to whether or not it is possible for the query to be contained - fn contains_impl(&self, query: u32) -> Option { + fn contains_query(&self, query: u32) -> Option { match self.inv_list.binary_search(&query) { Ok(pos) => { if pos % 2 == 0 { @@ -153,7 +153,7 @@ impl UnicodeSet { /// assert!(!example.contains('C')); /// ``` pub fn contains(&self, query: char) -> bool { - self.contains_impl(query as u32).is_some() + self.contains_query(query as u32).is_some() } /// Checks to see if the range is in the UnicodeSet, returns a Result @@ -192,7 +192,7 @@ impl UnicodeSet { if from >= till { return false; } - match self.contains_impl(from) { + match self.contains_query(from) { Some(pos) => (till) <= self.inv_list[pos + 1], None => false, } diff --git a/components/uniset/src/utils.rs b/components/uniset/src/utils.rs index 6d1d7206b50..b6044c46e30 100644 --- a/components/uniset/src/utils.rs +++ b/components/uniset/src/utils.rs @@ -8,14 +8,13 @@ use std::{ pub fn is_valid(v: &[u32]) -> bool { v.len() % 2 == 0 && v.windows(2).all(|chunk| chunk[0] < chunk[1]) - && v[v.len() - 1] <= (MAX as u32) + 1 + && v.last().map_or(false, |e| e <= &((MAX as u32) + 1)) } /// Returns start (inclusive) and end (exclusive) bounds of RangeBounds pub fn deconstruct_range(range: &impl RangeBounds) -> (u32, u32) { let from = match range.start_bound() { - Included(b) => (*b as u32), - Excluded(b) => (*b as u32), + Included(b) | Excluded(b) => (*b as u32), Unbounded => 0, }; let till = match range.end_bound() { @@ -37,6 +36,11 @@ mod tests { assert!(is_valid(&check)); } #[test] + fn test_is_valid_empty() { + let check = vec![]; + assert!(!is_valid(&check)); + } + #[test] fn test_is_valid_overlapping() { let check = vec![2, 5, 4, 6]; assert!(!is_valid(&check)); From a8a4b503c9928802f85eced847b94124e80f76b8 Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Wed, 15 Jul 2020 18:28:47 +0000 Subject: [PATCH 26/30] forgot to run fmt --- components/uniset/benches/inv_list.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/components/uniset/benches/inv_list.rs b/components/uniset/benches/inv_list.rs index d53e21f5f2e..d588ce53883 100644 --- a/components/uniset/benches/inv_list.rs +++ b/components/uniset/benches/inv_list.rs @@ -1,6 +1,9 @@ use criterion::{criterion_group, criterion_main, Criterion}; use icu_unicodeset::UnicodeSet; -use std::{convert::TryFrom, char::{MAX, from_u32}}; +use std::{ + char::{from_u32, MAX}, + convert::TryFrom, +}; fn contains_bench(c: &mut Criterion) { let best_ex = vec![65, 70]; From 845bc352dd1953af2d2154a0682f45dad4deb4aa Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Thu, 16 Jul 2020 17:21:02 +0000 Subject: [PATCH 27/30] change to std::char, and unreachable!() optimizations --- components/uniset/benches/inv_list.rs | 11 ++++------- components/uniset/src/uniset.rs | 21 ++++++++------------- components/uniset/src/utils.rs | 19 ++++++++++--------- 3 files changed, 22 insertions(+), 29 deletions(-) diff --git a/components/uniset/benches/inv_list.rs b/components/uniset/benches/inv_list.rs index d588ce53883..9ef3e7ea63b 100644 --- a/components/uniset/benches/inv_list.rs +++ b/components/uniset/benches/inv_list.rs @@ -1,14 +1,11 @@ use criterion::{criterion_group, criterion_main, Criterion}; use icu_unicodeset::UnicodeSet; -use std::{ - char::{from_u32, MAX}, - convert::TryFrom, -}; +use std::{char, convert::TryFrom}; fn contains_bench(c: &mut Criterion) { let best_ex = vec![65, 70]; let best_sample = UnicodeSet::try_from(best_ex).unwrap(); - let worst_ex: Vec = (0..((MAX as u32) + 1)).collect(); + let worst_ex: Vec = (0..((char::MAX as u32) + 1)).collect(); let worst_sample = UnicodeSet::try_from(worst_ex).unwrap(); let mut group = c.benchmark_group("uniset/contains"); @@ -24,7 +21,7 @@ fn contains_bench(c: &mut Criterion) { fn contains_range_bench(c: &mut Criterion) { let best_ex = vec![65, 70]; let best_sample = UnicodeSet::try_from(best_ex).unwrap(); - let worst_ex: Vec = (0..((MAX as u32) + 1)).collect(); + let worst_ex: Vec = (0..((char::MAX as u32) + 1)).collect(); let worst_sample = UnicodeSet::try_from(worst_ex).unwrap(); let mut group = c.benchmark_group("uniset/contains_range"); @@ -36,7 +33,7 @@ fn contains_range_bench(c: &mut Criterion) { sample .iter() .take(100) - .map(|ch| sample.contains_range(&(from_u32(0).unwrap()..ch))) + .map(|ch| sample.contains_range(&(char::from_u32(0).unwrap()..ch))) }) }); group.finish(); diff --git a/components/uniset/src/uniset.rs b/components/uniset/src/uniset.rs index e3d5f6fa46a..4cdf6f69f0a 100644 --- a/components/uniset/src/uniset.rs +++ b/components/uniset/src/uniset.rs @@ -1,9 +1,4 @@ -use std::{ - char::{from_u32, MAX}, - convert::TryFrom, - ops::RangeBounds, - slice::Iter, -}; +use std::{char, convert::TryFrom, ops::RangeBounds, slice::Iter}; use super::USetError; use crate::utils::{deconstruct_range, is_valid}; @@ -41,7 +36,7 @@ impl UnicodeSet { /// The range spans from `0x0 -> 0x10FFFF` inclusive pub fn all() -> UnicodeSet { UnicodeSet { - inv_list: vec![0, (MAX as u32) + 1], + inv_list: vec![0, (char::MAX as u32) + 1], } } @@ -94,7 +89,7 @@ impl UnicodeSet { self.inv_list .chunks(2) .flat_map(|pair| (pair[0]..pair[1])) - .filter_map(from_u32) + .filter_map(char::from_u32) } /// Returns the number of elements of the UnicodeSet @@ -181,8 +176,8 @@ impl UnicodeSet { /// /// ``` /// use icu_unicodeset::UnicodeSet; - /// use std::{convert::TryFrom, char::from_u32}; - /// let check = from_u32(0xD7FE).unwrap() .. from_u32(0xE001).unwrap(); + /// use std::{convert::TryFrom, char}; + /// let check = char::from_u32(0xD7FE).unwrap() .. char::from_u32(0xE001).unwrap(); /// let example_list = vec![0xD7FE, 0xD7FF, 0xE000, 0xE001]; /// let example = UnicodeSet::try_from(example_list).unwrap(); /// assert!(!example.contains_range(&(check))); @@ -202,7 +197,7 @@ impl UnicodeSet { #[cfg(test)] mod tests { use super::{USetError, UnicodeSet, BMP_MAX}; - use std::{char::MAX, convert::TryFrom, vec::Vec}; + use std::{char, convert::TryFrom, vec::Vec}; #[test] fn test_unicodeset_try_from_vec() { @@ -218,7 +213,7 @@ mod tests { } #[test] fn test_unicodeset_all() { - let expected = vec![0, (MAX as u32) + 1]; + let expected = vec![0, (char::MAX as u32) + 1]; assert_eq!(UnicodeSet::all().inv_list, expected); } #[test] @@ -274,7 +269,7 @@ mod tests { let check = UnicodeSet::try_from(ex).unwrap(); assert_eq!(8, check.size()); let check = UnicodeSet::all(); - let expected = (MAX as u32) + 1; + let expected = (char::MAX as u32) + 1; assert_eq!(expected as usize, check.size()); let check = UnicodeSet { inv_list: Vec::new(), diff --git a/components/uniset/src/utils.rs b/components/uniset/src/utils.rs index b6044c46e30..b20c96db391 100644 --- a/components/uniset/src/utils.rs +++ b/components/uniset/src/utils.rs @@ -1,5 +1,5 @@ use std::{ - char::MAX, + char, ops::{Bound::*, RangeBounds}, }; @@ -8,19 +8,20 @@ use std::{ pub fn is_valid(v: &[u32]) -> bool { v.len() % 2 == 0 && v.windows(2).all(|chunk| chunk[0] < chunk[1]) - && v.last().map_or(false, |e| e <= &((MAX as u32) + 1)) + && v.last().map_or(false, |e| e <= &((char::MAX as u32) + 1)) } -/// Returns start (inclusive) and end (exclusive) bounds of RangeBounds +/// Returns start (inclusive) and end (excluisive) bounds of RangeBounds pub fn deconstruct_range(range: &impl RangeBounds) -> (u32, u32) { let from = match range.start_bound() { - Included(b) | Excluded(b) => (*b as u32), + Included(b) => (*b as u32), + Excluded(_) => unreachable!(), Unbounded => 0, }; let till = match range.end_bound() { Included(b) => (*b as u32) + 1, Excluded(b) => (*b as u32), - Unbounded => (MAX as u32) + 1, + Unbounded => (char::MAX as u32) + 1, }; (from, till) } @@ -28,7 +29,7 @@ pub fn deconstruct_range(range: &impl RangeBounds) -> (u32, u32) { #[cfg(test)] mod tests { use super::{deconstruct_range, is_valid}; - use std::char::MAX; + use std::char; #[test] fn test_is_valid() { @@ -62,7 +63,7 @@ mod tests { } #[test] fn test_is_valid_out_of_range() { - let check = vec![1, 2, 3, 4, (MAX as u32) + 1]; + let check = vec![1, 2, 3, 4, (char::MAX as u32) + 1]; assert!(!is_valid(&check)); } // deconstruct_range @@ -74,12 +75,12 @@ mod tests { let check = deconstruct_range(&('A'..='D')); // Range Inclusive assert_eq!(check, expected); let check = deconstruct_range(&('A'..)); // Range From - assert_eq!(check, (65, (MAX as u32) + 1)); + assert_eq!(check, (65, (char::MAX as u32) + 1)); let check = deconstruct_range(&(..'A')); // Range To assert_eq!(check, (0, 65)); let check = deconstruct_range(&(..='A')); // Range To Inclusive assert_eq!(check, (0, 66)); let check = deconstruct_range(&(..)); // Range Full - assert_eq!(check, (0, (MAX as u32) + 1)); + assert_eq!(check, (0, (char::MAX as u32) + 1)); } } From dbf3100ca1760041ae33c52c44c95f910b3a8c0d Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Fri, 17 Jul 2020 17:53:29 +0000 Subject: [PATCH 28/30] size() is now constant check, ranges() temp removed --- components/uniset/src/conversions.rs | 56 ++++++----- components/uniset/src/lib.rs | 2 +- components/uniset/src/uniset.rs | 139 +++++++++++++-------------- 3 files changed, 98 insertions(+), 99 deletions(-) diff --git a/components/uniset/src/conversions.rs b/components/uniset/src/conversions.rs index 04230a22c87..03778d56618 100644 --- a/components/uniset/src/conversions.rs +++ b/components/uniset/src/conversions.rs @@ -1,23 +1,24 @@ -use super::USetError; -use crate::utils::deconstruct_range; -use crate::UnicodeSet; use std::{ convert::TryFrom, ops::{Range, RangeBounds, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive}, }; -fn try_from_range(range: &impl RangeBounds) -> Result { +use super::UnicodeSetError; +use crate::utils::deconstruct_range; +use crate::UnicodeSet; + +fn try_from_range(range: &impl RangeBounds) -> Result { let (from, till) = deconstruct_range(range); if from < till { let set = vec![from, till]; - Ok(UnicodeSet::try_from(set).unwrap()) + Ok(UnicodeSet::from_inversion_list(set).unwrap()) } else { - Err(USetError::InvalidRange(from, till)) + Err(UnicodeSetError::InvalidRange(from, till)) } } impl TryFrom<&Range> for UnicodeSet { - type Error = USetError; + type Error = UnicodeSetError; fn try_from(range: &Range) -> Result { try_from_range(range) @@ -25,7 +26,7 @@ impl TryFrom<&Range> for UnicodeSet { } impl TryFrom<&RangeFrom> for UnicodeSet { - type Error = USetError; + type Error = UnicodeSetError; fn try_from(range: &RangeFrom) -> Result { try_from_range(range) @@ -33,7 +34,7 @@ impl TryFrom<&RangeFrom> for UnicodeSet { } impl TryFrom<&RangeFull> for UnicodeSet { - type Error = USetError; + type Error = UnicodeSetError; fn try_from(_: &RangeFull) -> Result { Ok(UnicodeSet::all()) @@ -41,7 +42,7 @@ impl TryFrom<&RangeFull> for UnicodeSet { } impl TryFrom<&RangeInclusive> for UnicodeSet { - type Error = USetError; + type Error = UnicodeSetError; fn try_from(range: &RangeInclusive) -> Result { try_from_range(range) @@ -49,7 +50,7 @@ impl TryFrom<&RangeInclusive> for UnicodeSet { } impl TryFrom<&RangeTo> for UnicodeSet { - type Error = USetError; + type Error = UnicodeSetError; fn try_from(range: &RangeTo) -> Result { try_from_range(range) @@ -57,7 +58,7 @@ impl TryFrom<&RangeTo> for UnicodeSet { } impl TryFrom<&RangeToInclusive> for UnicodeSet { - type Error = USetError; + type Error = UnicodeSetError; fn try_from(range: &RangeToInclusive) -> Result { try_from_range(range) @@ -66,9 +67,10 @@ impl TryFrom<&RangeToInclusive> for UnicodeSet { #[cfg(test)] mod tests { - use super::USetError; + use super::UnicodeSetError; use crate::UnicodeSet; - use std::convert::TryFrom; + use std::{char, convert::TryFrom}; + #[test] fn test_try_from_range() { let check: Vec = UnicodeSet::try_from(&('A'..'B')).unwrap().iter().collect(); @@ -77,7 +79,7 @@ mod tests { #[test] fn test_try_from_range_error() { let check = UnicodeSet::try_from(&('A'..'A')); - assert_eq!(Err(USetError::InvalidRange(65, 65)), check); + assert_eq!(Err(UnicodeSetError::InvalidRange(65, 65)), check); } #[test] fn test_try_from_range_inclusive() { @@ -87,35 +89,39 @@ mod tests { #[test] fn test_try_from_range_inclusive_err() { let check = UnicodeSet::try_from(&('B'..'A')); - assert_eq!(Err(USetError::InvalidRange(66, 65)), check); + assert_eq!(Err(UnicodeSetError::InvalidRange(66, 65)), check); } #[test] fn test_try_from_range_from() { let uset = UnicodeSet::try_from(&('A'..)).unwrap(); - let check: Vec<&u32> = uset.ranges().collect(); - assert_eq!(vec![&65, &((std::char::MAX as u32) + 1)], check); + let check: usize = uset.size(); + let expected: usize = (char::MAX as usize) + 1 - 65; + assert_eq!(expected, check); } #[test] fn test_try_from_range_to() { let uset = UnicodeSet::try_from(&(..'A')).unwrap(); - let check: Vec<&u32> = uset.ranges().collect(); - assert_eq!(vec![&0, &65], check); + let check: usize = uset.size(); + let expected: usize = 65; + assert_eq!(expected, check); } #[test] fn test_try_from_range_to_err() { let check = UnicodeSet::try_from(&(..(0 as char))); - assert_eq!(Err(USetError::InvalidRange(0, 0)), check); + assert_eq!(Err(UnicodeSetError::InvalidRange(0, 0)), check); } #[test] fn test_try_from_range_to_inclusive() { let uset = UnicodeSet::try_from(&(..='A')).unwrap(); - let check: Vec<&u32> = uset.ranges().collect(); - assert_eq!(vec![&0, &66], check); + let check: usize = uset.size(); + let expected: usize = 66; + assert_eq!(expected, check); } #[test] fn test_try_from_range_full() { let uset = UnicodeSet::try_from(&(..)).unwrap(); - let check: Vec<&u32> = uset.ranges().collect(); - assert_eq!(vec![&0, &((std::char::MAX as u32) + 1)], check); + let check: usize = uset.size(); + let expected: usize = (char::MAX as usize) + 1; + assert_eq!(expected, check); } } diff --git a/components/uniset/src/lib.rs b/components/uniset/src/lib.rs index 870eb001e30..48bb002b9b1 100644 --- a/components/uniset/src/lib.rs +++ b/components/uniset/src/lib.rs @@ -9,7 +9,7 @@ pub use utils::*; /// Custom Errors for UnicodeSet. #[derive(Debug, PartialEq)] -pub enum USetError { +pub enum UnicodeSetError { InvalidSet(Vec), InvalidRange(u32, u32), } diff --git a/components/uniset/src/uniset.rs b/components/uniset/src/uniset.rs index 4cdf6f69f0a..ff9243562ed 100644 --- a/components/uniset/src/uniset.rs +++ b/components/uniset/src/uniset.rs @@ -1,6 +1,6 @@ -use std::{char, convert::TryFrom, ops::RangeBounds, slice::Iter}; +use std::{char, ops::RangeBounds}; -use super::USetError; +use super::UnicodeSetError; use crate::utils::{deconstruct_range, is_valid}; /// Represents the end code point of the Basic Multilingual Plane range, starting from code point 0 , inclusive const BMP_MAX: u32 = 0xFFFF; @@ -9,34 +9,51 @@ const BMP_MAX: u32 = 0xFFFF; /// /// Provides exposure to membership functions and constructors from serialized UnicodeSets /// and predefined ranges. -/// Implements an [inversion list.](https://en.wikipedia.org/wiki/Inversion_list) #[derive(Debug, PartialEq)] pub struct UnicodeSet { // If we wanted to use an array to keep the memory on the stack, there is an unsafe nightly feature // https://doc.rust-lang.org/nightly/core/array/trait.FixedSizeArray.html // Allows for traits of fixed size arrays + + // Implements an [inversion list.](https://en.wikipedia.org/wiki/Inversion_list) inv_list: Vec, + size: usize, } -impl TryFrom> for UnicodeSet { - type Error = USetError; - - fn try_from(set: Vec) -> Result { - if is_valid(&set) { - Ok(UnicodeSet { inv_list: set }) +impl UnicodeSet { + /// Returns UnicodeSet from an [inversion list.](https://en.wikipedia.org/wiki/Inversion_list) + /// represented by a `Vec` of codepoints. + /// + /// The inversion list must be of even length, sorted ascending non-overlapping, + /// and within the bounds of `0x0 -> 0x10FFFF` inclusive, and end points being exclusive. + /// + /// Example: + /// + /// ``` + /// use icu_unicodeset::UnicodeSet; + /// use icu_unicodeset::UnicodeSetError; + /// let empty: Vec = vec![]; + /// assert_eq!(UnicodeSet::from_inversion_list(empty.clone()), Err(UnicodeSetError::InvalidSet(empty.clone()))) + /// ``` + pub fn from_inversion_list(inv_list: Vec) -> Result { + if is_valid(&inv_list) { + let size: usize = inv_list + .chunks(2) + .map(|end_points| end_points[1] - end_points[0]) + .sum::() as usize; + Ok(UnicodeSet { inv_list, size }) } else { - Err(USetError::InvalidSet(set)) + Err(UnicodeSetError::InvalidSet(inv_list)) } } -} -impl UnicodeSet { /// Returns UnicodeSet spanning entire Unicode range /// /// The range spans from `0x0 -> 0x10FFFF` inclusive pub fn all() -> UnicodeSet { UnicodeSet { inv_list: vec![0, (char::MAX as u32) + 1], + size: (char::MAX as usize) + 1, } } @@ -46,38 +63,18 @@ impl UnicodeSet { pub fn bmp() -> UnicodeSet { UnicodeSet { inv_list: vec![0, BMP_MAX + 1], + size: (BMP_MAX as usize) + 1, } } - /// Yields an iterator of start and stop points of ranges in the UnicodeSet - /// - /// Example: - /// - /// ``` - /// use icu_unicodeset::UnicodeSet; - /// use std::convert::TryFrom; - /// let example_list = vec![0, 10, 15, 20]; - /// let example = UnicodeSet::try_from(example_list).unwrap(); - /// let mut example_ranges = example.ranges(); - /// assert_eq!(Some(&0), example_ranges.next()); - /// assert_eq!(Some(&10), example_ranges.next()); - /// assert_eq!(Some(&15), example_ranges.next()); - /// assert_eq!(Some(&20), example_ranges.next()); - /// assert_eq!(None, example_ranges.next()); - /// ``` - pub fn ranges(&self) -> Iter { - self.inv_list.iter() - } - /// Yields an iterator going through the character set in the UnicodeSet /// /// Example: /// /// ``` /// use icu_unicodeset::UnicodeSet; - /// use std::convert::TryFrom; /// let example_list = vec![65, 68, 69, 70]; - /// let example = UnicodeSet::try_from(example_list).unwrap(); + /// let example = UnicodeSet::from_inversion_list(example_list).unwrap(); /// let mut example_iter = example.iter(); /// assert_eq!(Some('A'), example_iter.next()); /// assert_eq!(Some('B'), example_iter.next()); @@ -97,12 +94,7 @@ impl UnicodeSet { if self.is_empty() { return 0; } - let s: u32 = self - .inv_list - .chunks(2) - .map(|end_points| end_points[1] - end_points[0]) - .sum(); - s as usize + self.size } /// Returns whether or not the UnicodeSet is empty @@ -112,7 +104,8 @@ impl UnicodeSet { /// Wrapper for contains /// - /// Returns an Option as to whether or not it is possible for the query to be contained + /// Returns an `Option` as to whether or not it is possible for the query to be contained. + /// The value in the `Option` is the start index of the range that contains the query. fn contains_query(&self, query: u32) -> Option { match self.inv_list.binary_search(&query) { Ok(pos) => { @@ -141,9 +134,8 @@ impl UnicodeSet { /// /// ``` /// use icu_unicodeset::UnicodeSet; - /// use std::convert::TryFrom; /// let example_list = vec![65, 67, 68, 69]; - /// let example = UnicodeSet::try_from(example_list).unwrap(); + /// let example = UnicodeSet::from_inversion_list(example_list).unwrap(); /// assert!(example.contains('A')); /// assert!(!example.contains('C')); /// ``` @@ -161,9 +153,8 @@ impl UnicodeSet { /// /// ``` /// use icu_unicodeset::UnicodeSet; - /// use std::convert::TryFrom; /// let example_list = vec![65, 67, 68, 69]; - /// let example = UnicodeSet::try_from(example_list).unwrap(); + /// let example = UnicodeSet::from_inversion_list(example_list).unwrap(); /// assert!(example.contains_range(&('A'..'C'))); /// assert!(example.contains_range(&('A'..='B'))); /// assert!(!example.contains_range(&('A'..='C'))); @@ -176,10 +167,10 @@ impl UnicodeSet { /// /// ``` /// use icu_unicodeset::UnicodeSet; - /// use std::{convert::TryFrom, char}; + /// use std::char; /// let check = char::from_u32(0xD7FE).unwrap() .. char::from_u32(0xE001).unwrap(); /// let example_list = vec![0xD7FE, 0xD7FF, 0xE000, 0xE001]; - /// let example = UnicodeSet::try_from(example_list).unwrap(); + /// let example = UnicodeSet::from_inversion_list(example_list).unwrap(); /// assert!(!example.contains_range(&(check))); /// ``` pub fn contains_range(&self, range: &impl RangeBounds) -> bool { @@ -196,37 +187,46 @@ impl UnicodeSet { #[cfg(test)] mod tests { - use super::{USetError, UnicodeSet, BMP_MAX}; - use std::{char, convert::TryFrom, vec::Vec}; + use super::{UnicodeSet, UnicodeSetError, BMP_MAX}; + use std::{char, vec::Vec}; #[test] fn test_unicodeset_try_from_vec() { let ex = vec![2, 3, 4, 5]; - let check = UnicodeSet::try_from(ex.clone()).unwrap().inv_list; - assert_eq!(ex, check); + let check = UnicodeSet::from_inversion_list(ex.clone()).unwrap(); + assert_eq!(ex, check.inv_list); + assert_eq!(2, check.size()); } #[test] fn test_unicodeset_try_from_vec_error() { let check = vec![1, 1, 2, 3, 4]; - let set = UnicodeSet::try_from(check.clone()); - assert_eq!(Err(USetError::InvalidSet(check)), set); + let set = UnicodeSet::from_inversion_list(check.clone()); + assert_eq!(Err(UnicodeSetError::InvalidSet(check)), set); } #[test] fn test_unicodeset_all() { let expected = vec![0, (char::MAX as u32) + 1]; - assert_eq!(UnicodeSet::all().inv_list, expected); + assert_eq!(UnicodeSet::all().inv_list, expected.clone()); + assert_eq!( + UnicodeSet::all().size(), + (expected[1] - expected[0]) as usize + ) } #[test] fn test_unicodeset_bmp() { let expected = vec![0, BMP_MAX + 1]; - assert_eq!(UnicodeSet::bmp().inv_list, expected); + assert_eq!(UnicodeSet::bmp().inv_list, expected.clone()); + assert_eq!( + UnicodeSet::bmp().size(), + (expected[1] - expected[0]) as usize + ); } // UnicodeSet membership functions #[test] fn test_unicodeset_contains() { let ex = vec![2, 5, 10, 15]; - let check = UnicodeSet::try_from(ex).unwrap(); + let check = UnicodeSet::from_inversion_list(ex).unwrap(); assert!(check.contains(2 as char)); assert!(check.contains(4 as char)); assert!(check.contains(10 as char)); @@ -235,7 +235,7 @@ mod tests { #[test] fn test_unicodeset_contains_false() { let ex = vec![2, 5, 10, 15]; - let check = UnicodeSet::try_from(ex).unwrap(); + let check = UnicodeSet::from_inversion_list(ex).unwrap(); assert!(!check.contains(1 as char)); assert!(!check.contains(5 as char)); assert!(!check.contains(9 as char)); @@ -245,14 +245,14 @@ mod tests { #[test] fn test_unicodeset_contains_range() { let ex = vec![65, 70, 75, 85]; - let check = UnicodeSet::try_from(ex).unwrap(); + let check = UnicodeSet::from_inversion_list(ex).unwrap(); assert!(check.contains_range(&('A'..='E'))); // 65 - 69 assert!(check.contains_range(&('K'..'U'))); // 75 - 84 } #[test] fn test_unicodeset_contains_range_false() { let ex = vec![65, 70, 75, 85]; - let check = UnicodeSet::try_from(ex).unwrap(); + let check = UnicodeSet::from_inversion_list(ex).unwrap(); assert!(!check.contains_range(&('!'..'A'))); // 33 - 65 assert!(!check.contains_range(&('F'..'K'))); // 70 - 74 assert!(!check.contains_range(&('U'..))); @@ -266,19 +266,23 @@ mod tests { #[test] fn test_unicodeset_size() { let ex = vec![2, 5, 10, 15]; - let check = UnicodeSet::try_from(ex).unwrap(); + let check = UnicodeSet::from_inversion_list(ex).unwrap(); assert_eq!(8, check.size()); let check = UnicodeSet::all(); let expected = (char::MAX as u32) + 1; assert_eq!(expected as usize, check.size()); let check = UnicodeSet { inv_list: Vec::new(), + size: 0, }; assert_eq!(check.size(), 0); } #[test] fn test_unicodeset_is_empty() { - let check = UnicodeSet { inv_list: vec![] }; + let check = UnicodeSet { + inv_list: vec![], + size: 0, + }; assert!(check.is_empty()); } #[test] @@ -287,20 +291,9 @@ mod tests { assert!(!check.is_empty()); } #[test] - fn test_unicodeset_ranges() { - let ex = vec![65, 70, 75, 85]; - let check = UnicodeSet::try_from(ex).unwrap(); - let mut iter = check.ranges(); - assert_eq!(iter.next(), Some(&65)); - assert_eq!(iter.next(), Some(&70)); - assert_eq!(iter.next(), Some(&75)); - assert_eq!(iter.next(), Some(&85)); - assert_eq!(iter.next(), None); - } - #[test] fn test_unicodeset_iter() { let ex = vec![65, 68, 69, 70, 0xD800, 0xD801]; - let check = UnicodeSet::try_from(ex).unwrap(); + let check = UnicodeSet::from_inversion_list(ex).unwrap(); let mut iter = check.iter(); assert_eq!(Some('A'), iter.next()); assert_eq!(Some('B'), iter.next()); From a9acfa28948f92377ab02b268fb8667e65f4f9ce Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Fri, 17 Jul 2020 19:07:40 +0000 Subject: [PATCH 29/30] fixed bench --- components/uniset/benches/inv_list.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/components/uniset/benches/inv_list.rs b/components/uniset/benches/inv_list.rs index 9ef3e7ea63b..31dacb530ee 100644 --- a/components/uniset/benches/inv_list.rs +++ b/components/uniset/benches/inv_list.rs @@ -1,12 +1,12 @@ use criterion::{criterion_group, criterion_main, Criterion}; use icu_unicodeset::UnicodeSet; -use std::{char, convert::TryFrom}; +use std::char; fn contains_bench(c: &mut Criterion) { let best_ex = vec![65, 70]; - let best_sample = UnicodeSet::try_from(best_ex).unwrap(); + let best_sample = UnicodeSet::from_inversion_list(best_ex).unwrap(); let worst_ex: Vec = (0..((char::MAX as u32) + 1)).collect(); - let worst_sample = UnicodeSet::try_from(worst_ex).unwrap(); + let worst_sample = UnicodeSet::from_inversion_list(worst_ex).unwrap(); let mut group = c.benchmark_group("uniset/contains"); group.bench_with_input("best", &best_sample, |b, sample| { @@ -20,9 +20,9 @@ fn contains_bench(c: &mut Criterion) { fn contains_range_bench(c: &mut Criterion) { let best_ex = vec![65, 70]; - let best_sample = UnicodeSet::try_from(best_ex).unwrap(); + let best_sample = UnicodeSet::from_inversion_list(best_ex).unwrap(); let worst_ex: Vec = (0..((char::MAX as u32) + 1)).collect(); - let worst_sample = UnicodeSet::try_from(worst_ex).unwrap(); + let worst_sample = UnicodeSet::from_inversion_list(worst_ex).unwrap(); let mut group = c.benchmark_group("uniset/contains_range"); group.bench_with_input("best", &best_sample, |b, sample| { From 2c54b63897c9c372dd8e96ed1cb792c7b434aba2 Mon Sep 17 00:00:00 2001 From: Evan Peng Date: Fri, 17 Jul 2020 19:34:28 +0000 Subject: [PATCH 30/30] clippy checks that cargo clippy doesn't catch locally --- components/uniset/src/uniset.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/uniset/src/uniset.rs b/components/uniset/src/uniset.rs index ff9243562ed..d1fa6095c51 100644 --- a/components/uniset/src/uniset.rs +++ b/components/uniset/src/uniset.rs @@ -206,7 +206,7 @@ mod tests { #[test] fn test_unicodeset_all() { let expected = vec![0, (char::MAX as u32) + 1]; - assert_eq!(UnicodeSet::all().inv_list, expected.clone()); + assert_eq!(UnicodeSet::all().inv_list, expected); assert_eq!( UnicodeSet::all().size(), (expected[1] - expected[0]) as usize @@ -215,7 +215,7 @@ mod tests { #[test] fn test_unicodeset_bmp() { let expected = vec![0, BMP_MAX + 1]; - assert_eq!(UnicodeSet::bmp().inv_list, expected.clone()); + assert_eq!(UnicodeSet::bmp().inv_list, expected); assert_eq!( UnicodeSet::bmp().size(), (expected[1] - expected[0]) as usize