From 9ae42b606b41136292e2ba7c3f0b131a536a155f Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Thu, 30 Dec 2021 14:48:47 -0800 Subject: [PATCH 01/11] Add repr(C) for ResourceKey --- provider/core/src/extract.rs | 73 ++++++++++++++++++++++++++++++ provider/core/src/lib.rs | 2 + provider/core/src/resource.rs | 83 +++++++++++++++++++++++++++++------ provider/core/tests/sizes.rs | 6 ++- 4 files changed, 148 insertions(+), 16 deletions(-) create mode 100644 provider/core/src/extract.rs diff --git a/provider/core/src/extract.rs b/provider/core/src/extract.rs new file mode 100644 index 00000000000..11c256e8d44 --- /dev/null +++ b/provider/core/src/extract.rs @@ -0,0 +1,73 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Utilities for extracting `ResourceKey` objects from a byte stream. Requires the "std" feature. + +use crate::prelude::*; +use std::io::BufRead; +use std::io::BufReader; +use std::io; + +pub fn extract_keys_from_byte_stream(stream: impl io::Read) -> io::Result> { + let mut reader = BufReader::with_capacity(1024, stream); + let mut working_buffer = [0u8; 1024 + 39]; + let mut output = Vec::new(); + loop { + let reader_buffer = reader.fill_buf()?; + if reader_buffer.len() == 0 { + break; + } + let len = reader_buffer.len(); + // Save 39 bytes from iteration to iteration: one less than a 40-byte window + working_buffer[39..(39+len)].copy_from_slice(reader_buffer); + for window in working_buffer[..(39+len)].windows(40) { + if &window[0..8] == b"ICURES[[" && &window[36..40] == b"]]**" { + let mut bytes: [u8; 40] = [0; 40]; + bytes.copy_from_slice(window); + let resc_key = match ResourceKey::from_repr_c(bytes) { + Some(k) => k, + None => continue + }; + output.push(resc_key); + } + } + reader.consume(len); + working_buffer.copy_within(len.., 0); + } + output.sort(); + output.dedup(); + Ok(output) +} + +#[cfg(test)] +mod test { + use super::*; + use crate::resource_key; + + const GOLDEN_BYTES: &[u8] = b"\x00\x00ICURES[[\x02\x00\x00\x00\x00\x00\x00\x00skeletons\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00]]**ICURES[[\x02\x00\x00\x00\x00\x00\x00\x00symbols\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00]]**\x00\x00"; + + #[test] + fn test_extract_golden() { + let keys = extract_keys_from_byte_stream(&*GOLDEN_BYTES).unwrap(); + assert_eq!(keys, vec![ + resource_key!(DateTime, "skeletons", 1), + resource_key!(DateTime, "symbols", 1), + ]); + } + + #[test] + fn test_extract_large() { + let keys: Vec = (0u8..=255u8).map(|i| resource_key!(Core, "demo", i)).collect(); + let mut buffer: Vec = Vec::new(); + for key in keys.iter() { + // Insert some garbage + buffer.extend(b"\x00ICURES[[\x00\x00]]**\x00\x00"); + // This is safe because we are transmuting to a POD type + let key_bytes: [u8; 40] = unsafe { core::mem::transmute(*key) }; + buffer.extend(&key_bytes); + } + let extracted_keys = extract_keys_from_byte_stream(&*buffer).unwrap(); + assert_eq!(keys, extracted_keys); + } +} diff --git a/provider/core/src/lib.rs b/provider/core/src/lib.rs index 703bddfd714..6ad62a2f27b 100644 --- a/provider/core/src/lib.rs +++ b/provider/core/src/lib.rs @@ -113,6 +113,8 @@ mod error; #[macro_use] pub mod erased; pub mod export; +#[cfg(feature = "std")] +pub mod extract; pub mod filter; pub mod hello_world; pub mod inv; diff --git a/provider/core/src/resource.rs b/provider/core/src/resource.rs index 94aa9696db0..2c624cddc2a 100644 --- a/provider/core/src/resource.rs +++ b/provider/core/src/resource.rs @@ -15,12 +15,13 @@ use core::default::Default; use core::fmt; use core::fmt::Write; use icu_locid::LanguageIdentifier; -use tinystr::{TinyStr16, TinyStr4}; +use tinystr::{tinystr4, tinystr8, TinyStr16, TinyStr4, TinyStr8}; use writeable::{LengthHint, Writeable}; /// A top-level collection of related resource keys. #[non_exhaustive] #[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Debug)] +#[repr(C)] pub enum ResourceCategory { Core, Calendar, @@ -79,10 +80,64 @@ impl writeable::Writeable for ResourceCategory { /// /// Use [`resource_key!`] as a shortcut to create resource keys in code. #[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone)] +#[repr(C)] pub struct ResourceKey { + _tag0: TinyStr8, pub category: ResourceCategory, pub sub_category: TinyStr16, - pub version: u16, + pub version: u8, + _tag1: TinyStr4, +} + +impl ResourceKey { + /// Creates a new [`ResourceKey`]. + pub const fn new( + category: ResourceCategory, + sub_category: TinyStr16, + version: u8, + ) -> ResourceKey { + ResourceKey { + _tag0: tinystr8!("ICURES[["), + category, + sub_category, + version, + _tag1: tinystr4!("]]**"), + } + } + + /// Recovers a [`ResourceKey`] from its `repr(C)` bytes. + /// + /// Returns `None` if the bytes are not a valid [`ResourceKey`]. + /// + /// # Examples + /// + /// ``` + /// use icu_provider::prelude::*; + /// + /// let demo_key = icu_provider::resource_key!(Core, "demo", 1); + /// // This is safe because we are transmuting to a POD type + /// let repr_c_bytes: [u8; 40] = unsafe { + /// core::mem::transmute(demo_key) + /// }; + /// let recovered_key = ResourceKey::from_repr_c(repr_c_bytes) + /// .expect("The bytes are valid"); + /// + /// assert_eq!(demo_key, recovered_key); + /// ``` + pub fn from_repr_c(bytes: [u8; 40]) -> Option { + // Smoke check + if &bytes[0..8] != b"ICURES[[" || &bytes[36..40] != b"]]**" { + return None; + } + + // TODO(#1457): Use a ULE-like code path here. + // TODO(#1457): This is not safe! + // - We can't currently verify the ResourceCategory! + // - TinyStr does not currently have a function that takes a byte *array* (with NULs). + unsafe { + Some(core::mem::transmute(bytes)) + } + } } /// Shortcut to construct a const resource identifier. @@ -119,11 +174,11 @@ macro_rules! resource_key { ) }; ($category:expr, $sub_category:literal, $version:tt) => { - $crate::ResourceKey { - category: $category, - sub_category: $crate::internal::tinystr16!($sub_category), - version: $version, - } + $crate::ResourceKey::new( + $category, + $crate::internal::tinystr16!($sub_category), + $version, + ) }; } @@ -433,11 +488,11 @@ mod tests { expected: "core/cardinal@1", }, KeyTestCase { - resc_key: ResourceKey { - category: ResourceCategory::PrivateUse(tinystr4!("priv")), - sub_category: tinystr::tinystr16!("cardinal"), - version: 1, - }, + resc_key: ResourceKey::new( + ResourceCategory::PrivateUse(tinystr4!("priv")), + tinystr::tinystr16!("cardinal"), + 1, + ), expected: "x-priv/cardinal@1", }, KeyTestCase { @@ -445,8 +500,8 @@ mod tests { expected: "core/maxlengthsubcatg@1", }, KeyTestCase { - resc_key: resource_key!(Core, "cardinal", 65535), - expected: "core/cardinal@65535", + resc_key: resource_key!(Core, "cardinal", 255), + expected: "core/cardinal@255", }, ] } diff --git a/provider/core/tests/sizes.rs b/provider/core/tests/sizes.rs index f2377eea994..ea082694287 100644 --- a/provider/core/tests/sizes.rs +++ b/provider/core/tests/sizes.rs @@ -5,7 +5,9 @@ use icu_provider::prelude::*; use static_assertions::const_assert_eq; +const_assert_eq!(8, core::mem::size_of::()); const_assert_eq!(8, core::mem::size_of::()); const_assert_eq!(16, core::mem::size_of::()); -const_assert_eq!(4, core::mem::size_of::()); -const_assert_eq!(32, core::mem::size_of::()); +const_assert_eq!(1, core::mem::size_of::()); +const_assert_eq!(4, core::mem::size_of::()); +const_assert_eq!(40, core::mem::size_of::()); From b4c75aedab9f45cd8cabbd9c901d4b3209fbcbeb Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Thu, 30 Dec 2021 15:25:29 -0800 Subject: [PATCH 02/11] Add icu4x-key-extract --- tools/datagen/Cargo.toml | 5 ++ tools/datagen/README.md | 1 + tools/datagen/src/bin/key-extract.rs | 89 ++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+) create mode 100644 tools/datagen/src/bin/key-extract.rs diff --git a/tools/datagen/Cargo.toml b/tools/datagen/Cargo.toml index 3382f605568..e09d8a7c6b8 100644 --- a/tools/datagen/Cargo.toml +++ b/tools/datagen/Cargo.toml @@ -29,6 +29,7 @@ all-features = true [dependencies] eyre = "0.6" clap = "2.33" +either = "1.6" futures = "0.3" icu_locid = { version = "0.4", path = "../../components/locid", features = ["std"]} icu_properties = { version = "0.4", path = "../../components/properties", features = ["std"]} @@ -52,3 +53,7 @@ path = "src/bin/datagen.rs" [[bin]] name = "icu4x-testdata-download" path = "src/bin/testdata-download.rs" + +[[bin]] +name = "icu4x-key-extract" +path = "src/bin/key-extract.rs" diff --git a/tools/datagen/README.md b/tools/datagen/README.md index 5337abcc59d..9dad544448a 100644 --- a/tools/datagen/README.md +++ b/tools/datagen/README.md @@ -6,6 +6,7 @@ The tools include: 1. `icu4x-datagen`: Read source data (CLDR JSON) and dump ICU4X-format data. 2. `icu4x-testdata-download`: Download fresh CLDR JSON for testdata. +3. `icu4x-key-extract`: Extract `ResourceKey` objects present in a compiled executable. More details on each tool can be found by running `--help`. diff --git a/tools/datagen/src/bin/key-extract.rs b/tools/datagen/src/bin/key-extract.rs new file mode 100644 index 00000000000..f51b10c0855 --- /dev/null +++ b/tools/datagen/src/bin/key-extract.rs @@ -0,0 +1,89 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use clap::{App, Arg}; +use either::Either; +use eyre::WrapErr; +use simple_logger::SimpleLogger; +use std::fs::File; +use std::io::Write; +use std::path::PathBuf; + +fn main() -> eyre::Result<()> { + let args = App::new("ICU4X Resource Key Extractor") + .version("0.0.1") + .author("The ICU4X Project Developers") + .about("Extract resource keys from a compiled binary") + .arg( + Arg::with_name("VERBOSE") + .short("v") + .long("verbose") + .multiple(true) + .help("Sets the level of verbosity (-v, -vv, or -vvv)"), + ) + .arg( + Arg::with_name("INPUT") + .short("i") + .long("input") + .help( + "Path to the file to scan. If omitted, reads from standard input." + ) + .takes_value(true) + ) + .arg( + Arg::with_name("OUTPUT") + .short("o") + .long("out") + .help( + "Path to a file to print the detected resource keys. If omitted, writes to standard output.", + ) + .takes_value(true) + ) + .get_matches(); + + match args.occurrences_of("VERBOSE") { + 0 => SimpleLogger::new() + .env() + .with_level(log::LevelFilter::Info) + .init() + .unwrap(), + 1 => SimpleLogger::new() + .with_level(log::LevelFilter::Debug) + .init() + .unwrap(), + 2 => SimpleLogger::new() + .with_level(log::LevelFilter::Trace) + .init() + .unwrap(), + _ => eyre::bail!("Only -v and -vv are supported"), + } + + let read_stream: Either<_, _> = match args.value_of("INPUT") { + Some(path_str) => { + let path_buf = PathBuf::from(path_str); + Either::Left( + File::open(&path_buf).with_context(|| path_buf.to_string_lossy().into_owned())?, + ) + } + None => Either::Right(std::io::stdin()), + }; + + let mut write_stream: Either<_, _> = match args.value_of("OUTPUT") { + Some(path_str) => { + let path_buf = PathBuf::from(path_str); + Either::Left( + File::create(&path_buf).with_context(|| path_buf.to_string_lossy().into_owned())?, + ) + } + None => Either::Right(std::io::stdout()), + }; + + let keys = icu_provider::extract::extract_keys_from_byte_stream(read_stream)?; + + for key in keys { + writeln!(write_stream, "{}", key)?; + } + + Ok(()) +} From c511e8d9b6b72a0841a04ea4af4b969871ba2ad2 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Thu, 30 Dec 2021 17:06:39 -0800 Subject: [PATCH 03/11] Add icu4x-key-extract test to CI --- .github/workflows/build-test.yml | 15 +++-- Makefile.toml | 2 + provider/core/src/extract.rs | 2 +- provider/core/src/resource.rs | 4 +- .../datagen/tests/testdata/work_log+keys.txt | 4 ++ tools/scripts/data.toml | 28 ++++++++++ tools/scripts/wasm.toml | 56 +++++++++++++++++++ 7 files changed, 103 insertions(+), 8 deletions(-) create mode 100644 tools/datagen/tests/testdata/work_log+keys.txt diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 0a3a395f14c..dfbf13e8052 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -231,6 +231,10 @@ jobs: steps: - uses: actions/checkout@v2 + - name: Install Node.js v14.17.0 + uses: actions/setup-node@v1 + with: + node-version: 14.17.0 - name: Load nightly Rust toolchain for WASM. run: | rustup install nightly-2021-12-22 @@ -320,15 +324,16 @@ jobs: with: command: make args: wasm-release - - name: Install Node.js v14.17.0 - uses: actions/setup-node@v1 - with: - node-version: 14.17.0 - - name: Build + - name: Test uses: actions-rs/cargo@v1.0.1 with: command: make args: wasm-test-release + - name: Build Examples and test icu4x-key-extract + uses: actions-rs/cargo@v1.0.1 + with: + command: make + args: wasm-compare-worklog-keys # This has to be a separate test since the emscripten sdk # will otherwise interfere with other node-using tests - name: Run emscripten test diff --git a/Makefile.toml b/Makefile.toml index 2f66eb57583..6f4d74d5fde 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -91,6 +91,8 @@ dependencies = [ # we have to set up the environment for the emscripten job separately # Instead, each of these is called individually. "wasm-release", + "wasm-test-release", + "wasm-compare-worklog-keys", "wasm-cpp-emscripten", ] diff --git a/provider/core/src/extract.rs b/provider/core/src/extract.rs index 11c256e8d44..4c493ab5dd4 100644 --- a/provider/core/src/extract.rs +++ b/provider/core/src/extract.rs @@ -25,7 +25,7 @@ pub fn extract_keys_from_byte_stream(stream: impl io::Read) -> io::Result k, None => continue }; diff --git a/provider/core/src/resource.rs b/provider/core/src/resource.rs index 2c624cddc2a..79e9ce3c15a 100644 --- a/provider/core/src/resource.rs +++ b/provider/core/src/resource.rs @@ -124,7 +124,7 @@ impl ResourceKey { /// /// assert_eq!(demo_key, recovered_key); /// ``` - pub fn from_repr_c(bytes: [u8; 40]) -> Option { + pub fn from_repr_c(bytes: &[u8; 40]) -> Option { // Smoke check if &bytes[0..8] != b"ICURES[[" || &bytes[36..40] != b"]]**" { return None; @@ -135,7 +135,7 @@ impl ResourceKey { // - We can't currently verify the ResourceCategory! // - TinyStr does not currently have a function that takes a byte *array* (with NULs). unsafe { - Some(core::mem::transmute(bytes)) + Some(core::mem::transmute(*bytes)) } } } diff --git a/tools/datagen/tests/testdata/work_log+keys.txt b/tools/datagen/tests/testdata/work_log+keys.txt new file mode 100644 index 00000000000..9f03fd10b11 --- /dev/null +++ b/tools/datagen/tests/testdata/work_log+keys.txt @@ -0,0 +1,4 @@ +datetime/lengths@1 +datetime/skeletons@1 +datetime/symbols@1 +plurals/ordinal@1 diff --git a/tools/scripts/data.toml b/tools/scripts/data.toml index f7668eae749..5fccdd58444 100644 --- a/tools/scripts/data.toml +++ b/tools/scripts/data.toml @@ -15,6 +15,17 @@ args = [ "-v", ] +[tasks.icu4x-key-extract] +description = "Build and smoke-test the icu4x-key-extract tool" +category = "ICU4X Data" +command = "cargo" +args = [ + "run", + "--bin=icu4x-key-extract", + "--", + "--help", +] + [tasks.testdata-build-json] description = "Build ICU4X JSON from the downloaded CLDR JSON, overwriting the existing ICU4X JSON." category = "ICU4X Data" @@ -65,6 +76,22 @@ args = [ "--overwrite", ] +[tasks.testdata-build-worklog-json] +description = "Build JSON files for the work_log example" +category = "ICU4X Data" +command = "cargo" +args = [ + "run", + "--bin=icu4x-datagen", + "--", + "--input-from-testdata", + "--out=tools/datagen/tests/testdata/work_log_json", + "--key-file=tools/datagen/tests/testdata/work_log+keys.txt", + "--locales=de", + "--locales=es", + "--overwrite", +] + [tasks.testdata] description = "Rebuild all ICU4X testdata from source data checked into the repository" category = "ICU4X Data" @@ -72,6 +99,7 @@ dependencies = [ "testdata-build-json", "testdata-build-blob", "testdata-build-blob-smaller", + "testdata-build-worklog-json", ] [tasks.testdata-check] diff --git a/tools/scripts/wasm.toml b/tools/scripts/wasm.toml index f1717a37bef..5c84478bb5f 100644 --- a/tools/scripts/wasm.toml +++ b/tools/scripts/wasm.toml @@ -334,6 +334,62 @@ end ''' dependencies = ["wasm-wasm-examples"] +[tasks.wasm-key-extract-examples] +description = "Run icu4x-key-extract on WASM files" +category = "ICU4X WASM" +script_runner = "@duckscript" +script = ''' +exit_on_error true + +mkdir wasmpkg/keys + +handle = glob_array wasmpkg/*.wasm +for src_path in ${handle} + path_no_extension = substring ${src_path} -5 + basename = substring ${path_no_extension} 8 + out_path = concat wasmpkg/keys/ ${basename} "+keys.txt" + + out_exists = is_path_exists ${out_path} + up_to_date = set false + if ${out_exists} + src_time = get_last_modified_time ${src_path} + out_time = get_last_modified_time ${out_path} + up_to_date = less_than ${src_time} ${out_time} + end + + if not ${up_to_date} + echo Writing ${out_path} + exec --fail-on-error cargo run --bin icu4x-key-extract -- -i ${src_path} -o ${out_path} + end +end +''' +dependencies = ["wasm-wasm-examples", "icu4x-key-extract"] + +[tasks.wasm-compare-worklog-keys] +description = "Compare the generated worklog key file with the golden version" +category = "ICU4X WASM" +script_runner = "@duckscript" +script = ''' +exit_on_error true + +expected = readfile tools/datagen/tests/testdata/work_log+keys.txt +actual = readfile wasmpkg/keys/work_log+keys.txt +are_equal = eq ${expected} ${actual} + +if ${are_equal} + exit 0 +else + echo "*****" + echo "work_log+keys.txt do not match! Actual generated output:" + echo "" + echo ${actual} + echo "If this is expected, copy the above output into tools/datagen/tests/testdata/work_log+keys.txt" + echo "*****" + exit 1 +end +''' +dependencies = ["wasm-key-extract-examples"] + [tasks.wasm-dev] description = "All-in-one command to build dev-mode WASM FFI to wasmpkg" category = "ICU4X WASM" From 84660258091d8e8e77707136107c0e216a05caf2 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Thu, 30 Dec 2021 17:51:57 -0800 Subject: [PATCH 04/11] Add support for and test --key-file in icu4x-datagen --- .gitignore | 1 + Cargo.lock | 1 + tools/datagen/src/bin/datagen.rs | 37 ++++++++++++++++++++++---------- tools/scripts/data.toml | 33 ++++++++++++++-------------- 4 files changed, 45 insertions(+), 27 deletions(-) diff --git a/.gitignore b/.gitignore index 799786906a2..3c63338f000 100644 --- a/.gitignore +++ b/.gitignore @@ -23,6 +23,7 @@ dhat-heap.json # Do not check-in bincode test data provider/testdata/data/bincode +tools/datagen/tests/testdata/work_log_bincode # Ignore irrelevant files that get generated on macOS **/.DS_Store diff --git a/Cargo.lock b/Cargo.lock index c7b5d59c6c6..3234fde405d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1096,6 +1096,7 @@ name = "icu_datagen" version = "0.4.0" dependencies = [ "clap", + "either", "eyre", "futures", "icu_locid", diff --git a/tools/datagen/src/bin/datagen.rs b/tools/datagen/src/bin/datagen.rs index 5bb35054e3b..e1762174ab0 100644 --- a/tools/datagen/src/bin/datagen.rs +++ b/tools/datagen/src/bin/datagen.rs @@ -27,6 +27,10 @@ use simple_logger::SimpleLogger; use std::collections::HashSet; use std::path::PathBuf; use std::str::FromStr; +use std::io; +use std::io::BufRead; +use std::fs::File; +use std::borrow::Cow; use writeable::Writeable; fn main() -> eyre::Result<()> { @@ -240,10 +244,6 @@ fn main() -> eyre::Result<()> { .unwrap() } - if matches.is_present("KEY_FILE") { - eyre::bail!("Key file is not yet supported (see #192)",); - } - if matches.is_present("DRY_RUN") { eyre::bail!("Dry-run is not yet supported"); } @@ -280,14 +280,29 @@ fn main() -> eyre::Result<()> { _ => unreachable!(), }; + let mut allowed_keys: Option>> = None; + if let Some(keys) = matches.values_of("KEYS") { + let allowed_keys = allowed_keys.get_or_insert_with(Default::default); + allowed_keys.extend(keys.map(|s| Cow::Borrowed(s))); + } + if let Some(key_file_path) = matches.value_of_os("KEY_FILE") { + // eyre::bail!("Key file is not yet supported (see #192)",); + let allowed_keys = allowed_keys.get_or_insert_with(Default::default); + let file = File::open(key_file_path).with_context(|| key_file_path.to_string_lossy().into_owned())?; + for line in io::BufReader::new(file).lines() { + let line_string = line.with_context(|| key_file_path.to_string_lossy().into_owned())?; + allowed_keys.insert(Cow::Owned(line_string)); + } + } + if matches.is_present("ALL_KEYS") || matches.is_present("KEYS") + || matches.is_present("KEY_FILE") || matches.is_present("TEST_KEYS") { - let keys = matches.values_of("KEYS").map(|values| values.collect()); - export_cldr(&matches, exporter, locales_vec.as_deref(), keys.as_ref())?; - export_set_props(&matches, exporter, keys.as_ref())?; - export_map_props(&matches, exporter, keys.as_ref())?; + export_cldr(&matches, exporter, locales_vec.as_deref(), allowed_keys.as_ref())?; + export_set_props(&matches, exporter, allowed_keys.as_ref())?; + export_map_props(&matches, exporter, allowed_keys.as_ref())?; } if matches.is_present("HELLO_WORLD") { @@ -379,7 +394,7 @@ fn export_cldr( matches: &ArgMatches, exporter: &mut (impl DataExporter + ?Sized), allowed_locales: Option<&[LanguageIdentifier]>, - allowed_keys: Option<&HashSet<&str>>, + allowed_keys: Option<&HashSet>>, ) -> eyre::Result<()> { let locale_subset = matches.value_of("CLDR_LOCALE_SUBSET").unwrap_or("full"); let cldr_paths: Box = if let Some(tag) = matches.value_of("CLDR_TAG") { @@ -437,7 +452,7 @@ fn export_cldr( fn export_set_props( matches: &ArgMatches, exporter: &mut (impl DataExporter + ?Sized), - allowed_keys: Option<&HashSet<&str>>, + allowed_keys: Option<&HashSet>>, ) -> eyre::Result<()> { log::trace!("Loading data for binary properties..."); @@ -485,7 +500,7 @@ fn export_set_props( fn export_map_props( matches: &ArgMatches, exporter: &mut (impl DataExporter + ?Sized), - allowed_keys: Option<&HashSet<&str>>, + allowed_keys: Option<&HashSet>>, ) -> eyre::Result<()> { log::trace!("Loading data for enumerated properties..."); diff --git a/tools/scripts/data.toml b/tools/scripts/data.toml index 5fccdd58444..9aaaaaa0d87 100644 --- a/tools/scripts/data.toml +++ b/tools/scripts/data.toml @@ -76,22 +76,6 @@ args = [ "--overwrite", ] -[tasks.testdata-build-worklog-json] -description = "Build JSON files for the work_log example" -category = "ICU4X Data" -command = "cargo" -args = [ - "run", - "--bin=icu4x-datagen", - "--", - "--input-from-testdata", - "--out=tools/datagen/tests/testdata/work_log_json", - "--key-file=tools/datagen/tests/testdata/work_log+keys.txt", - "--locales=de", - "--locales=es", - "--overwrite", -] - [tasks.testdata] description = "Rebuild all ICU4X testdata from source data checked into the repository" category = "ICU4X Data" @@ -155,3 +139,20 @@ args = [ "--syntax=bincode", "--overwrite", ] + +[tasks.testdata-build-worklog-bincode] +description = "Build Bincode files for the work_log example" +category = "ICU4X Data" +command = "cargo" +args = [ + "run", + "--bin=icu4x-datagen", + "--", + "--input-from-testdata", + "--out=tools/datagen/tests/testdata/work_log_bincode", + "--key-file=tools/datagen/tests/testdata/work_log+keys.txt", + "--locales=de", + "--locales=es", + "--syntax=bincode", + "--overwrite", +] From 22cabee43dfc5a99f4dcac0afcca92be601d5b9d Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Thu, 30 Dec 2021 18:00:47 -0800 Subject: [PATCH 05/11] Update docs for new icu4x-key-extract tool --- tools/datagen/README.md | 35 ++++++++++---------- tools/datagen/src/main.rs | 68 +++++++++++++++++++-------------------- 2 files changed, 51 insertions(+), 52 deletions(-) diff --git a/tools/datagen/README.md b/tools/datagen/README.md index 9dad544448a..be4eb5b2894 100644 --- a/tools/datagen/README.md +++ b/tools/datagen/README.md @@ -12,39 +12,38 @@ More details on each tool can be found by running `--help`. ## Examples -Generate ICU4X JSON file tree: +Generate ICU4X Postcard blob (single file) for all keys and all locales: ```bash # Run from the icu4x project folder $ cargo run --bin icu4x-datagen -- \ - --cldr-tag 39.0.0 \ - --all-keys \ - --all-locales \ - --out /tmp/icu4x_data/json + --cldr-tag 39.0.0 \ + --all-keys \ + --all-locales \ + --format blob \ + --out /tmp/icu4x_data/icu4x_data.postcard ``` -Generate ICU4X Postcard blob (single file): +Extract the keys from an executable into a key file: ```bash # Run from the icu4x project folder -$ cargo run --bin icu4x-datagen -- \ - --cldr-tag 39.0.0 \ - --all-keys \ - --all-locales \ - --format blob \ - --out /tmp/icu4x_data/icu4x_data.postcard +$ cargo build --example work_log +$ cargo run --bin icu4x-key-extract -- \ + -i target/debug/examples/work_log + -o /tmp/icu4x_data/work_log+keys.txt ``` -Generate ICU4X Bincode file tree: +Generate ICU4X JSON file tree from the key file for Spanish and German: ```bash # Run from the icu4x project folder $ cargo run --bin icu4x-datagen -- \ - --cldr-tag 39.0.0 \ - --all-keys \ - --all-locales \ - --syntax bincode \ - --out /tmp/icu4x_data/bincode + --cldr-tag 39.0.0 \ + --key-file /tmp/icu4x_data/work_log+keys.txt \ + --locales es \ + --locales de \ + --out /tmp/icu4x_data/work_log_json ``` ## More Information diff --git a/tools/datagen/src/main.rs b/tools/datagen/src/main.rs index 0b70db1c863..e38627bcd3c 100644 --- a/tools/datagen/src/main.rs +++ b/tools/datagen/src/main.rs @@ -8,45 +8,45 @@ //! //! 1. `icu4x-datagen`: Read source data (CLDR JSON) and dump ICU4X-format data. //! 2. `icu4x-testdata-download`: Download fresh CLDR JSON for testdata. +//! 3. `icu4x-key-extract`: Extract `ResourceKey` objects present in a compiled executable. //! //! More details on each tool can be found by running `--help`. //! //! # Examples //! -//! Generate ICU4X JSON file tree: -//! -//!```bash -//!# Run from the icu4x project folder -//!$ cargo run --bin icu4x-datagen -- \ -//! --cldr-tag 39.0.0 \ -//! --all-keys \ -//! --all-locales \ -//! --out /tmp/icu4x_data/json -//!``` -//! -//! Generate ICU4X Postcard blob (single file): -//! -//!```bash -//!# Run from the icu4x project folder -//!$ cargo run --bin icu4x-datagen -- \ -//! --cldr-tag 39.0.0 \ -//! --all-keys \ -//! --all-locales \ -//! --format blob \ -//! --out /tmp/icu4x_data/icu4x_data.postcard -//!``` -//! -//! Generate ICU4X Bincode file tree: -//! -//!```bash -//!# Run from the icu4x project folder -//!$ cargo run --bin icu4x-datagen -- \ -//! --cldr-tag 39.0.0 \ -//! --all-keys \ -//! --all-locales \ -//! --syntax bincode \ -//! --out /tmp/icu4x_data/bincode -//!``` +//! Generate ICU4X Postcard blob (single file) for all keys and all locales: +//! +//! ```bash +//! # Run from the icu4x project folder +//! $ cargo run --bin icu4x-datagen -- \ +//! --cldr-tag 39.0.0 \ +//! --all-keys \ +//! --all-locales \ +//! --format blob \ +//! --out /tmp/icu4x_data/icu4x_data.postcard +//! ``` +//! +//! Extract the keys from an executable into a key file: +//! +//! ```bash +//! # Run from the icu4x project folder +//! $ cargo build --example work_log +//! $ cargo run --bin icu4x-key-extract -- \ +//! -i target/debug/examples/work_log +//! -o /tmp/icu4x_data/work_log+keys.txt +//! ``` +//! +//! Generate ICU4X JSON file tree from the key file for Spanish and German: +//! +//! ```bash +//! # Run from the icu4x project folder +//! $ cargo run --bin icu4x-datagen -- \ +//! --cldr-tag 39.0.0 \ +//! --key-file /tmp/icu4x_data/work_log+keys.txt \ +//! --locales es \ +//! --locales de \ +//! --out /tmp/icu4x_data/work_log_json +//! ``` fn main() { panic!("Please run a more specific binary") From 596e252a159e4249504a347af854a42f0f687a19 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Thu, 30 Dec 2021 18:11:07 -0800 Subject: [PATCH 06/11] Update readmes again --- tools/datagen/README.md | 5 +++-- tools/datagen/src/main.rs | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/datagen/README.md b/tools/datagen/README.md index be4eb5b2894..84116467c8c 100644 --- a/tools/datagen/README.md +++ b/tools/datagen/README.md @@ -28,10 +28,11 @@ Extract the keys from an executable into a key file: ```bash # Run from the icu4x project folder -$ cargo build --example work_log +$ cargo build --example work_log --release $ cargo run --bin icu4x-key-extract -- \ - -i target/debug/examples/work_log + -i target/release/examples/work_log \ -o /tmp/icu4x_data/work_log+keys.txt +$ cat /tmp/icu4x_data/work_log+keys.txt ``` Generate ICU4X JSON file tree from the key file for Spanish and German: diff --git a/tools/datagen/src/main.rs b/tools/datagen/src/main.rs index e38627bcd3c..466332214e2 100644 --- a/tools/datagen/src/main.rs +++ b/tools/datagen/src/main.rs @@ -30,10 +30,11 @@ //! //! ```bash //! # Run from the icu4x project folder -//! $ cargo build --example work_log +//! $ cargo build --example work_log --release //! $ cargo run --bin icu4x-key-extract -- \ -//! -i target/debug/examples/work_log +//! -i target/release/examples/work_log \ //! -o /tmp/icu4x_data/work_log+keys.txt +//! $ cat /tmp/icu4x_data/work_log+keys.txt //! ``` //! //! Generate ICU4X JSON file tree from the key file for Spanish and German: From 87c93d11b546042e564af4a433ac01dce468d810 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Thu, 30 Dec 2021 18:16:01 -0800 Subject: [PATCH 07/11] fmt --- provider/core/src/extract.rs | 23 ++++++++++++++--------- provider/core/src/resource.rs | 6 ++---- tools/datagen/src/bin/datagen.rs | 18 ++++++++++++------ tools/datagen/src/main.rs | 4 ++-- 4 files changed, 30 insertions(+), 21 deletions(-) diff --git a/provider/core/src/extract.rs b/provider/core/src/extract.rs index 4c493ab5dd4..5cdd4445d59 100644 --- a/provider/core/src/extract.rs +++ b/provider/core/src/extract.rs @@ -5,9 +5,9 @@ //! Utilities for extracting `ResourceKey` objects from a byte stream. Requires the "std" feature. use crate::prelude::*; +use std::io; use std::io::BufRead; use std::io::BufReader; -use std::io; pub fn extract_keys_from_byte_stream(stream: impl io::Read) -> io::Result> { let mut reader = BufReader::with_capacity(1024, stream); @@ -20,14 +20,14 @@ pub fn extract_keys_from_byte_stream(stream: impl io::Read) -> io::Result k, - None => continue + None => continue, }; output.push(resc_key); } @@ -50,15 +50,20 @@ mod test { #[test] fn test_extract_golden() { let keys = extract_keys_from_byte_stream(&*GOLDEN_BYTES).unwrap(); - assert_eq!(keys, vec![ - resource_key!(DateTime, "skeletons", 1), - resource_key!(DateTime, "symbols", 1), - ]); + assert_eq!( + keys, + vec![ + resource_key!(DateTime, "skeletons", 1), + resource_key!(DateTime, "symbols", 1), + ] + ); } #[test] fn test_extract_large() { - let keys: Vec = (0u8..=255u8).map(|i| resource_key!(Core, "demo", i)).collect(); + let keys: Vec = (0u8..=255u8) + .map(|i| resource_key!(Core, "demo", i)) + .collect(); let mut buffer: Vec = Vec::new(); for key in keys.iter() { // Insert some garbage diff --git a/provider/core/src/resource.rs b/provider/core/src/resource.rs index 79e9ce3c15a..49adb8c21a0 100644 --- a/provider/core/src/resource.rs +++ b/provider/core/src/resource.rs @@ -106,7 +106,7 @@ impl ResourceKey { } /// Recovers a [`ResourceKey`] from its `repr(C)` bytes. - /// + /// /// Returns `None` if the bytes are not a valid [`ResourceKey`]. /// /// # Examples @@ -134,9 +134,7 @@ impl ResourceKey { // TODO(#1457): This is not safe! // - We can't currently verify the ResourceCategory! // - TinyStr does not currently have a function that takes a byte *array* (with NULs). - unsafe { - Some(core::mem::transmute(*bytes)) - } + unsafe { Some(core::mem::transmute(*bytes)) } } } diff --git a/tools/datagen/src/bin/datagen.rs b/tools/datagen/src/bin/datagen.rs index e1762174ab0..d1715fe944f 100644 --- a/tools/datagen/src/bin/datagen.rs +++ b/tools/datagen/src/bin/datagen.rs @@ -24,13 +24,13 @@ use icu_provider_fs::export::FilesystemExporter; use icu_provider_fs::manifest; use icu_provider_uprops::{EnumeratedPropertyCodePointTrieProvider, PropertiesDataProvider}; use simple_logger::SimpleLogger; +use std::borrow::Cow; use std::collections::HashSet; -use std::path::PathBuf; -use std::str::FromStr; +use std::fs::File; use std::io; use std::io::BufRead; -use std::fs::File; -use std::borrow::Cow; +use std::path::PathBuf; +use std::str::FromStr; use writeable::Writeable; fn main() -> eyre::Result<()> { @@ -288,7 +288,8 @@ fn main() -> eyre::Result<()> { if let Some(key_file_path) = matches.value_of_os("KEY_FILE") { // eyre::bail!("Key file is not yet supported (see #192)",); let allowed_keys = allowed_keys.get_or_insert_with(Default::default); - let file = File::open(key_file_path).with_context(|| key_file_path.to_string_lossy().into_owned())?; + let file = File::open(key_file_path) + .with_context(|| key_file_path.to_string_lossy().into_owned())?; for line in io::BufReader::new(file).lines() { let line_string = line.with_context(|| key_file_path.to_string_lossy().into_owned())?; allowed_keys.insert(Cow::Owned(line_string)); @@ -300,7 +301,12 @@ fn main() -> eyre::Result<()> { || matches.is_present("KEY_FILE") || matches.is_present("TEST_KEYS") { - export_cldr(&matches, exporter, locales_vec.as_deref(), allowed_keys.as_ref())?; + export_cldr( + &matches, + exporter, + locales_vec.as_deref(), + allowed_keys.as_ref(), + )?; export_set_props(&matches, exporter, allowed_keys.as_ref())?; export_map_props(&matches, exporter, allowed_keys.as_ref())?; } diff --git a/tools/datagen/src/main.rs b/tools/datagen/src/main.rs index 466332214e2..07c5cf2cddb 100644 --- a/tools/datagen/src/main.rs +++ b/tools/datagen/src/main.rs @@ -25,9 +25,9 @@ //! --format blob \ //! --out /tmp/icu4x_data/icu4x_data.postcard //! ``` -//! +//! //! Extract the keys from an executable into a key file: -//! +//! //! ```bash //! # Run from the icu4x project folder //! $ cargo build --example work_log --release From 6e8b75de8e62b5e91bbd8caab35df3845dd7941b Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Thu, 30 Dec 2021 18:22:23 -0800 Subject: [PATCH 08/11] Add missing docs --- provider/core/src/extract.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/provider/core/src/extract.rs b/provider/core/src/extract.rs index 5cdd4445d59..b62380ce1c3 100644 --- a/provider/core/src/extract.rs +++ b/provider/core/src/extract.rs @@ -9,13 +9,21 @@ use std::io; use std::io::BufRead; use std::io::BufReader; +/// Extracts [`ResourceKey`] objects from a byte stream. +/// +/// This function looks for all occurrences of the `repr(C)` of `ResourceKey` in the byte stream, +/// and reads them back as Rust objects. +/// +/// The byte stream is often the output of a compiler, like a WASM file or an x86 executable. +/// +/// To run this function as a command-line tool, use `icu4x-key-extract`. pub fn extract_keys_from_byte_stream(stream: impl io::Read) -> io::Result> { let mut reader = BufReader::with_capacity(1024, stream); let mut working_buffer = [0u8; 1024 + 39]; let mut output = Vec::new(); loop { let reader_buffer = reader.fill_buf()?; - if reader_buffer.len() == 0 { + if reader_buffer.is_empty() { break; } let len = reader_buffer.len(); From 2f7e1c231a6edb7e7cf325c6bee322c0333cc889 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Thu, 30 Dec 2021 20:55:47 -0800 Subject: [PATCH 09/11] Fix up bincode jobs --- Makefile.toml | 2 +- tools/scripts/data.toml | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/Makefile.toml b/Makefile.toml index 6f4d74d5fde..b643b027119 100644 --- a/Makefile.toml +++ b/Makefile.toml @@ -48,7 +48,7 @@ dependencies = [ "test-all-features", "test-docs-default", "test-docs", - "testdata-build-bincode-all", + "testdata-build-worklog-bincode", "testdata-check", ] diff --git a/tools/scripts/data.toml b/tools/scripts/data.toml index 9aaaaaa0d87..5b3121372a7 100644 --- a/tools/scripts/data.toml +++ b/tools/scripts/data.toml @@ -83,7 +83,6 @@ dependencies = [ "testdata-build-json", "testdata-build-blob", "testdata-build-blob-smaller", - "testdata-build-worklog-json", ] [tasks.testdata-check] From 63b08a54b48f7d730f34274f4b7fc18268d2afdd Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Thu, 30 Dec 2021 23:37:39 -0800 Subject: [PATCH 10/11] Fix docs test --- provider/core/src/resource.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/provider/core/src/resource.rs b/provider/core/src/resource.rs index 49adb8c21a0..1df56960ea5 100644 --- a/provider/core/src/resource.rs +++ b/provider/core/src/resource.rs @@ -119,7 +119,7 @@ impl ResourceKey { /// let repr_c_bytes: [u8; 40] = unsafe { /// core::mem::transmute(demo_key) /// }; - /// let recovered_key = ResourceKey::from_repr_c(repr_c_bytes) + /// let recovered_key = ResourceKey::from_repr_c(&repr_c_bytes) /// .expect("The bytes are valid"); /// /// assert_eq!(demo_key, recovered_key); From b665dbb1c51a882214d781681eabfe84fc20feb1 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 31 Dec 2021 11:16:59 -0800 Subject: [PATCH 11/11] Remove obsolete comment --- tools/datagen/src/bin/datagen.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/datagen/src/bin/datagen.rs b/tools/datagen/src/bin/datagen.rs index d1715fe944f..13a4308cdb1 100644 --- a/tools/datagen/src/bin/datagen.rs +++ b/tools/datagen/src/bin/datagen.rs @@ -286,7 +286,6 @@ fn main() -> eyre::Result<()> { allowed_keys.extend(keys.map(|s| Cow::Borrowed(s))); } if let Some(key_file_path) = matches.value_of_os("KEY_FILE") { - // eyre::bail!("Key file is not yet supported (see #192)",); let allowed_keys = allowed_keys.get_or_insert_with(Default::default); let file = File::open(key_file_path) .with_context(|| key_file_path.to_string_lossy().into_owned())?;