-
Notifications
You must be signed in to change notification settings - Fork 184
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add icu4x-key-extract for Static Data Slicing #1460
Changes from 1 commit
9ae42b6
b4c75ae
c511e8d
8466025
22cabee
596e252
87c93d1
6e8b75d
2f7e1c2
63b08a5
b665dbb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
// This file is part of ICU4X. For terms of use, please see the file | ||
// called LICENSE at the top level of the ICU4X source tree | ||
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
|
||
//! Utilities for extracting `ResourceKey` objects from a byte stream. Requires the "std" feature. | ||
|
||
use crate::prelude::*; | ||
use std::io::BufRead; | ||
use std::io::BufReader; | ||
use std::io; | ||
|
||
pub fn extract_keys_from_byte_stream(stream: impl io::Read) -> io::Result<Vec<ResourceKey>> { | ||
let mut reader = BufReader::with_capacity(1024, stream); | ||
let mut working_buffer = [0u8; 1024 + 39]; | ||
let mut output = Vec::new(); | ||
loop { | ||
let reader_buffer = reader.fill_buf()?; | ||
if reader_buffer.len() == 0 { | ||
break; | ||
} | ||
let len = reader_buffer.len(); | ||
// Save 39 bytes from iteration to iteration: one less than a 40-byte window | ||
working_buffer[39..(39+len)].copy_from_slice(reader_buffer); | ||
for window in working_buffer[..(39+len)].windows(40) { | ||
if &window[0..8] == b"ICURES[[" && &window[36..40] == b"]]**" { | ||
let mut bytes: [u8; 40] = [0; 40]; | ||
bytes.copy_from_slice(window); | ||
let resc_key = match ResourceKey::from_repr_c(bytes) { | ||
Some(k) => k, | ||
None => continue | ||
}; | ||
output.push(resc_key); | ||
} | ||
} | ||
reader.consume(len); | ||
working_buffer.copy_within(len.., 0); | ||
} | ||
output.sort(); | ||
output.dedup(); | ||
Ok(output) | ||
} | ||
|
||
#[cfg(test)] | ||
mod test { | ||
use super::*; | ||
use crate::resource_key; | ||
|
||
const GOLDEN_BYTES: &[u8] = b"\x00\x00ICURES[[\x02\x00\x00\x00\x00\x00\x00\x00skeletons\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00]]**ICURES[[\x02\x00\x00\x00\x00\x00\x00\x00symbols\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00]]**\x00\x00"; | ||
|
||
#[test] | ||
fn test_extract_golden() { | ||
let keys = extract_keys_from_byte_stream(&*GOLDEN_BYTES).unwrap(); | ||
assert_eq!(keys, vec![ | ||
resource_key!(DateTime, "skeletons", 1), | ||
resource_key!(DateTime, "symbols", 1), | ||
]); | ||
} | ||
|
||
#[test] | ||
fn test_extract_large() { | ||
let keys: Vec<ResourceKey> = (0u8..=255u8).map(|i| resource_key!(Core, "demo", i)).collect(); | ||
let mut buffer: Vec<u8> = Vec::new(); | ||
for key in keys.iter() { | ||
// Insert some garbage | ||
buffer.extend(b"\x00ICURES[[\x00\x00]]**\x00\x00"); | ||
// This is safe because we are transmuting to a POD type | ||
let key_bytes: [u8; 40] = unsafe { core::mem::transmute(*key) }; | ||
buffer.extend(&key_bytes); | ||
} | ||
let extracted_keys = extract_keys_from_byte_stream(&*buffer).unwrap(); | ||
assert_eq!(keys, extracted_keys); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,12 +15,13 @@ use core::default::Default; | |
use core::fmt; | ||
use core::fmt::Write; | ||
use icu_locid::LanguageIdentifier; | ||
use tinystr::{TinyStr16, TinyStr4}; | ||
use tinystr::{tinystr4, tinystr8, TinyStr16, TinyStr4, TinyStr8}; | ||
use writeable::{LengthHint, Writeable}; | ||
|
||
/// A top-level collection of related resource keys. | ||
#[non_exhaustive] | ||
#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone, Debug)] | ||
#[repr(C)] | ||
pub enum ResourceCategory { | ||
Core, | ||
Calendar, | ||
|
@@ -79,10 +80,64 @@ impl writeable::Writeable for ResourceCategory { | |
/// | ||
/// Use [`resource_key!`] as a shortcut to create resource keys in code. | ||
#[derive(PartialEq, Eq, PartialOrd, Ord, Copy, Clone)] | ||
#[repr(C)] | ||
pub struct ResourceKey { | ||
_tag0: TinyStr8, | ||
pub category: ResourceCategory, | ||
pub sub_category: TinyStr16, | ||
pub version: u16, | ||
pub version: u8, | ||
_tag1: TinyStr4, | ||
} | ||
Comment on lines
+83
to
+90
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we want to put these changes behind a feature? If this is going to be transparent to the client, we can also set the feature correctly and avoid the tags in production code. |
||
|
||
impl ResourceKey { | ||
/// Creates a new [`ResourceKey`]. | ||
pub const fn new( | ||
category: ResourceCategory, | ||
sub_category: TinyStr16, | ||
version: u8, | ||
) -> ResourceKey { | ||
ResourceKey { | ||
_tag0: tinystr8!("ICURES[["), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this going to work for a BE binary? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It will work so long as the fields are endian-agnostic. I changed the version field to be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For TinyStr in particular, there's a conversation about it. Short answer is that it actually is endian-safe. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For ResourceCategory it will be fine as long as it's There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ResourceCategory is dataful, which is why it is hard. The other problem is that I intend to remove ResourceCategory, so I don't want to spend a lot of time solving it. Also, even if ResourceCategory were What do you suggest as the short-term solution?
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So Regarless of the answer to that question, I think it would be better to use byte arrays (and byte string literals). They are a primitive, are endian-agnostic, and don't require a future reader to be familiar with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I did not realize ResourceCategory is dataful. I'm okay with having this code for now but I'd prioritize fixing this up. @robertbastian yes, because it has the same bit pattern as the string, and strings are endianness-independent. In fact I've thought about making tinystr internally use byte arrays in the first place. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I still think it overcomplicates things to write a |
||
category, | ||
sub_category, | ||
version, | ||
_tag1: tinystr4!("]]**"), | ||
} | ||
} | ||
|
||
/// Recovers a [`ResourceKey`] from its `repr(C)` bytes. | ||
/// | ||
/// Returns `None` if the bytes are not a valid [`ResourceKey`]. | ||
/// | ||
/// # Examples | ||
/// | ||
/// ``` | ||
/// use icu_provider::prelude::*; | ||
/// | ||
/// let demo_key = icu_provider::resource_key!(Core, "demo", 1); | ||
/// // This is safe because we are transmuting to a POD type | ||
/// let repr_c_bytes: [u8; 40] = unsafe { | ||
/// core::mem::transmute(demo_key) | ||
/// }; | ||
/// let recovered_key = ResourceKey::from_repr_c(repr_c_bytes) | ||
/// .expect("The bytes are valid"); | ||
/// | ||
/// assert_eq!(demo_key, recovered_key); | ||
/// ``` | ||
pub fn from_repr_c(bytes: [u8; 40]) -> Option<ResourceKey> { | ||
// Smoke check | ||
if &bytes[0..8] != b"ICURES[[" || &bytes[36..40] != b"]]**" { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
return None; | ||
} | ||
|
||
// TODO(#1457): Use a ULE-like code path here. | ||
// TODO(#1457): This is not safe! | ||
// - We can't currently verify the ResourceCategory! | ||
// - TinyStr does not currently have a function that takes a byte *array* (with NULs). | ||
unsafe { | ||
Some(core::mem::transmute(bytes)) | ||
} | ||
} | ||
} | ||
|
||
/// Shortcut to construct a const resource identifier. | ||
|
@@ -119,11 +174,11 @@ macro_rules! resource_key { | |
) | ||
}; | ||
($category:expr, $sub_category:literal, $version:tt) => { | ||
$crate::ResourceKey { | ||
category: $category, | ||
sub_category: $crate::internal::tinystr16!($sub_category), | ||
version: $version, | ||
} | ||
$crate::ResourceKey::new( | ||
$category, | ||
$crate::internal::tinystr16!($sub_category), | ||
$version, | ||
) | ||
}; | ||
} | ||
|
||
|
@@ -433,20 +488,20 @@ mod tests { | |
expected: "core/cardinal@1", | ||
}, | ||
KeyTestCase { | ||
resc_key: ResourceKey { | ||
category: ResourceCategory::PrivateUse(tinystr4!("priv")), | ||
sub_category: tinystr::tinystr16!("cardinal"), | ||
version: 1, | ||
}, | ||
resc_key: ResourceKey::new( | ||
ResourceCategory::PrivateUse(tinystr4!("priv")), | ||
tinystr::tinystr16!("cardinal"), | ||
1, | ||
), | ||
expected: "x-priv/cardinal@1", | ||
}, | ||
KeyTestCase { | ||
resc_key: resource_key!(Core, "maxlengthsubcatg", 1), | ||
expected: "core/maxlengthsubcatg@1", | ||
}, | ||
KeyTestCase { | ||
resc_key: resource_key!(Core, "cardinal", 65535), | ||
expected: "core/cardinal@65535", | ||
resc_key: resource_key!(Core, "cardinal", 255), | ||
expected: "core/cardinal@255", | ||
}, | ||
] | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thought: not a huge fan of this, but I guess it's no big deal and it does seem to be the best way to achieve this