Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

split_at audit #5970

Merged
merged 4 commits into from
Jan 14, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions components/collator/src/elements.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1888,11 +1888,9 @@ where
while !remaining.is_empty() {
// Numeric CEs are generated for segments of
// up to 254 digits.
let (head, tail) = if remaining.len() > 254 {
remaining.split_at(254)
} else {
(remaining, &b""[..])
};
let (head, tail) = remaining
.split_at_checked(254)
.unwrap_or((remaining, b""));
remaining = tail;
// From ICU4C CollationIterator::appendNumericSegmentCEs
if head.len() <= 7 {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,10 +122,9 @@ impl SubdivisionId {
})
.ok_or(ParseError::InvalidExtension)?;
let region_len = if is_alpha { 2 } else { 3 };
if code_units.len() < region_len + 1 {
return Err(ParseError::InvalidExtension);
}
let (region_code_units, suffix_code_units) = code_units.split_at(region_len);
let (region_code_units, suffix_code_units) = code_units
.split_at_checked(region_len)
.ok_or(ParseError::InvalidExtension)?;
let region =
Region::try_from_utf8(region_code_units).map_err(|_| ParseError::InvalidExtension)?;
let suffix = SubdivisionSuffix::try_from_utf8(suffix_code_units)?;
Expand Down
11 changes: 2 additions & 9 deletions components/locale_core/src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,8 @@ const fn skip_before_separator(slice: &[u8]) -> &[u8] {
}

// Notice: this slice may be empty for cases like `"en-"` or `"en--US"`
// MSRV 1.71/1.79: Use split_at/split_at_unchecked
// SAFETY: end < slice.len() by while loop
unsafe { core::slice::from_raw_parts(slice.as_ptr(), end) }
unsafe { slice.split_at_unchecked(end).0 }
sffc marked this conversation as resolved.
Show resolved Hide resolved
}

// `SubtagIterator` is a helper iterator for [`LanguageIdentifier`] and [`Locale`] parsing.
Expand Down Expand Up @@ -66,14 +65,8 @@ impl<'a> SubtagIterator<'a> {

self.current = if result.len() < self.remaining.len() {
// If there is more after `result`, by construction `current` starts with a separator
// MSRV 1.79: Use split_at_unchecked
// SAFETY: `self.remaining` is strictly longer than `result`
self.remaining = unsafe {
core::slice::from_raw_parts(
self.remaining.as_ptr().add(result.len() + 1),
self.remaining.len() - (result.len() + 1),
)
};
self.remaining = unsafe { self.remaining.split_at_unchecked(result.len() + 1).1 };
Some(skip_before_separator(self.remaining))
} else {
None
Expand Down
29 changes: 10 additions & 19 deletions components/normalizer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1454,14 +1454,11 @@ macro_rules! normalizer_methods {
/// is the normalization of the whole input.
pub fn split_normalized<'a>(&self, text: &'a str) -> (&'a str, &'a str) {
let up_to = self.is_normalized_up_to(text);
if up_to <= text.len() {
// not using split_at_checked due to MSRV
text.split_at(up_to)
} else {
text.split_at_checked(up_to).unwrap_or_else(|| {
// Internal bug, not even GIGO, never supposed to happen
debug_assert!(false);
("", text)
}
})
}

/// Return the index a string slice is normalized up to.
Expand Down Expand Up @@ -1502,14 +1499,11 @@ macro_rules! normalizer_methods {
#[cfg(feature = "utf16_iter")]
pub fn split_normalized_utf16<'a>(&self, text: &'a [u16]) -> (&'a [u16], &'a [u16]) {
let up_to = self.is_normalized_utf16_up_to(text);
if up_to <= text.len() {
// not using split_at_checked due to MSRV
text.split_at(up_to)
} else {
text.split_at_checked(up_to).unwrap_or_else(|| {
// Internal bug, not even GIGO, never supposed to happen
debug_assert!(false);
(&[], text)
}
})
}

/// Return the index a slice of potentially-invalid UTF-16 is normalized up to.
Expand Down Expand Up @@ -1559,17 +1553,14 @@ macro_rules! normalizer_methods {
#[cfg(feature = "utf8_iter")]
pub fn split_normalized_utf8<'a>(&self, text: &'a [u8]) -> (&'a str, &'a [u8]) {
let up_to = self.is_normalized_utf8_up_to(text);
if up_to <= text.len() {
// not using split_at_checked due to MSRV
let (head, tail) = text.split_at(up_to);
// SAFETY: The normalization check also checks for
// UTF-8 well-formedness.
(unsafe { core::str::from_utf8_unchecked(head) }, tail)
} else {
let (head, tail) = text.split_at_checked(up_to).unwrap_or_else(|| {
// Internal bug, not even GIGO, never supposed to happen
debug_assert!(false);
("", text)
}
(&[], text)
});
// SAFETY: The normalization check also checks for
// UTF-8 well-formedness.
(unsafe { core::str::from_utf8_unchecked(head) }, tail)
}

/// Return the index a slice of potentially-invalid UTF-8 is normalized up to
Expand Down
25 changes: 9 additions & 16 deletions components/pattern/src/multi_named.rs
Original file line number Diff line number Diff line change
Expand Up @@ -434,38 +434,31 @@ impl<'a> MultiNamedPlaceholderPatternIterator<'a> {
match self.store.find(|x| (x as usize) <= 0x07) {
Some(0) => {
// Placeholder
let Some(&[lead, trail]) = self.store.as_bytes().get(0..2) else {
let Some((&[lead, trail], remainder)) = self
.store
.split_at_checked(2)
.map(|(a, b)| (a.as_bytes(), b))
else {
return Err(MultiNamedPlaceholderError::InvalidStore);
};
robertbastian marked this conversation as resolved.
Show resolved Hide resolved
debug_assert!(lead <= 7);
if trail > 7 {
return Err(MultiNamedPlaceholderError::InvalidStore);
}
let placeholder_len = (lead << 3) + trail;
let boundary = (placeholder_len as usize) + 2;
// TODO: use .split_at_checked() when available:
// https://github.com/rust-lang/rust/issues/119128
let Some(placeholder_name) = self.store.get(2..boundary) else {
let Some((placeholder_name, remainder)) =
remainder.split_at_checked(placeholder_len as usize)
else {
return Err(MultiNamedPlaceholderError::InvalidStore);
};
robertbastian marked this conversation as resolved.
Show resolved Hide resolved
let Some(remainder) = self.store.get(boundary..) else {
debug_assert!(false, "should be a perfect slice");
return Err(MultiNamedPlaceholderError::Unreachable);
};
self.store = remainder;
Ok(Some(PatternItem::Placeholder(MultiNamedPlaceholderKey(
placeholder_name,
))))
}
Some(i) => {
// Literal
// TODO: use .split_at_checked() when available:
// https://github.com/rust-lang/rust/issues/119128
let Some(literal) = self.store.get(..i) else {
debug_assert!(false, "should be a perfect slice");
return Err(MultiNamedPlaceholderError::Unreachable);
};
let Some(remainder) = self.store.get(i..) else {
let Some((literal, remainder)) = self.store.split_at_checked(i) else {
debug_assert!(false, "should be a perfect slice");
return Err(MultiNamedPlaceholderError::Unreachable);
};
Expand Down
12 changes: 5 additions & 7 deletions components/plurals/src/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -451,12 +451,7 @@ where
})
} else {
let (second_byte, remainder) = remainder.split_first()?;
// TODO in Rust 1.80: use split_at_checked
let v_length = *second_byte as usize;
if remainder.len() < v_length {
return None;
}
let (v_bytes, remainder) = remainder.split_at(v_length);
let (v_bytes, remainder) = remainder.split_at_checked(*second_byte as usize)?;
Some(PluralElementsUnpackedBytes {
lead_byte: *lead_byte,
v_bytes,
Expand Down Expand Up @@ -824,7 +819,10 @@ where
panic!("other value too long to be packed: {self:?}")
}
};
let (v_bytes, specials_bytes) = remainder.split_at_mut(*second_byte as usize);
#[allow(clippy::unwrap_used)] // for now okay since it is mostly only during datagen
let (v_bytes, specials_bytes) = remainder
.split_at_mut_checked(*second_byte as usize)
.unwrap();
builder.default.1.encode_var_ule_write(v_bytes);
EncodeAsVarULE::<PluralElementsTupleSliceVarULE<V>>::encode_var_ule_write(
&specials,
Expand Down
19 changes: 1 addition & 18 deletions documents/process/style_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -801,24 +801,7 @@ See also: the [Panics](#Panics--required) section of this document.

#### Special Case: `split_at`

The standard library functions such as `slice::split_at` are panicky, but they are not covered by our Clippy lints.

Instead of using `slice::split_at` directly, it is recommended to use a helper function, which can be saved in the file in which it is being used. Example:

```rust
/// Like slice::split_at but returns an Option instead of panicking
#[inline]
fn debug_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> {
if mid > slice.len() {
debug_assert!(false, "debug_split_at: index expected to be in range");
None
} else {
// Note: We're trusting the compiler to inline this and remove the assertion
// hiding on the top of slice::split_at: `assert(mid <= self.len())`
Some(slice.split_at(mid))
}
}
```
The standard library functions such as `slice::split_at` are panicky, but they are not covered by our Clippy lints. Be careful to use `slice::split_at_checked` instead.

#### Exception: Poison

Expand Down
3 changes: 2 additions & 1 deletion utils/writeable/src/cmp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ impl fmt::Write for WriteComparator<'_> {
return Ok(());
}
let cmp_len = core::cmp::min(other.len(), self.code_units.len());
let (this, remainder) = self.code_units.split_at(cmp_len);
// SAFETY: cmp_len is at most self.code_units.len()
let (this, remainder) = unsafe { self.code_units.split_at_unchecked(cmp_len) };
robertbastian marked this conversation as resolved.
Show resolved Hide resolved
self.code_units = remainder;
self.result = this.cmp(other.as_bytes());
Ok(())
Expand Down
23 changes: 3 additions & 20 deletions utils/zerotrie/src/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,35 +3,18 @@
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

pub(crate) trait MaybeSplitAt<T> {
/// Like slice::split_at but returns an Option instead of panicking
/// if the index is out of range.
fn maybe_split_at(&self, mid: usize) -> Option<(&Self, &Self)>;
/// Like slice::split_at but debug-panics and returns an empty second slice
/// if the index is out of range.
fn debug_split_at(&self, mid: usize) -> (&Self, &Self);
}

impl<T> MaybeSplitAt<T> for [T] {
#[inline]
fn maybe_split_at(&self, mid: usize) -> Option<(&Self, &Self)> {
if mid > self.len() {
None
} else {
// Note: We're trusting the compiler to inline this and remove the assertion
// hiding on the top of slice::split_at: `assert(mid <= self.len())`
Some(self.split_at(mid))
}
}
#[inline]
fn debug_split_at(&self, mid: usize) -> (&Self, &Self) {
if mid > self.len() {
debug_assert!(false, "debug_split_at: index expected to be in range");
self.split_at_checked(mid).unwrap_or_else(|| {
debug_assert!(false, "debug_split_at: {mid} expected to be in range");
(self, &[])
} else {
// Note: We're trusting the compiler to inline this and remove the assertion
// hiding on the top of slice::split_at: `assert(mid <= self.len())`
self.split_at(mid)
}
})
}
}

Expand Down
2 changes: 1 addition & 1 deletion utils/zerotrie/src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ pub(crate) fn get_parameterized<T: ZeroTrieWithOptions + ?Sized>(
{
let (trie_span, ascii_span);
(trie_span, trie) = trie.debug_split_at(x);
(ascii_span, ascii) = ascii.maybe_split_at(x)?;
(ascii_span, ascii) = ascii.split_at_checked(x)?;
if trie_span == ascii_span {
// Matched a byte span
continue;
Expand Down
14 changes: 4 additions & 10 deletions utils/zerovec/src/ule/vartuple.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,22 +111,16 @@ where
V: VarULE + ?Sized,
{
fn validate_bytes(bytes: &[u8]) -> Result<(), UleError> {
// TODO: use split_first_chunk_mut in 1.77
if bytes.len() < size_of::<A::ULE>() {
return Err(UleError::length::<Self>(bytes.len()));
}
let (sized_chunk, variable_chunk) = bytes.split_at(size_of::<A::ULE>());
let (sized_chunk, variable_chunk) = bytes
.split_at_checked(size_of::<A::ULE>())
.ok_or(UleError::length::<Self>(bytes.len()))?;
A::ULE::validate_bytes(sized_chunk)?;
V::validate_bytes(variable_chunk)?;
Ok(())
}

unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Self {
#[allow(clippy::panic)] // panic is documented in function contract
if bytes.len() < size_of::<A::ULE>() {
panic!("from_bytes_unchecked called with short slice")
}
let (_sized_chunk, variable_chunk) = bytes.split_at(size_of::<A::ULE>());
let (_sized_chunk, variable_chunk) = bytes.split_at_unchecked(size_of::<A::ULE>());
// Safety: variable_chunk is a valid V because of this function's precondition: bytes is a valid Self,
// and a valid Self contains a valid V after the space needed for A::ULE.
let variable_ref = V::from_bytes_unchecked(variable_chunk);
Expand Down
5 changes: 2 additions & 3 deletions utils/zerovec/src/varzerovec/components.rs
Original file line number Diff line number Diff line change
Expand Up @@ -316,8 +316,7 @@ impl<'a, T: VarULE + ?Sized, F: VarZeroVecFormat> VarZeroVecComponents<'a, T, F>
marker: PhantomData,
};
}
// MSRV Rust 1.79: Use split_at_unchecked
let len_bytes = slice.get_unchecked(0..F::Len::SIZE);
let (len_bytes, data_bytes) = unsafe { slice.split_at_unchecked(F::Len::SIZE) };
// Safety: F::Len allows all byte sequences
let len_ule = F::Len::slice_from_bytes_unchecked(len_bytes);

Expand All @@ -328,7 +327,7 @@ impl<'a, T: VarULE + ?Sized, F: VarZeroVecFormat> VarZeroVecComponents<'a, T, F>
// whereas we're calling something that asks for `parse_bytes_with_length()`.
// The two methods perform similar validation, with parse_bytes() validating an additional
// 4-byte `length` header.
Self::from_bytes_unchecked_with_length(len_u32, slice.get_unchecked(F::Len::SIZE..))
Self::from_bytes_unchecked_with_length(len_u32, data_bytes)
}

/// Construct a [`VarZeroVecComponents`] from a byte slice that has previously
Expand Down
Loading