diff --git a/form_urlencoded/src/lib.rs b/form_urlencoded/src/lib.rs index e1cf571a6..c57a09b23 100644 --- a/form_urlencoded/src/lib.rs +++ b/form_urlencoded/src/lib.rs @@ -150,6 +150,10 @@ impl<'a> Iterator for ByteSerialize<'a> { None => (self.bytes, &[][..]), }; self.bytes = remaining; + // This unsafe is appropriate because we have already checked these + // bytes in byte_serialized_unchanged, which checks for a subset + // of UTF-8. So we know these bytes are valid UTF-8, and doing + // another UTF-8 check would be wasteful. Some(unsafe { str::from_utf8_unchecked(unchanged_slice) }) } else { None diff --git a/form_urlencoded/src/query_encoding.rs b/form_urlencoded/src/query_encoding.rs index 76aed15a7..f7ad3348b 100644 --- a/form_urlencoded/src/query_encoding.rs +++ b/form_urlencoded/src/query_encoding.rs @@ -18,18 +18,34 @@ pub(crate) fn encode<'a>(encoding_override: EncodingOverride, input: &'a str) -> } pub(crate) fn decode_utf8_lossy(input: Cow<[u8]>) -> Cow { + // Note: This function is duplicated in `percent_encoding/lib.rs`. match input { Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes), Cow::Owned(bytes) => { - let raw_utf8: *const [u8]; match String::from_utf8_lossy(&bytes) { - Cow::Borrowed(utf8) => raw_utf8 = utf8.as_bytes(), - Cow::Owned(s) => return s.into(), + Cow::Borrowed(utf8) => { + // If from_utf8_lossy returns a Cow::Borrowed, then we can + // be sure our original bytes were valid UTF-8. This is because + // if the bytes were invalid UTF-8 from_utf8_lossy would have + // to allocate a new owned string to back the Cow so it could + // replace invalid bytes with a placeholder. + + // First we do a debug_assert to confirm our description above. + let raw_utf8: *const [u8]; + raw_utf8 = utf8.as_bytes(); + debug_assert!(raw_utf8 == &*bytes as *const [u8]); + + // Given we know the original input bytes are valid UTF-8, + // and we have ownership of those bytes, we re-use them and + // return a Cow::Owned here. Ideally we'd put our return statement + // right below this line, but to support the old lexically scoped + // borrow checker the return must be moved to outside the match + // statement. + }, + Cow::Owned(s) => return Cow::Owned(s), } - // from_utf8_lossy returned a borrow of `bytes` unchanged. - debug_assert!(raw_utf8 == &*bytes as *const [u8]); - // Reuse the existing `Vec` allocation. - unsafe { String::from_utf8_unchecked(bytes) }.into() + + Cow::Owned(unsafe { String::from_utf8_unchecked(bytes) }) } } } diff --git a/idna/src/punycode.rs b/idna/src/punycode.rs index 829684b14..424fec733 100644 --- a/idna/src/punycode.rs +++ b/idna/src/punycode.rs @@ -143,11 +143,10 @@ pub fn encode_str(input: &str) -> Option { /// 63 encoded bytes, the DNS limit on domain name labels. pub fn encode(input: &[char]) -> Option { // Handle "basic" (ASCII) code points. They are encoded as-is. - let output_bytes = input + let mut output : String = input .iter() - .filter_map(|&c| if c.is_ascii() { Some(c as u8) } else { None }) + .filter(|&c| c.is_ascii()) .collect(); - let mut output = unsafe { String::from_utf8_unchecked(output_bytes) }; let basic_length = output.len() as u32; if basic_length > 0 { output.push_str("-") diff --git a/percent_encoding/lib.rs b/percent_encoding/lib.rs index 27eaf6740..dbfbd3549 100644 --- a/percent_encoding/lib.rs +++ b/percent_encoding/lib.rs @@ -252,6 +252,8 @@ impl<'a> Iterator for PercentEncode<'a> { self.bytes = remaining; Some(percent_encode_byte(first_byte)) } else { + // The unsafe blocks here are appropriate because the bytes are + // confirmed as a subset of UTF-8 in should_percent_encode. for (i, &byte) in remaining.iter().enumerate() { if self.ascii_set.should_percent_encode(byte) { // 1 for first_byte + i for previous iterations of this loop @@ -425,18 +427,34 @@ impl<'a> PercentDecode<'a> { } fn decode_utf8_lossy(input: Cow<[u8]>) -> Cow { + // Note: This function is duplicated in `form_urlencoded/src/query_encoding.rs`. match input { Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes), Cow::Owned(bytes) => { - let raw_utf8: *const [u8]; match String::from_utf8_lossy(&bytes) { - Cow::Borrowed(utf8) => raw_utf8 = utf8.as_bytes(), - Cow::Owned(s) => return s.into(), + Cow::Borrowed(utf8) => { + // If from_utf8_lossy returns a Cow::Borrowed, then we can + // be sure our original bytes were valid UTF-8. This is because + // if the bytes were invalid UTF-8 from_utf8_lossy would have + // to allocate a new owned string to back the Cow so it could + // replace invalid bytes with a placeholder. + + // First we do a debug_assert to confirm our description above. + let raw_utf8: *const [u8]; + raw_utf8 = utf8.as_bytes(); + debug_assert!(raw_utf8 == &*bytes as *const [u8]); + + // Given we know the original input bytes are valid UTF-8, + // and we have ownership of those bytes, we re-use them and + // return a Cow::Owned here. Ideally we'd put our return statement + // right below this line, but to support the old lexically scoped + // borrow checker the return must be moved to outside the match + // statement. + }, + Cow::Owned(s) => return Cow::Owned(s), } - // from_utf8_lossy returned a borrow of `bytes` unchanged. - debug_assert!(raw_utf8 == &*bytes as *const [u8]); - // Reuse the existing `Vec` allocation. - unsafe { String::from_utf8_unchecked(bytes) }.into() + + Cow::Owned(unsafe { String::from_utf8_unchecked(bytes) }) } } }