Skip to content

Commit

Permalink
Restructure UTF-16 to UTF-8 encode to avoid unsafe.
Browse files Browse the repository at this point in the history
Per suggestion by @RReverser in rust-lang/rust#55147

This change also makes the output buffer size requirement for
UTF-16 to UTF-8 encode normal (number of input code units times
three instead of the previous input code units times three plus one
where the last code unit was never written into but had to be
there for space checks).
  • Loading branch information
hsivonen committed Oct 19, 2018
1 parent 7de454e commit a30a3d4
Showing 1 changed file with 81 additions and 58 deletions.
139 changes: 81 additions & 58 deletions src/utf_8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -664,88 +664,111 @@ impl Utf8Encoder {
'inner: loop {
// The following loop is only broken out of as a goto forward.
loop {
// Unfortunately, this check isn't enough for the compiler to elide
// the bound checks on writes to dst, which is why they are manually
// elided, which makes a measurable difference.
if written.checked_add(4).unwrap() > dst.len() {
return (EncoderResult::OutputFull, read, written);
}
read += 1;
// Note that `read` hasn't yet been updated to reflect
// that `unit` has been read. We commit to updating `read`
// only when we now we are able to update `written`, too.
// This is in stark contrast to the general structure
// of encoding_rs but makes UTF-16 to UTF-8 conversion's
// output buffer space requirements more intuitive.
if unit < 0x800 {
unsafe {
*(dst.get_unchecked_mut(written)) = (unit >> 6) as u8 | 0xC0u8;
written += 1;
*(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8;
written += 1;
// The pattern here allows bound checks to be elided
// from the writes into `tail`.
if written <= dst.len() {
let tail = &mut dst[written..];
if tail.len() >= 2 {
tail[0] = (unit >> 6) as u8 | 0xC0;
tail[1] = (unit & 0x3F) as u8 | 0x80;

read += 1;
written += 2;
break;
}
}
break;
return (EncoderResult::OutputFull, read, written);
}
let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
if unsafe { likely(unit_minus_surrogate_start > (0xDFFF - 0xD800)) } {
unsafe {
*(dst.get_unchecked_mut(written)) = (unit >> 12) as u8 | 0xE0u8;
written += 1;
*(dst.get_unchecked_mut(written)) =
((unit & 0xFC0) >> 6) as u8 | 0x80u8;
written += 1;
*(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8;
written += 1;
// The pattern here allows bound checks to be elided
// from the writes into `tail`.
if written <= dst.len() {
let tail = &mut dst[written..];
if tail.len() >= 3 {
tail[0] = (unit >> 12) as u8 | 0xE0;
tail[1] = ((unit & 0xFC0) >> 6) as u8 | 0x80;
tail[2] = (unit & 0x3F) as u8 | 0x80;

read += 1;
written += 3;
break;
}
}
break;
return (EncoderResult::OutputFull, read, written);
}
if unsafe { likely(unit_minus_surrogate_start <= (0xDBFF - 0xD800)) } {
// high surrogate
// read > src.len() is impossible, but using
let next_read = read + 1;
// next_read > src.len() is impossible, but using
// >= instead of == allows the compiler to elide a bound check.
if read >= src.len() {
debug_assert_eq!(read, src.len());
if next_read >= src.len() {
debug_assert_eq!(next_read, src.len());
// Unpaired surrogate at the end of the buffer.
unsafe {
*(dst.get_unchecked_mut(written)) = 0xEFu8;
written += 1;
*(dst.get_unchecked_mut(written)) = 0xBFu8;
written += 1;
*(dst.get_unchecked_mut(written)) = 0xBDu8;
written += 1;
// The pattern here allows bound checks to be elided
// from the writes into `tail`.
if written <= dst.len() {
let tail = &mut dst[written..];
if tail.len() >= 3 {
tail[0] = 0xEFu8;
tail[1] = 0xBFu8;
tail[2] = 0xBDu8;

read += 1;
written += 3;
return (EncoderResult::InputEmpty, read, written);
}
}
return (EncoderResult::InputEmpty, read, written);
return (EncoderResult::OutputFull, read, written);
}
let second = src[read];
let second = src[next_read];
let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
if unsafe { likely(second_minus_low_surrogate_start <= (0xDFFF - 0xDC00)) }
{
// The next code unit is a low surrogate. Advance position.
read += 1;
let astral = (u32::from(unit) << 10) + u32::from(second)
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
unsafe {
*(dst.get_unchecked_mut(written)) = (astral >> 18) as u8 | 0xF0u8;
written += 1;
*(dst.get_unchecked_mut(written)) =
((astral & 0x3F000u32) >> 12) as u8 | 0x80u8;
written += 1;
*(dst.get_unchecked_mut(written)) =
((astral & 0xFC0u32) >> 6) as u8 | 0x80u8;
written += 1;
*(dst.get_unchecked_mut(written)) = (astral & 0x3F) as u8 | 0x80u8;
written += 1;
// The pattern here allows bound checks to be elided
// from the writes into `tail`.
if written <= dst.len() {
let tail = &mut dst[written..];
if tail.len() >= 4 {
let astral = (u32::from(unit) << 10) + u32::from(second)
- (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
tail[0] = (astral >> 18) as u8 | 0xF0;
tail[1] = ((astral & 0x3F000u32) >> 12) as u8 | 0x80;
tail[2] = ((astral & 0xFC0u32) >> 6) as u8 | 0x80;
tail[3] = (astral & 0x3F) as u8 | 0x80;

read += 2;
written += 4;
break;
}
}
break;
return (EncoderResult::OutputFull, read, written);
}
// The next code unit is not a low surrogate. Don't advance
// position and treat the high surrogate as unpaired.
// Fall through
}
// Unpaired low surrogate
unsafe {
*(dst.get_unchecked_mut(written)) = 0xEFu8;
written += 1;
*(dst.get_unchecked_mut(written)) = 0xBFu8;
written += 1;
*(dst.get_unchecked_mut(written)) = 0xBDu8;
written += 1;
if written <= dst.len() {
let tail = &mut dst[written..];
if tail.len() >= 3 {
tail[0] = 0xEFu8;
tail[1] = 0xBFu8;
tail[2] = 0xBDu8;

read += 1;
written += 3;
break;
}
}
break;
return (EncoderResult::OutputFull, read, written);
}
// Now see if the next unit is Basic Latin
// read > src.len() is impossible, but using
Expand Down

3 comments on commit a30a3d4

@hsivonen
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The performance was worse with this changeset than without it.

@RReverser
Copy link

@RReverser RReverser commented on a30a3d4 Oct 19, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The performance was worse with this changeset than without it.

Hmm, how big of a difference? Instruction-wise the generated code is even simpler than checked_add + unsafe, so this is surprising.

@hsivonen
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Up to 30% slow-down:

 bench_encode_from_utf16_ar                         29,154 (790 MB/s)        37,820 (609 MB/s)             8,666   29.72%   x 0.77 
 bench_encode_from_utf16_cs                         60,380 (802 MB/s)        57,870 (837 MB/s)            -2,510   -4.16%   x 1.04 
 bench_encode_from_utf16_de                         19,900 (3865 MB/s)       18,949 (4059 MB/s)             -951   -4.78%   x 1.05 
 bench_encode_from_utf16_el                         40,668 (797 MB/s)        52,530 (617 MB/s)            11,862   29.17%   x 0.77 
 bench_encode_from_utf16_en                         5,672 (11410 MB/s)       5,475 (11820 MB/s)             -197   -3.47%   x 1.04 
 bench_encode_from_utf16_fr                         110,489 (1292 MB/s)      108,223 (1319 MB/s)          -2,266   -2.05%   x 1.02 
 bench_encode_from_utf16_he                         73,867 (670 MB/s)        88,136 (562 MB/s)            14,269   19.32%   x 0.84 
 bench_encode_from_utf16_ja                         20,890 (1564 MB/s)       23,582 (1385 MB/s)            2,692   12.89%   x 0.89 
 bench_encode_from_utf16_jquery                     5,747 (15087 MB/s)       5,868 (14776 MB/s)              121    2.11%   x 0.98 
 bench_encode_from_utf16_ko                         24,455 (816 MB/s)        27,973 (713 MB/s)             3,518   14.39%   x 0.87 
 bench_encode_from_utf16_pt                         33,006 (2059 MB/s)       32,558 (2087 MB/s)             -448   -1.36%   x 1.01 
 bench_encode_from_utf16_ru                         165,369 (707 MB/s)       193,806 (603 MB/s)           28,437   17.20%   x 0.85 
 bench_encode_from_utf16_th                         131,109 (1416 MB/s)      146,074 (1271 MB/s)          14,965   11.41%   x 0.90 
 bench_encode_from_utf16_tr                         72,349 (775 MB/s)        69,089 (811 MB/s)            -3,260   -4.51%   x 1.05 
 bench_encode_from_utf16_vi                         184,639 (469 MB/s)       187,317 (463 MB/s)            2,678    1.45%   x 0.99 
 bench_encode_from_utf16_zh_cn                      20,577 (1500 MB/s)       23,475 (1315 MB/s)            2,898   14.08%   x 0.88 
 bench_encode_from_utf16_zh_tw                      20,816 (1486 MB/s)       23,537 (1314 MB/s)            2,721   13.07%   x 0.88 

Another attempt at rewrite required 2018 edition (non-lexical lifetimes) and was better than the above but still worse than the original. Results in comparison to the original unsafe-using code:

 bench_encode_from_utf16_ar                         29,154 (790 MB/s)        34,606 (665 MB/s)                 5,452   18.70%   x 0.84 
 bench_encode_from_utf16_cs                         60,380 (802 MB/s)        60,153 (805 MB/s)                  -227   -0.38%   x 1.00 
 bench_encode_from_utf16_de                         19,900 (3865 MB/s)       20,613 (3732 MB/s)                  713    3.58%   x 0.97 
 bench_encode_from_utf16_el                         40,668 (797 MB/s)        48,475 (669 MB/s)                 7,807   19.20%   x 0.84 
 bench_encode_from_utf16_en                         5,672 (11410 MB/s)       5,748 (11259 MB/s)                   76    1.34%   x 0.99 
 bench_encode_from_utf16_fr                         110,489 (1292 MB/s)      109,013 (1310 MB/s)              -1,476   -1.34%   x 1.01 
 bench_encode_from_utf16_he                         73,867 (670 MB/s)        83,974 (590 MB/s)                10,107   13.68%   x 0.88 
 bench_encode_from_utf16_ja                         20,890 (1564 MB/s)       23,208 (1407 MB/s)                2,318   11.10%   x 0.90 
 bench_encode_from_utf16_jquery                     5,747 (15087 MB/s)       5,824 (14888 MB/s)                   77    1.34%   x 0.99 
 bench_encode_from_utf16_ko                         24,455 (816 MB/s)        24,424 (817 MB/s)                   -31   -0.13%   x 1.00 
 bench_encode_from_utf16_pt                         33,006 (2059 MB/s)       34,487 (1971 MB/s)                1,481    4.49%   x 0.96 
 bench_encode_from_utf16_ru                         165,369 (707 MB/s)       183,053 (639 MB/s)               17,684   10.69%   x 0.90 
 bench_encode_from_utf16_th                         131,109 (1416 MB/s)      146,268 (1269 MB/s)              15,159   11.56%   x 0.90 
 bench_encode_from_utf16_tr                         72,349 (775 MB/s)        72,117 (777 MB/s)                  -232   -0.32%   x 1.00 
 bench_encode_from_utf16_vi                         184,639 (469 MB/s)       186,843 (464 MB/s)                2,204    1.19%   x 0.99 
 bench_encode_from_utf16_zh_cn                      20,577 (1500 MB/s)       22,726 (1359 MB/s)                2,149   10.44%   x 0.91 
 bench_encode_from_utf16_zh_tw                      20,816 (1486 MB/s)       22,889 (1352 MB/s)                2,073    9.96%   x 0.91 

Please sign in to comment.