From 5578f18e10c255779ea6d8e80970202c9157b9d8 Mon Sep 17 00:00:00 2001 From: mattiZed <mattized@users.noreply.github.com> Date: Sun, 28 Apr 2024 20:08:48 +0200 Subject: [PATCH 1/5] refactor: encode uses LUT, also work on string buffer reference directly refactor: decode uses LUT, improve delta computation --- src/lib.rs | 101 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 72 insertions(+), 29 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 96cfedc..c712393 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -46,23 +46,26 @@ where } } -fn encode(current: f64, previous: f64, factor: i32) -> Result<String, String> { - let current = scale(current, factor); - let previous = scale(previous, factor); - let mut coordinate = (current - previous) << 1; - if (current - previous) < 0 { - coordinate = !coordinate; +fn encode(delta: i64, encoded: &mut String) { + let mut value = delta << 1; + if value < 0 { + value = !value; } - let mut output: String = "".to_string(); - while coordinate >= 0x20 { - let from_char = char::from_u32(((0x20 | (coordinate & 0x1f)) + 63) as u32) - .ok_or("Couldn't convert character")?; - output.push(from_char); - coordinate >>= 5; + _encode(value as u64, encoded); +} + +fn _encode(mut value: u64, result: &mut String) { + const ENCODING_TABLE: &str = + "?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"; + + while value >= 0x20 { + let pos = (value & 0x1F) | 0x20; + let from_char = ENCODING_TABLE.as_bytes()[pos as usize] as char; + result.push(from_char); + value >>= 5; } - let from_char = char::from_u32((coordinate + 63) as u32).ok_or("Couldn't convert character")?; - output.push(from_char); - Ok(output) + let from_char = ENCODING_TABLE.as_bytes()[value as usize] as char; + result.push(from_char); } /// Encodes a Google Encoded Polyline. @@ -82,20 +85,23 @@ where { let base: i32 = 10; let factor: i32 = base.pow(precision); + let mut encoded: String = "".to_string(); - let mut output = "".to_string(); - let mut b = Coord { x: 0.0, y: 0.0 }; + let mut current: Coord<i64> = Coord { x: 0, y: 0 }; + let mut previous: Coord<i64> = Coord { x: 0, y: 0 }; for (i, a) in coordinates.into_iter().enumerate() { + current.x = scale(a.x, factor); + current.y = scale(a.y, factor); check(a.y, (MIN_LATITUDE, MAX_LATITUDE)) .map_err(|e| format!("Latitude error at position {0}: {1}", i, e))?; check(a.x, (MIN_LONGITUDE, MAX_LONGITUDE)) .map_err(|e| format!("Longitude error at position {0}: {1}", i, e))?; - output = output + &encode(a.y, b.y, factor)?; - output = output + &encode(a.x, b.x, factor)?; - b = a; + encode(current.y - previous.y, &mut encoded); + encode(current.x - previous.x, &mut encoded); + previous = current; } - Ok(output) + Ok(encoded) } /// Decodes a Google Encoded Polyline. @@ -135,16 +141,46 @@ pub fn decode_polyline(polyline: &str, precision: u32) -> Result<LineString<f64> } fn trans(chars: &[u8], mut index: usize) -> Result<(i64, usize), String> { + #[rustfmt::skip] + const DECODING_TABLE: &[i8] = &[ + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, + 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, + 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, + 57, 58, 59, 60, 61, 62, 63, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, + ]; + let mut shift = 0; let mut result = 0; let mut byte; loop { - byte = chars[index] as u64; - if byte < 63 || (shift > 64 - 5) { + byte = DECODING_TABLE[chars[index] as usize]; + if byte < 1 { return Err(format!("Cannot decode character at index {}", index)); } - byte -= 63; - result |= (byte & 0x1f) << shift; + result |= (byte as u64 & 0x1f) << shift; index += 1; shift += 5; if byte < 0x20 { @@ -237,15 +273,22 @@ mod tests { // TODO: handle this case in the future? fn broken_string() { let s = "_p~iF~ps|U_u🗑lLnnqC_mqNvxq`@"; - let res = vec![[-120.2, 38.5], [-120.95, 2306360.53104], [-126.453, 2306363.08304]].into(); - assert_eq!(decode_polyline(s, 5).unwrap(), res); + let expected: Result<_, _> = Err("Cannot decode character at index 12".to_string()); + assert_eq!(decode_polyline(s, 5), expected); } #[test] - #[should_panic] fn invalid_string() { let s = "invalid_polyline_that_should_be_handled_gracefully"; - decode_polyline(s, 6).unwrap(); + let expected: Result<_, _> = Err("Cannot decode character at index 12".to_string()); + assert_eq!(decode_polyline(s, 5), expected); + } + + #[test] + fn another_invalid_string() { + let s = "ugh_ugh"; + let expected: Result<_, _> = Err("Cannot decode character at index 12".to_string()); + assert_eq!(decode_polyline(s, 5), expected); } #[test] From 10ef75507c91f32044dbedfcca30f6422a3fbae3 Mon Sep 17 00:00:00 2001 From: mattiZed <mattiZed@users.noreply.github.com> Date: Mon, 29 Apr 2024 10:27:15 +0200 Subject: [PATCH 2/5] fix: differentiate character and overflow errors feat: provide naive and LUT-based algorithms for both encode and decode --- src/lib.rs | 122 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 93 insertions(+), 29 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index c712393..5da49f9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -46,26 +46,14 @@ where } } -fn encode(delta: i64, encoded: &mut String) { - let mut value = delta << 1; - if value < 0 { - value = !value; +// Compute the maximum shift allowed/needed for representing longitude/latitude deltas, +// based on precision +fn max_shift(factor: i64, lat: bool) -> u8 { + if lat { + ((180 * factor).ilog2() + 1) as u8 + } else { + ((360 * factor).ilog2() + 1) as u8 } - _encode(value as u64, encoded); -} - -fn _encode(mut value: u64, result: &mut String) { - const ENCODING_TABLE: &str = - "?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"; - - while value >= 0x20 { - let pos = (value & 0x1F) | 0x20; - let from_char = ENCODING_TABLE.as_bytes()[pos as usize] as char; - result.push(from_char); - value >>= 5; - } - let from_char = ENCODING_TABLE.as_bytes()[value as usize] as char; - result.push(from_char); } /// Encodes a Google Encoded Polyline. @@ -104,6 +92,44 @@ where Ok(encoded) } +// Polyline-encoder entrypoint, expects a scaled (and valid) longitude/latitude delta. +fn encode(delta: i64, encoded: &mut String) { + let mut value = delta << 1; + if value < 0 { + value = !value; + } + // value can be encoded using lookup table (_encode() vs _encode_lut). + // At least on my machine, plain _encode() is slightly faster (-20%). + _encode(value as u64, encoded); +} + +fn _encode(mut value: u64, encoded: &mut String) { + while value >= 0x20 { + // apply 5bit mask to value and set 6th bit to indicate there is another chunk + // needed to encode "value" + let byte = ((value & 0b11111) | 0b100000) as u8; + encoded.push((byte + 63) as char); + value >>= 5; + } + // encode the remainder + encoded.push((value as u8 + 63) as char); +} + +// Approach for optimizing _encode(), utilizing a lookup table. +fn _encode_lut(mut value: u64, encoded: &mut String) { + const ENCODING_TABLE: &str = + "?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"; + + while value >= 0b100000 { + let pos = (value & 0b11111) | 0b100000; + let from_char = ENCODING_TABLE.as_bytes()[pos as usize] as char; + encoded.push(from_char); + value >>= 5; + } + let from_char = ENCODING_TABLE.as_bytes()[value as usize] as char; + encoded.push(from_char); +} + /// Decodes a Google Encoded Polyline. /// /// # Examples @@ -123,12 +149,15 @@ pub fn decode_polyline(polyline: &str, precision: u32) -> Result<LineString<f64> let chars = polyline.as_bytes(); + // polyline can be decoded using lookup table (_decode() vs _decode_lut). + // At least on my machine, _decode_lut() is slightly faster (~20-25%). while index < chars.len() { - let (latitude_change, new_index) = trans(chars, index)?; + let (latitude_change, new_index) = _decode_lut(chars, index, max_shift(factor, true))?; if new_index >= chars.len() { break; } - let (longitude_change, new_index) = trans(chars, new_index)?; + let (longitude_change, new_index) = + _decode_lut(chars, new_index, max_shift(factor, false))?; index = new_index; lat += latitude_change; @@ -140,7 +169,40 @@ pub fn decode_polyline(polyline: &str, precision: u32) -> Result<LineString<f64> Ok(coordinates.into()) } -fn trans(chars: &[u8], mut index: usize) -> Result<(i64, usize), String> { +fn _decode(chars: &[u8], mut index: usize, max_shift: u8) -> Result<(i64, usize), String> { + let mut shift = 0; + let mut value = 0; + let mut byte; + + loop { + byte = chars[index] as u64; + if byte < 63 || byte > 127 { + return Err(format!("Cannot decode character at index {index}")); + } + byte -= 63; + // apply 5bit mask to byte and reconstruct value + value |= (byte & 0b11111) << shift; + shift += 5; + index += 1; + // if the 6th bit is not set, we are done + if byte < 0b100000 { + break; + } else if shift > max_shift { + // break-criterion didn't hit and next iteration would provoke + // an out-of-bounds longitude/latitude. + return Err(format!("Overflow due to character at index {}", index - 1)); + } + } + + let delta = if (value & 1) > 0 { + !(value >> 1) + } else { + value >> 1 + } as i64; + Ok((delta, index)) +} + +fn _decode_lut(chars: &[u8], mut index: usize, max_shift: u8) -> Result<(i64, usize), String> { #[rustfmt::skip] const DECODING_TABLE: &[i8] = &[ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, @@ -177,14 +239,18 @@ fn trans(chars: &[u8], mut index: usize) -> Result<(i64, usize), String> { let mut byte; loop { byte = DECODING_TABLE[chars[index] as usize]; - if byte < 1 { + if byte < 0 { return Err(format!("Cannot decode character at index {}", index)); } result |= (byte as u64 & 0x1f) << shift; - index += 1; shift += 5; - if byte < 0x20 { + index += 1; + if byte < 0b100000 { break; + } else if shift > max_shift { + // break-criterion didn't hit and next iteration would provoke + // an out-of-bounds longitude/latitude. + return Err(format!("Overflow due to character at index {}", index - 1)); } } @@ -269,8 +335,6 @@ mod tests { } #[test] - // emoji is decodable but messes up data - // TODO: handle this case in the future? fn broken_string() { let s = "_p~iF~ps|U_u🗑lLnnqC_mqNvxq`@"; let expected: Result<_, _> = Err("Cannot decode character at index 12".to_string()); @@ -280,14 +344,14 @@ mod tests { #[test] fn invalid_string() { let s = "invalid_polyline_that_should_be_handled_gracefully"; - let expected: Result<_, _> = Err("Cannot decode character at index 12".to_string()); + let expected: Result<_, _> = Err("Overflow due to character at index 5".to_string()); assert_eq!(decode_polyline(s, 5), expected); } #[test] fn another_invalid_string() { let s = "ugh_ugh"; - let expected: Result<_, _> = Err("Cannot decode character at index 12".to_string()); + let expected: Result<_, _> = Err("Overflow due to character at index 5".to_string()); assert_eq!(decode_polyline(s, 5), expected); } From 66c8af59e63679ba833b00e339fc2eb5a9da4cc4 Mon Sep 17 00:00:00 2001 From: mattiZed <mattiZed@users.noreply.github.com> Date: Mon, 29 Apr 2024 10:36:41 +0200 Subject: [PATCH 3/5] chore: update changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 02f5aba..a4d4c5a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,11 @@ ## Unreleased +* inspired by HERE's [flexpolyline](https://github.com/heremaps/flexible-polyline): + * provide alt. implementation of both encode() and decode() based on LUT + * rework encoder delta-computation (performance improvement) +* correctly differentiate character and overflow errors when decoding + ## 0.10.2 * fix decoder crashing with out-of-bounds error (https://github.com/georust/polyline/pull/37): From 094810848227997aa934253df5a90dc454396829 Mon Sep 17 00:00:00 2001 From: mattiZed <mattized@users.noreply.github.com> Date: Mon, 29 Apr 2024 20:07:05 +0200 Subject: [PATCH 4/5] style: also use binary literal in decode_lut --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 5da49f9..b29b55f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -242,7 +242,7 @@ fn _decode_lut(chars: &[u8], mut index: usize, max_shift: u8) -> Result<(i64, us if byte < 0 { return Err(format!("Cannot decode character at index {}", index)); } - result |= (byte as u64 & 0x1f) << shift; + result |= (byte as u64 & 0b11111) << shift; shift += 5; index += 1; if byte < 0b100000 { From e11dd90861c7ae419adce8e645ae85db32c38d82 Mon Sep 17 00:00:00 2001 From: mattiZed <mattiZed@users.noreply.github.com> Date: Thu, 2 May 2024 07:21:29 +0200 Subject: [PATCH 5/5] revert: remove string allocation/concatenation improvement for better traceability of performance gains --- CHANGELOG.md | 2 +- src/lib.rs | 16 ++++++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a4d4c5a..7a418b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ * inspired by HERE's [flexpolyline](https://github.com/heremaps/flexible-polyline): * provide alt. implementation of both encode() and decode() based on LUT - * rework encoder delta-computation (performance improvement) + * rework encoder delta-computation * correctly differentiate character and overflow errors when decoding ## 0.10.2 diff --git a/src/lib.rs b/src/lib.rs index b29b55f..c7745cd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -85,25 +85,26 @@ where .map_err(|e| format!("Latitude error at position {0}: {1}", i, e))?; check(a.x, (MIN_LONGITUDE, MAX_LONGITUDE)) .map_err(|e| format!("Longitude error at position {0}: {1}", i, e))?; - encode(current.y - previous.y, &mut encoded); - encode(current.x - previous.x, &mut encoded); + encoded = encoded + &encode(current.y - previous.y); + encoded = encoded + &encode(current.x - previous.x); previous = current; } Ok(encoded) } // Polyline-encoder entrypoint, expects a scaled (and valid) longitude/latitude delta. -fn encode(delta: i64, encoded: &mut String) { +fn encode(delta: i64) -> String { let mut value = delta << 1; if value < 0 { value = !value; } // value can be encoded using lookup table (_encode() vs _encode_lut). // At least on my machine, plain _encode() is slightly faster (-20%). - _encode(value as u64, encoded); + return _encode(value as u64); } -fn _encode(mut value: u64, encoded: &mut String) { +fn _encode(mut value: u64) -> String { + let mut encoded: String = String::new(); while value >= 0x20 { // apply 5bit mask to value and set 6th bit to indicate there is another chunk // needed to encode "value" @@ -113,10 +114,12 @@ fn _encode(mut value: u64, encoded: &mut String) { } // encode the remainder encoded.push((value as u8 + 63) as char); + return encoded } // Approach for optimizing _encode(), utilizing a lookup table. -fn _encode_lut(mut value: u64, encoded: &mut String) { +fn _encode_lut(mut value: u64) -> String { + let mut encoded: String = String::new(); const ENCODING_TABLE: &str = "?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"; @@ -128,6 +131,7 @@ fn _encode_lut(mut value: u64, encoded: &mut String) { } let from_char = ENCODING_TABLE.as_bytes()[value as usize] as char; encoded.push(from_char); + return encoded } /// Decodes a Google Encoded Polyline.