From 56584ad9c4fd2091dbe94196a88ab70c5741f7ba Mon Sep 17 00:00:00 2001 From: Joe Neeman Date: Mon, 15 Aug 2016 15:47:30 +0200 Subject: [PATCH] Modify Teddy's match verification. Once Teddy finds a fingerprint, it needs to check if there is really a match. The old approach was to iterate over the set bits of both 64-bit halves of the fingerprint-checking vector. The new approach first extracts a bitfield that tells which bytes of the fingerprint-checking vector are non-zero. Then it iterates over those bytes. This seems to be faster (up to about 10% in some benchmarks). It seems like the main reason that this approach is faster is that most matched fingerprints only match in one place. The new code narrows in on the important place more quickly, whereas the old code wasted time unnecessarily examining an empty u64. The gain seems to be larger with AVX2 support (which is not included in this patch). Presumably it would be even larger with AVX512. --- src/simd_accel/teddy128.rs | 93 +++++++++++++++----------------------- 1 file changed, 37 insertions(+), 56 deletions(-) diff --git a/src/simd_accel/teddy128.rs b/src/simd_accel/teddy128.rs index a2b34bc0ee..a15e4401a6 100644 --- a/src/simd_accel/teddy128.rs +++ b/src/simd_accel/teddy128.rs @@ -86,7 +86,7 @@ better. Namely: 1. Teddy's core algorithm scans the haystack in 16 byte chunks. 16 is significant because it corresponds to the number of bytes in a SIMD vector. If one used AVX2 instructions, then we could scan the haystack in 32 byte - chunks. Similarly, if one used AVX512 instructions, we could sca the + chunks. Similarly, if one used AVX512 instructions, we could scan the haystack in 64 byte chunks. Hyperscan implements SIMD + AVX2, we only implement SIMD for the moment. (The author doesn't have a CPU with AVX2 support... yet.) @@ -326,11 +326,10 @@ References // TODO: Make the inner loop do aligned loads. use std::cmp; -use std::mem::transmute; use std::ptr; use simd::u8x16; -use simd::x86::sse2::u64x2; +use simd::x86::sse2::Sse2Bool8ix16; use simd::x86::ssse3::Ssse3U8x16; use syntax; @@ -456,8 +455,9 @@ impl Teddy { // N.B. `res0` is our `C` in the module documentation. let res0 = self.masks.members1(h); // Only do expensive verification if there are any non-zero bits. - if res0.ne(zero).any() { - if let Some(m) = self.verify_128(haystack, pos, res0) { + let bitfield = res0.ne(zero).move_mask(); + if bitfield != 0 { + if let Some(m) = self.verify(haystack, pos, res0, bitfield) { return Some(m); } } @@ -510,9 +510,11 @@ impl Teddy { // `AND`'s our `C` values together. let res = res0prev0 & res1; prev0 = res0; - if res.ne(zero).any() { + + let bitfield = res.ne(zero).move_mask(); + if bitfield != 0 { let pos = pos.checked_sub(1).unwrap(); - if let Some(m) = self.verify_128(haystack, pos, res) { + if let Some(m) = self.verify(haystack, pos, res, bitfield) { return Some(m); } } @@ -568,9 +570,11 @@ impl Teddy { prev0 = res0; prev1 = res1; - if res.ne(zero).any() { + + let bitfield = res.ne(zero).move_mask(); + if bitfield != 0 { let pos = pos.checked_sub(2).unwrap(); - if let Some(m) = self.verify_128(haystack, pos, res) { + if let Some(m) = self.verify(haystack, pos, res, bitfield) { return Some(m); } } @@ -585,63 +589,40 @@ impl Teddy { /// Runs the verification procedure on `res` (i.e., `C` from the module /// documentation), where the haystack block starts at `pos` in - /// `haystack`. + /// `haystack`. `bitfield` has ones in the bit positions that `res` has + /// non-zero bytes. /// /// If a match exists, it returns the first one. #[inline(always)] - fn verify_128( + fn verify( &self, haystack: &[u8], pos: usize, res: u8x16, + mut bitfield: u32, ) -> Option { - // The verification procedure is more amenable to standard 64 bit - // values, so get those. - let res64: u64x2 = unsafe { transmute(res) }; - let reshi = res64.extract(0); - let reslo = res64.extract(1); - if let Some(m) = self.verify_64(haystack, pos, reshi, 0) { - return Some(m); - } - if let Some(m) = self.verify_64(haystack, pos, reslo, 8) { - return Some(m); - } - None - } - - /// Runs the verification procedure on half of `C`. - /// - /// If a match exists, it returns the first one. - /// - /// `offset` is an additional byte offset to add to the position before - /// substring match verification. - #[inline(always)] - fn verify_64( - &self, - haystack: &[u8], - pos: usize, - mut res: u64, - offset: usize, - ) -> Option { - // There's a possible match so long as there's at least one bit set. - while res != 0 { - // The next possible match is at the least significant bit. - let bit = res.trailing_zeros(); - // The position of the bit in its corresponding lane gives us the - // corresponding bucket. - let bucket = (bit % 8) as usize; - // The lane that the bit is in gives us its offset. - let bytei = (bit / 8) as usize; - // Compute the start of where a substring would start. - let start = pos + offset + bytei; - // Kill off this bit. If we couldn't match anything, we'll go to - // the next bit. - res &= !(1 << bit); - // Actual substring search verification. - if let Some(m) = self.verify_bucket(haystack, bucket, start) { - return Some(m); + while bitfield != 0 { + // The next offset, relative to pos, where some fingerprint matched. + let byte_pos = bitfield.trailing_zeros(); + bitfield &= !(1 << byte_pos); + + // Offset relative to the beginning of the haystack. + let start = pos + byte_pos as usize; + + // The bitfield telling us which patterns had fingerprints that match at this starting + // position. + let mut patterns = res.extract(byte_pos); + while patterns != 0 { + let bucket = patterns.trailing_zeros() as usize; + patterns &= !(1 << bucket); + + // Actual substring search verification. + if let Some(m) = self.verify_bucket(haystack, bucket, start) { + return Some(m); + } } } + None }