-
Notifications
You must be signed in to change notification settings - Fork 157
/
Copy pathSearch.hs
132 lines (117 loc) · 4.83 KB
/
Search.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE UnliftedFFITypes #-}
-- |
-- Module : Data.Text.Lazy.Search
-- Copyright : (c) 2009, 2010 Bryan O'Sullivan
--
-- License : BSD-style
-- Maintainer : [email protected]
-- Stability : experimental
-- Portability : GHC
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Fast substring search for lazy 'Text', based on work by Boyer,
-- Moore, Horspool, Sunday, and Lundh. Adapted from the strict
-- implementation.
module Data.Text.Internal.Lazy.Search
(
indices
) where
import Data.Bits (unsafeShiftL, (.|.), (.&.))
import qualified Data.Text.Array as A
import Data.Int (Int64)
import Data.Word (Word8, Word64)
import qualified Data.Text.Internal as T
import qualified Data.Text as T (concat, isPrefixOf)
import Data.Text.Internal.ArrayUtils (memchr)
import Data.Text.Internal.Fusion.Types (PairS(..))
import Data.Text.Internal.Lazy (Text(..), foldrChunks)
-- | /O(n+m)/ Find the offsets of all non-overlapping indices of
-- @needle@ within @haystack@.
--
-- This function is strict in @needle@, and lazy (as far as possible)
-- in the chunks of @haystack@.
--
-- In (unlikely) bad cases, this algorithm's complexity degrades
-- towards /O(n*m)/.
indices :: Text -- ^ Substring to search for (@needle@)
-> Text -- ^ Text to search in (@haystack@)
-> [Int64]
indices needle
| nlen <= 0 = const []
| nlen == 1 = indicesOne (A.unsafeIndex narr noff) 0
| otherwise = advance 0 0
where
T.Text narr noff nlen = T.concat (foldrChunks (:) [] needle)
advance !_ !_ Empty = []
advance !(g :: Int64) !(i :: Int) xxs@(Chunk x@(T.Text xarr@(A.ByteArray xarr#) xoff l) xs)
| i >= l = advance g (i - l) xs
| lackingHay (i + nlen) x xs = []
| c == z && candidateMatch = g : advance (g + intToInt64 nlen) (i + nlen) xxs
| otherwise = advance (g + intToInt64 delta) (i + delta) xxs
where
c = index xxs (i + nlast)
delta | nextInPattern = nlen + 1
| c == z = skip + 1
| l >= i + nlen = case
memchr xarr# (xoff + i + nlen) (l - i - nlen) z of
-1 -> max 1 (l - i - nlen)
s -> s + 1
| otherwise = 1
nextInPattern = mask .&. swizzle (index xxs (i + nlen)) == 0
candidateMatch
| i + nlen <= l = A.equal narr noff xarr (xoff + i) nlen
| otherwise = A.equal narr noff xarr (xoff + i) (l - i) &&
T.Text narr (noff + l - i) (nlen - l + i) `isPrefixOf` xs
nlast = nlen - 1
z = A.unsafeIndex narr (noff + nlen - 1)
(mask :: Word64) :*: skip = buildTable 0 0 0 (nlen-2)
swizzle :: Word8 -> Word64
swizzle w = 1 `unsafeShiftL` (word8ToInt w .&. 0x3f)
buildTable !g !i !msk !skp
| i >= nlast = (msk .|. swizzle z) :*: skp
| otherwise = buildTable (g+1) (i+1) msk' skp'
where c = A.unsafeIndex narr (noff+i)
msk' = msk .|. swizzle c
skp' | c == z = nlen - g - 2
| otherwise = skp
-- | Check whether an attempt to index into the haystack at the
-- given offset would fail.
lackingHay :: Int -> T.Text -> Text -> Bool
lackingHay q (T.Text _ _ l) ps = l < q && case ps of
Empty -> True
Chunk r rs -> lackingHay (q - l) r rs
-- | Fast index into a partly unpacked 'Text'. We take into account
-- the possibility that the caller might try to access one element
-- past the end.
index :: Text -> Int -> Word8
index Empty !_ = 0
index (Chunk (T.Text arr off len) xs) !i
| i < len = A.unsafeIndex arr (off + i)
| otherwise = index xs (i - len)
-- | A variant of 'indices' that scans linearly for a single 'Word8'.
indicesOne :: Word8 -> Int64 -> Text -> [Int64]
indicesOne c = chunk
where
chunk :: Int64 -> Text -> [Int64]
chunk !_ Empty = []
chunk !i (Chunk (T.Text oarr ooff olen) os) = go 0
where
go h | h >= olen = chunk (i+intToInt64 olen) os
| on == c = i + intToInt64 h : go (h+1)
| otherwise = go (h+1)
where on = A.unsafeIndex oarr (ooff+h)
-- | First argument is a strict Text, and second is a lazy one.
isPrefixOf :: T.Text -> Text -> Bool
isPrefixOf (T.Text _ _ xlen) Empty = xlen == 0
isPrefixOf x@(T.Text xarr xoff xlen) (Chunk y@(T.Text _ _ ylen) ys)
| xlen <= ylen = x `T.isPrefixOf` y
| otherwise = y `T.isPrefixOf` x && T.Text xarr (xoff + ylen) (xlen - ylen) `isPrefixOf` ys
intToInt64 :: Int -> Int64
intToInt64 = fromIntegral
word8ToInt :: Word8 -> Int
word8ToInt = fromIntegral