-
-
Notifications
You must be signed in to change notification settings - Fork 580
/
Copy pathcache.py
203 lines (164 loc) · 7.6 KB
/
cache.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#
# Copyright (c) 2015 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# When you publish or redistribute any data created with ScanCode or any ScanCode
# derivative work, you must accompany this data with the following acknowledgment:
#
# Generated with ScanCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
# ScanCode is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
from __future__ import absolute_import, print_function
from array import array
from hashlib import md5
from os.path import exists
from os.path import getmtime
from os.path import getsize
from os.path import join
import yg.lockfile # @UnresolvedImport
from commoncode.fileutils import create_dir
from commoncode.fileutils import file_iter
from licensedcode import src_dir
from licensedcode import license_index_cache_dir
from licensedcode import license_matches_cache_dir
"""
Caching on-disk of LicenseIndex and LicenseMatches:
"""
"""
An on-disk persistent cache of LicenseIndex. The index is pickled and
invalidated if there are any changes in the code or licenses text or rules.
Loading and dumping the cached index is safe to use across multiple processes
using lock files.
"""
index_lock_file = join(license_index_cache_dir, 'lockfile')
tree_checksum_file = join(license_index_cache_dir, 'tree_checksums')
index_cache_file = join(license_index_cache_dir, 'index_cache')
def tree_checksum(base_dir=src_dir):
"""
Return a checksum computed from a file tree using the file paths, size and
last modified time stamps.
The purpose is to detect is there has been any modification to source code,
compiled code or licenses or rule files and use this as a proxyx to verify
the cache consistency.
"""
hashable = [''.join([loc, str(getmtime(loc)), str(getsize(loc))])
for loc in file_iter(base_dir)]
return md5(''.join(hashable)).hexdigest()
LICENSE_INDEX_LOCK_TIMEOUT = 60 * 3
def get_or_build_index_from_cache(force_clear=False):
"""
Return a LicenseIndex loaded from cache. If the index is stale or does not exist,
build a new index and caches it. Clear or purge the LicenseMatch cache as needed.
"""
from licensedcode.index import LicenseIndex
from licensedcode.models import get_rules
try:
# acquire lock and wait until timeout to get a lock or die
with yg.lockfile.FileLock(index_lock_file, timeout=LICENSE_INDEX_LOCK_TIMEOUT):
if force_clear:
license_matches_cache.clear(0)
# if we have a saved cached index
if exists(tree_checksum_file) and exists(index_cache_file):
# load saved tree_checksum and compare with current tree_checksum
with open(tree_checksum_file, 'rb') as etcs:
existing_checksum = etcs.read()
current_checksum = tree_checksum()
if current_checksum == existing_checksum:
# The cache is consistent with the latest code and data:
# we load index from cache
with open(index_cache_file, 'rb') as ifc:
# Note: loads() is much (twice++???) faster than load()
idx = LicenseIndex.loads(ifc.read())
return idx
# Here, the cache is not consistent with the latest code and data:
# It is either stale or non-existing: we need to cleanup/regen
# clear the LicenseMatch cache entirely
license_matches_cache.clear(0)
# regen the index
idx = LicenseIndex(get_rules())
with open(index_cache_file, 'wb') as ifc:
ifc.write(idx.dumps())
# save the new checksums tree
with open(tree_checksum_file, 'wb') as ctcs:
ctcs.write(tree_checksum())
return idx
except yg.lockfile.FileLockTimeout:
# TODO: unable to lock in a nicer way
raise
"""
A cache of recent matches from queries and query runs.
Several files in the same project or codebase are highly likely have repeated
identical license headers, texts or notices. Another common pattern is multiple
copies of a complete (and possibly long) license text. By caching and returning
the cached matches right away, we can avoid doing the same matching over and
over.
The approach is to use the hash of a sequence of token ids as a cache key either
for a whole query or a query run and to ignore the actual start position.
As values we cache a list of LicenseMatch objects for this sequence of tokens.
When we have a cache hit, the returned cached LicenseMatch are adjusted for
their query and line positions. This way we can have cache hits for the same
sequence of tokens eventually starting at different positions in different
queries.
The cached list of LicenseMatch may be empty: this way we also cache the absence
of matches for a sequence of tokens. This absence of matches can be as costly to
compute initially than an actual matches.
"""
MATCH_CACHE = '0-cached'
class LicenseMatchCache(object):
"""
A file-based cache for license matches.
This is NOT thread-safe, but is multi-process safe.
"""
def __init__(self, cache_dir):
self.cache_dir = cache_dir
create_dir(cache_dir)
from diskcache import Cache as Cache
self.cache = Cache(cache_dir)
def key(self, tokens):
"""
Return a computed cache key for a sequence of query `tokens` numeric ids.
"""
return md5(array('h', tokens).tostring()).hexdigest()
def get(self, query_run):
"""
Return a sequence of cached LicenseMatch if found in the cache or None.
It may return an empty sequence if this was a cached value.
"""
cache_key = self.key(query_run.tokens)
cached = self.cache.get(cache_key)
# either we did not get a hit or we got a hit to nothing (empty list)
# which is a valid cached value
if not cached:
return cached
qrs = query_run.start
qre = query_run.end
return [lm.rebase(qrs, qre, MATCH_CACHE) for lm in cached]
def put(self, query_run, matches):
"""
Cache a license `matches` sequence given a `query run` tokens.
"""
cache_key = self.key(query_run.tokens)
self.cache[cache_key] = matches
return cache_key
def clear(self, *args):
"""
Purge the cache keeping up to `max_size` of the most recently created
entries. If `max_size` is zero, the whole cache is purged.
Raise an exception if a write lock cannot be acquired.
"""
self.cache.clear()
# global cache
license_matches_cache = LicenseMatchCache(cache_dir=license_matches_cache_dir)