-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathsourceloader.py
210 lines (189 loc) · 8.31 KB
/
sourceloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# sourceloader.py
# SourceLoader class
#
# Jiyong Jang, 2012
#
import sys
import os
import re
import time
from collections import defaultdict
import common
try:
import bitarray
except ImportError as err:
print err
sys.exit(-1)
class SourceLoader(object):
def __init__(self):
self._patch_list = []
self._npatch = 0
self._source_list = []
self._nsource = 0
self._match_dict = defaultdict(list)
self._nmatch = 0
self._bit_vector = bitarray.bitarray(common.bloomfilter_size)
def traverse(self, source_path, patch):
'''
Traverse source files
'''
print '[+] traversing source files'
start_time = time.time()
self._patch_list = patch.items()
self._npatch = patch.length()
if os.path.isfile(source_path):
magic_type = common.file_type(source_path)
common.verbose_print(' [-] %s: %s' % (source_path, magic_type))
if magic_type.startswith('text'):
main_type, sub_type = magic_type.split('/')
magic_ext = self._get_file_type(sub_type)
self._process(source_path, magic_ext)
elif os.path.isdir(source_path):
for root,dirs,files in os.walk(source_path):
for file in files:
file_path = os.path.join(root, file)
magic_type = common.file_type(file_path)
common.verbose_print(' [-] %s: %s' % (file_path, magic_type))
if magic_type.startswith('text'):
main_type, sub_type = magic_type.split('/')
magic_ext = self._get_file_type(sub_type)
self._process(file_path, magic_ext)
elapsed_time = time.time() - start_time
print '[+] %d possible matches ... %.1fs\n' % (self._nmatch, elapsed_time)
return self._nmatch
def _process(self, source_path, magic_ext):
'''
Normalize a source file and build a Bloom filter for queries
'''
source_file = open(source_path, 'r')
source_orig_lines = source_file.read()
source_file.close()
source_norm_lines = self._normalize(source_orig_lines, magic_ext)
if self._query_bloomfilter(source_norm_lines, magic_ext):
source_norm_lines = re.split('\n', source_norm_lines)
source_orig_lines = re.split('\n', source_orig_lines)
self._source_list.append(common.SourceInfo(source_path, magic_ext, source_orig_lines, source_norm_lines))
self._nsource += 1
def _normalize(self, source, ext):
'''
Normalize a source file
'''
# Language-specific optimization
if ext==common.FileExt.C or ext==common.FileExt.Java:
norm_lines = []
for c in common.c_regex.finditer(source):
if c.group('noncomment'):
norm_lines.append(c.group('noncomment'))
elif c.group('multilinecomment'):
newlines_cnt = c.group('multilinecomment').count('\n')
while newlines_cnt:
norm_lines.append('\n')
newlines_cnt -= 1
source = ''.join(norm_lines)
elif ext==common.FileExt.ShellScript or ext==common.FileExt.Python:
source = ''.join([c.group('noncomment') for c in common.shellscript_regex.finditer(source) if c.group('noncomment')])
elif ext==common.FileExt.Perl:
source = ''.join([c.group('noncomment') for c in common.perl_regex.finditer(source) if c.group('noncomment')])
elif ext==common.FileExt.PHP:
norm_lines = []
for c in common.php_regex.finditer(source):
if c.group('noncomment'):
norm_lines.append(c.group('noncomment'))
elif c.group('multilinecomment'):
newlines_cnt = c.group('multilinecomment').count('\n')
while newlines_cnt:
norm_lines.append('\n')
newlines_cnt -= 1
source = ''.join(norm_lines)
elif ext==common.FileExt.Ruby:
norm_lines = []
for c in common.ruby_regex.finditer(source):
if c.group('noncomment'):
norm_lines.append(c.group('noncomment'))
elif c.group('multilinecomment'):
newlines_cnt = c.group('multilinecomment').count('\n')
while newlines_cnt:
norm_lines.append('\n')
newlines_cnt -= 1
source = ''.join(norm_lines)
# Remove whitespaces except newlines
source = common.whitespaces_regex.sub("", source)
# Convert into lowercases
return source.lower()
def _query_bloomfilter(self, source_norm_lines, magic_ext):
source_norm_lines = source_norm_lines.split()
if len(source_norm_lines) < common.ngram_size:
common.verbose_print(' - skipped (%d lines)' % len(source_norm_lines))
return False
self._bit_vector.setall(0)
num_ngram = len(source_norm_lines) - common.ngram_size + 1
is_vuln_source = False
num_ngram_processed = 0
for i in range(0, num_ngram):
if num_ngram_processed > common.bloomfilter_size/common.min_mn_ratio:
common.verbose_print(' - split Bloom filters (%d n-grams)' % num_ngram_processed)
for patch_id in range(0, self._npatch):
if magic_ext == self._patch_list[patch_id].file_ext:
hash_list = self._patch_list[patch_id].hash_list
is_match = True
for h in hash_list:
if not self._bit_vector[h]:
is_match = False
break
if is_match:
is_vuln_source = True
self._match_dict[patch_id].append(self._nsource)
common.verbose_print(' - match (patch #%d : source #%d)' % (patch_id, self._nsource))
self._nmatch += 1
num_ngram_processed = 0
self._bit_vector.setall(0)
ngram = ''.join(source_norm_lines[i:i+common.ngram_size])
hash1 = common.fnv1a_hash(ngram) & (common.bloomfilter_size-1)
hash2 = common.djb2_hash(ngram) & (common.bloomfilter_size-1)
hash3 = common.sdbm_hash(ngram) & (common.bloomfilter_size-1)
self._bit_vector[hash1] = 1
self._bit_vector[hash2] = 1
self._bit_vector[hash3] = 1
num_ngram_processed += 1
for patch_id in range(0, self._npatch):
if magic_ext == self._patch_list[patch_id].file_ext:
hash_list = self._patch_list[patch_id].hash_list
is_match = True
for h in hash_list:
if not self._bit_vector[h]:
is_match = False
break
if is_match:
is_vuln_source = True
self._match_dict[patch_id].append(self._nsource)
common.verbose_print(' - match (patch #%d : source #%d)' % (patch_id, self._nsource))
self._nmatch += 1
return is_vuln_source
def _get_file_type(self, sub_type):
'''
Determine a file type based upon sub_type (magic module)
'''
magic_ext = None
if sub_type.startswith('x-c'):
magic_ext = common.FileExt.C
elif sub_type == 'x-java':
magic_ext = common.FileExt.Java
elif sub_type == 'x-shellscript':
magic_ext = common.FileExt.ShellScript
elif sub_type == 'x-perl':
magic_ext = common.FileExt.Perl
elif sub_type == 'x-python':
magic_ext = common.FileExt.Python
elif sub_type == 'x-php':
magic_ext = common.FileExt.PHP
elif sub_type == 'x-ruby':
magic_ext = common.FileExt.Ruby
else:
magic_ext = common.FileExt.Text
return magic_ext
def items(self):
return self._source_list
def length(self):
return self._nsource
def match_items(self):
return self._match_dict