-
Notifications
You must be signed in to change notification settings - Fork 2
/
helpers.py
84 lines (65 loc) · 2.08 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from glob import glob
import json
import yaml
import re
import os
# Derive stop words and stemmer once
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
def write_json(content, filename):
with open(filename, "w") as fd:
fd.write(json.dumps(content, indent=4))
def read_json(filename):
with open(filename, "r") as fd:
content = json.loads(fd.read())
return content
def read_yaml(filename):
with open(filename, "r") as fd:
content = yaml.safe_load(fd)
return content
def read_errors(datadir):
errors = []
# Add in dinos errors, use the spec hash as the id
dinodir = os.path.join(datadir, "dinos")
if os.path.exists(dinodir):
for filename in glob(os.path.join(dinodir, "*error.json")):
entries = read_json(filename)
for entry in entries:
entry["label"] = "dinos-error"
errors.append(entry)
print("Found a total of %s errors" % len(errors))
# Replace source file None with empty
for error in errors:
if error["source_file"] in [[None], (None,)]:
error["source_file"] = ""
return errors
def process_text(text):
"""
Process text, including:
1. Lowercase
2. Remove numbers and punctuation
3. Strip whitespace
4. Tokenize and stop word removal
5. Stemming
"""
# Make lowercase
text = text.lower()
# Remove numbers and punctuation (but leave path separator for now)
text = re.sub(r"\d+", "", text)
text = re.sub(r"[^\w\s\/]", "", text)
# Strip whitespace
text = text.strip()
# tokenize and stop word removal
words = []
for token in word_tokenize(text):
if token in stop_words:
continue
if os.sep in token:
token = os.path.basename(token)
words.append(token)
# Don't do stemming here - the error messages are usually hard coded / consistent
# words = [stemmer.stem(t) for t in tokens]
return words