-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathaugmentation.py
68 lines (60 loc) · 2.1 KB
/
augmentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from better_profanity import profanity
def sentiment_scores(sentence, neg, neu, pos, compound):
sid_obj = SentimentIntensityAnalyzer()
sentiment_dict = sid_obj.polarity_scores(sentence)
#print("Overall sentiment dictionary is : ", sentiment_dict)
neg.append(sentiment_dict["neg"])
neu.append(sentiment_dict["neu"])
pos.append(sentiment_dict["pos"])
compound.append(sentiment_dict["compound"])
# Driver code
if __name__ == "__main__" :
data = pd.read_csv('data/old/dataset.csv')
content = data['content']
annotation = data['annotation']
cuss_word = []
f = open('cuss_words.txt','r')
for line in f:
cuss_word.append(line.strip())
d1 = pd.DataFrame(data)
neg = []
neu = []
pos = []
sentence_len = []
compound = []
punctuation_count = []
contain_profanity = []
num_profanity = []
for sentence in content:
#1 if has profanity, 0 if no profanity
count = 0
if(profanity.contains_profanity(sentence)):
contain_profanity.append(1)
for word in sentence.split():
if word in cuss_word:
count = count + 1
num_profanity.append(count)
else:
contain_profanity.append(0)
num_profanity.append(count)
#length of sentence
sentence_len.append(len(sentence))
#punctuation ratio
count = 0
for i in range (0, len(sentence)):
if sentence[i] in ('!', "," ,"\'" ,";" ,"\"", ".", "-" ,"?"):
count = count + 1
punctuation_count.append((count/len(sentence)))
#vader sentiment
sentiment_scores(sentence, neg, neu, pos, compound)
d1['sentence_length'] = sentence_len
d1["neg"] = neg
d1["neu"] = neu
d1["pos"] = pos
d1['compound'] = compound
d1['punctuation_count'] = punctuation_count
d1['contain_profanity'] = contain_profanity
d1['num_profanity'] = num_profanity
d1.to_csv('data/dataset.csv', index=True)