-
Notifications
You must be signed in to change notification settings - Fork 1
/
wordlevel.js
153 lines (134 loc) · 4.33 KB
/
wordlevel.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
// functions for checking word frequency level
'use strict'
var nlp = require('compromise') // for parsing and NLP
// var _ = require('underscore')
var removePunctuation = require('remove-punctuation')
var Lemmatizer_en = require("javascript-lemmatizer")
var freq_en = require('./freq_list_en')
class Wordlevel {
constructor(lang) {
if (!lang) lang='en'
if (['en','ar','fa'].indexOf(lang)===-1) throw('Language not supported')
this.lang = lang
this.lemmatizer = {}
//this.lemmafreq = {}
if (lang==='en') {
this.lemmatizer = new Lemmatizer_en()
this.list = freq_en
}
else if (lang==='fa') {
this.lemmatizer = {} //new Lemmatizer_en()
this.list = [] //freq_en
}
else if (lang==='ar') {
this.lemmatizer = {} //new Lemmatizer_en()
this.list = [] //freq_en
}
}
normalize_word(word, pos) {
word = removePunctuation(word).toLowerCase().trim();
var lemma = '';
// English
if (this.lang==='en') {
var lemmas = this.lemmatizer.lemmas(word);
if (lemmas.length===0) lemma = ''
else if (lemmas.length===1) lemma = lemmas[0][0];
else if (!pos) lemma = lemmas[1][0]; // i.e. the noun form (statistically most likely)
else {
// try to match known POS
var matches = lemmas.filter(function(lem){ return (lem[1]===pos); });
if (matches.length>0) lemma = matches[0][0];
else lemmas[1][0]; // i.e. the noun form (statistically most likely)
}
}
// TODO: Arabic
// TODO: Farsi
return lemma;
}
// return array of objects describing each word
parse_str(str) {
var list = nlp(str).out('terms');
var pos_options = {Noun:'noun', Verb:'verb', Adjective:'adj', Adverb:'adv'};
//console.log(list)
// pull out parts we actually want
var words = [];
list.forEach((word, index) => {
// console.log(word);
let newword = {}
newword.discard = word.tags.filter((tag)=>!pos_options[tag] )
newword.pos = word.tags
.filter((tag)=>pos_options[tag]).map((tag)=>pos_options[tag])
newword.pos = newword.pos[0] || ''
newword.lemma = this.normalize_word(word.normal, newword.pos)
newword.word = word.text
//newword = this.pluginWordAnalysis(newword)
//newword.html = this.word2HTML(newword)
list[index] = newword
});
// console.log(list);
return list;
}
frequency(word, pos) {
if (!this.lemmafreq) {
this.lemmafreq = this._prepare_lema_index()
this.lemmacount = Object.keys(this.lemmafreq).length
}
let freq = 0
let lemma = this.normalize_word(word, pos)
if (lemma.length>0) freq = this.lemmafreq[lemma] || 0
return freq
}
level(word, pos) {
// ratio to percentage rounded to first decimal
let level = Math.round(((this.frequency(word, pos) / this.lemmacount) * 100)*10)/10
return level
}
// returns the level at 98% of these words
block_level(str) {
var words = this.parse_frequency_list(str)
let wordcount = words.length
let top = wordcount - Math.round(wordcount/50)
let word = words[top-1]
return word.level
}
// returns the top 2% words
topwords(str) {
var words = this.parse_frequency_list(str)
let wordcount = words.length
let top = wordcount - Math.round(wordcount/50)
let topwords = words.slice(top-1)
return topwords
}
parse_frequency_list(str){
var text = this.parse_str(str)
var list = {}
var sortedlist = []
let that = this
text.forEach((word)=> {
let level = that.level(word.lemma, word.pos)
if (!list[word.lemma]) list[word.lemma] = {word: word.lemma, count: 1, level: level}
else list[word.lemma].count++
})
// copy over into a sorted array
for (var word in list) {
if (list.hasOwnProperty(word)) sortedlist.push(list[word])
}
// sort array by level
sortedlist.sort((a, b) => a.level-b.level)
return sortedlist
}
/** internal funcitonality */
_prepare_lema_index() {
let result = {}
let that = this
if (this.list.length<1) return result
this.list.forEach(function(word, index) {
let lemma = that.normalize_word(word)
if (lemma) result[lemma] = index
})
//console.log('Prepared lemma frequency index with ', Object.keys(result).length, 'keys')
return result
}
}
// export default Testwords;
module.exports = Wordlevel