-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnmf.py
95 lines (75 loc) · 2.86 KB
/
nmf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 2 14:35:55 2021
@author: aluenen
"""
#We should also look at this: https://github.com/derekgreene/topic-model-tutorial/blob/master/3%20-%20Parameter%20Selection%20for%20NMF.ipynb
#https://github.com/derekgreene/topic-model-tutorial
import joblib
import numpy as np
from sklearn import decomposition
class NonNegativeMatrixFactorisation:
'''This class contains all functions to execute NMF'''
def openjoblib(self, name):
"""
Opens a given joblib file file
Args: A path to a joblib file (str)
Returns: A document/term matrix (A), a list of feature names (terms), \
and a list of snippets of articles
"""
(A,terms,snippets) = joblib.load(name)
print( "Loaded %d X %d document-term matrix" % (A.shape[0], A.shape[1]) )
return A, terms, snippets
def decomposition(self, k, A):
'''
Runs topic model
Parameters
----------
k : number of topics (int)
A : document/term matrix
Returns
-------
W : document membership weights of each of the topics
H : term weights to each of the topics
'''
model = decomposition.NMF( init="nndsvd", n_components=k )
W = model.fit_transform( A )
H = model.components_
return W, H
def get_descriptor(self, terms, H, topic_index, top):
'''
Extract top terms for each topic
Parameters
----------
terms : list of tokens
H : term weights to each of the topics
topic_index : index of topic in H
top : number of terms per topic
Returns
-------
top_terms : list of top terms per topic
'''
# reverse sort the values to sort the indices
top_indices = np.argsort( H[topic_index,:] )[::-1]
# now get the terms corresponding to the top-ranked indices
top_terms = []
for term_index in top_indices[0:top]:
top_terms.append( terms[term_index] )
return top_terms
def printtopics(self, k, H, terms):
descriptors = []
for topic_index in range(k):
descriptors.append(self.get_descriptor( terms, H, topic_index, 10))
str_descriptor = ", ".join( descriptors[topic_index] )
print("Topic %02d: %s" % ( topic_index+1, str_descriptor ) )
def savemodel(self, W, H, terms, snippets ):
joblib.dump((W,H,terms,snippets), "articles-model-nmf-k%02d.pkl" % k)
def main():
nmf = NonNegativeMatrixFactorisation()
k = 12 #number of topics
A, terms, snippets = nmf.openjoblib("D:/Newsdata/Original/corpuskanker-tfidf.pkl")
W, H = nmf.decomposition(k, A)
nmf.printtopics(k, H, terms)
#nmf.savemodel(W, H, terms, snippets)
if __name__ == "__main__":
main()