-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextractivesummarization.py
145 lines (100 loc) · 3.77 KB
/
extractivesummarization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# -*- coding: utf-8 -*-
"""ExtractiveSummarization.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/github/dlagur/ProjectHanium/blob/main/ExtractiveSummarization.ipynb
"""
!pip install jpype1
!pip install konlpy
from konlpy.tag import Kkma
from konlpy.tag import Twitter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
import numpy as np
kkma = Kkma() # text를 문장단위로 나눈다
def text2sentences(text):
sentences = kkma.sentences(text)
for idx in range(0, len(sentences)):
if len(sentences[idx]) <= 10:
sentences[idx-1] += (' ' + sentences[idx])
sentences[idx] = ''
return sentences
import pandas as pd
text = pd.read_csv('/content/IT_0.csv')
text
sentences = text2sentences(text['answer'][0])
sentences[0]
twitter = Twitter()
# 불용어 제거
stopwords = ['글자수', '000', '또한', '저는', '있습니다', '그리고', '이러한', '거나', '자수', '통해', '이후', '당시', '대한', '때문 ']
def get_nouns(sentences):
nouns = []
for sentence in sentences:
if sentence is not '':
nouns.append(' '.join([noun for noun in twitter.nouns(str(sentence)) if noun not in stopwords and len(noun) > 1]))
return nouns
nouns = get_nouns(sentences)
nouns
nouns
"""## TF-IDF 모델 생성 및 그래프 생성"""
tfidf = TfidfVectorizer()
cnt_vec = CountVectorizer()
graph_sentence = []
def build_sent_graph(sentence):
tfidf_mat = tfidf.fit_transform(sentence).toarray()
graph_sentence = np.dot(tfidf_mat, tfidf_mat.T)
return graph_sentence
sent_graph = build_sent_graph(nouns) # 요약할 문장 단위에 대한 sentence graph 생성
def build_words_graph(sentence):
cnt_vec_mat = normalize(cnt_vec.fit_transform(sentence).toarray().astype(float), axis =0)
vocab = cnt_vec.vocabulary_
return np.dot(cnt_vec_mat.T, cnt_vec_mat), {vocab[word] : word for word in vocab}
words_graph, idx2word = build_words_graph(nouns)
words_graph
idx2word
"""### 순서
1. 위에서 구한 가중치 그래프 이용 TextRank 알고리즘에 넣어 점수 매기기
2. 순위가 높은 순서대로 정렬
3. 요약할 문장의 개수만큼 출력
"""
def get_ranks(graph, d=0.85): # d = damping factor
A = graph
matrix_size = A.shape[0]
for id in range(matrix_size):
A[id, id] = 0 # diagonal 부분을 0으로
link_sum = np.sum(A[:,id]) # A[;, id] = A[:][id]
if link_sum != 0:
A[:, id] /= link_sum
A[:, id] *= -d
A[id, id] = 1
B = (1-d) * np.ones((matrix_size, 1))
ranks = np.linalg.solve(A, B) # 연립방정식 Ax = b
return {idx : r[0] for idx, r in enumerate(ranks)}
# sent_rank_idx = get_ranks(sent_graph) # sent_graph = sentence 가중치 그래프
word_rank_idx = get_ranks(words_graph)
# sorted_sent_rank_idx = sorted(sent_rank_idx, key=lambda k: sent_rank_idx[k], reverse=True)
sorted_word_rank_idx = sorted(word_rank_idx, key=lambda k : word_rank_idx[k], reverse=True)
# 요약할 문장의 수를 입력하면 그 문장의 개수만큼 요약
def summarize(sent_num=5): # 기본 5개의 문장으로 텍스트를 요약
summary = []
index = []
for idx in sorted_sent_rank_idx[:sent_num]:
index.append(idx)
index.sort()
for idx in index:
summary.append(sentences[idx])
for text in summary:
print(text)
print('\n')
text['answer'][0]
def keywords(word_num=10):
keywords = []
index = []
for idx in sorted_word_rank_idx[:word_num]:
index.append(idx)
# index.sort()
for idx in index:
keywords.append(idx2word[idx])
print(keywords)
keywords()