forked from EgorShatsky/SortApp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlogic.py
102 lines (64 loc) · 2.67 KB
/
logic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import pandas as pd
import re
def get_data_from_xlsx():
excel_data = pd.read_excel("file.xlsx", header=None)
return pd.DataFrame(excel_data)
def delete_header_row(df):
df_without_header_row = df.drop(df.index[0])
return df_without_header_row
def get_grouped_sentences(df, index):
original_dialogs = df[index].tolist()
original_dialogs = list(map(lambda d: str(d), original_dialogs))
grouped_sentences = []
for dialog in original_dialogs:
grouped_sentences.append(get_dialog_sentences(dialog))
return grouped_sentences
def get_dialog_sentences(dialog):
sentences = []
client_dialogs = re.findall(r'CLIENT:(.+?)BOT:', dialog + "BOT:", re.DOTALL)
if len(client_dialogs) == 0:
sentences.append(dialog)
else:
client_dialogs = get_client_dialogs(client_dialogs)
for client_dialog in client_dialogs:
client_sentences = client_dialog.split(".")
sentences = sentences + client_sentences
sentences = list(map(lambda d: d.strip(), sentences))
sentences = list(filter(lambda d: d != '', sentences))
return sentences
def get_client_dialogs(dialogs):
delimiter_chars = ['?', '!', '.', '\n', 'CLIENT:']
client_dialogs = []
for dialog in dialogs:
filtered_dialog = dialog
for char in delimiter_chars:
filtered_dialog = filtered_dialog.replace(char, ".")
filtered_dialog = filtered_dialog.strip()
filtered_dialog = re.sub(r"[:,'\";<>\\/`~#%^&*()+]", "", filtered_dialog)
client_dialogs.append(filtered_dialog.lower())
return client_dialogs
def get_words_from_sentence(sentence):
words = list(map(lambda word: word.strip(' '), sentence.split(' ')))
return list(filter(lambda word: word != '', words))
def get_phrase_count_dict(grouped_sentences, phrase_word_count):
phrase_count = {}
for group_sentences in grouped_sentences:
group_phrases = set()
for sentence in group_sentences:
words = get_words_from_sentence(sentence)
for i in range(len(words)):
if i + phrase_word_count > len(words):
break
phrase = " ".join(words[i:i + phrase_word_count])
group_phrases.add(phrase)
for phrase in group_phrases:
if phrase in phrase_count.keys():
phrase_count[phrase] = phrase_count[phrase] + 1
else:
phrase_count[phrase] = 1
return phrase_count
def sort_phrases(phrase_counts, sort_type="max"):
reverse = True
if sort_type == "min":
reverse = False
return sorted(phrase_counts.items(), key=lambda p: p[1], reverse=reverse)