-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapp.py
108 lines (85 loc) · 3.58 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import streamlit as st
import nltk
nltk.download('brown')
nltk.download('punkt')
from nltk.util import ngrams
from nltk.lm.preprocessing import pad_sequence, padded_everygram_pipeline
from nltk.lm import MLE, Vocabulary
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import string
def preprocess_text(text):
tokens = nltk.word_tokenize(text.lower())
stop_words = set(stopwords.words('english'))
tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
return tokens
def plot_most_common_words(text):
tokens = preprocess_text(text)
word_freq = nltk.FreqDist(tokens)
most_common_words = word_freq.most_common(10)
words, counts = zip(*most_common_words)
plt.figure(figsize=(10, 6))
plt.bar(words, counts)
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Most Common Words')
plt.xticks(rotation=45)
st.pyplot(plt)
def plot_repeated_words(text):
tokens = preprocess_text(text)
word_freq = nltk.FreqDist(tokens)
repeated_words = [word for word, count in word_freq.items() if count > 1][:10]
words, counts = zip(*[(word, word_freq[word]) for word in repeated_words])
plt.figure(figsize=(10, 6))
plt.bar(words, counts)
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Repeated Words')
plt.xticks(rotation=45)
st.pyplot(plt)
def calculate_perplexity(text, model):
tokens = preprocess_text(text)
padded_tokens = ['<s>'] + tokens + ['</s>']
ngrams_sequence = list(ngrams(padded_tokens, model.order))
perplexity = model.perplexity(ngrams_sequence)
return perplexity
def calculate_burstiness(text):
tokens = preprocess_text(text)
word_freq = nltk.FreqDist(tokens)
avg_freq = sum(word_freq.values()) / len(word_freq)
variance = sum((freq - avg_freq) ** 2 for freq in word_freq.values()) / len(word_freq)
burstiness_score = variance / (avg_freq ** 2)
return burstiness_score
def is_generated_text(perplexity, burstiness_score):
if perplexity < 100 and burstiness_score < 0.5:
return "Likely generated by a language model"
else:
return "Not likely generated by a language model"
def main():
st.title("Language Model Text Analysis")
text = st.text_area("Enter the text you want to analyze", height=200)
if st.button("Analyze"):
if text:
# Load or train your language model
# In this example, we'll use a simple unigram model
tokens = nltk.corpus.brown.words() # You can use any corpus of your choice
train_data, padded_vocab = padded_everygram_pipeline(1, tokens)
model = MLE(1)
model.fit(train_data, padded_vocab)
# Calculate perplexity
perplexity = calculate_perplexity(text, model)
st.write("Perplexity:", perplexity)
# Calculate burstiness score
burstiness_score = calculate_burstiness(text)
st.write("Burstiness Score:", burstiness_score)
# Check if text is likely generated by a language model
generated_cue = is_generated_text(perplexity, burstiness_score)
st.write("Text Analysis Result:", generated_cue)
# Plot most common words
plot_most_common_words(text)
# Plot repeated words
plot_repeated_words(text)
else:
st.warning("Please enter some text to analyze.")
if __name__ == "__main__":
main()