-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathapp.py
237 lines (195 loc) · 8.37 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import numpy as np
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import torch
import re
import streamlit as st
import pdfplumber
import pandas as pd
import base64
from docx import Document
import streamlit.components.v1 as components
# Define the device, model, and tokenizer
device = "cpu"
# device = "mps" # for Apple Sillicon devices
# device ="cuda" # for CUDA supported devices
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
max_length = 1024
stride = 256
ai_perplexity_threshold = 55
human_ai_perplexity_threshold = 80
def get_perplexity(sentence):
"""
Calculate the perplexity of a given sentence using the GPT-2 model.
"""
# Encode the sentence using the tokenizer
input_ids = tokenizer.encode(
sentence,
add_special_tokens=True,
truncation=True,
max_length=max_length,
return_tensors="pt"
).to(device)
total_nll = 0
total_tokens = 0
for start_pos in range(0, input_ids.shape[1], stride):
# Determine the end position of the current sequence
end_pos = min(start_pos + max_length, input_ids.shape[1])
target_len = end_pos - start_pos
# Create target_ids by detaching input_ids and filling non-target tokens with -100
target_ids = input_ids[:, start_pos:end_pos].detach()
target_ids[:, :-target_len].fill_(-100)
# Compute the negative log likelihood loss
outputs = model(input_ids[:, start_pos:end_pos], labels=target_ids)
neg_log_likelihood = outputs.loss * target_len
total_nll += neg_log_likelihood.sum()
total_tokens += target_len
if total_tokens == 0:
# Assign infinity perplexity as a default value
perplexity = float('inf')
else:
perplexity = round(float(torch.exp(total_nll / total_tokens)), 2)
return perplexity
def analyze_text(sentence):
"""
Analyze the given text and determine the perplexity and label of the text.
"""
results = {}
# Count the total number of valid characters in the sentence
total_valid_char = sum(len(x)
for x in re.findall(r"[a-zA-Z0-9]+", sentence))
if total_valid_char < 200:
results["Label"] = -1
results["Output"] = "Insufficient Content"
results["Percent_ai"] = "-"
results["Perplexity"] = "-"
results["Burstiness"] = "-"
return results
# Split the sentence into lines based on punctuation and newlines
lines = re.split(r'(?<=[.?!][ \[\(])|(?<=\n)\s*', sentence)
lines = [line for line in lines if re.search(
r"[a-zA-Z0-9]+", line) is not None]
perplexities = []
total_characters = 0
ai_characters = 0
for line in lines:
total_characters += len(line)
perplexity = get_perplexity(line)
perplexities.append(perplexity)
if perplexity < ai_perplexity_threshold:
ai_characters += len(line)
results["Percent_ai"] = str(
round((ai_characters/total_characters)*100, 2))+"%"
results["Perplexity"] = round(sum(perplexities) / len(perplexities), 2)
results["Burstiness"] = round(np.var(perplexities), 2)
if results["Perplexity"] <= ai_perplexity_threshold:
results["Label"] = 0
results["Output"] = "AI"
elif results["Perplexity"] <= human_ai_perplexity_threshold:
results["Label"] = 1
results["Output"] = "Human + AI"
else:
results["Label"] = 2
results["Output"] = "Human"
return results
def process_text_file(file):
"""
Process the input text file (PDF or Word) and analyze the content.
"""
if file.type == "application/pdf":
with pdfplumber.open(file) as pdf:
text = ""
for page in pdf.pages:
extracted_text = page.extract_text()
text += extracted_text if extracted_text is not None else ""
elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
doc = Document(file)
text = ""
for para in doc.paragraphs:
text += para.text
else:
st.error("Unsupported file format. Please upload a PDF or Word document.")
return
results = analyze_text(text)
return results
def main():
st.set_page_config(page_title='ChatGPT - AI-powered text analysis')
st.title("CheckGPT")
st.write("CheckGPT is an AI-powered text analysis tool that predicts the content generated by AI by evaluating the perplexity and burstiness scores of GPT model, and provides insights for investigating text authenticity.")
st.write("Checkout the code on GitHub <a href='https://github.com/Ank-Cha/CheckGPT' target='_blank'><img src='https://cdn-icons-png.flaticon.com/512/536/536452.png' alt='GitHub Logo' width='25' height='25' style='margin-left: 5px;'></a>", unsafe_allow_html=True)
# Create an empty placeholder for the uploaded files
uploaded_files_placeholder = st.empty()
results_list = []
# Process the files only when the "Start" button is pressed
uploaded_files = uploaded_files_placeholder.file_uploader(
"Upload PDF or Word documents", type=["pdf", "docx"], accept_multiple_files=True)
# Create a button to start processing
start_button = st.button("Start Checking")
st.markdown(
"""
<style>
.footer {
position: fixed;
bottom: 0;
left: 0;
width: 100%;
text-align: center;
padding: 10px;
background-color: #0A2742;
color: white;
}
</style>
<div class="footer">
Made by Ankush
<a href="https://www.linkedin.com/in/ankush-chaudhari/" target="_blank">
<img src="https://cdn-icons-png.flaticon.com/512/174/174857.png" alt="LinkedIn" width="20" height="20">
</a>
</div>
""",
unsafe_allow_html=True
)
if start_button:
with st.spinner("Processing..."):
for uploaded_file in uploaded_files:
results = process_text_file(uploaded_file)
results["file_name"] = uploaded_file.name
results_list.append(results)
if results_list:
df = pd.DataFrame(results_list)
df = df[["file_name", "Percent_ai",
"Perplexity", "Burstiness", "Output"]]
df = df.astype(str)
df = df.rename(columns={"file_name": "File Name", "Percent_ai": "Predicted AI percent",
"Perplexity": "Perplexity Score", "Output": "Predicted Output"})
st.write("Results:")
# Apply conditional formatting to the "Output" cell only
df_styled = df.style.applymap(
lambda value: "color: grey" if value == "Insufficient Content" else
"color: green" if value == "Human" else
"color: DarkOrange" if value == "Human + AI" else
"color: red",
subset=["Predicted Output"]
)
st.dataframe(df_styled)
# Add a button to download the results as a CSV file
csv_data = df.to_csv(index=False)
b64 = base64.b64encode(csv_data.encode()).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="results.csv">Download CSV</a>'
st.markdown(href, unsafe_allow_html=True)
# Display the description of columns and disclaimer
st.markdown(
"""
<div class="small-text">
<strong>Column Descriptions:</strong><br>
- <strong>Predicted AI percent:</strong> Percentage of the text predicted to be generated by AI.<br>
- <strong>Perplexity Score:</strong> Measurement the model's confidence in generating the text.<br>
- <strong>Burstiness:</strong> Measurment of variation in perplexity scores for the analyzed text.<br>
- <strong>Predicted Output:</strong> The predicted label for the text: 'AI', 'Human + AI', 'Human', or 'Insufficient Content'.<br><br>
<strong>Disclaimer:</strong><br>
These results are generated by an AI model and may not be 100% accurate. Please use them for investigation purposes and exercise caution when making decisions based on the results.
</div>
""",
unsafe_allow_html=True
)
if __name__ == "__main__":
main()