app.py

import numpy as np
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import torch
import re
import streamlit as st
import pdfplumber
import pandas as pd
import base64
from docx import Document
import streamlit.components.v1 as components


# Define the device, model, and tokenizer
device = "cpu"
# device = "mps" # for Apple Sillicon devices
# device ="cuda" # for CUDA supported devices

model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")


max_length = 1024
stride = 256
ai_perplexity_threshold = 55
human_ai_perplexity_threshold = 80


def get_perplexity(sentence):
    """
    Calculate the perplexity of a given sentence using the GPT-2 model.
    """
    # Encode the sentence using the tokenizer
    input_ids = tokenizer.encode(
        sentence,
        add_special_tokens=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    ).to(device)

    total_nll = 0
    total_tokens = 0

    for start_pos in range(0, input_ids.shape[1], stride):
        # Determine the end position of the current sequence
        end_pos = min(start_pos + max_length, input_ids.shape[1])
        target_len = end_pos - start_pos

        # Create target_ids by detaching input_ids and filling non-target tokens with -100
        target_ids = input_ids[:, start_pos:end_pos].detach()
        target_ids[:, :-target_len].fill_(-100)

        # Compute the negative log likelihood loss
        outputs = model(input_ids[:, start_pos:end_pos], labels=target_ids)
        neg_log_likelihood = outputs.loss * target_len

        total_nll += neg_log_likelihood.sum()
        total_tokens += target_len

    if total_tokens == 0:
        # Assign infinity perplexity as a default value
        perplexity = float('inf')
    else:
        perplexity = round(float(torch.exp(total_nll / total_tokens)), 2)

    return perplexity


def analyze_text(sentence):
    """
    Analyze the given text and determine the perplexity and label of the text.
    """
    results = {}

    # Count the total number of valid characters in the sentence
    total_valid_char = sum(len(x)
                           for x in re.findall(r"[a-zA-Z0-9]+", sentence))

    if total_valid_char < 200:
        results["Label"] = -1
        results["Output"] = "Insufficient Content"
        results["Percent_ai"] = "-"
        results["Perplexity"] = "-"
        results["Burstiness"] = "-"

        return results

    # Split the sentence into lines based on punctuation and newlines
    lines = re.split(r'(?<=[.?!][ \[\(])|(?<=\n)\s*', sentence)
    lines = [line for line in lines if re.search(
        r"[a-zA-Z0-9]+", line) is not None]
    perplexities = []
    total_characters = 0
    ai_characters = 0
    for line in lines:
        total_characters += len(line)
        perplexity = get_perplexity(line)
        perplexities.append(perplexity)
        if perplexity < ai_perplexity_threshold:
            ai_characters += len(line)

    results["Percent_ai"] = str(
        round((ai_characters/total_characters)*100, 2))+"%"
    results["Perplexity"] = round(sum(perplexities) / len(perplexities), 2)
    results["Burstiness"] = round(np.var(perplexities), 2)

    if results["Perplexity"] <= ai_perplexity_threshold:
        results["Label"] = 0
        results["Output"] = "AI"
    elif results["Perplexity"] <= human_ai_perplexity_threshold:
        results["Label"] = 1
        results["Output"] = "Human + AI"
    else:
        results["Label"] = 2
        results["Output"] = "Human"

    return results


def process_text_file(file):
    """
    Process the input text file (PDF or Word) and analyze the content.
    """
    if file.type == "application/pdf":
        with pdfplumber.open(file) as pdf:
            text = ""
            for page in pdf.pages:
                extracted_text = page.extract_text()
                text += extracted_text if extracted_text is not None else ""

    elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
        doc = Document(file)
        text = ""
        for para in doc.paragraphs:
            text += para.text
    else:
        st.error("Unsupported file format. Please upload a PDF or Word document.")
        return

    results = analyze_text(text)
    return results


def main():
    st.set_page_config(page_title='ChatGPT - AI-powered text analysis')
    st.title("CheckGPT")
    st.write("CheckGPT is an AI-powered text analysis tool that predicts the content generated by AI by evaluating the perplexity and burstiness scores of GPT model, and provides insights for investigating text authenticity.")
    st.write("Checkout the code on GitHub <a href='https://github.com/Ank-Cha/CheckGPT' target='_blank'><img src='https://cdn-icons-png.flaticon.com/512/536/536452.png' alt='GitHub Logo' width='25' height='25' style='margin-left: 5px;'></a>", unsafe_allow_html=True)
    # Create an empty placeholder for the uploaded files
    uploaded_files_placeholder = st.empty()

    results_list = []

    # Process the files only when the "Start" button is pressed
    uploaded_files = uploaded_files_placeholder.file_uploader(
        "Upload PDF or Word documents", type=["pdf", "docx"], accept_multiple_files=True)

    # Create a button to start processing
    start_button = st.button("Start Checking")

    st.markdown(
        """
        <style>
        .footer {
            position: fixed;
            bottom: 0;
            left: 0;
            width: 100%;
            text-align: center;
            padding: 10px;
            background-color: #0A2742;
            color: white;
        }
        </style>
       <div class="footer">
    Made by Ankush &nbsp;&nbsp;&nbsp;
    <a href="https://www.linkedin.com/in/ankush-chaudhari/" target="_blank">
    <img src="https://cdn-icons-png.flaticon.com/512/174/174857.png" alt="LinkedIn" width="20" height="20">
    </a>
    </div>
        """,
        unsafe_allow_html=True
    )

    if start_button:
        with st.spinner("Processing..."):
            for uploaded_file in uploaded_files:
                results = process_text_file(uploaded_file)
                results["file_name"] = uploaded_file.name
                results_list.append(results)

        if results_list:
            df = pd.DataFrame(results_list)
            df = df[["file_name", "Percent_ai",
                     "Perplexity", "Burstiness", "Output"]]
            df = df.astype(str)
            df = df.rename(columns={"file_name": "File Name", "Percent_ai": "Predicted AI percent",
                           "Perplexity": "Perplexity Score", "Output": "Predicted Output"})
            st.write("Results:")

            # Apply conditional formatting to the "Output" cell only
            df_styled = df.style.applymap(
                lambda value: "color: grey" if value == "Insufficient Content" else
                "color: green" if value == "Human" else
                "color: DarkOrange" if value == "Human + AI" else
                "color: red",
                subset=["Predicted Output"]
            )

            st.dataframe(df_styled)

            # Add a button to download the results as a CSV file
            csv_data = df.to_csv(index=False)
            b64 = base64.b64encode(csv_data.encode()).decode()
            href = f'<a href="data:file/csv;base64,{b64}" download="results.csv">Download CSV</a>'
            st.markdown(href, unsafe_allow_html=True)

            # Display the description of columns and disclaimer
            st.markdown(
                """
            <div class="small-text">
            <strong>Column Descriptions:</strong><br>
            - <strong>Predicted AI percent:</strong> Percentage of the text predicted to be generated by AI.<br>
            - <strong>Perplexity Score:</strong> Measurement the model's confidence in generating the text.<br>
            - <strong>Burstiness:</strong> Measurment of variation in perplexity scores for the analyzed text.<br>
            - <strong>Predicted Output:</strong> The predicted label for the text: 'AI', 'Human + AI', 'Human', or 'Insufficient Content'.<br><br>
            
            <strong>Disclaimer:</strong><br>
            These results are generated by an AI model and may not be 100% accurate. Please use them for investigation purposes and exercise caution when making decisions based on the results.
            </div>
            """,
                unsafe_allow_html=True
            )


if __name__ == "__main__":
    main()