Skip to content

Commit

Permalink
upload data and results
Browse files Browse the repository at this point in the history
  • Loading branch information
Weixin-Liang authored and Weixin-Liang committed May 3, 2023
1 parent c697bf9 commit 8cc05ff
Show file tree
Hide file tree
Showing 97 changed files with 99,866 additions and 5 deletions.
Binary file added .DS_Store
Binary file not shown.
58 changes: 58 additions & 0 deletions Code/baselines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import numpy as np
from tqdm import tqdm
import os
import requests
from multiprocessing.pool import ThreadPool
from math import ceil

class PXDetector:
def __init__(self):
pass

def score(self, data, model, batch_size=8, **kwargs):
print("Scoring")
n_samples = len(data)
scores = []
for batch in tqdm(range(ceil(n_samples / batch_size)), desc=f"Computing PX"):
original_text = data[batch * batch_size:(batch + 1) * batch_size]
loglikelihood = model.get_batch_loglikelihood(original_text)
scores.append(loglikelihood)
scores = np.concatenate(scores)
return scores

class GPT0:
def __init__(self, api_key=""):
self.api_key = api_key
self.base_url = 'https://api.gptzero.me/v2/predict'
pass

def score(self, data, model, batch_size=8, **kwargs):
print("Scoring")
n_samples = len(data)
scores = []
for batch in tqdm(range(n_samples // batch_size), desc=f"Computing PX"):
original_text = data[batch * batch_size:(batch + 1) * batch_size]
loglikelihood = model.get_batch_loglikelihood(original_text)
scores.append(loglikelihood)
scores = np.concatenate(scores)
return scores


def score(self, documents, n_p=16):
pool = ThreadPool(n_p)
out_scores = pool.map(self.text_predict, documents)



def text_predict(self, document):
url = f'{self.base_url}/text'
headers = {
'accept': 'application/json',
'X-Api-Key': self.api_key,
'Content-Type': 'application/json'
}
data = {
'document': document
}
response = requests.post(url, headers=headers, json=data)
return response.json()
103 changes: 103 additions & 0 deletions Code/extract_scores.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import os
import torch
import json
import argparse
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

from models import get_model
from detectors import get_detector

def config():
parser = argparse.ArgumentParser()
parser.add_argument("--model_name", type=str, default="gpt2-xl")
parser.add_argument("--dataset_name", type=str, default="toefl")
parser.add_argument("--batch_size", type=int, default=32)
parser.add_argument("--detector_name", type=str, default="detectgpt")
parser.add_argument("--cache_dir", type=str, default="/dfs/scratch1/merty/.cache")
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--root-folder", type=str, default="./project_data/")
return parser.parse_args()


args = config()
model = get_model(args.model_name)
detector = get_detector(args.detector_name)

torch.manual_seed(args.seed)
np.random.seed(args.seed)

import gdown
import os
import glob
import pandas as pd

def pubs_dataset(cache_dir="/dfs/scratch2/merty/", first_k=500):
if not os.path.exists(os.path.join(cache_dir, "ScientificDiscovery/")):
raise ValueError()

all_csvs = glob.glob(os.path.join(cache_dir, "ScientificDiscovery/", "*.csv"))

dfs = [pd.read_csv(csv)[['abstract', 'venue']] for csv in all_csvs[:1]]
concatted = pd.concat(dfs).iloc[:first_k]
abstracts = concatted['abstract'].tolist()
venues = concatted['venue'].tolist()
return concatted, abstracts, venues


class DetectionDataset:
def __init__(self, name, root="/afs/cs.stanford.edu/u/merty/projects/chatgpt-detector-eval/project_data/"):
self.data_path = os.path.join(root, name)
self.name = json.load(open(os.path.join(self.data_path, "name.json")))
self.data = json.load(open(os.path.join(self.data_path, "data.json")))
self.results = None
self.collect_results()
def __len__(self):
return len(self.data)
def describe(self):
return f"{self.name['name']} Mean Length: {np.mean([len(t['document'].split(' ')) for t in self.data])}"
def __getitem__(self, idx):
return self.data[idx]["document"]
def collect_results(self):
results_files = [f for f in os.listdir(self.data_path) if f not in ["name.json", "data.json"]]
print(results_files)
results_records = [json.load(open(os.path.join(self.data_path, f))) for f in results_files]
detector_names = [f.split(".")[0] for f in results_files]
results_records = {n: {"Detector": n, "Scores": r} for n,r in zip(detector_names, results_records)}
self.results = results_records
return results_records
def get_results(self, detector_name):
if detector_name not in self.results:
return None
else:
return self.results[detector_name]
def save_result(self, detector_name, scores):
with open(os.path.join(self.data_path, f"{detector_name}.json"), "w") as f:
json.dump(scores, f)


root_folder = args.root_folder
for folder in os.listdir(root_folder):
ds = DetectionDataset(foler)
print(ds[0])
print(ds.describe())
results = ds.collect_results()
print(results.keys())
print()

detector_results = ds.get_results(args.detector_name)
if detector_results is None:

texts = [t["document"] for t in ds.data]
print(ds.describe())
dataset_scores = detector.score(texts, model, batch_size=args.batch_size)
all_results = []
for d, s in zip(texts, dataset_scores):
all_results.append({"document": d, "score": float(s)})

ds.save_result(f"{args.detector_name}-{args.model_name}", all_results)
print("Saved")

else:
print("Already exists: {}".format(detector_results))
33 changes: 33 additions & 0 deletions Code/hf_detector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
import numpy as np
import torch
from scipy.special import softmax



class HFDetector:
def __init__(self, model_name="Hello-SimpleAI/chatgpt-detector-roberta" , cache_dir="/dfs/scratch1/merty/.cache", device="cuda"):
self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir, output_hidden_states=True)
self.model = AutoModelForSequenceClassification.from_pretrained(model_name, cache_dir=cache_dir, output_hidden_states=True)
self.model = self.model.to(device)
self.model = self.model.eval()
self.device = device

@torch.no_grad()
def score(self, texts, model, batch_size=8):
all_scores = []
for batch in tqdm(range(len(texts) // batch_size), desc=f"Running ChatGPT Detector"):
original_text = texts[batch * batch_size:(batch + 1) * batch_size]
inputs = self.tokenizer(original_text, padding="max_length", truncation=True, return_tensors="pt").to(self.device)
outputs = self.model(**inputs)
logits = outputs["logits"].detach().cpu().numpy()
probs = softmax(logits, axis=1)
all_scores.append(probs[:, 1:])
all_scores = np.concatenate(all_scores, axis=0)
return all_scores





57 changes: 57 additions & 0 deletions Code/hf_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import torch
from multiprocessing.pool import ThreadPool
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch.nn.functional as F


class HFModel:
def __init__(self, model_name="gpt2" , cache_dir="/dfs/scratch1/merty/.cache", device="cuda"):
self.model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)
self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.model = self.model.to(device)
self.model = self.model.eval()
self.device = device

@torch.no_grad()
def sample(self, texts, prompt_tokens=30, min_words=55, do_top_p=True, min_length=150):
all_encoded = self.tokenizer(texts, return_tensors="pt", padding=True).to(self.device)
all_encoded = {key: value[:, :prompt_tokens] for key, value in all_encoded.items()}

decoded = ['' for _ in range(len(texts))]

# sample from the model until we get a sample with at least min_words words for each example
# this is an inefficient way to do this (since we regenerate for all inputs if just one is too short), but it works
tries = 0
while (m := min(len(x.split()) for x in decoded)) < min_words:
if tries != 0:
print()
print(f"min words: {m}, needed {min_words}, regenerating (try {tries})")

sampling_kwargs = {}
if do_top_p:
sampling_kwargs['top_p'] = 0.9

outputs = self.model.generate(**all_encoded, min_length=min_length, max_length=200, do_sample=True, **sampling_kwargs, pad_token_id=self.tokenizer.eos_token_id, eos_token_id=self.tokenizer.eos_token_id)
decoded = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
tries += 1
return decoded

@torch.no_grad()
def get_batch_loglikelihood(self, texts):
tokenized = self.tokenizer(texts, return_tensors="pt", padding=True).to(self.device)
labels = tokenized.input_ids
outputs = self.model(**tokenized, labels=labels)
logits = outputs.logits.cpu()
labels = labels.cpu()

shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
shift_labels[shift_labels == self.tokenizer.pad_token_id] = -100
loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), reduction="none")
ll_per_sample = -loss.view(shift_logits.shape[0], shift_logits.shape[1])
nonpad_per_row = (shift_labels != -100).sum(dim=1)
ll_per_sample = ll_per_sample.sum(dim=1)/nonpad_per_row
return ll_per_sample.cpu().numpy()


47 changes: 47 additions & 0 deletions Code/openai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import openai
import numpy as np
from transformers import AutoTokenizer
from multiprocessing.pool import ThreadPool


class OpenAIModel:
def __init__(self, model_name="text-davinci-003", cache_dir="/dfs/scratch1/merty/.cache", device="cuda",
top_p=0.96, do_top_p=True):
self.openai_model = model_name
openai.api_key = '<Your API Key>'
self.tokenizer = AutoTokenizer.from_pretrained("gpt2", cache_dir=cache_dir)
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.device = device
self.top_p = top_p
self.do_top_p = do_top_p

def _sample_api(self, text):
# sample from the openai model
kwargs = {"engine": self.openai_model, "max_tokens": 200 }
if self.do_top_p:
kwargs['top_p'] = self.top_p

r = openai.Completion.create(prompt=f"{text}", **kwargs)
return text + r['choices'][0].text

def sample(self, texts, prompt_tokens=30, min_words=55, do_top_p=True, min_length=150):
all_encoded = self.tokenizer(texts, return_tensors="pt", padding=True).to(self.device)
all_encoded = {key: value[:, :prompt_tokens] for key, value in all_encoded.items()}

prefixes = self.tokenizer.batch_decode(all_encoded['input_ids'], skip_special_tokens=True)
pool = ThreadPool(8)
decoded = pool.map(self._sample_api, prefixes)
return decoded

def get_ll(self, text):
kwargs = { "engine": self.openai_model, "temperature": 0, "max_tokens": 0, "echo": True, "logprobs": 0}
r = openai.Completion.create(prompt=f"<|endoftext|>{text}", **kwargs)
result = r['choices'][0]
tokens, logprobs = result["logprobs"]["tokens"][1:], result["logprobs"]["token_logprobs"][1:]
return np.mean(logprobs)

def get_batch_loglikelihood(self, texts):
# get the loglikelihood of each text in the batch
ll_per_sample = np.array(ThreadPool(8).map(self.get_ll, texts))
return ll_per_sample

Binary file added Data_and_Results/.DS_Store
Binary file not shown.
Loading

0 comments on commit 8cc05ff

Please sign in to comment.