-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Weixin-Liang
authored and
Weixin-Liang
committed
May 3, 2023
1 parent
c697bf9
commit 8cc05ff
Showing
97 changed files
with
99,866 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import numpy as np | ||
from tqdm import tqdm | ||
import os | ||
import requests | ||
from multiprocessing.pool import ThreadPool | ||
from math import ceil | ||
|
||
class PXDetector: | ||
def __init__(self): | ||
pass | ||
|
||
def score(self, data, model, batch_size=8, **kwargs): | ||
print("Scoring") | ||
n_samples = len(data) | ||
scores = [] | ||
for batch in tqdm(range(ceil(n_samples / batch_size)), desc=f"Computing PX"): | ||
original_text = data[batch * batch_size:(batch + 1) * batch_size] | ||
loglikelihood = model.get_batch_loglikelihood(original_text) | ||
scores.append(loglikelihood) | ||
scores = np.concatenate(scores) | ||
return scores | ||
|
||
class GPT0: | ||
def __init__(self, api_key=""): | ||
self.api_key = api_key | ||
self.base_url = 'https://api.gptzero.me/v2/predict' | ||
pass | ||
|
||
def score(self, data, model, batch_size=8, **kwargs): | ||
print("Scoring") | ||
n_samples = len(data) | ||
scores = [] | ||
for batch in tqdm(range(n_samples // batch_size), desc=f"Computing PX"): | ||
original_text = data[batch * batch_size:(batch + 1) * batch_size] | ||
loglikelihood = model.get_batch_loglikelihood(original_text) | ||
scores.append(loglikelihood) | ||
scores = np.concatenate(scores) | ||
return scores | ||
|
||
|
||
def score(self, documents, n_p=16): | ||
pool = ThreadPool(n_p) | ||
out_scores = pool.map(self.text_predict, documents) | ||
|
||
|
||
|
||
def text_predict(self, document): | ||
url = f'{self.base_url}/text' | ||
headers = { | ||
'accept': 'application/json', | ||
'X-Api-Key': self.api_key, | ||
'Content-Type': 'application/json' | ||
} | ||
data = { | ||
'document': document | ||
} | ||
response = requests.post(url, headers=headers, json=data) | ||
return response.json() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
import os | ||
import torch | ||
import json | ||
import argparse | ||
import numpy as np | ||
from tqdm import tqdm | ||
import matplotlib.pyplot as plt | ||
|
||
from models import get_model | ||
from detectors import get_detector | ||
|
||
def config(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--model_name", type=str, default="gpt2-xl") | ||
parser.add_argument("--dataset_name", type=str, default="toefl") | ||
parser.add_argument("--batch_size", type=int, default=32) | ||
parser.add_argument("--detector_name", type=str, default="detectgpt") | ||
parser.add_argument("--cache_dir", type=str, default="/dfs/scratch1/merty/.cache") | ||
parser.add_argument("--seed", type=int, default=42) | ||
parser.add_argument("--root-folder", type=str, default="./project_data/") | ||
return parser.parse_args() | ||
|
||
|
||
args = config() | ||
model = get_model(args.model_name) | ||
detector = get_detector(args.detector_name) | ||
|
||
torch.manual_seed(args.seed) | ||
np.random.seed(args.seed) | ||
|
||
import gdown | ||
import os | ||
import glob | ||
import pandas as pd | ||
|
||
def pubs_dataset(cache_dir="/dfs/scratch2/merty/", first_k=500): | ||
if not os.path.exists(os.path.join(cache_dir, "ScientificDiscovery/")): | ||
raise ValueError() | ||
|
||
all_csvs = glob.glob(os.path.join(cache_dir, "ScientificDiscovery/", "*.csv")) | ||
|
||
dfs = [pd.read_csv(csv)[['abstract', 'venue']] for csv in all_csvs[:1]] | ||
concatted = pd.concat(dfs).iloc[:first_k] | ||
abstracts = concatted['abstract'].tolist() | ||
venues = concatted['venue'].tolist() | ||
return concatted, abstracts, venues | ||
|
||
|
||
class DetectionDataset: | ||
def __init__(self, name, root="/afs/cs.stanford.edu/u/merty/projects/chatgpt-detector-eval/project_data/"): | ||
self.data_path = os.path.join(root, name) | ||
self.name = json.load(open(os.path.join(self.data_path, "name.json"))) | ||
self.data = json.load(open(os.path.join(self.data_path, "data.json"))) | ||
self.results = None | ||
self.collect_results() | ||
def __len__(self): | ||
return len(self.data) | ||
def describe(self): | ||
return f"{self.name['name']} Mean Length: {np.mean([len(t['document'].split(' ')) for t in self.data])}" | ||
def __getitem__(self, idx): | ||
return self.data[idx]["document"] | ||
def collect_results(self): | ||
results_files = [f for f in os.listdir(self.data_path) if f not in ["name.json", "data.json"]] | ||
print(results_files) | ||
results_records = [json.load(open(os.path.join(self.data_path, f))) for f in results_files] | ||
detector_names = [f.split(".")[0] for f in results_files] | ||
results_records = {n: {"Detector": n, "Scores": r} for n,r in zip(detector_names, results_records)} | ||
self.results = results_records | ||
return results_records | ||
def get_results(self, detector_name): | ||
if detector_name not in self.results: | ||
return None | ||
else: | ||
return self.results[detector_name] | ||
def save_result(self, detector_name, scores): | ||
with open(os.path.join(self.data_path, f"{detector_name}.json"), "w") as f: | ||
json.dump(scores, f) | ||
|
||
|
||
root_folder = args.root_folder | ||
for folder in os.listdir(root_folder): | ||
ds = DetectionDataset(foler) | ||
print(ds[0]) | ||
print(ds.describe()) | ||
results = ds.collect_results() | ||
print(results.keys()) | ||
print() | ||
|
||
detector_results = ds.get_results(args.detector_name) | ||
if detector_results is None: | ||
|
||
texts = [t["document"] for t in ds.data] | ||
print(ds.describe()) | ||
dataset_scores = detector.score(texts, model, batch_size=args.batch_size) | ||
all_results = [] | ||
for d, s in zip(texts, dataset_scores): | ||
all_results.append({"document": d, "score": float(s)}) | ||
|
||
ds.save_result(f"{args.detector_name}-{args.model_name}", all_results) | ||
print("Saved") | ||
|
||
else: | ||
print("Already exists: {}".format(detector_results)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
from transformers import AutoModelForSequenceClassification, AutoTokenizer | ||
from tqdm import tqdm | ||
import numpy as np | ||
import torch | ||
from scipy.special import softmax | ||
|
||
|
||
|
||
class HFDetector: | ||
def __init__(self, model_name="Hello-SimpleAI/chatgpt-detector-roberta" , cache_dir="/dfs/scratch1/merty/.cache", device="cuda"): | ||
self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir, output_hidden_states=True) | ||
self.model = AutoModelForSequenceClassification.from_pretrained(model_name, cache_dir=cache_dir, output_hidden_states=True) | ||
self.model = self.model.to(device) | ||
self.model = self.model.eval() | ||
self.device = device | ||
|
||
@torch.no_grad() | ||
def score(self, texts, model, batch_size=8): | ||
all_scores = [] | ||
for batch in tqdm(range(len(texts) // batch_size), desc=f"Running ChatGPT Detector"): | ||
original_text = texts[batch * batch_size:(batch + 1) * batch_size] | ||
inputs = self.tokenizer(original_text, padding="max_length", truncation=True, return_tensors="pt").to(self.device) | ||
outputs = self.model(**inputs) | ||
logits = outputs["logits"].detach().cpu().numpy() | ||
probs = softmax(logits, axis=1) | ||
all_scores.append(probs[:, 1:]) | ||
all_scores = np.concatenate(all_scores, axis=0) | ||
return all_scores | ||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import torch | ||
from multiprocessing.pool import ThreadPool | ||
from transformers import AutoModelForCausalLM, AutoTokenizer | ||
import torch.nn.functional as F | ||
|
||
|
||
class HFModel: | ||
def __init__(self, model_name="gpt2" , cache_dir="/dfs/scratch1/merty/.cache", device="cuda"): | ||
self.model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir) | ||
self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir) | ||
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id | ||
self.model = self.model.to(device) | ||
self.model = self.model.eval() | ||
self.device = device | ||
|
||
@torch.no_grad() | ||
def sample(self, texts, prompt_tokens=30, min_words=55, do_top_p=True, min_length=150): | ||
all_encoded = self.tokenizer(texts, return_tensors="pt", padding=True).to(self.device) | ||
all_encoded = {key: value[:, :prompt_tokens] for key, value in all_encoded.items()} | ||
|
||
decoded = ['' for _ in range(len(texts))] | ||
|
||
# sample from the model until we get a sample with at least min_words words for each example | ||
# this is an inefficient way to do this (since we regenerate for all inputs if just one is too short), but it works | ||
tries = 0 | ||
while (m := min(len(x.split()) for x in decoded)) < min_words: | ||
if tries != 0: | ||
print() | ||
print(f"min words: {m}, needed {min_words}, regenerating (try {tries})") | ||
|
||
sampling_kwargs = {} | ||
if do_top_p: | ||
sampling_kwargs['top_p'] = 0.9 | ||
|
||
outputs = self.model.generate(**all_encoded, min_length=min_length, max_length=200, do_sample=True, **sampling_kwargs, pad_token_id=self.tokenizer.eos_token_id, eos_token_id=self.tokenizer.eos_token_id) | ||
decoded = self.tokenizer.batch_decode(outputs, skip_special_tokens=True) | ||
tries += 1 | ||
return decoded | ||
|
||
@torch.no_grad() | ||
def get_batch_loglikelihood(self, texts): | ||
tokenized = self.tokenizer(texts, return_tensors="pt", padding=True).to(self.device) | ||
labels = tokenized.input_ids | ||
outputs = self.model(**tokenized, labels=labels) | ||
logits = outputs.logits.cpu() | ||
labels = labels.cpu() | ||
|
||
shift_logits = logits[..., :-1, :].contiguous() | ||
shift_labels = labels[..., 1:].contiguous() | ||
shift_labels[shift_labels == self.tokenizer.pad_token_id] = -100 | ||
loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1), reduction="none") | ||
ll_per_sample = -loss.view(shift_logits.shape[0], shift_logits.shape[1]) | ||
nonpad_per_row = (shift_labels != -100).sum(dim=1) | ||
ll_per_sample = ll_per_sample.sum(dim=1)/nonpad_per_row | ||
return ll_per_sample.cpu().numpy() | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
import openai | ||
import numpy as np | ||
from transformers import AutoTokenizer | ||
from multiprocessing.pool import ThreadPool | ||
|
||
|
||
class OpenAIModel: | ||
def __init__(self, model_name="text-davinci-003", cache_dir="/dfs/scratch1/merty/.cache", device="cuda", | ||
top_p=0.96, do_top_p=True): | ||
self.openai_model = model_name | ||
openai.api_key = '<Your API Key>' | ||
self.tokenizer = AutoTokenizer.from_pretrained("gpt2", cache_dir=cache_dir) | ||
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id | ||
self.device = device | ||
self.top_p = top_p | ||
self.do_top_p = do_top_p | ||
|
||
def _sample_api(self, text): | ||
# sample from the openai model | ||
kwargs = {"engine": self.openai_model, "max_tokens": 200 } | ||
if self.do_top_p: | ||
kwargs['top_p'] = self.top_p | ||
|
||
r = openai.Completion.create(prompt=f"{text}", **kwargs) | ||
return text + r['choices'][0].text | ||
|
||
def sample(self, texts, prompt_tokens=30, min_words=55, do_top_p=True, min_length=150): | ||
all_encoded = self.tokenizer(texts, return_tensors="pt", padding=True).to(self.device) | ||
all_encoded = {key: value[:, :prompt_tokens] for key, value in all_encoded.items()} | ||
|
||
prefixes = self.tokenizer.batch_decode(all_encoded['input_ids'], skip_special_tokens=True) | ||
pool = ThreadPool(8) | ||
decoded = pool.map(self._sample_api, prefixes) | ||
return decoded | ||
|
||
def get_ll(self, text): | ||
kwargs = { "engine": self.openai_model, "temperature": 0, "max_tokens": 0, "echo": True, "logprobs": 0} | ||
r = openai.Completion.create(prompt=f"<|endoftext|>{text}", **kwargs) | ||
result = r['choices'][0] | ||
tokens, logprobs = result["logprobs"]["tokens"][1:], result["logprobs"]["token_logprobs"][1:] | ||
return np.mean(logprobs) | ||
|
||
def get_batch_loglikelihood(self, texts): | ||
# get the loglikelihood of each text in the batch | ||
ll_per_sample = np.array(ThreadPool(8).map(self.get_ll, texts)) | ||
return ll_per_sample | ||
|
Binary file not shown.
Oops, something went wrong.