diff --git a/README.md b/README.md index 3c4ab7b..26e6d21 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,9 @@ huak run main # pre-processing data huak run preproc + +# detect AI-generated data +huak run detect ``` ## Format diff --git a/output/embeddings_w_lsh.csv b/output/preprocessed_data.csv similarity index 100% rename from output/embeddings_w_lsh.csv rename to output/preprocessed_data.csv diff --git a/pyproject.toml b/pyproject.toml index 41344f8..5c6dc6b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,3 +23,4 @@ dev = [ [tool.huak.task] main = "python3 src/semantic_hashing_demo/main.py" preproc = "python3 src/semantic_hashing_demo/preprocessing.py" +detect = "python3 src/semantic_hashing_demo/detection.py" diff --git a/src/semantic_hashing_demo/config.py b/src/semantic_hashing_demo/config.py index 7f692da..5a9d60e 100644 --- a/src/semantic_hashing_demo/config.py +++ b/src/semantic_hashing_demo/config.py @@ -12,6 +12,8 @@ # So, parse accordingly depending on the computational resources for bucketing. data_file = "./data/fine_food_reviews_1k.csv" +preprocessed_data_file = "./output/preprocessed_data.csv" + # no. of text samples n = 20 diff --git a/src/semantic_hashing_demo/detection.py b/src/semantic_hashing_demo/detection.py new file mode 100644 index 0000000..21d5646 --- /dev/null +++ b/src/semantic_hashing_demo/detection.py @@ -0,0 +1,24 @@ +""" +The end goal is to detect AI-generated text. +""" +import polars as pl +from config import preprocessed_data_file + + +def main(): + # Specify data types for multiple columns to be read as strings + dtype_spec = { + "Hash 8-bit": str, + "Hash 16-bit": str, + "Hash 32-bit": str, + "Hash 64-bit": str, + "Hash 128-bit": str, + } + + # pull data into polars dataframe + df = pl.read_csv(preprocessed_data_file, dtypes=dtype_spec) + print(df.head()) + + +if __name__ == "__main__": + main()