Skip to content

Commit

Permalink
Output data file renamed, Add detection code to just read file
Browse files Browse the repository at this point in the history
  • Loading branch information
abhi3700 committed Feb 27, 2024
1 parent 6281f8e commit f1ddcb7
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 0 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ huak run main

# pre-processing data
huak run preproc

# detect AI-generated data
huak run detect
```

## Format
Expand Down
File renamed without changes.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ dev = [
[tool.huak.task]
main = "python3 src/semantic_hashing_demo/main.py"
preproc = "python3 src/semantic_hashing_demo/preprocessing.py"
detect = "python3 src/semantic_hashing_demo/detection.py"
2 changes: 2 additions & 0 deletions src/semantic_hashing_demo/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# So, parse accordingly depending on the computational resources for bucketing.
data_file = "./data/fine_food_reviews_1k.csv"

preprocessed_data_file = "./output/preprocessed_data.csv"

# no. of text samples
n = 20

Expand Down
24 changes: 24 additions & 0 deletions src/semantic_hashing_demo/detection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""
The end goal is to detect AI-generated text.
"""
import polars as pl
from config import preprocessed_data_file


def main():
# Specify data types for multiple columns to be read as strings
dtype_spec = {
"Hash 8-bit": str,
"Hash 16-bit": str,
"Hash 32-bit": str,
"Hash 64-bit": str,
"Hash 128-bit": str,
}

# pull data into polars dataframe
df = pl.read_csv(preprocessed_data_file, dtypes=dtype_spec)
print(df.head())


if __name__ == "__main__":
main()

0 comments on commit f1ddcb7

Please sign in to comment.