-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathprep-data.py
36 lines (26 loc) · 1.16 KB
/
prep-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from huggingface_hub import list_repo_files, hf_hub_download
def list_wiki_datasets():
all_files = list_repo_files("weaviate/wiki-sample", repo_type="dataset")
# get items with 0001 parquet file, this way we avoid duplicates
items = list(filter(lambda path: path.endswith("0001.parquet"), all_files))
# remove the parquet from the name
return [item.replace("/0001.parquet", "") for item in items]
def list_dataset_files(dataset):
dataset_files = list_repo_files("weaviate/wiki-sample", repo_type="dataset")
return list(filter(lambda path: path.startswith(dataset), dataset_files))
def download_file(file):
hf_hub_download(
repo_id="weaviate/wiki-sample",
filename=file,
repo_type="dataset",
local_dir="wiki-data",
)
def download_source_files(dataset="no-vectors", max_files=1000):
files_to_download = list_dataset_files(dataset)
# print(f"Files to download: {files_to_download}")
for file in files_to_download:
print(f"Downloading {file}")
download_file(file)
max_files -= 1
if(max_files == 0): break
download_source_files("openai/text-embedding-3-small", 10)