Skip to content

Commit

Permalink
Merge pull request #60 from weaviate/nltk-download-dir-fix
Browse files Browse the repository at this point in the history
Fix downloading of nltk data into a local folder
  • Loading branch information
antas-marcin authored May 12, 2023
2 parents 64b36bd + 0de98b1 commit ffe4aba
Show file tree
Hide file tree
Showing 5 changed files with 8 additions and 3 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
models
nltk_data
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
__pycache__
models
nltk_data
4 changes: 2 additions & 2 deletions cicd/build_custom_base.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ function push_main() {
# The ones that are always pushed

tag="$remote_repo:custom-$git_hash"
docker buildx build -f custom.Dockerfile \
docker buildx build --platform=linux/arm64,linux/amd64 -f custom.Dockerfile \
--tag "$tag" \
--push \
.
Expand All @@ -57,7 +57,7 @@ function push_tag() {
echo "Tag & Push $tag, $tag_latest, $tag_git"
docker tag "custom-base" "$tag" && docker push "$tag"

docker buildx build -f custom.Dockerfile \
docker buildx build --platform=linux/arm64,linux/amd64 -f custom.Dockerfile \
--tag "$tag" \
--tag "$tag_latest" \
--tag "$tag_git" \
Expand Down
2 changes: 1 addition & 1 deletion download.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,4 @@
model.save_pretrained('./models/model')
tokenizer.save_pretrained('./models/model')

nltk.download('punkt')
nltk.download('punkt', download_dir='./nltk_data')
3 changes: 3 additions & 0 deletions vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import math
from typing import Optional
import torch
import nltk
from nltk.tokenize import sent_tokenize
from pydantic import BaseModel
from transformers import (
Expand Down Expand Up @@ -57,6 +58,8 @@ def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per

self.executor = ThreadPoolExecutor()

nltk.data.path.append('./nltk_data')

def tokenize(self, text:str):
return self.tokenizer(text, padding=True, truncation=True, max_length=500,
add_special_tokens = True, return_tensors="pt")
Expand Down

0 comments on commit ffe4aba

Please sign in to comment.