Skip to content

Commit

Permalink
Merge pull request #46 from weaviate/optional-sentence-tokenization
Browse files Browse the repository at this point in the history
Optional direct text tokenization setting
  • Loading branch information
antas-marcin authored Jan 30, 2023
2 parents 4a38a7b + 3a592ed commit 7ff4780
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 14 deletions.
8 changes: 7 additions & 1 deletion app.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,15 @@ def startup_event():
else:
logger.info("Running on CPU")

# Batch text tokenization enabled by default
direct_tokenize = True
transformers_direct_tokenize = os.getenv("T2V_TRANSFORMERS_DIRECT_TOKENIZE")
if transformers_direct_tokenize is not None and transformers_direct_tokenize == "false" or transformers_direct_tokenize == "0":
direct_tokenize = False

meta_config = Meta('./models/model')
vec = Vectorizer('./models/model', cuda_support, cuda_core, cuda_per_process_memory_fraction,
meta_config.getModelType(), meta_config.get_architecture())
meta_config.getModelType(), meta_config.get_architecture(), direct_tokenize)


@app.get("/.well-known/live", response_class=Response)
Expand Down
38 changes: 25 additions & 13 deletions vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,14 @@ class Vectorizer:
cuda: bool
cuda_core: str
model_type: str
direct_tokenize: bool

def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per_process_memory_fraction: float, model_type: str, architecture: str):
def __init__(self, model_path: str, cuda_support: bool, cuda_core: str, cuda_per_process_memory_fraction: float, model_type: str, architecture: str, direct_tokenize: bool):
self.cuda = cuda_support
self.cuda_core = cuda_core
self.cuda_per_process_memory_fraction = cuda_per_process_memory_fraction
self.model_type = model_type
self.direct_tokenize = direct_tokenize

self.model_delegate: HFModel = ModelFactory.model(model_type, architecture)
self.model = self.model_delegate.create_model(model_path)
Expand Down Expand Up @@ -65,20 +67,30 @@ def pool_embedding(self, batch_results, tokens, config):

async def vectorize(self, text: str, config: VectorInputConfig):
with torch.no_grad():
sentences = sent_tokenize(' '.join(text.split(),))
num_sentences = len(sentences)
number_of_batch_vectors = math.ceil(num_sentences / MAX_BATCH_SIZE)
batch_sum_vectors = 0
for i in range(0, number_of_batch_vectors):
start_index = i * MAX_BATCH_SIZE
end_index = start_index + MAX_BATCH_SIZE

tokens = self.tokenize(sentences[start_index:end_index])
if not self.direct_tokenize:
# create embeddings without tokenizing text
tokens = self.tokenize(text)
if self.cuda:
tokens.to(self.cuda_core)
batch_results = self.get_batch_results(tokens, sentences[start_index:end_index])
batch_sum_vectors += self.pool_embedding(batch_results, tokens, config)
return batch_sum_vectors.detach() / num_sentences
batch_results = self.get_batch_results(tokens, text)
batch_sum_vectors = self.pool_embedding(batch_results, tokens, config)
return batch_sum_vectors.detach()
else:
# tokenize text
sentences = sent_tokenize(' '.join(text.split(),))
num_sentences = len(sentences)
number_of_batch_vectors = math.ceil(num_sentences / MAX_BATCH_SIZE)
batch_sum_vectors = 0
for i in range(0, number_of_batch_vectors):
start_index = i * MAX_BATCH_SIZE
end_index = start_index + MAX_BATCH_SIZE

tokens = self.tokenize(sentences[start_index:end_index])
if self.cuda:
tokens.to(self.cuda_core)
batch_results = self.get_batch_results(tokens, sentences[start_index:end_index])
batch_sum_vectors += self.pool_embedding(batch_results, tokens, config)
return batch_sum_vectors.detach() / num_sentences


class HFModel:
Expand Down

0 comments on commit 7ff4780

Please sign in to comment.