Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added GPU Clustering Support #252

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

.DS_Store
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ tensorflow_text
torch
sentence_transformers
hnswlib
conda create -n rapids-22.02 -c rapidsai -c nvidia -c conda-forge rapids=22.02 python=3.8 cudatoolkit=11.4 dask-sql
4 changes: 4 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@
'torch',
'sentence_transformers',
],
'gpu_support': [
'cuml',
'cudf',
],
'indexing': [
'hnswlib',
],
Expand Down
Binary file added top2vec/.DS_Store
Binary file not shown.
66 changes: 59 additions & 7 deletions top2vec/Top2Vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,16 @@
from sklearn.preprocessing import normalize
from scipy.special import softmax

#importing gpu libs
try:
import cuml
import cudf
from cuml.cluster import HDBSCAN as cuml_hdbscan
_HAVE_CUML = True
except ImportError:
_HAVE_CUML = False


try:
import hnswlib

Expand Down Expand Up @@ -324,6 +334,11 @@ class Top2Vec:
functions only document ids will be returned, not the actual
documents.

use_gpu: bool (Optional, default False)
If set to True documents will be using the Rapids.ai hdbscan library for
clustering. Rapids.ai converts dataframes into Cudf that are optimized
for GPU based parallelization.

workers: int (Optional)
The amount of worker threads to be used in training the model. Larger
amount will lead to faster training.
Expand Down Expand Up @@ -372,7 +387,8 @@ def __init__(self,
use_embedding_model_tokenizer=False,
umap_args=None,
hdbscan_args=None,
verbose=True
verbose=True,
use_gpu=False
):

if verbose:
Expand Down Expand Up @@ -603,18 +619,32 @@ def return_doc(doc):
umap_args = {'n_neighbors': 15,
'n_components': 5,
'metric': 'cosine'}
if use_gpu:
try:
umap_args.pop('metric')
except:
None
docvecs_cudf = cudf.DataFrame(self._get_document_vectors(norm=False))
umap_model = cuml.UMAP(**umap_args).fit(docvecs_cudf)
else:
umap_model = umap.UMAP(**umap_args).fit(self._get_document_vectors(norm=False))

umap_model = umap.UMAP(**umap_args).fit(self.document_vectors)

# find dense areas of document vectors
logger.info('Finding dense areas of documents')

if hdbscan_args is None:
hdbscan_args = {'min_cluster_size': 15,
hdbscan_args = {'min_cluster_size': 5,
'metric': 'euclidean',
'cluster_selection_method': 'eom'}

cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(umap_model.embedding_)

logger.info('custom')
print(umap_model.embedding_.shape)
cluster = cuml_hdbscan(**hdbscan_args).fit(umap_model.embedding_)
# cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(umap_model.embedding_.to_numpy())



# calculate topic vectors from dense areas of documents
logger.info('Finding topics')
Expand Down Expand Up @@ -790,13 +820,35 @@ def _embed_query(self, query):

return self._l2_normalize(np.array(self.embed([query])[0]))


def _set_document_vectors(self, document_vectors):
if self.embedding_model == 'doc2vec':
self.model.docvecs.vectors_docs = document_vectors
else:
self.document_vectors = document_vectors

def _get_document_vectors(self, norm=True):

if self.embedding_model == 'doc2vec':

if norm:
self.model.docvecs.init_sims()
return self.model.docvecs.vectors_docs_norm
else:
return self.model.docvecs.vectors_docs
else:
return self.document_vectors

def _create_topic_vectors(self, cluster_labels):

cluster_labels = cluster_labels.to_pandas()
unique_labels = set(cluster_labels)
if -1 in unique_labels:
unique_labels.remove(-1)

self.topic_vectors = self._l2_normalize(
np.vstack([self.document_vectors[np.where(cluster_labels == label)[0]]
.mean(axis=0) for label in unique_labels]))
np.vstack([self._get_document_vectors(norm=False)[np.where(cluster_labels == label)[0]]
.mean(axis=0) for label in unique_labels]))

def _deduplicate_topics(self):
core_samples, labels = dbscan(X=self.topic_vectors,
Expand Down Expand Up @@ -2640,4 +2692,4 @@ def generate_topic_wordcloud(self, topic_num, background_color="black", reduced=
WordCloud(width=1600,
height=400,
background_color=background_color).generate_from_frequencies(word_score_dict))
plt.title("Topic " + str(topic_num), loc='left', fontsize=25, pad=20)
plt.title("Topic " + str(topic_num), loc='left', fontsize=25, pad=20)