Skip to content

Commit

Permalink
(1/n) - Preload datasets in manifold so that subsequent stages of tra…
Browse files Browse the repository at this point in the history
…ining, indexing and search can use those instead of each trainer or indexer downloading data. (#4034)

Summary:
Pull Request resolved: #4034

Preload datasets in manifold so that subsequent stages of training, indexing and search can use those instead of each trainer or indexer downloading data.

Reviewed By: kuarora

Differential Revision: D65926898

fbshipit-source-id: 9341d2676fd2a50027887e821ec95768e829af31
  • Loading branch information
satymish authored and facebook-github-bot committed Nov 20, 2024
1 parent 98d335b commit 844e3ce
Showing 1 changed file with 8 additions and 5 deletions.
13 changes: 8 additions & 5 deletions benchs/bench_fw/descriptors.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ class CodecDescriptor(IndexBaseDescriptor):
factory: Optional[str] = None
construction_params: Optional[List[Dict[str, int]]] = None
training_vectors: Optional[DatasetDescriptor] = None
FILENAME_PREFIX: str = "xt"

def __post_init__(self):
self.get_name()
Expand Down Expand Up @@ -254,7 +255,7 @@ def name_from_factory(self) -> str:
name += f"d_{self.d}.{self.metric.upper()}."
if self.factory != "Flat":
assert self.training_vectors is not None
name += self.training_vectors.get_filename("xt")
name += self.training_vectors.get_filename(CodecDescriptor.FILENAME_PREFIX)
name += IndexBaseDescriptor.param_dict_list_to_name(self.construction_params)
return name

Expand All @@ -278,6 +279,7 @@ def alias(self, benchmark_io: BenchmarkIO):
class IndexDescriptor(IndexBaseDescriptor):
codec_desc: Optional[CodecDescriptor] = None
database_desc: Optional[DatasetDescriptor] = None
FILENAME_PREFIX: str = "xb"

def __hash__(self):
return hash(str(self))
Expand All @@ -290,14 +292,14 @@ def is_built(self):

def get_name(self) -> str:
if self.desc_name is None:
self.desc_name = self.codec_desc.get_name() + self.database_desc.get_filename(prefix="xb")
self.desc_name = self.codec_desc.get_name() + self.database_desc.get_filename(prefix=IndexDescriptor.FILENAME_PREFIX)

return self.desc_name

def flat_name(self):
if self.flat_desc_name is not None:
return self.flat_desc_name
self.flat_desc_name = self.codec_desc.flat_name() + self.database_desc.get_filename(prefix="xb")
self.flat_desc_name = self.codec_desc.flat_name() + self.database_desc.get_filename(prefix=IndexDescriptor.FILENAME_PREFIX)
return self.flat_desc_name

# alias is used to refer when index is uploaded to blobstore and refered again
Expand All @@ -313,6 +315,7 @@ class KnnDescriptor(IndexBaseDescriptor):
query_dataset: Optional[DatasetDescriptor] = None
search_params: Optional[Dict[str, int]] = None
reconstruct: bool = False
FILENAME_PREFIX: str = "q"
# range metric definitions
# key: name
# value: one of the following:
Expand Down Expand Up @@ -340,7 +343,7 @@ def __hash__(self):
def get_name(self):
name = self.index_desc.get_name()
name += IndexBaseDescriptor.param_dict_to_name(self.search_params)
name += self.query_dataset.get_filename("q")
name += self.query_dataset.get_filename(KnnDescriptor.FILENAME_PREFIX)
name += f"k_{self.k}."
name += f"t_{self.num_threads}."
if self.reconstruct:
Expand All @@ -353,7 +356,7 @@ def flat_name(self):
if self.flat_desc_name is not None:
return self.flat_desc_name
name = self.index_desc.flat_name()
name += self.query_dataset.get_filename("q")
name += self.query_dataset.get_filename(KnnDescriptor.FILENAME_PREFIX)
name += f"k_{self.k}."
name += f"t_{self.num_threads}."
if self.reconstruct:
Expand Down

0 comments on commit 844e3ce

Please sign in to comment.