diff --git a/benchmarks/perf-tool/README.md b/benchmarks/perf-tool/README.md index 2d84df819..52590d22b 100644 --- a/benchmarks/perf-tool/README.md +++ b/benchmarks/perf-tool/README.md @@ -270,6 +270,26 @@ Ingests a dataset of multiple context types into the cluster. | ----------- | ----------- | ----------- | | took | Total time to ingest the dataset into the index.| ms | +#### ingest_nested_field + +Ingests a dataset with nested field into the cluster. + +##### Parameters + +| Parameter Name | Description | Default | +| ----------- |------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| ----------- | +| index_name | Name of index to ingest into | No default | +| field_name | Name of field to ingest into | No default | +| dataset_path | Path to data-set | No default | +| attributes_dataset_name | Name of dataset with additional attributes inside the main dataset | No default | +| attribute_spec | Definition of attributes, format is: [{ name: [name_val], type: [type_val]}] Order is important and must match order of attributes column in dataset file. It should contains { name: 'parent_id', type: 'int'} | No default | + +##### Metrics + +| Metric Name | Description | Unit | +| ----------- | ----------- | ----------- | +| took | Total time to ingest the dataset into the index.| ms | + #### query Runs a set of queries against an index. @@ -330,6 +350,36 @@ Runs a set of queries with filter against an index. | recall@R | ratio of top R results from the ground truth neighbors that are in the K results returned by the plugin | float 0.0-1.0 | | recall@K | ratio of results returned that were ground truth nearest neighbors | float 0.0-1.0 | + +#### query_nested_field + +Runs a set of queries with nested field against an index. + +##### Parameters + +| Parameter Name | Description | Default | +| ----------- |-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------| +| k | Number of neighbors to return on search | 100 | +| r | r value in Recall@R | 1 | +| index_name | Name of index to search | No default | +| field_name | Name field to search | No default | +| calculate_recall | Whether to calculate recall values | False | +| dataset_format | Format the dataset is in. Currently hdf5 and bigann is supported. The hdf5 file must be organized in the same way that the ann-benchmarks organizes theirs. | 'hdf5' | +| dataset_path | Path to dataset | No default | +| neighbors_format | Format the neighbors dataset is in. Currently hdf5 and bigann is supported. The hdf5 file must be organized in the same way that the ann-benchmarks organizes theirs. | 'hdf5' | +| neighbors_path | Path to neighbors dataset | No default | +| neighbors_dataset | Name of filter dataset inside the neighbors dataset | No default | +| query_count | Number of queries to create from data-set | Size of the data-set | + +##### Metrics + +| Metric Name | Description | Unit | +| ----------- | ----------- | ----------- | +| took | Took times returned per query aggregated as total, p50, p90 and p99 (when applicable) | ms | +| memory_kb | Native memory k-NN is using at the end of the query workload | KB | +| recall@R | ratio of top R results from the ground truth neighbors that are in the K results returned by the plugin | float 0.0-1.0 | +| recall@K | ratio of results returned that were ground truth nearest neighbors | float 0.0-1.0 | + #### get_stats Gets the index stats. diff --git a/benchmarks/perf-tool/add-parent-doc-id-to-dataset.py b/benchmarks/perf-tool/add-parent-doc-id-to-dataset.py index aed838f6b..54a40b281 100644 --- a/benchmarks/perf-tool/add-parent-doc-id-to-dataset.py +++ b/benchmarks/perf-tool/add-parent-doc-id-to-dataset.py @@ -152,18 +152,18 @@ def run(self, source_path, target_path) -> None: cpus = multiprocessing.cpu_count() total_clients = min(8, cpus) # 1 # 10 hdf5Data_train = HDF5DataSet(target_path, "train") - train_vectors = hdf5Data_train.read(0, 1000000) + train_vectors = hdf5Data_train.read(0, hdf5Data_train.size()) hdf5Data_train.close() print(f'Train vector size: {len(train_vectors)}') hdf5Data_test = HDF5DataSet(target_path, "test") - total_queries = 10000 # 10000 + total_queries = hdf5Data_test.size() # 10000 dis = [] * total_queries for i in range(total_queries): dis.insert(i, []) - queries_per_client = int(total_queries / total_clients) + queries_per_client = int(total_queries / total_clients + 0.5) if queries_per_client == 0: queries_per_client = total_queries @@ -176,7 +176,7 @@ def run(self, source_path, target_path) -> None: if start_index + queries_per_client <= total_queries: end_index = int(start_index + queries_per_client) else: - end_index = total_queries - start_index + end_index = total_queries print(f'Start Index: {start_index}, end Index: {end_index}') print(f'client is : {client}') @@ -199,7 +199,6 @@ def run(self, source_path, target_path) -> None: i = 0 for d in calculatedDis: if d: - print("Dis is not null") dis[i] = d j = j + 1 i = i + 1 @@ -238,14 +237,12 @@ def queryTask(train_vectors, test_vectors, startIndex, endIndex, process_number, i = startIndex for test in test_vectors: distances = [] - parent_ids = {} values = {} for value in train_vectors: - parent_ids[value.id] = value.parent_id values[value.id] = value distances.append({ "dis": calculateL2Distance(test.vector, value.vector), - "id": value.id + "id": value.parent_id }) distances.sort(key=lambda vector: vector['dis']) @@ -258,15 +255,15 @@ def queryTask(train_vectors, test_vectors, startIndex, endIndex, process_number, for sub_i in range(len(distances)): id = distances[sub_i]['id'] # Check if the number has been seen before - if len(nested) < 1000 and parent_ids[id] not in seen_set_nested: + if len(nested) < 1000 and id not in seen_set_nested: # If not seen before, mark it as seen - seen_set_nested.add(parent_ids[id]) + seen_set_nested.add(id) nested.append(distances[sub_i]) - if len(restricted) < 1000 and parent_ids[id] not in seen_set_restricted and values[id].apply_restricted_filter(): - seen_set_restricted.add(parent_ids[id]) + if len(restricted) < 1000 and id not in seen_set_restricted and values[id].apply_restricted_filter(): + seen_set_restricted.add(id) restricted.append(distances[sub_i]) - if len(relaxed) < 1000 and parent_ids[id] not in seen_set_relaxed and values[id].apply_relaxed_filter(): - seen_set_relaxed.add(parent_ids[id]) + if len(relaxed) < 1000 and id not in seen_set_relaxed and values[id].apply_relaxed_filter(): + seen_set_relaxed.add(id) relaxed.append(distances[sub_i]) all_distances[i]['nested'] = nested diff --git a/benchmarks/perf-tool/dataset/data-nested.hdf5 b/benchmarks/perf-tool/dataset/data-nested.hdf5 new file mode 100644 index 000000000..4223d7281 Binary files /dev/null and b/benchmarks/perf-tool/dataset/data-nested.hdf5 differ diff --git a/benchmarks/perf-tool/dataset/sift-128-euclidean-nested.hdf5 b/benchmarks/perf-tool/dataset/sift-128-euclidean-nested.hdf5 new file mode 100644 index 000000000..4223d7281 Binary files /dev/null and b/benchmarks/perf-tool/dataset/sift-128-euclidean-nested.hdf5 differ diff --git a/benchmarks/perf-tool/okpt/test/steps/factory.py b/benchmarks/perf-tool/okpt/test/steps/factory.py index 77e963921..2033f2672 100644 --- a/benchmarks/perf-tool/okpt/test/steps/factory.py +++ b/benchmarks/perf-tool/okpt/test/steps/factory.py @@ -10,7 +10,7 @@ from okpt.test.steps.steps import CreateIndexStep, DisableRefreshStep, RefreshIndexStep, DeleteIndexStep, \ TrainModelStep, DeleteModelStep, ForceMergeStep, ClearCacheStep, IngestStep, IngestMultiFieldStep, \ - QueryStep, QueryWithFilterStep, GetStatsStep, WarmupStep + IngestNestedFieldStep, QueryStep, QueryWithFilterStep, QueryNestedFieldStep, GetStatsStep, WarmupStep def create_step(step_config: StepConfig) -> Step: @@ -30,10 +30,14 @@ def create_step(step_config: StepConfig) -> Step: return IngestStep(step_config) elif step_config.step_name == IngestMultiFieldStep.label: return IngestMultiFieldStep(step_config) + elif step_config.step_name == IngestNestedFieldStep.label: + return IngestNestedFieldStep(step_config) elif step_config.step_name == QueryStep.label: return QueryStep(step_config) elif step_config.step_name == QueryWithFilterStep.label: return QueryWithFilterStep(step_config) + elif step_config.step_name == QueryNestedFieldStep.label: + return QueryNestedFieldStep(step_config) elif step_config.step_name == ForceMergeStep.label: return ForceMergeStep(step_config) elif step_config.step_name == ClearCacheStep.label: diff --git a/benchmarks/perf-tool/okpt/test/steps/steps.py b/benchmarks/perf-tool/okpt/test/steps/steps.py index 6ec164d9d..c0adba62a 100644 --- a/benchmarks/perf-tool/okpt/test/steps/steps.py +++ b/benchmarks/perf-tool/okpt/test/steps/steps.py @@ -333,7 +333,6 @@ def action(doc_id): for i in range(0, self.doc_count, self.bulk_size): partition = self.dataset.read(self.bulk_size) self._handle_data_bulk(partition, action, i) - self.dataset.reset() return {} @@ -379,6 +378,7 @@ def __init__(self, step_config: StepConfig): step_config.config, {}, []) self.partition_attr = self.attributes_dataset.read(self.doc_count) + self.action_buffer = None def _handle_data_bulk(self, partition, action, i): if partition is None: @@ -429,6 +429,118 @@ def bulk_transform_with_attributes(self, partition: np.ndarray, partition_attr, return actions +class IngestNestedFieldStep(BaseIngestStep): + """See base class.""" + + label = 'ingest_nested_field' + + def __init__(self, step_config: StepConfig): + super().__init__(step_config) + + dataset_path = parse_string_param('dataset_path', step_config.config, + {}, None) + + self.attributes_dataset_name = parse_string_param('attributes_dataset_name', + step_config.config, {}, None) + + self.attributes_dataset = parse_dataset('hdf5', dataset_path, + Context.CUSTOM, self.attributes_dataset_name) + + self.attribute_spec = parse_list_param('attribute_spec', + step_config.config, {}, []) + + self.partition_attr = self.attributes_dataset.read(self.doc_count) + + if self.dataset.size() != self.doc_count: + raise ValueError("custom doc_count is not supported for nested field") + self.action_buffer = None + self.action_parent_id = None + self.count = 0 + + def _handle_data_bulk(self, partition, action, i): + if partition is None: + return + body = self.bulk_transform_with_nested(partition, self.partition_attr, self.field_name, + action, i, self.attribute_spec) + if len(body) > 0: + bulk_index(self.opensearch, self.index_name, body) + + def bulk_transform_with_nested(self, partition: np.ndarray, partition_attr, field_name: str, + action, offset: int, attributes_def) -> List[Dict[str, Any]]: + """Partitions and transforms a list of vectors into OpenSearch's bulk + injection format. + Args: + partition: An array of vectors to transform. + partition_attr: dictionary of additional data to transform + field_name: field name for action + action: Bulk API action. + offset: to start counting from + attributes_def: definition of additional doc fields + Returns: + An array of transformed vectors in bulk format. + """ + # offset is index of start row. We need number of parent doc - 1. + # The number of parent document can be calculated by using partition_attr data. + # We need to keep the last parent doc aside so that additional data can be added later. + parent_id_idx = next((index for (index, d) in enumerate(attributes_def) if d.get('name') == 'parent_id'), None) + if parent_id_idx is None: + raise ValueError("parent_id should be provided as attribute spec") + if attributes_def[parent_id_idx]['type'] != 'int': + raise ValueError("parent_id should be int type") + + first_index = offset + last_index = offset + len(partition) - 1 + num_of_actions = int(partition_attr[last_index][parent_id_idx].decode()) - int(partition_attr[first_index][parent_id_idx].decode()) + if self.action_buffer is None: + self.action_buffer = {"nested_field": []} + self.action_parent_id = int(partition_attr[first_index][parent_id_idx].decode()) + + actions = [] + _ = [ + actions.extend([action(i + self.action_parent_id), None]) + for i in range(num_of_actions) + ] + + idx = 1 + part_list = partition.tolist() + for i in range(len(partition)): + self.count += 1 + nested = {field_name: part_list[i]} + attr_idx = i + offset + attr_def_idx = 0 + current_parent_id = None + for attribute in attributes_def: + attr_def_name = attribute['name'] + attr_def_type = attribute['type'] + if attr_def_name == "parent_id": + current_parent_id = int(partition_attr[attr_idx][attr_def_idx].decode()) + attr_def_idx += 1 + continue + + if attr_def_type == 'str': + val = partition_attr[attr_idx][attr_def_idx].decode() + if val != 'None': + nested[attr_def_name] = val + elif attr_def_type == 'int': + val = int(partition_attr[attr_idx][attr_def_idx].decode()) + nested[attr_def_name] = val + attr_def_idx += 1 + + if self.action_parent_id == current_parent_id: + self.action_buffer["nested_field"].append(nested) + else: + actions.extend([action(self.action_parent_id), self.action_buffer]) + self.action_buffer = {"nested_field": []} + self.action_buffer["nested_field"].append(nested) + self.action_parent_id = current_parent_id + idx += 2 + + if self.count == self.doc_count: + actions.extend([action(self.action_parent_id), self.action_buffer]) + + return actions + + class BaseQueryStep(OpenSearchStep): """See base class.""" @@ -469,7 +581,7 @@ def _action(self): break query_responses.append( query_index(self.opensearch, self.index_name, - self.get_body(query[0]) , [self.field_name])) + self.get_body(query[0]) , self.get_exclude_fields())) results['took'] = [ float(query_response['took']) for query_response in query_responses @@ -506,6 +618,8 @@ def _get_measures(self) -> List[str]: def get_body(self, vec): pass + def get_exclude_fields(self): + return [self.field_name] class QueryStep(BaseQueryStep): """See base class.""" @@ -611,6 +725,43 @@ def get_body(self, vec): else: raise ConfigurationError('Not supported filter type {}'.format(self.filter_type)) +class QueryNestedFieldStep(BaseQueryStep): + """See base class.""" + + label = 'query_nested_field' + + def __init__(self, step_config: StepConfig): + super().__init__(step_config) + + neighbors_dataset = parse_string_param('neighbors_dataset', + step_config.config, {}, None) + + self.neighbors = parse_dataset(self.neighbors_format, self.neighbors_path, + Context.CUSTOM, neighbors_dataset) + + self.implicit_config = step_config.implicit_config + + def get_body(self, vec): + return { + 'size': self.k, + 'query': { + 'nested': { + 'path': 'nested_field', + 'query': { + 'knn': { + 'nested_field.' + self.field_name: { + 'vector': vec, + 'k': self.k + } + } + } + } + } + } + + def get_exclude_fields(self): + return ['nested_field.' + self.field_name] + class GetStatsStep(OpenSearchStep): """See base class.""" diff --git a/benchmarks/perf-tool/release-configs/faiss-hnsw/nested/simple/index.json b/benchmarks/perf-tool/release-configs/faiss-hnsw/nested/simple/index.json new file mode 100644 index 000000000..a982afc81 --- /dev/null +++ b/benchmarks/perf-tool/release-configs/faiss-hnsw/nested/simple/index.json @@ -0,0 +1,32 @@ +{ + "settings": { + "index": { + "knn": true, + "number_of_shards": 24, + "number_of_replicas": 1, + "knn.algo_param.ef_search": 100 + } + }, + "mappings": { + "properties": { + "nested_field": { + "type": "nested", + "properties": { + "target_field": { + "type": "knn_vector", + "dimension": 128, + "method": { + "name": "hnsw", + "space_type": "l2", + "engine": "faiss", + "parameters": { + "ef_construction": 256, + "m": 16 + } + } + } + } + } + } + } +} diff --git a/benchmarks/perf-tool/release-configs/faiss-hnsw/nested/simple/simple-nested-test.yml b/benchmarks/perf-tool/release-configs/faiss-hnsw/nested/simple/simple-nested-test.yml new file mode 100644 index 000000000..151b2014d --- /dev/null +++ b/benchmarks/perf-tool/release-configs/faiss-hnsw/nested/simple/simple-nested-test.yml @@ -0,0 +1,37 @@ +endpoint: [ENDPOINT] +port: [PORT] +test_name: "Faiss HNSW Nested Field Test" +test_id: "Faiss HNSW Nested Field Test" +num_runs: 3 +show_runs: false +steps: + - name: delete_index + index_name: target_index + - name: create_index + index_name: target_index + index_spec: release-configs/faiss-hnsw/nested/simple/index.json + - name: ingest_nested_field + index_name: target_index + field_name: target_field + dataset_format: hdf5 + dataset_path: dataset/sift-128-euclidean-nested.hdf5 + attributes_dataset_name: attributes + attribute_spec: [ { name: 'color', type: 'str' }, { name: 'taste', type: 'str' }, { name: 'age', type: 'int' }, { name: 'parent_id', type: 'int'} ] + - name: refresh_index + index_name: target_index + - name: force_merge + index_name: target_index + max_num_segments: 1 + - name: warmup_operation + index_name: target_index + - name: query_nested_field + k: 100 + r: 1 + calculate_recall: true + index_name: target_index + field_name: target_field + dataset_format: hdf5 + dataset_path: dataset/sift-128-euclidean-nested.hdf5 + neighbors_format: hdf5 + neighbors_path: dataset/sift-128-euclidean-nested.hdf5 + neighbors_dataset: neighbour_nested \ No newline at end of file diff --git a/benchmarks/perf-tool/release-configs/lucene-hnsw/nested/simple/index.json b/benchmarks/perf-tool/release-configs/lucene-hnsw/nested/simple/index.json new file mode 100644 index 000000000..8dc749c39 --- /dev/null +++ b/benchmarks/perf-tool/release-configs/lucene-hnsw/nested/simple/index.json @@ -0,0 +1,31 @@ +{ + "settings": { + "index": { + "knn": true, + "number_of_shards": 24, + "number_of_replicas": 1 + } + }, + "mappings": { + "properties": { + "nested_field": { + "type": "nested", + "properties": { + "target_field": { + "type": "knn_vector", + "dimension": 128, + "method": { + "name": "hnsw", + "space_type": "l2", + "engine": "lucene", + "parameters": { + "ef_construction": 256, + "m": 16 + } + } + } + } + } + } + } +} diff --git a/benchmarks/perf-tool/release-configs/lucene-hnsw/nested/simple/simple-nested-test.yml b/benchmarks/perf-tool/release-configs/lucene-hnsw/nested/simple/simple-nested-test.yml new file mode 100644 index 000000000..cf1e4edc4 --- /dev/null +++ b/benchmarks/perf-tool/release-configs/lucene-hnsw/nested/simple/simple-nested-test.yml @@ -0,0 +1,37 @@ +endpoint: [ENDPOINT] +port: [PORT] +test_name: "Lucene HNSW Nested Field Test" +test_id: "Lucene HNSW Nested Field Test" +num_runs: 3 +show_runs: false +steps: + - name: delete_index + index_name: target_index + - name: create_index + index_name: target_index + index_spec: release-configs/faiss-hnsw/nested/simple/index.json + - name: ingest_nested_field + index_name: target_index + field_name: target_field + dataset_format: hdf5 + dataset_path: dataset/sift-128-euclidean-nested.hdf5 + attributes_dataset_name: attributes + attribute_spec: [ { name: 'color', type: 'str' }, { name: 'taste', type: 'str' }, { name: 'age', type: 'int' }, { name: 'parent_id', type: 'int'} ] + - name: refresh_index + index_name: target_index + - name: force_merge + index_name: target_index + max_num_segments: 1 + - name: warmup_operation + index_name: target_index + - name: query_nested_field + k: 100 + r: 1 + calculate_recall: true + index_name: target_index + field_name: target_field + dataset_format: hdf5 + dataset_path: dataset/sift-128-euclidean-nested.hdf5 + neighbors_format: hdf5 + neighbors_path: dataset/sift-128-euclidean-nested.hdf5 + neighbors_dataset: neighbour_nested \ No newline at end of file