Skip to content

Commit

Permalink
fix up benchmark and datagen code
Browse files Browse the repository at this point in the history
  • Loading branch information
changhiskhan committed Sep 27, 2022
1 parent 60d7412 commit a1548ea
Show file tree
Hide file tree
Showing 8 changed files with 125 additions and 181 deletions.
24 changes: 11 additions & 13 deletions python/benchmarks/bench_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,13 +116,7 @@ def list_benchmarks(self):

def create_main(self):
@click.command
@click.option(
"-u",
"--base-uri",
required=True,
type=str,
help="Base uri to the benchmark dataset catalog",
)
@click.argument("base_uri")
@click.option(
"-f", "--format", "fmt", help="'lance', 'parquet', or 'raw'. Omit for all"
)
Expand All @@ -147,7 +141,6 @@ def main(base_uri, fmt, flavor, benchmark, repeats, output):
fmt = [fmt]
else:
fmt = KNOWN_FORMATS
base_uri = f"{base_uri}/datasets/{self.name}"

def run_benchmark(bmark):
b = bmark.repeat(repeats or 1)
Expand Down Expand Up @@ -320,12 +313,17 @@ def main(
else:
fmt = known_formats

kwargs = {
"existing_data_behavior": "overwrite_or_ignore",
"max_rows_per_group": group_size,
"max_rows_per_file": max_rows_per_file,
}
for f in fmt:
if f == 'lance':
kwargs = {
"existing_data_behavior": "overwrite_or_ignore",
"max_rows_per_group": group_size,
"max_rows_per_file": max_rows_per_file,
}
elif f == 'parquet':
kwargs = {
'row_group_size': group_size,
}
if embedded:
converter.make_embedded_dataset(df, f, output_path, **kwargs)
else:
Expand Down
52 changes: 34 additions & 18 deletions python/benchmarks/coco/analytics.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
#!/usr/bin/env python3

import sys
from typing import Union

import duckdb
import numpy as np
import pandas as pd
import pyarrow.compute as pc
import pyarrow.dataset as ds

sys.path.append("..")

from bench_utils import BenchmarkSuite, download_uris, get_dataset, get_uri
from parse_coco import CocoConverter
from datagen import CocoConverter

import lance

Expand Down Expand Up @@ -46,33 +50,43 @@ def _label_distribution_raw(base_uri: str):
"""
c = CocoConverter(base_uri)
df = c.read_metadata()
return pd.json_normalize(df.annotations.explode()).name.value_counts()
return pd.json_normalize(df.annotations).name.explode().value_counts()


def _filter_data_raw(base_uri: str, klass="cat", offset=20, limit=50):
"""SELECT image, annotations FROM coco WHERE annotations.label = 'cat' LIMIT 50 OFFSET 20"""
c = CocoConverter(base_uri)
df = c.read_metadata()
mask = df.annotations.apply(lambda ann: any([a["name"] == klass for a in ann]))
ser = pd.json_normalize(df.annotations)["name"]

def has_klass(names):
if isinstance(names, list):
return (np.array(names) == klass).any()
return False

mask = ser.apply(has_klass)
filtered = df.loc[mask, ["image_uri", "annotations"]]
limited = filtered[offset : offset + limit]
limited = filtered[offset: offset + limit]
limited.assign(image=download_uris(limited.image_uri))
return limited


def _filter_data_lance(base_uri: str, klass="cat", offset=20, limit=50, flavor=None):
uri = get_uri(base_uri, "coco", "lance", flavor)
index_scanner = lance.scanner(uri, columns=["image_id", "annotations.name"])
# TODO restore after projection bug
index_scanner = lance.dataset(uri)
# index_scanner = index_scanner.scanner(columns=["image_id", "annotations.name"])
query = (
f"SELECT distinct image_id FROM ("
f" SELECT image_id, UNNEST(annotations) as ann FROM index_scanner"
f") WHERE ann.name == '{klass}'"
f" SELECT image_id, UNNEST(annotations.name) as name FROM index_scanner"
f") WHERE name = '{klass}'"
)
filtered_ids = duckdb.query(query).arrow().column("image_id").combine_chunks()
scanner = lance.scanner(
uri,
["image_id", "image", "annotations.name"],
# filter=pc.field("image_id").isin(filtered_ids),
scanner = lance.dataset(uri).scanner(
# TODO restore after projection bug
# columns=["image_id", "image", "annotations.name"],
columns=["image_id", "image", "annotations"],
filter=pc.field("image_id").isin(filtered_ids),
limit=50,
offset=20,
)
Expand All @@ -84,8 +98,8 @@ def _filter_data_parquet(base_uri: str, klass="cat", offset=20, limit=50, flavor
dataset = ds.dataset(uri)
query = (
f"SELECT distinct image_id FROM ("
f" SELECT image_id, UNNEST(annotations) as ann FROM dataset"
f") WHERE ann.name == '{klass}'"
f" SELECT image_id, UNNEST(annotations.name) as name FROM dataset"
f") WHERE name == '{klass}'"
)
filtered_ids = duckdb.query(query).arrow().column("image_id").to_numpy().tolist()
id_string = ",".join([f"'{x}'" for x in filtered_ids])
Expand All @@ -98,14 +112,16 @@ def _filter_data_parquet(base_uri: str, klass="cat", offset=20, limit=50, flavor


def _label_distribution_lance(dataset: ds.Dataset):
scanner = lance.scanner(dataset, columns=["annotations.name"])
return _label_distribution_duckdb(scanner)
# TODO restore after projection bug
# scanner = dataset.scanner(columns=["annotations.name"])
# return _label_distribution_duckdb(scanner)
return _label_distribution_duckdb(dataset)


def _label_distribution_duckdb(arrow_obj: Union[ds.Dataset | ds.Scanner]):
query = """\
SELECT ann.name, COUNT(1) FROM (
SELECT UNNEST(annotations) as ann FROM arrow_obj
SELECT name, COUNT(1) FROM (
SELECT UNNEST(annotations.name) as name FROM arrow_obj
) GROUP BY 1
"""
return duckdb.query(query).to_df()
Expand Down
93 changes: 0 additions & 93 deletions python/benchmarks/coco/coco_classes.csv

This file was deleted.

41 changes: 7 additions & 34 deletions python/benchmarks/coco/datagen.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,44 +110,17 @@ def _convert_field(self, name, typ, col):
arrays = []
for subfield in typ:
sub_arr = native_arr.field(subfield.name)
if name == "annotations" and subfield.name == "name":
converted = self._convert_name_column(
sub_arr, native_arr.field("category_id")
)
else:
converted = self._convert_field(
f"{name}.{subfield.name}",
subfield.type,
sub_arr.to_numpy(zero_copy_only=False),
)
converted = self._convert_field(
f"{name}.{subfield.name}",
subfield.type,
sub_arr.to_numpy(zero_copy_only=False),
)
arrays.append(converted)
return pa.StructArray.from_arrays(arrays, fields=typ)
else:
arr = pa.array(col, type=typ)
return arr

def _convert_name_column(self, name_arr, category_id_arr):
coco_classes = pd.read_csv("coco_classes.csv", header=0, index_col=None)
# let's make sure the actual data matches
check = pd.Series(
dict(
zip(
name_arr.values.to_numpy(False),
category_id_arr.values.to_numpy(False),
)
)
).to_frame(name="check_id")
joined = coco_classes.set_index("name").join(check, how="right")
mask = pd.notnull(joined.check_id)
filtered = joined[mask]
if not (filtered.check_id == filtered.category_id).all():
raise ValueError(f"Category id check failed")
dict_arr = pa.DictionaryArray.from_pandas(
pd.Categorical(name_arr.values.to_numpy(False), coco_classes.name.values)
)
assert not pd.isna(dict_arr.indices.to_numpy()).all()
return pa.ListArray.from_arrays(name_arr.offsets, dict_arr)

def image_uris(self, table):
return table["image_uri"].to_numpy()

Expand Down Expand Up @@ -206,8 +179,8 @@ def _ann_schema():
pa.list_(pa.float32(), 4),
pa.int16(),
pa.int64(),
pa.dictionary(pa.int8(), pa.utf8()),
pa.dictionary(pa.int8(), pa.utf8()),
pa.string(),
pa.string()
]
schema = pa.struct(
[pa.field(name, pa.list_(dtype)) for name, dtype in zip(names, types)]
Expand Down
5 changes: 3 additions & 2 deletions python/benchmarks/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,9 @@ def iou_vectorized(num_boxes: int):
ymin_arr = np.random.randn(num_boxes) + 1
xmax_arr = (np.random.randn(num_boxes) + 10) * 10
ymax_arr = (np.random.randn(num_boxes) + 10) * 10
storage = pa.StructArray.from_arrays(
[xmin_arr, ymin_arr, xmax_arr, ymax_arr], names=["xmin", "ymin", "xmax", "ymax"]
storage = pa.FixedSizeListArray.from_arrays(
np.stack([xmin_arr, ymin_arr, xmax_arr, ymax_arr]).T.reshape(-1),
list_size=4
)
box_arr = Box2dArray.from_storage(Box2dType(), storage)
return box_arr.iou(box_arr)
Expand Down
11 changes: 7 additions & 4 deletions python/benchmarks/oxford_pet/analytics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python3

import os
import sys
from typing import Optional

import duckdb
Expand All @@ -9,8 +10,11 @@
import pyarrow as pa
import pyarrow.compute as pc
import pyarrow.dataset

sys.path.append("..")

from bench_utils import BenchmarkSuite, download_uris
from parse_pet import OxfordPetConverter
from datagen import OxfordPetConverter

import lance

Expand Down Expand Up @@ -38,8 +42,7 @@ def filter_data(base_uri: str, fmt: str, flavor: Optional[str]):
query = "SELECT image, class FROM ds WHERE class='pug' " "LIMIT 50 OFFSET 20"
return duckdb.query(query).to_df()
elif fmt == "lance":
scanner = lance.scanner(
uri,
scanner = lance.dataset(uri).scanner(
columns=["image", "class"],
filter=pc.field("class") == "pug",
limit=50,
Expand Down Expand Up @@ -77,7 +80,7 @@ def get_pets_filtered_data(base_uri, klass="pug", offset=20, limit=50):
c = OxfordPetConverter(base_uri)
df = c.read_metadata()
filtered = df.loc[df["class"] == klass, ["class", "filename"]]
limited: pd.DataFrame = filtered[offset : offset + limit]
limited: pd.DataFrame = filtered[offset: offset + limit]
uris = [os.path.join(base_uri, f"images/{x}.jpg") for x in limited.filename.values]
return limited.assign(images=download_uris(pd.Series(uris)))

Expand Down
Loading

0 comments on commit a1548ea

Please sign in to comment.