Skip to content

Commit

Permalink
Merge pull request #2 from rahulj51/delta-lake
Browse files Browse the repository at this point in the history
Adding support for delta lake table format
  • Loading branch information
dacort authored Jun 10, 2023
2 parents 6d04262 + c6f04d5 commit e1e0272
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 24 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,15 @@ _youcanevenwritestraighttos3_ 🤭
fake -n 10 pyint,user_name,date_this_year -f parquet -o s3://YOUR_BUCKET/data/sample.parquet
```

### Delta Lake

Data can be exported as a delta lake table.

```bash
fake -n 10 pyint,user_name,date_this_year -f deltalake -o sample_data
```


## Templates

Want to generate 1 MILLION S3 Access logs in ~2 minutes? Now you can.
Expand Down
11 changes: 6 additions & 5 deletions faker_cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import sys
from faker_cli.templates import CloudFrontWriter, S3AccessLogs, S3AccessWriter, CloudTrailLogs, CloudFrontLogs

from faker_cli.writer import CSVWriter, JSONWriter, ParquetWriter
from faker_cli.writer import CSVWriter, JSONWriter, ParquetWriter, DeltaLakeWriter
from typing import List

def infer_column_names(col_names, col_types: str) -> List[str]:
Expand All @@ -20,6 +20,7 @@ def infer_column_names(col_names, col_types: str) -> List[str]:
"csv": CSVWriter,
"json": JSONWriter,
"parquet": ParquetWriter,
"deltalake": DeltaLakeWriter
}

TEMPLATE_MAPPER = {
Expand All @@ -33,7 +34,7 @@ def infer_column_names(col_names, col_types: str) -> List[str]:

@click.command()
@click.option("--num-rows", "-n", default=1, help="Number of rows")
@click.option("--format", "-f", type=click.Choice(["csv", "json", "parquet"]), default="csv", help="Format of the output")
@click.option("--format", "-f", type=click.Choice(["csv", "json", "parquet", "deltalake"]), default="csv", help="Format of the output")
@click.option("--output", "-o", type=click.Path(writable=True))
@click.option("--columns", "-c", help="Column names", default=None, required=False)
@click.option("--template", "-t", help="Template to use", type=click.Choice(["s3access", "cloudfront"]), default=None)
Expand All @@ -57,9 +58,9 @@ def main(num_rows, format, output, columns, template, column_types):
)

# Parquet output requires a filename
if format == "parquet" and output is None:
raise click.BadArgumentUsage("parquet format requires --output/-o filename parameter.")
if output is not None and format != "parquet":
if format in ["parquet", "deltalake"] and output is None:
raise click.BadArgumentUsage("parquet | deltalake formats requires --output/-o filename parameter.")
if output is not None and format not in ["parquet", "deltalake"]:
raise click.BadArgumentUsage("output files not supported for csv/json yet.")

# If the user provides a template, we use that provider and writer and exit.
Expand Down
6 changes: 6 additions & 0 deletions faker_cli/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Optional
import pyarrow as pa
import pyarrow.parquet as pq
import deltalake


class Writer:
Expand Down Expand Up @@ -56,3 +57,8 @@ def write(self, row):

def close(self):
pq.write_table(self.table, self.filename)


class DeltaLakeWriter(ParquetWriter):
def close(self):
deltalake.write_deltalake(table_or_uri=self.filename, data=self.table)
47 changes: 29 additions & 18 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ python = "^3.9"
faker = "^18.9.0"
click = "^8.1.3"
pyarrow = "^12.0.0"
deltalake = "^0.9.0"


[tool.poetry.group.dev.dependencies]
Expand Down
17 changes: 16 additions & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from faker_cli.cli import main
from click.testing import CliRunner
import json
import deltalake


# Test that help is provided if the user provides no arguments
Expand Down Expand Up @@ -50,4 +51,18 @@ def test_custom_column_names():
lines = result.output.strip().splitlines()
data: dict = json.loads(lines[0])
assert len(data.keys()) == 2
assert list(data) == ["first", "second"]
assert list(data) == ["first", "second"]

def test_deltalake_output(tmp_path):
runner = CliRunner()
file = tmp_path / 'table'
result = runner.invoke(main, ["pyint,user_name", "-f", "deltalake", "-o", file])
assert result.exit_code == 0
delta_table = deltalake.DeltaTable(file)
arrow_table = delta_table.to_pyarrow_table()
lines_count = arrow_table.num_rows
assert lines_count == 1

column_names = arrow_table.column_names
assert column_names == ["pyint", "user_name"]
assert arrow_table.num_columns == 2

0 comments on commit e1e0272

Please sign in to comment.