Skip to content

Commit

Permalink
feat: Support GCS filesystem for bytewax engine
Browse files Browse the repository at this point in the history
Signed-off-by: Hai Nguyen <[email protected]>
  • Loading branch information
sudohainguyen committed Oct 11, 2023
1 parent 6a728fe commit 050f623
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import pyarrow as pa
import pyarrow.parquet as pq
import s3fs
from bytewax.dataflow import Dataflow # type: ignore
from bytewax.execution import cluster_main
from bytewax.inputs import ManualInputConfig, distribute
Expand All @@ -29,8 +28,7 @@ def __init__(
self._run_dataflow()

def process_path(self, path):
fs = s3fs.S3FileSystem()
dataset = pq.ParquetDataset(path, filesystem=fs, use_legacy_dataset=False)
dataset = pq.ParquetDataset(path, use_legacy_dataset=False)
batches = []
for fragment in dataset.fragments:
for batch in fragment.to_table().to_batches():
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@
"google-cloud-datastore>=2.1.0,<3",
"google-cloud-storage>=1.34.0,<3",
"google-cloud-bigtable>=2.11.0,<3",
"gcsfs>=2023.3.0,<2024.0.0",
]

REDIS_REQUIRED = [
Expand Down

0 comments on commit 050f623

Please sign in to comment.