-
Notifications
You must be signed in to change notification settings - Fork 161
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Summary: Pull Request resolved: #325 This `MapDataPipe` seems simple enough to add but we should talk about what is the general guideline for adding `MapDataPipe` before actually doing it. Test Plan: Imported from OSS Reviewed By: ejguan Differential Revision: D35979906 Pulled By: NivekT fbshipit-source-id: fce7bad9b6c2dad10c815a3d708f12349519823f
- Loading branch information
1 parent
1171ec2
commit 47e6fcc
Showing
6 changed files
with
176 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,4 +40,5 @@ MapDataPipes | |
Mapper | ||
SequenceWrapper | ||
Shuffler | ||
UnZipper | ||
Zipper |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# Copyright (c) Meta Platforms, Inc. and affiliates. | ||
# All rights reserved. | ||
# | ||
# This source code is licensed under the BSD-style license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
|
||
import unittest | ||
|
||
import expecttest | ||
from torchdata.datapipes.map import MapDataPipe, SequenceWrapper, UnZipper | ||
|
||
|
||
class TestMapDataPipe(expecttest.TestCase): | ||
def test_unzipper_mapdatapipe(self) -> None: | ||
source_dp = SequenceWrapper([(i, i + 10, i + 20) for i in range(10)]) | ||
|
||
# Functional Test: unzips each sequence, with `sequence_length` specified | ||
dp1: MapDataPipe | ||
dp2: MapDataPipe | ||
dp3: MapDataPipe | ||
dp1, dp2, dp3 = UnZipper(source_dp, sequence_length=3) # type: ignore[misc] | ||
self.assertEqual(list(range(10)), list(dp1)) | ||
self.assertEqual(list(range(10, 20)), list(dp2)) | ||
self.assertEqual(list(range(20, 30)), list(dp3)) | ||
|
||
# Functional Test: skipping over specified values | ||
dp2, dp3 = source_dp.unzip(sequence_length=3, columns_to_skip=[0]) | ||
self.assertEqual(list(range(10, 20)), list(dp2)) | ||
self.assertEqual(list(range(20, 30)), list(dp3)) | ||
|
||
(dp2,) = source_dp.unzip(sequence_length=3, columns_to_skip=[0, 2]) | ||
self.assertEqual(list(range(10, 20)), list(dp2)) | ||
|
||
source_dp = SequenceWrapper([(i, i + 10, i + 20, i + 30) for i in range(10)]) | ||
dp2, dp3 = source_dp.unzip(sequence_length=4, columns_to_skip=[0, 3]) | ||
self.assertEqual(list(range(10, 20)), list(dp2)) | ||
self.assertEqual(list(range(20, 30)), list(dp3)) | ||
|
||
# __len__ Test: the lengths of child DataPipes are correct | ||
self.assertEqual((10, 10), (len(dp2), len(dp3))) | ||
|
||
|
||
if __name__ == "__main__": | ||
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
# Copyright (c) Meta Platforms, Inc. and affiliates. | ||
# All rights reserved. | ||
# | ||
# This source code is licensed under the BSD-style license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
|
||
from typing import Optional, Sequence, TypeVar | ||
|
||
from torchdata.datapipes import functional_datapipe | ||
from torchdata.datapipes.map import MapDataPipe | ||
|
||
|
||
T = TypeVar("T") | ||
|
||
|
||
@functional_datapipe("unzip") | ||
class UnZipperMapDataPipe(MapDataPipe): | ||
""" | ||
Takes in a DataPipe of Sequences, unpacks each Sequence, and return the elements in separate DataPipes | ||
based on their position in the Sequence (functional name: ``unzip``). The number of instances produced | ||
equals to the ``sequence_legnth`` minus the number of columns to skip. | ||
Note: | ||
Each sequence within the DataPipe should have the same length, specified by | ||
the input argument `sequence_length`. | ||
Args: | ||
source_datapipe: Iterable DataPipe with sequences of data | ||
sequence_length: Length of the sequence within the source_datapipe. All elements should have the same length. | ||
columns_to_skip: optional indices of columns that the DataPipe should skip (each index should be | ||
an integer from 0 to sequence_length - 1) | ||
Example: | ||
>>> from torchdata.datapipes.iter import SequenceWrapper | ||
>>> source_dp = SequenceWrapper([(i, i + 10, i + 20) for i in range(3)]) | ||
>>> dp1, dp2, dp3 = source_dp.unzip(sequence_length=3) | ||
>>> list(dp1) | ||
[0, 1, 2] | ||
>>> list(dp2) | ||
[10, 11, 12] | ||
>>> list(dp3) | ||
[20, 21, 22] | ||
""" | ||
|
||
def __new__( | ||
cls, | ||
source_datapipe: MapDataPipe[Sequence[T]], | ||
sequence_length: int, | ||
columns_to_skip: Optional[Sequence[int]] = None, | ||
): | ||
if sequence_length < 1: | ||
raise ValueError(f"Expected `sequence_length` larger than 0, but {sequence_length} is found") | ||
if columns_to_skip is None: | ||
instance_ids = list(range(sequence_length)) | ||
else: | ||
skips = set(columns_to_skip) | ||
instance_ids = [i for i in range(sequence_length) if i not in skips] | ||
|
||
if len(instance_ids) == 0: | ||
raise RuntimeError( | ||
f"All instances are being filtered out in {cls.__name__}. Please check" | ||
"the input `sequence_length` and `columns_to_skip`." | ||
) | ||
return [_UnZipperMapDataPipe(source_datapipe, i) for i in instance_ids] | ||
|
||
|
||
class _UnZipperMapDataPipe(MapDataPipe[T]): | ||
def __init__(self, main_datapipe: MapDataPipe[Sequence[T]], instance_id: int): | ||
self.main_datapipe = main_datapipe | ||
self.instance_id = instance_id | ||
|
||
def __getitem__(self, index) -> T: | ||
return self.main_datapipe[index][self.instance_id] | ||
|
||
def __len__(self) -> int: | ||
return len(self.main_datapipe) |