pytorch · NivekT · May 2, 2022 · May 3, 2022
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -93,21 +93,30 @@
 
 # TODO: use regex to replace all "T" and "T_co" related signature
 signature_replacements = {
-    "torch.utils.data.dataset.IterDataPipe": "IterDataPipe",
+    "torch.utils.data.datapipes.datapipe.IterDataPipe": "IterDataPipe",
     "abc.IterDataPipe": "IterDataPipe",
-    "torch.utils.data.dataset.MapDataPipe": "MapDataPipe",
+    "torch.utils.data.datapipes.datapipe.MapDataPipe": "MapDataPipe",
     "abc.MapDataPipe": "MapDataPipe",
     "typing.Type[torch.utils.data.sampler.Sampler]": "torch.utils.data.sampler.Sampler",
     "<class 'torch.utils.data.sampler.SequentialSampler'>": "SequentialSampler",
     "torch.utils.data.datapipes.iter.combining.T_co": "T_co",
     "torch.utils.data.datapipes.iter.combinatorics.T_co": "T_co",
     "torchdata.datapipes.iter.transform.bucketbatcher.T_co": "T_co",
-    "<class 'torch.utils.data.dataset.DataChunk'>": "DataChunk",
     "torch.utils.data.datapipes.map.grouping.T": "T",
     "torch.utils.data.datapipes.map.combining.T_co": "T_co",
     "torch.utils.data.datapipes.map.combinatorics.T_co": "T_co",
     "torchdata.datapipes.iter.util.cycler.T_co": "T_co",
     "torchdata.datapipes.iter.util.paragraphaggregator.T_co": "T_co",
+    "torchdata.datapipes.map.util.cacheholder.T_co": "T_co",
+    "Sequence[torchdata.datapipes.map.util.unzipper.T]": "Sequence[T]",
+    "torchdata.datapipes.iter.util.samplemultiplexer.T_co": "T_co",
+    "torchdata.datapipes.iter.util.indexadder.K": "K",
+    "torchdata.datapipes.iter.util.unzipper.T": "T",
+    "torch.utils.data.datapipes.iter.grouping.T_co": "T_co",
+    "torchdata.datapipes.iter.util.dataframemaker.T_co": "T_co",
+    "torchdata.datapipes.iter.util.cacheholder.T_co": "T_co",
+    "torchdata.datapipes.iter.util.header.T_co": "T_co",
+    "<class 'torch.utils.data.datapipes.datapipe.DataChunk'>": "List",
     "typing.": "",
 }
 

diff --git a/docs/source/torchdata.datapipes.iter.rst b/docs/source/torchdata.datapipes.iter.rst
@@ -140,6 +140,8 @@ saving files, and listing the files in directories).
     IoPathSaver
     OnlineReader
     ParquetDataFrameLoader
+    S3FileLister
+    S3FileLoader
     Saver
 
 Mapping DataPipes
@@ -170,7 +172,7 @@ A miscellaneous set of DataPipes with different functionalities.
     HashChecker
     InMemoryCacheHolder
     IterableWrapper
-    IterToMapConverter
+    MapToIterConverter
     OnDiskCacheHolder
     ShardingFilter
 

diff --git a/docs/source/torchdata.datapipes.map.rst b/docs/source/torchdata.datapipes.map.rst
@@ -37,8 +37,8 @@ MapDataPipes
     Batcher
     Concater
     InMemoryCacheHolder
+    IterToMapConverter
     Mapper
-    MapToIterConverter
     SequenceWrapper
     Shuffler
     UnZipper

diff --git a/torchdata/datapipes/iter/load/s3io.py b/torchdata/datapipes/iter/load/s3io.py
@@ -15,27 +15,29 @@
 
 @functional_datapipe("list_file_by_s3")
 class S3FileListerIterDataPipe(IterDataPipe[str]):
-    r""":class:`S3FileListerIterDataPipe`.
+    r"""
+    Iterable DataPipe that lists Amazon S3 file URLs with the given prefixes (functional name: ``list_file_by_s3``).
+    Acceptable prefixes include ``s3://bucket-name``, ``s3://bucket-name/``, ``s3://bucket-name/folder``,
+    ``s3://bucket-name/folder/``, and ``s3://bucket-name/prefix``. You may also set ``length``, ``request_timeout_ms``
+    (default 3000 ms in aws-sdk-cpp), and ``region``.
 
-    Iterable DataPipe that lists URLs with the given prefixes (functional name: ``list_file_by_s3``).
-    Acceptable prefixes include `s3://bucket-name`, `s3://bucket-name/`, `s3://bucket-name/folder`,
-    `s3://bucket-name/folder/`, and `s3://bucket-name/prefix`. You may also set `length`, `request_timeout_ms` (default 3000
-    ms in aws-sdk-cpp), and `region`. Note that:
+    Note:
+        1. Input **must** be a list and direct S3 URLs are skipped.
+
+        2. ``length`` is `-1` by default, and any call to ``__len__()`` is invalid, because the length is unknown
+           until all files are iterated.
+
+        3. ``request_timeout_ms`` and ``region`` will overwrite settings in the configuration file or
+           environment variables.
 
-    1. Input **must** be a list and direct S3 URLs are skipped.
-    2. `length` is `-1` by default, and any call to `__len__()` is invalid, because the length is unknown until all files
-    are iterated.
-    3. `request_timeout_ms` and `region` will overwrite settings in the configuration file or environment variables.
+        4. AWS_CPP_SDK is necessary to use the S3 DataPipe(s).
 
     Args:
         source_datapipe: a DataPipe that contains URLs/URL prefixes to s3 files
         length: Nominal length of the datapipe
         requestTimeoutMs: optional, overwrite the default timeout setting for this datapipe
         region: optional, overwrite the default region inferred from credentials for this datapipe
 
-    Note:
-        AWS_CPP_SDK is necessary to use the S3 DataPipe(s).
-
     Example:
         >>> from torchdata.datapipes.iter import S3FileLister, S3FileLoader
         >>> s3_prefixes = ['s3://bucket-name/folder/', ...]
@@ -72,24 +74,25 @@ def __len__(self) -> int:
 
 @functional_datapipe("load_file_by_s3")
 class S3FileLoaderIterDataPipe(IterDataPipe[Tuple[str, StreamWrapper]]):
-    r""":class:`S3FileListerIterDataPipe`.
+    r"""
+    Iterable DataPipe that loads Amazon S3 files from the given S3 URLs (functional name: ``load_file_by_s3``).
+    ``S3FileLoader`` iterates all given S3 URLs in ``BytesIO`` format with ``(url, BytesIO)`` tuples.
+    You may also set ``request_timeout_ms`` (default 3000 ms in aws-sdk-cpp), ``region``,
+    ``buffer_size`` (default 120Mb), and ``multi_part_download`` (default to use multi-part downloading).
 
-    Iterable DataPipe that loads S3 files given S3 URLs (functional name: ``load_file_by_s3``).
-    `S3FileLoader` iterates all given S3 URLs in `BytesIO` format with `(url, BytesIO)` tuples.
-    You may also set `request_timeout_ms` (default 3000 ms in aws-sdk-cpp), `region`,
-    `buffer_size` (default 120Mb), and `multi_part_download` (default to use multi-part downloading). Note that:
+    Note:
+        1. Input **must** be a list and S3 URLs must be valid.
+
+        2. ``request_timeout_ms`` and ``region`` will overwrite settings in the
+           configuration file or environment variables.
 
-    1. Input **must** be a list and S3 URLs must be valid.
-    2. `request_timeout_ms` and `region` will overwrite settings in the configuration file or environment variables.
+        3. AWS_CPP_SDK is necessary to use the S3 DataPipe(s).
 
     Args:
         source_datapipe: a DataPipe that contains URLs to s3 files
         requestTimeoutMs: optional, overwrite the default timeout setting for this datapipe
         region: optional, overwrite the default region inferred from credentials for this datapipe
 
-    Note:
-        AWS_CPP_SDK is necessary to use the S3 DataPipe(s).
-
     Example:
         >>> from torchdata.datapipes.iter import S3FileLister, S3FileLoader
         >>> s3_prefixes = ['s3://bucket-name/folder/', ...]

diff --git a/torchdata/datapipes/iter/util/converter.py b/torchdata/datapipes/iter/util/converter.py
@@ -26,13 +26,22 @@ class IterToMapConverterMapDataPipe(MapDataPipe):
     with exactly two objects. The first object of each item becomes a key in
     the new dictionary, and the second object the corresponding value.
 
+    For the opposite converter, use :class:`.MapToIterConverter`.
+
     Args:
         datapipe: Source IterDataPipe
         key_value_fn: Function being applied over each data to generate key-value pair
 
     Note:
         If a key being added is already present, the corresponding value
         will be replaced by the new value.
+
+    Example:
+        >>> from torchdata.datapipes.iter import IterableWrapper
+        >>> source_dp = IterableWrapper([(i, i) for i in range(10)])
+        >>> map_dp = source_dp.to_map_datapipe()
+        >>> list(map_dp)
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
     """
     datapipe: IterDataPipe
     key_value_fn: Optional[Callable]

diff --git a/torchdata/datapipes/iter/util/header.py b/torchdata/datapipes/iter/util/header.py
@@ -17,6 +17,8 @@
 class HeaderIterDataPipe(IterDataPipe[T_co]):
     r"""
     Yields elements from the source DataPipe from the start, up to the specfied limit (functional name: ``header``).
+    This DataPipe can also be used to manually set the length of a DataPipe to a certain value; you
+    can do so by calling ``dp.header(desired_len)``.
 
     Args:
         source_datapipe: the DataPipe from which elements will be yielded

diff --git a/torchdata/datapipes/iter/util/webdataset.py b/torchdata/datapipes/iter/util/webdataset.py
@@ -47,7 +47,7 @@ class WebDatasetIterDataPipe(IterDataPipe[Dict]):
     r"""
     Iterable DataPipe that accepts stream of (path, data) tuples, usually,
     representing the pathnames and files of a tar archive (functional name:
-    ``webdataset''). This aggregates consecutive items with the same basename
+    ``webdataset``). This aggregates consecutive items with the same basename
     into a single dictionary, using the extensions as keys (WebDataset file
     convention). Any text after the first "." in the filename is used as
     a key/extension.
@@ -56,8 +56,10 @@ class WebDatasetIterDataPipe(IterDataPipe[Dict]):
 
     Args:
         source_datapipe: a DataPipe yielding a stream of (path, data) pairs
+
     Returns:
         a DataPipe yielding a stream of dictionaries
+
     Examples:
         >>> from torchdata.datapipes.iter import FileLister, FileOpener
         >>>

diff --git a/torchdata/datapipes/map/util/converter.py b/torchdata/datapipes/map/util/converter.py
@@ -15,6 +15,8 @@ class MapToIterConverterIterDataPipe(IterDataPipe):
     Convert a ``MapDataPipe`` to an ``IterDataPipe`` (functional name: ``to_iter_datapipe``). It uses ``indices`` to
     iterate through the ``MapDataPipe``, defaults to ``range(len(mapdatapipe))`` if not given.
 
+    For the opposite converter, use :class:`.IterToMapConverter`.
+
     Args:
         datapipe: source MapDataPipe with data
         indices: optional list of indices that will dictate how the datapipe will be iterated over