From 7946f8a9f6a020a89f534f4a2b921357935ee975 Mon Sep 17 00:00:00 2001 From: Erjia Guan Date: Mon, 23 Aug 2021 14:32:56 -0700 Subject: [PATCH] Rename DataPipe to Op-er (#63325) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/63325 Rename each DataPipe to an operation name ending with er. Functional API should remain `verb` such as `read_from_tar` , `shuffle`, ... (Discussed in [here](https://github.com/facebookexternal/torchdata/pull/97#discussion_r688553905)) - Batch -> Batcher - Collate -> Collator - Concat -> Concater - GroupByKey - > ByKeyGrouper ? - ListDirFiles -> FileLister - LoadFilesFromDisk -> FileLoader - Map -> Mapper - ReadFilesFromTar -> TarArchiveReader - ReadFilesFromZip -> ZipArchiveReader - ReadLinesFromFile -> LineReader - Shuffle -> Shuffler - ToBytes -> StreamReader - Transforms -> Transformer - Zip -> Zipper Let me know if you have better name for each DataPipe Test Plan: Imported from OSS Reviewed By: mruberry Differential Revision: D30466950 Pulled By: ejguan fbshipit-source-id: 72909dca7b3964ab83b965891f96cc1ecf62d049 --- test/test_datapipe.py | 127 +++++++-------------- torch/utils/data/datapipes/iter/__init__.py | 74 ++++++------ torch/utils/data/datapipes/iter/callable.py | 41 +------ torch/utils/data/datapipes/iter/combinatorics.py | 4 +- torch/utils/data/datapipes/iter/combining.py | 8 +- .../iter/{listdirfiles.py => filelister.py} | 4 +- .../iter/{loadfilesfromdisk.py => fileloader.py} | 4 +- torch/utils/data/datapipes/iter/grouping.py | 12 +- torch/utils/data/datapipes/iter/linereader.py | 18 +++ .../utils/data/datapipes/iter/readlinesfromfile.py | 19 --- torch/utils/data/datapipes/iter/selecting.py | 4 +- .../datapipes/iter/{tobytes.py => streamreader.py} | 4 +- .../{readfilesfromtar.py => tararchivereader.py} | 4 +- .../{readfilesfromzip.py => ziparchivereader.py} | 4 +- torch/utils/data/datapipes/map/__init__.py | 7 +- torch/utils/data/datapipes/map/callable.py | 4 +- torch/utils/data/datapipes/map/combining.py | 4 +- 17 files changed, 128 insertions(+), 214 deletions(-) rename torch/utils/data/datapipes/iter/{listdirfiles.py => filelister.py} (93%) rename torch/utils/data/datapipes/iter/{loadfilesfromdisk.py => fileloader.py} (93%) create mode 100644 torch/utils/data/datapipes/iter/linereader.py delete mode 100644 torch/utils/data/datapipes/iter/readlinesfromfile.py rename torch/utils/data/datapipes/iter/{tobytes.py => streamreader.py} (85%) rename torch/utils/data/datapipes/iter/{readfilesfromtar.py => tararchivereader.py} (96%) rename torch/utils/data/datapipes/iter/{readfilesfromzip.py => ziparchivereader.py} (96%) diff --git a/test/test_datapipe.py b/test/test_datapipe.py index 9c23801..86e53fa 100644 --- a/test/test_datapipe.py +++ b/test/test_datapipe.py @@ -34,7 +34,6 @@ from unittest import skipIf import numpy as np import torch -import torch.nn as nn import torch.utils.data.backward_compatibility import torch.utils.data.datapipes as dp import torch.utils.data.graph @@ -55,13 +54,6 @@ from torch.utils.data.datapipes.utils.decoder import ( ) try: - import torchvision.transforms - HAS_TORCHVISION = True -except ImportError: - HAS_TORCHVISION = False -skipIfNoTorchVision = skipIf(not HAS_TORCHVISION, "no torchvision") - -try: import dill # XXX: By default, dill writes the Pickler dispatch table to inject its # own logic there. This globally affects the behavior of the standard library @@ -177,7 +169,7 @@ class TestIterableDataPipeBasic(TestCase): def test_listdirfiles_iterable_datapipe(self): temp_dir = self.temp_dir.name - datapipe = dp.iter.ListDirFiles(temp_dir, '') + datapipe = dp.iter.FileLister(temp_dir, '') count = 0 for pathname in datapipe: @@ -186,7 +178,7 @@ class TestIterableDataPipeBasic(TestCase): self.assertEqual(count, len(self.temp_files)) count = 0 - datapipe = dp.iter.ListDirFiles(temp_dir, '', recursive=True) + datapipe = dp.iter.FileLister(temp_dir, '', recursive=True) for pathname in datapipe: count = count + 1 self.assertTrue((pathname in self.temp_files) or (pathname in self.temp_sub_files)) @@ -195,13 +187,13 @@ class TestIterableDataPipeBasic(TestCase): def test_loadfilesfromdisk_iterable_datapipe(self): # test import datapipe class directly from torch.utils.data.datapipes.iter import ( - ListDirFiles, - LoadFilesFromDisk, + FileLister, + FileLoader, ) temp_dir = self.temp_dir.name - datapipe1 = ListDirFiles(temp_dir, '') - datapipe2 = LoadFilesFromDisk(datapipe1) + datapipe1 = FileLister(temp_dir, '') + datapipe2 = FileLoader(datapipe1) count = 0 for rec in datapipe2: @@ -220,9 +212,9 @@ class TestIterableDataPipeBasic(TestCase): tar.add(self.temp_files[0]) tar.add(self.temp_files[1]) tar.add(self.temp_files[2]) - datapipe1 = dp.iter.ListDirFiles(temp_dir, '*.tar') - datapipe2 = dp.iter.LoadFilesFromDisk(datapipe1) - datapipe3 = dp.iter.ReadFilesFromTar(datapipe2) + datapipe1 = dp.iter.FileLister(temp_dir, '*.tar') + datapipe2 = dp.iter.FileLoader(datapipe1) + datapipe3 = dp.iter.TarArchiveReader(datapipe2) # read extracted files before reaching the end of the tarfile for rec, temp_file in itertools.zip_longest(datapipe3, self.temp_files): self.assertTrue(rec is not None and temp_file is not None) @@ -247,9 +239,9 @@ class TestIterableDataPipeBasic(TestCase): myzip.write(self.temp_files[0]) myzip.write(self.temp_files[1]) myzip.write(self.temp_files[2]) - datapipe1 = dp.iter.ListDirFiles(temp_dir, '*.zip') - datapipe2 = dp.iter.LoadFilesFromDisk(datapipe1) - datapipe3 = dp.iter.ReadFilesFromZip(datapipe2) + datapipe1 = dp.iter.FileLister(temp_dir, '*.zip') + datapipe2 = dp.iter.FileLoader(datapipe1) + datapipe3 = dp.iter.ZipArchiveReader(datapipe2) # read extracted files before reaching the end of the zipfile for rec, temp_file in itertools.zip_longest(datapipe3, self.temp_files): self.assertTrue(rec is not None and temp_file is not None) @@ -271,8 +263,8 @@ class TestIterableDataPipeBasic(TestCase): temp_pngfile_pathname = os.path.join(temp_dir, "test_png.png") png_data = np.array([[[1., 0., 0.], [1., 0., 0.]], [[1., 0., 0.], [1., 0., 0.]]], dtype=np.single) np.save(temp_pngfile_pathname, png_data) - datapipe1 = dp.iter.ListDirFiles(temp_dir, ['*.png', '*.txt']) - datapipe2 = dp.iter.LoadFilesFromDisk(datapipe1) + datapipe1 = dp.iter.FileLister(temp_dir, ['*.png', '*.txt']) + datapipe2 = dp.iter.FileLoader(datapipe1) def _png_decoder(extension, data): if extension != 'png': @@ -321,10 +313,10 @@ class TestIterableDataPipeBasic(TestCase): f.write('12345abcde') tar.add(file_pathname) - datapipe1 = dp.iter.ListDirFiles(temp_dir, '*.tar') - datapipe2 = dp.iter.LoadFilesFromDisk(datapipe1) - datapipe3 = dp.iter.ReadFilesFromTar(datapipe2) - datapipe4 = dp.iter.GroupByKey(datapipe3, group_size=2) + datapipe1 = dp.iter.FileLister(temp_dir, '*.tar') + datapipe2 = dp.iter.FileLoader(datapipe1) + datapipe3 = dp.iter.TarArchiveReader(datapipe2) + datapipe4 = dp.iter.ByKeyGrouper(datapipe3, group_size=2) expected_result = [("a.png", "a.json"), ("c.png", "c.json"), ("b.png", "b.json"), ("d.png", "d.json"), ( "f.png", "f.json"), ("g.png", "g.json"), ("e.png", "e.json"), ("h.json", "h.txt")] @@ -447,13 +439,14 @@ class TestIterableDataPipeHttp(TestCase): create_temp_files_for_serving(tmpdir, test_file_count, test_file_size, file_url_template) - datapipe_dir_f = dp.iter.ListDirFiles(tmpdir, '*_list') - datapipe_f_lines = dp.iter.ReadLinesFromFile(datapipe_dir_f) + datapipe_dir_f = dp.iter.FileLister(tmpdir, '*_list') + datapipe_stream = dp.iter.FileLoader(datapipe_dir_f) + datapipe_f_lines = dp.iter.LineReader(datapipe_stream) datapipe_line_url: IterDataPipe[str] = \ - dp.iter.Map(datapipe_f_lines, _get_data_from_tuple_fn, (1,)) + dp.iter.Mapper(datapipe_f_lines, _get_data_from_tuple_fn, (1,)) datapipe_http = dp.iter.HttpReader(datapipe_line_url, timeout=timeout) - datapipe_tob = dp.iter.ToBytes(datapipe_http, chunk=chunk) + datapipe_tob = dp.iter.StreamReader(datapipe_http, chunk=chunk) for (url, data) in datapipe_tob: self.assertGreater(len(url), 0) @@ -539,18 +532,18 @@ class TestFunctionalIterDataPipe(TestCase): def _test_picklable(self): arr = range(10) picklable_datapipes: List[Tuple[Type[IterDataPipe], IterDataPipe, Tuple, Dict[str, Any]]] = [ - (dp.iter.Map, IDP(arr), (), {}), - (dp.iter.Map, IDP(arr), (_fake_fn, (0, ), {'test': True}), {}), - (dp.iter.Collate, IDP(arr), (), {}), - (dp.iter.Collate, IDP(arr), (_fake_fn, (0, ), {'test': True}), {}), + (dp.iter.Mapper, IDP(arr), (), {}), + (dp.iter.Mapper, IDP(arr), (_fake_fn, (0, ), {'test': True}), {}), + (dp.iter.Collator, IDP(arr), (), {}), + (dp.iter.Collator, IDP(arr), (_fake_fn, (0, ), {'test': True}), {}), (dp.iter.Filter, IDP(arr), (_fake_filter_fn, (0, ), {'test': True}), {}), ] for dpipe, input_dp, dp_args, dp_kwargs in picklable_datapipes: p = pickle.dumps(dpipe(input_dp, *dp_args, **dp_kwargs)) # type: ignore[call-arg] unpicklable_datapipes: List[Tuple[Type[IterDataPipe], IterDataPipe, Tuple, Dict[str, Any]]] = [ - (dp.iter.Map, IDP(arr), (lambda x: x, ), {}), - (dp.iter.Collate, IDP(arr), (lambda x: x, ), {}), + (dp.iter.Mapper, IDP(arr), (lambda x: x, ), {}), + (dp.iter.Collator, IDP(arr), (lambda x: x, ), {}), (dp.iter.Filter, IDP(arr), (lambda x: x >= 5, ), {}), ] for dpipe, input_dp, dp_args, dp_kwargs in unpicklable_datapipes: @@ -566,10 +559,10 @@ class TestFunctionalIterDataPipe(TestCase): input_dp2 = IDP(range(5)) with self.assertRaisesRegex(ValueError, r"Expected at least one DataPipe"): - dp.iter.Concat() + dp.iter.Concater() with self.assertRaisesRegex(TypeError, r"Expected all inputs to be `IterDataPipe`"): - dp.iter.Concat(input_dp1, ()) # type: ignore[arg-type] + dp.iter.Concater(input_dp1, ()) # type: ignore[arg-type] concat_dp = input_dp1.concat(input_dp2) self.assertEqual(len(concat_dp), 15) @@ -913,59 +906,17 @@ class TestFunctionalIterDataPipe(TestCase): with self.assertRaisesRegex(TypeError, r"instance doesn't have valid length$"): len(shuffle_dp_nl) - @skipIfNoTorchVision - def test_transforms_datapipe(self): - torch.set_default_dtype(torch.float) - # A sequence of numpy random numbers representing 3-channel images - w = h = 32 - inputs = [np.random.randint(0, 255, (h, w, 3), dtype=np.uint8) for i in range(10)] - tensor_inputs = [torch.tensor(x, dtype=torch.float).permute(2, 0, 1) / 255. for x in inputs] - - input_dp = IDP(inputs) - # Raise TypeError for python function - with self.assertRaisesRegex(TypeError, r"`transforms` are required to be"): - input_dp.legacy_transforms(_fake_fn) - - # transforms.Compose of several transforms - transforms = torchvision.transforms.Compose([ - torchvision.transforms.ToTensor(), - torchvision.transforms.Pad(1, fill=1, padding_mode='constant'), - ]) - tsfm_dp = input_dp.legacy_transforms(transforms) - self.assertEqual(len(tsfm_dp), len(input_dp)) - for tsfm_data, input_data in zip(tsfm_dp, tensor_inputs): - self.assertEqual(tsfm_data[:, 1:(h + 1), 1:(w + 1)], input_data) - - # nn.Sequential of several transforms (required to be instances of nn.Module) - input_dp = IDP(tensor_inputs) - transforms = nn.Sequential( - torchvision.transforms.Pad(1, fill=1, padding_mode='constant'), - ) - tsfm_dp = input_dp.legacy_transforms(transforms) - self.assertEqual(len(tsfm_dp), len(input_dp)) - for tsfm_data, input_data in zip(tsfm_dp, tensor_inputs): - self.assertEqual(tsfm_data[:, 1:(h + 1), 1:(w + 1)], input_data) - - # Single transform - input_dp = IDP_NoLen(inputs) # type: ignore[assignment] - transform = torchvision.transforms.ToTensor() - tsfm_dp = input_dp.legacy_transforms(transform) - with self.assertRaisesRegex(TypeError, r"instance doesn't have valid length$"): - len(tsfm_dp) - for tsfm_data, input_data in zip(tsfm_dp, tensor_inputs): - self.assertEqual(tsfm_data, input_data) - def test_zip_datapipe(self): with self.assertRaises(TypeError): - dp.iter.Zip(IDP(range(10)), list(range(10))) # type: ignore[arg-type] + dp.iter.Zipper(IDP(range(10)), list(range(10))) # type: ignore[arg-type] - zipped_dp = dp.iter.Zip(IDP(range(10)), IDP_NoLen(range(5))) # type: ignore[var-annotated] + zipped_dp = dp.iter.Zipper(IDP(range(10)), IDP_NoLen(range(5))) # type: ignore[var-annotated] with self.assertRaisesRegex(TypeError, r"instance doesn't have valid length$"): len(zipped_dp) exp = list((i, i) for i in range(5)) self.assertEqual(list(zipped_dp), exp) - zipped_dp = dp.iter.Zip(IDP(range(10)), IDP(range(5))) + zipped_dp = dp.iter.Zipper(IDP(range(10)), IDP(range(5))) self.assertEqual(len(zipped_dp), 5) self.assertEqual(list(zipped_dp), exp) # Reset @@ -979,8 +930,8 @@ class TestFunctionalMapDataPipe(TestCase): picklable_datapipes: List[ Tuple[Type[MapDataPipe], MapDataPipe, Tuple, Dict[str, Any]] ] = [ - (dp.map.Map, MDP(arr), (), {}), - (dp.map.Map, MDP(arr), (_fake_fn, (0,), {'test': True}), {}), + (dp.map.Mapper, MDP(arr), (), {}), + (dp.map.Mapper, MDP(arr), (_fake_fn, (0,), {'test': True}), {}), ] for dpipe, input_dp, dp_args, dp_kwargs in picklable_datapipes: p = pickle.dumps(dpipe(input_dp, *dp_args, **dp_kwargs)) # type: ignore[call-arg] @@ -988,7 +939,7 @@ class TestFunctionalMapDataPipe(TestCase): unpicklable_datapipes: List[ Tuple[Type[MapDataPipe], MapDataPipe, Tuple, Dict[str, Any]] ] = [ - (dp.map.Map, MDP(arr), (lambda x: x,), {}), + (dp.map.Mapper, MDP(arr), (lambda x: x,), {}), ] for dpipe, input_dp, dp_args, dp_kwargs in unpicklable_datapipes: with warnings.catch_warnings(record=True) as wa: @@ -1005,10 +956,10 @@ class TestFunctionalMapDataPipe(TestCase): input_dp2 = MDP(range(5)) with self.assertRaisesRegex(ValueError, r"Expected at least one DataPipe"): - dp.map.Concat() + dp.map.Concater() with self.assertRaisesRegex(TypeError, r"Expected all inputs to be `MapDataPipe`"): - dp.map.Concat(input_dp1, ()) # type: ignore[arg-type] + dp.map.Concater(input_dp1, ()) # type: ignore[arg-type] concat_dp = input_dp1.concat(input_dp2) self.assertEqual(len(concat_dp), 15) diff --git a/torch/utils/data/datapipes/iter/__init__.py b/torch/utils/data/datapipes/iter/__init__.py index bdaef95..5af2ab6 100644 --- a/torch/utils/data/datapipes/iter/__init__.py +++ b/torch/utils/data/datapipes/iter/__init__.py @@ -1,38 +1,31 @@ from torch.utils.data.datapipes.iter.callable import ( - CollateIterDataPipe as Collate, - MapIterDataPipe as Map, - TransformsIterDataPipe as Transforms, + CollatorIterDataPipe as Collator, + MapperIterDataPipe as Mapper, ) from torch.utils.data.datapipes.iter.combinatorics import ( SamplerIterDataPipe as Sampler, - ShuffleIterDataPipe as Shuffle, + ShufflerIterDataPipe as Shuffler, ) from torch.utils.data.datapipes.iter.combining import ( - ConcatIterDataPipe as Concat, - ZipIterDataPipe as Zip, + ConcaterIterDataPipe as Concater, + ZipperIterDataPipe as Zipper, +) +from torch.utils.data.datapipes.iter.filelister import ( + FileListerIterDataPipe as FileLister, +) +from torch.utils.data.datapipes.iter.fileloader import ( + FileLoaderIterDataPipe as FileLoader, ) from torch.utils.data.datapipes.iter.grouping import ( - BatchIterDataPipe as Batch, + BatcherIterDataPipe as Batcher, BucketBatcherIterDataPipe as BucketBatcher, - GroupByKeyIterDataPipe as GroupByKey, + ByKeyGrouperIterDataPipe as ByKeyGrouper, ) from torch.utils.data.datapipes.iter.httpreader import ( HTTPReaderIterDataPipe as HttpReader, ) -from torch.utils.data.datapipes.iter.listdirfiles import ( - ListDirFilesIterDataPipe as ListDirFiles, -) -from torch.utils.data.datapipes.iter.loadfilesfromdisk import ( - LoadFilesFromDiskIterDataPipe as LoadFilesFromDisk, -) -from torch.utils.data.datapipes.iter.readfilesfromtar import ( - ReadFilesFromTarIterDataPipe as ReadFilesFromTar, -) -from torch.utils.data.datapipes.iter.readfilesfromzip import ( - ReadFilesFromZipIterDataPipe as ReadFilesFromZip, -) -from torch.utils.data.datapipes.iter.readlinesfromfile import ( - ReadLinesFromFileIterDataPipe as ReadLinesFromFile, +from torch.utils.data.datapipes.iter.linereader import ( + LineReaderIterDataPipe as LineReader, ) from torch.utils.data.datapipes.iter.routeddecoder import ( RoutedDecoderIterDataPipe as RoutedDecoder, @@ -40,33 +33,38 @@ from torch.utils.data.datapipes.iter.routeddecoder import ( from torch.utils.data.datapipes.iter.selecting import ( FilterIterDataPipe as Filter, ) -from torch.utils.data.datapipes.iter.tobytes import ( - ToBytesIterDataPipe as ToBytes, +from torch.utils.data.datapipes.iter.streamreader import ( + StreamReaderIterDataPipe as StreamReader, +) +from torch.utils.data.datapipes.iter.tararchivereader import ( + TarArchiveReaderIterDataPipe as TarArchiveReader, +) +from torch.utils.data.datapipes.iter.ziparchivereader import ( + ZipArchiveReaderIterDataPipe as ZipArchiveReader, ) from torch.utils.data.datapipes.iter.utils import ( IterableAsDataPipeIterDataPipe as IterableAsDataPipe, ) -__all__ = ['Batch', +__all__ = ['Batcher', 'BucketBatcher', - 'Collate', - 'Concat', + 'ByKeyGrouper', + 'Collator', + 'Concater', + 'FileLister', + 'FileLoader', 'Filter', - 'GroupByKey', 'HttpReader', 'IterableAsDataPipe', - 'ListDirFiles', - 'LoadFilesFromDisk', - 'Map', - 'ReadFilesFromTar', - 'ReadFilesFromZip', - 'ReadLinesFromFile', + 'LineReader', + 'Mapper', 'RoutedDecoder', 'Sampler', - 'Shuffle', - 'ToBytes', - 'Transforms', - 'Zip'] + 'Shuffler', + 'StreamReader', + 'TarArchiveReader', + 'ZipArchiveReader', + 'Zipper'] # Please keep this list sorted assert __all__ == sorted(__all__) diff --git a/torch/utils/data/datapipes/iter/callable.py b/torch/utils/data/datapipes/iter/callable.py index cc0f9e1..18f6f17 100644 --- a/torch/utils/data/datapipes/iter/callable.py +++ b/torch/utils/data/datapipes/iter/callable.py @@ -1,5 +1,4 @@ import warnings -import torch.nn as nn from torch.utils.data import IterDataPipe, _utils, functional_datapipe, DataChunk from typing import Callable, Dict, Iterator, Optional, Sized, Tuple, TypeVar @@ -26,8 +25,8 @@ def default_fn(data): @functional_datapipe('map') -class MapIterDataPipe(IterDataPipe[T_co]): - r""" :class:`MapIterDataPipe`. +class MapperIterDataPipe(IterDataPipe[T_co]): + r""" :class:`MapperIterDataPipe`. Iterable DataPipe to run a function over each item from the source DataPipe. The function can be any regular python function or partial object. Lambda @@ -108,8 +107,8 @@ class MapIterDataPipe(IterDataPipe[T_co]): @functional_datapipe('collate') -class CollateIterDataPipe(MapIterDataPipe): - r""" :class:`CollateIterDataPipe`. +class CollatorIterDataPipe(MapperIterDataPipe): + r""" :class:`CollatorIterDataPipe`. Iterable DataPipe to collate samples from datapipe to Tensor(s) by `util_.collate.default_collate`, or customized Data Structure by collate_fn. @@ -153,35 +152,3 @@ class CollateIterDataPipe(MapIterDataPipe): fn_kwargs: Optional[Dict] = None, ) -> None: super().__init__(datapipe, fn=collate_fn, fn_args=fn_args, fn_kwargs=fn_kwargs) - - -@functional_datapipe('legacy_transforms') -class TransformsIterDataPipe(MapIterDataPipe): - r""" :class:`TransformsIterDataPipe`. - - Iterable DataPipe to use transform(s) from torchvision or torchaudio to transform - data from datapipe. - args: - datapipe: Iterable DataPipe being transformed - transforms: A transform or a sequence of transforms from torchvision or torchaudio. - """ - - def __init__(self, - datapipe: IterDataPipe, - transforms: Callable, - ) -> None: - # Type checking for transforms - transforms_types: Tuple = (nn.Module, ) - try: - # Specific types of transforms other than `nn.Module` from torchvision - import torchvision.transforms as tsfm - transforms_types += (tsfm.Compose, tsfm.RandomChoice, tsfm.RandomOrder, - tsfm.ToPILImage, tsfm.ToTensor, tsfm.Lambda) - except ImportError: - pass - - if not isinstance(transforms, transforms_types): - raise TypeError("`transforms` are required to be a callable from " - "torchvision.transforms or torchaudio.transforms") - - super().__init__(datapipe, fn=transforms) diff --git a/torch/utils/data/datapipes/iter/combinatorics.py b/torch/utils/data/datapipes/iter/combinatorics.py index a8b1e3d..d1a7dd0 100644 --- a/torch/utils/data/datapipes/iter/combinatorics.py +++ b/torch/utils/data/datapipes/iter/combinatorics.py @@ -44,8 +44,8 @@ class SamplerIterDataPipe(IterDataPipe[T_co]): @functional_datapipe('shuffle') -class ShuffleIterDataPipe(IterDataPipe[T_co]): - r""" :class:`ShuffleIterDataPipe` +class ShufflerIterDataPipe(IterDataPipe[T_co]): + r""" :class:`ShufflerIterDataPipe` Iterable DataPipe to shuffle the input DataPipe with a buffer. The buffer with `buffer_size` is filled with elements from the datapipe first. Then, diff --git a/torch/utils/data/datapipes/iter/combining.py b/torch/utils/data/datapipes/iter/combining.py index 0693b1f..4b28e09 100644 --- a/torch/utils/data/datapipes/iter/combining.py +++ b/torch/utils/data/datapipes/iter/combining.py @@ -7,8 +7,8 @@ T_co = TypeVar('T_co', covariant=True) @functional_datapipe('concat') -class ConcatIterDataPipe(IterDataPipe): - r""" :class:`ConcatIterDataPipe`. +class ConcaterIterDataPipe(IterDataPipe): + r""" :class:`ConcaterIterDataPipe`. Iterable DataPipe to concatenate multiple Iterable DataPipes. args: @@ -54,7 +54,7 @@ class IterateBuffer(IterDataPipe): @functional_datapipe('fork') -class ForkIterDataPipe(IterDataPipe): +class ForkerIterDataPipe(IterDataPipe): def __new__(cls, datapipe, instances): result = [] @@ -96,7 +96,7 @@ class MultiplexerIterDataPipe(IterDataPipe): @functional_datapipe('zip') -class ZipIterDataPipe(IterDataPipe[Tuple[T_co]]): +class ZipperIterDataPipe(IterDataPipe[Tuple[T_co]]): r""" :class:`ZipIterDataPipe`. Iterable DataPipe aggregates elements into a tuple from each of diff --git a/torch/utils/data/datapipes/iter/listdirfiles.py b/torch/utils/data/datapipes/iter/filelister.py similarity index 93% rename from torch/utils/data/datapipes/iter/listdirfiles.py rename to torch/utils/data/datapipes/iter/filelister.py index 91ef8a3..48fdce9 100644 --- a/torch/utils/data/datapipes/iter/listdirfiles.py +++ b/torch/utils/data/datapipes/iter/filelister.py @@ -2,8 +2,8 @@ from torch.utils.data import IterDataPipe from torch.utils.data.datapipes.utils.common import get_file_pathnames_from_root from typing import List, Union, Iterator -class ListDirFilesIterDataPipe(IterDataPipe[str]): - r""" :class:`ListDirFilesIterDataPipe` +class FileListerIterDataPipe(IterDataPipe[str]): + r""" :class:`FileListerIterDataPipe` Iterable DataPipe to load file pathname(s) (path + filename), yield pathname from given disk root dir. args: diff --git a/torch/utils/data/datapipes/iter/loadfilesfromdisk.py b/torch/utils/data/datapipes/iter/fileloader.py similarity index 93% rename from torch/utils/data/datapipes/iter/loadfilesfromdisk.py rename to torch/utils/data/datapipes/iter/fileloader.py index c9dd5da..2b73e4e 100644 --- a/torch/utils/data/datapipes/iter/loadfilesfromdisk.py +++ b/torch/utils/data/datapipes/iter/fileloader.py @@ -5,8 +5,8 @@ from torch.utils.data import IterDataPipe from torch.utils.data.datapipes.utils.common import get_file_binaries_from_pathnames -class LoadFilesFromDiskIterDataPipe(IterDataPipe[Tuple[str, IOBase]]): - r""" :class:`LoadFilesFromDiskIterDataPipe`. +class FileLoaderIterDataPipe(IterDataPipe[Tuple[str, IOBase]]): + r""" :class:`FileLoaderIterDataPipe`. Iterable Datapipe to load file streams from given pathnames, yield pathname and file stream in a tuple. diff --git a/torch/utils/data/datapipes/iter/grouping.py b/torch/utils/data/datapipes/iter/grouping.py index e6304c2..5f44948 100644 --- a/torch/utils/data/datapipes/iter/grouping.py +++ b/torch/utils/data/datapipes/iter/grouping.py @@ -32,8 +32,8 @@ class ShardingFilterIterDataPipe(IterDataPipe): @functional_datapipe('batch') -class BatchIterDataPipe(IterDataPipe[DataChunk[T_co]]): - r""" :class:`BatchIterDataPipe`. +class BatcherIterDataPipe(IterDataPipe[DataChunk[T_co]]): + r""" :class:`BatcherIterDataPipe`. Iterable DataPipe to create mini-batches of data. An outer dimension will be added as `batch_size` if `drop_last` is set to `True`, or `length % batch_size` for the @@ -93,8 +93,8 @@ class BatchIterDataPipe(IterDataPipe[DataChunk[T_co]]): @functional_datapipe('unbatch') -class UnBatchIterDataPipe(IterDataPipe): - r""" :class:`UnBatchIterDataPipe`. +class UnBatcherIterDataPipe(IterDataPipe): + r""" :class:`UnBatcherIterDataPipe`. Iterable DataPipe to undo batching of data. In other words, it flattens the data up to the specified level within a batched DataPipe. @@ -255,7 +255,7 @@ def default_sort_data_fn(datalist: List[Tuple[str, Any]]): @functional_datapipe('groupby') -class GroupByIterDataPipe(IterDataPipe): +class GrouperIterDataPipe(IterDataPipe): # TODO(VtalyFedyunin): Add inline docs and tests (they are partially available in notebooks) def __init__(self, datapipe: IterDataPipe[T_co], @@ -329,7 +329,7 @@ class GroupByIterDataPipe(IterDataPipe): @functional_datapipe('group_by_key') -class GroupByKeyIterDataPipe(IterDataPipe[list]): +class ByKeyGrouperIterDataPipe(IterDataPipe[list]): r""" :class:`GroupByKeyIterDataPipe`. Iterable datapipe to group data from input iterable by keys which are generated from `group_key_fn`, diff --git a/torch/utils/data/datapipes/iter/linereader.py b/torch/utils/data/datapipes/iter/linereader.py new file mode 100644 index 0000000..2b15b93 --- /dev/null +++ b/torch/utils/data/datapipes/iter/linereader.py @@ -0,0 +1,18 @@ +from typing import Tuple +from torch.utils.data import IterDataPipe + + +class LineReaderIterDataPipe(IterDataPipe[Tuple[str, str]]): + r""" :class:`LineReaderIterDataPipe` + + Iterable DataPipe to load file name and stream as source IterDataPipe + and yield filename and line(s). + """ + + def __init__(self, source_datapipe): + self.source_datapipe = source_datapipe + + def __iter__(self): + for file_name, stream in self.source_datapipe: + for line in stream: + yield file_name, line diff --git a/torch/utils/data/datapipes/iter/readlinesfromfile.py b/torch/utils/data/datapipes/iter/readlinesfromfile.py deleted file mode 100644 index c8366af..0000000 --- a/torch/utils/data/datapipes/iter/readlinesfromfile.py +++ /dev/null @@ -1,19 +0,0 @@ -from typing import Tuple -from torch.utils.data import IterDataPipe - - -class ReadLinesFromFileIterDataPipe(IterDataPipe[Tuple[str, str]]): - r""" :class:`ReadLinesFromFileDataPipe` - - Iterable DataPipe to load file names as source iter data pipe - and yield filename and line(s). - """ - - def __init__(self, source_datapipe): - self.source_datapipe = source_datapipe - - def __iter__(self): - for file_name in self.source_datapipe: - with open(file_name) as file: - for line in file: - yield (file_name, line) diff --git a/torch/utils/data/datapipes/iter/selecting.py b/torch/utils/data/datapipes/iter/selecting.py index 46a613a..83872ce 100644 --- a/torch/utils/data/datapipes/iter/selecting.py +++ b/torch/utils/data/datapipes/iter/selecting.py @@ -1,13 +1,13 @@ from torch.utils.data import IterDataPipe, functional_datapipe, DataChunk from typing import Callable, TypeVar, Iterator, Optional, Tuple, Dict -from .callable import MapIterDataPipe +from .callable import MapperIterDataPipe T_co = TypeVar('T_co', covariant=True) @functional_datapipe('filter') -class FilterIterDataPipe(MapIterDataPipe): +class FilterIterDataPipe(MapperIterDataPipe): r""" :class:`FilterIterDataPipe`. Iterable DataPipe to filter elements from datapipe according to filter_fn. diff --git a/torch/utils/data/datapipes/iter/tobytes.py b/torch/utils/data/datapipes/iter/streamreader.py similarity index 85% rename from torch/utils/data/datapipes/iter/tobytes.py rename to torch/utils/data/datapipes/iter/streamreader.py index 21fd82d..f74efe7 100644 --- a/torch/utils/data/datapipes/iter/tobytes.py +++ b/torch/utils/data/datapipes/iter/streamreader.py @@ -2,8 +2,8 @@ from typing import Tuple from torch.utils.data import IterDataPipe -class ToBytesIterDataPipe(IterDataPipe[Tuple[str, bytes]]): - r""" :class:`ToBytesIterDataPipe` +class StreamReaderIterDataPipe(IterDataPipe[Tuple[str, bytes]]): + r""" :class:`StreamReaderIterDataPipe` Iterable DataPipe to load IO stream with label name, and to yield bytes with label name in a tuple diff --git a/torch/utils/data/datapipes/iter/readfilesfromtar.py b/torch/utils/data/datapipes/iter/tararchivereader.py similarity index 96% rename from torch/utils/data/datapipes/iter/readfilesfromtar.py rename to torch/utils/data/datapipes/iter/tararchivereader.py index f456602..9145f5f 100644 --- a/torch/utils/data/datapipes/iter/readfilesfromtar.py +++ b/torch/utils/data/datapipes/iter/tararchivereader.py @@ -7,8 +7,8 @@ import os import tarfile import warnings -class ReadFilesFromTarIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]): - r""":class:`ReadFilesFromTarIterDataPipe`. +class TarArchiveReaderIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]): + r""" :class:`TarArchiveReaderIterDataPipe`. Iterable datapipe to extract tar binary streams from input iterable which contains tuples of pathname and tar binary stream, yields pathname and extracted binary stream in a tuple. diff --git a/torch/utils/data/datapipes/iter/readfilesfromzip.py b/torch/utils/data/datapipes/iter/ziparchivereader.py similarity index 96% rename from torch/utils/data/datapipes/iter/readfilesfromzip.py rename to torch/utils/data/datapipes/iter/ziparchivereader.py index edb8320..e98bd17 100644 --- a/torch/utils/data/datapipes/iter/readfilesfromzip.py +++ b/torch/utils/data/datapipes/iter/ziparchivereader.py @@ -8,8 +8,8 @@ import sys import zipfile import warnings -class ReadFilesFromZipIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]): - r""" :class:`ReadFilesFromZipIterDataPipe`. +class ZipArchiveReaderIterDataPipe(IterDataPipe[Tuple[str, BufferedIOBase]]): + r""" :class:`ZipArchiveReaderIterDataPipe`. Iterable data pipe to extract zip binary streams from input iterable which contains tuples of pathname and zip binary stream, yields pathname and extracted binary stream in a tuple. diff --git a/torch/utils/data/datapipes/map/__init__.py b/torch/utils/data/datapipes/map/__init__.py index b760995..5879165 100644 --- a/torch/utils/data/datapipes/map/__init__.py +++ b/torch/utils/data/datapipes/map/__init__.py @@ -1,7 +1,6 @@ # Functional DataPipe -from torch.utils.data.datapipes.map.callable import MapMapDataPipe as Map -from torch.utils.data.datapipes.map.combining import \ - (ConcatMapDataPipe as Concat) +from torch.utils.data.datapipes.map.callable import MapperMapDataPipe as Mapper +from torch.utils.data.datapipes.map.combining import ConcaterMapDataPipe as Concater -__all__ = ['Map', 'Concat'] +__all__ = ['Concater', 'Mapper'] diff --git a/torch/utils/data/datapipes/map/callable.py b/torch/utils/data/datapipes/map/callable.py index 0045729..8dbad95 100644 --- a/torch/utils/data/datapipes/map/callable.py +++ b/torch/utils/data/datapipes/map/callable.py @@ -26,8 +26,8 @@ def default_fn(data): @functional_datapipe('map') -class MapMapDataPipe(MapDataPipe[T_co]): - r""":class:`MapMapDataPipe`. +class MapperMapDataPipe(MapDataPipe[T_co]): + r""":class:`MapperMapDataPipe`. Map DataPipe to run a function over each item from the source DataPipe. The function can be any regular python function or partial object. Lambda diff --git a/torch/utils/data/datapipes/map/combining.py b/torch/utils/data/datapipes/map/combining.py index 234d453..4743c37 100644 --- a/torch/utils/data/datapipes/map/combining.py +++ b/torch/utils/data/datapipes/map/combining.py @@ -5,8 +5,8 @@ T_co = TypeVar('T_co', covariant=True) @functional_datapipe('concat') -class ConcatMapDataPipe(MapDataPipe): - r""" :class:`ConcatMapDataPipe`. +class ConcaterMapDataPipe(MapDataPipe): + r""" :class:`ConcaterMapDataPipe`. Map DataPipe to concatenate multiple Map DataPipes. The actual index of is the cumulative sum of source datapipes. -- 2.7.4