From 58c25aa85c79eae8e9197f7a68bde3385db36a53 Mon Sep 17 00:00:00 2001 From: Vitaly Fedyunin Date: Wed, 18 May 2022 14:49:44 -0400 Subject: [PATCH] Update on "Adding lock mechanism to prevent on_disk_cache downloading twice" Fixes #144 [ghstack-poisoned] --- torchdata/datapipes/iter/util/cacheholder.py | 5 ++++- torchdata/datapipes/iter/util/saver.py | 4 +--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/torchdata/datapipes/iter/util/cacheholder.py b/torchdata/datapipes/iter/util/cacheholder.py index 38b2a6e09..2d2e9692e 100644 --- a/torchdata/datapipes/iter/util/cacheholder.py +++ b/torchdata/datapipes/iter/util/cacheholder.py @@ -114,7 +114,10 @@ def _hash_check(filepath, hash_dict, hash_type): else: hash_func = hashlib.md5() - with portalocker.Lock(filepath, "rb", flags=portalocker.LockFlags.SHARED) as f: + # with portalocker.Lock(filepath, "rb", flags=portalocker.LockFlags.SHARED) as f: + # TODO(VitalyFedyunin): Line above will require all readers (Win) to obtain proper locks, + # I'm putting it on hold as we need to modify PyTorch core codebase heavily. + with open(filepath, "rb") as f: chunk = f.read(1024 ** 2) while chunk: hash_func.update(chunk) diff --git a/torchdata/datapipes/iter/util/saver.py b/torchdata/datapipes/iter/util/saver.py index 4a947c536..4cd3e2f62 100644 --- a/torchdata/datapipes/iter/util/saver.py +++ b/torchdata/datapipes/iter/util/saver.py @@ -8,8 +8,6 @@ from typing import Any, Callable, Iterator, Optional, Tuple, Union -# import portalocker - from torchdata.datapipes import functional_datapipe from torchdata.datapipes.iter import IterDataPipe @@ -59,7 +57,7 @@ def __iter__(self) -> Iterator[str]: if not os.path.exists(dirname): os.makedirs(dirname) # with portalocker.Lock(filepath, self.mode, flags=portalocker.LockFlags.EXCLUSIVE) as f: - # TODO(VitalyFedyunin): Enabling line above fails TorchText tests, need to investigate race condition + # TODO(VitalyFedyunin): Enabling line above will require all read sites to be updated (Win). with open(filepath, self.mode) as f: f.write(data) yield filepath