Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix line ending in reverse_readfile/readline in Windows #700

Merged
merged 25 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
cf80cb5
format and docstring tweaks
DanielYang59 Jul 23, 2024
25a41e0
use .rstrip()
DanielYang59 Jul 23, 2024
9342718
more docstring tweaks
DanielYang59 Jul 23, 2024
96ee909
some docstring tweaks
DanielYang59 Jul 23, 2024
c4c3ba5
remove guard again pathlib import
DanielYang59 Jul 23, 2024
0c04f09
pre-commit auto-fixes
pre-commit-ci[bot] Jul 23, 2024
0dc6421
format tweak in unit test
DanielYang59 Jul 23, 2024
8a0eef3
use capital global var name test_dir
DanielYang59 Jul 23, 2024
eb81e83
add unit tests for line end
DanielYang59 Jul 23, 2024
6fdfcff
add docstring for tests
DanielYang59 Jul 23, 2024
42ab074
pre-commit auto-fixes
pre-commit-ci[bot] Jul 23, 2024
823edee
use specific ignore tag
DanielYang59 Jul 23, 2024
e6e7842
Merge branch 'fix-reverse-read' of github.com:DanielYang59/monty into…
DanielYang59 Jul 23, 2024
bb8edd9
correct temp test file name
DanielYang59 Jul 23, 2024
aabf7b4
update reverse_readfile to use os.linesep
DanielYang59 Jul 23, 2024
a228a28
pre-commit auto-fixes
pre-commit-ci[bot] Jul 23, 2024
231121c
fix readline func and add unit test
DanielYang59 Jul 23, 2024
b4eb3cf
Merge branch 'fix-reverse-read' of github.com:DanielYang59/monty into…
DanielYang59 Jul 23, 2024
5eb2d9a
Merge branch 'fix-reverse-read' of github.com:DanielYang59/monty into…
DanielYang59 Jul 23, 2024
18d0b50
remove issue from merge
DanielYang59 Jul 23, 2024
44a2302
pre-commit auto-fixes
pre-commit-ci[bot] Jul 23, 2024
7a1ed12
remove debug files
DanielYang59 Jul 23, 2024
73089d4
fix var name
DanielYang59 Jul 23, 2024
8e4817e
Merge branch 'master' into fix-reverse-read
DanielYang59 Jul 24, 2024
2dfec96
remove unused ignore tag
DanielYang59 Jul 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/monty/functools.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def __init__(self, func: Callable) -> None:
func: Function to decorate.
"""
self.__func = func
wraps(self.__func)(self) # type: ignore
wraps(self.__func)(self) # type: ignore[arg-type]

def __get__(self, inst: Any, inst_cls) -> Any:
if inst is None:
Expand All @@ -95,7 +95,7 @@ def __get__(self, inst: Any, inst_cls) -> Any:
f"'{inst_cls.__name__}' object has no attribute '__dict__'"
)

name = self.__name__ # type: ignore # pylint: disable=E1101
name = self.__name__ # type: ignore[attr-defined] # pylint: disable=E1101
if name.startswith("__") and not name.endswith("__"):
name = f"_{inst_cls.__name__}{name}"

Expand Down
137 changes: 73 additions & 64 deletions src/monty/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
try:
import lzma
except ImportError:
lzma = None # type: ignore
lzma = None # type: ignore[assignment]
import mmap
import os
import subprocess
Expand All @@ -27,15 +27,15 @@

def zopen(filename: Union[str, Path], *args, **kwargs) -> IO:
"""
This function wraps around the bz2, gzip, lzma, xz and standard python's open
This function wraps around the bz2, gzip, lzma, xz and standard Python's open
function to deal intelligently with bzipped, gzipped or standard text
files.

Args:
filename (str/Path): filename or pathlib.Path.
*args: Standard args for python open(..). E.g., 'r' for read, 'w' for
*args: Standard args for Python open(..). E.g., 'r' for read, 'w' for
write.
**kwargs: Standard kwargs for python open(..).
**kwargs: Standard kwargs for Python open(..).

Returns:
File-like object. Supports with context.
Expand All @@ -47,39 +47,39 @@ def zopen(filename: Union[str, Path], *args, **kwargs) -> IO:
ext = ext.upper()
if ext == ".BZ2":
return bz2.open(filename, *args, **kwargs)
if ext in (".GZ", ".Z"):
if ext in {".GZ", ".Z"}:
return gzip.open(filename, *args, **kwargs)
if (lzma is not None) and (ext in (".XZ", ".LZMA")):
if lzma is not None and ext in {".XZ", ".LZMA"}:
return lzma.open(filename, *args, **kwargs)
return open(filename, *args, **kwargs) # pylint: disable=R1732
return open(filename, *args, **kwargs)


def reverse_readfile(filename: Union[str, Path]) -> Generator[str, str, None]:
"""
A much faster reverse read of file by using Python's mmap to generate a
memory-mapped file. It is slower for very small files than
reverse_readline, but at least 2x faster for large files (the primary use
of such a method).
of such a function).

Args:
filename (str):
Name of file to read.
filename (str | Path): File to read.

Yields:
Lines from the file in reverse order.
"""
try:
with zopen(filename, "rb") as f:
if isinstance(f, (gzip.GzipFile, bz2.BZ2File)):
for line in reversed(f.readlines()):
yield line.decode("utf-8").rstrip()
with zopen(filename, "rb") as file:
if isinstance(file, (gzip.GzipFile, bz2.BZ2File)):
for line in reversed(file.readlines()):
yield line.decode("utf-8").rstrip(os.linesep)
else:
fm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
n = len(fm)
filemap = mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ)
n = len(filemap)
while n > 0:
i = fm.rfind(b"\n", 0, n)
yield fm[i + 1 : n].decode("utf-8").strip("\n")
i = filemap.rfind(os.linesep.encode(), 0, n)
yield filemap[i + 1 : n].decode("utf-8").rstrip(os.linesep)
n = i

except ValueError:
return

Expand All @@ -88,19 +88,20 @@ def reverse_readline(
m_file, blk_size: int = 4096, max_mem: int = 4000000
) -> Generator[str, str, None]:
"""
Generator method to read a file line-by-line, but backwards. This allows
one to efficiently get data at the end of a file.

Based on code by Peter Astrand <[email protected]>, using modifications by
Raymond Hettinger and Kevin German.
http://code.activestate.com/recipes/439045-read-a-text-file-backwards
-yet-another-implementat/
Generator function to read a file line-by-line, but backwards.
This allows one to efficiently get data at the end of a file.

Reads file forwards and reverses in memory for files smaller than the
Read file forwards and reverse in memory for files smaller than the
max_mem parameter, or for gzip files where reverse seeks are not supported.

Files larger than max_mem are dynamically read backwards.

Reference:
Based on code by Peter Astrand <[email protected]>, using modifications
by Raymond Hettinger and Kevin German.
http://code.activestate.com/recipes/439045-read-a-text-file-backwards
-yet-another-implementat/

Args:
m_file (File): File stream to read (backwards)
blk_size (int): The buffer size. Defaults to 4096.
Expand All @@ -110,8 +111,8 @@ def reverse_readline(
this sets the maximum block size.

Returns:
Generator that returns lines from the file. Similar behavior to the
file.readline() method, except the lines are returned from the back
Generator that yields lines from the file. Behave similarly to the
file.readline() function, except the lines are returned from the back
of the file.
"""
# Check if the file stream is a bit stream or not
Expand Down Expand Up @@ -141,18 +142,19 @@ def reverse_readline(
m_file.seek(0, 2)
lastchar = m_file.read(1) if is_text else m_file.read(1).decode("utf-8")

trailing_newline = lastchar == "\n"
trailing_newline = lastchar == os.linesep

while 1:
newline_pos = buf.rfind("\n")
while True:
newline_pos = buf.rfind(os.linesep)
pos = m_file.tell()
if newline_pos != -1:
# Found a newline
line = buf[newline_pos + 1 :]
buf = buf[:newline_pos]
if pos or newline_pos or trailing_newline:
line += "\n"
line += os.linesep
yield line

elif pos:
# Need to fill buffer
toread = min(blk_size, pos)
Expand All @@ -163,7 +165,8 @@ def reverse_readline(
buf = m_file.read(toread).decode("utf-8") + buf
m_file.seek(pos - toread, 0)
if pos == toread:
buf = "\n" + buf
buf = os.linesep + buf

else:
# Start-of-file
return
Expand All @@ -178,21 +181,24 @@ class FileLock:
A file locking mechanism that has context-manager support so you can use
it in a with statement. This should be relatively cross-compatible as it
doesn't rely on msvcrt or fcntl for the locking.

Taken from http://www.evanfosmark.com/2009/01/cross-platform-file-locking
-support-in-python/
"""

Error = FileLockException

def __init__(self, file_name: str, timeout: float = 10, delay: float = 0.05):
def __init__(
self, file_name: str, timeout: float = 10, delay: float = 0.05
) -> None:
"""
Prepare the file locker. Specify the file to lock and optionally
the maximum timeout and the delay between each attempt to lock.

Args:
file_name: Name of file to lock.
timeout: Maximum timeout for locking. Defaults to 10.
delay: Delay between each attempt to lock. Defaults to 0.05.
file_name (str): Name of file to lock.
timeout (float): Maximum timeout in second for locking. Defaults to 10.
delay (float): Delay in second between each attempt to lock. Defaults to 0.05.
"""
self.file_name = os.path.abspath(file_name)
self.lockfile = f"{os.path.abspath(file_name)}.lock"
Expand All @@ -203,6 +209,30 @@ def __init__(self, file_name: str, timeout: float = 10, delay: float = 0.05):
if self.delay > self.timeout or self.delay <= 0 or self.timeout <= 0:
raise ValueError("delay and timeout must be positive with delay <= timeout")

def __enter__(self):
"""
Activated when used in the with statement. Should automatically
acquire a lock to be used in the with block.
"""
if not self.is_locked:
self.acquire()
return self

def __exit__(self, type_, value, traceback):
"""
Activated at the end of the with statement. It automatically releases
the lock if it isn't locked.
"""
if self.is_locked:
self.release()

def __del__(self):
"""
Make sure that the FileLock instance doesn't leave a lockfile
lying around.
"""
self.release()

def acquire(self) -> None:
"""
Acquire the lock, if possible. If the lock is in use, it check again
Expand All @@ -215,8 +245,8 @@ def acquire(self) -> None:
try:
self.fd = os.open(self.lockfile, os.O_CREAT | os.O_EXCL | os.O_RDWR)
break
except OSError as e:
if e.errno != errno.EEXIST:
except OSError as exc:
if exc.errno != errno.EEXIST:
raise
if (time.time() - start_time) >= self.timeout:
raise FileLockException(f"{self.lockfile}: Timeout occurred.")
Expand All @@ -235,37 +265,16 @@ def release(self) -> None:
os.unlink(self.lockfile)
self.is_locked = False

def __enter__(self):
"""
Activated when used in the with statement. Should automatically
acquire a lock to be used in the with block.
"""
if not self.is_locked:
self.acquire()
return self

def __exit__(self, type_, value, traceback):
"""
Activated at the end of the with statement. It automatically releases
the lock if it isn't locked.
"""
if self.is_locked:
self.release()

def __del__(self):
"""
Make sure that the FileLock instance doesn't leave a lockfile
lying around.
"""
self.release()


def get_open_fds() -> int:
"""
Return the number of open file descriptors for current process
Get the number of open file descriptors for current process.

Warnings:
Will only work on UNIX-like OS-es.

Returns:
int: The number of open file descriptors for current process.
"""
pid: int = os.getpid()
procs: bytes = subprocess.check_output(["lsof", "-w", "-Ff", "-p", str(pid)])
Expand Down
2 changes: 1 addition & 1 deletion src/monty/itertools.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
try:
import numpy as np
except ImportError:
np = None # type: ignore
np = None

if TYPE_CHECKING:
from typing import Iterable
Expand Down
4 changes: 2 additions & 2 deletions src/monty/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,10 +637,10 @@ def default(self, o) -> dict:
"data": str(o),
}
try:
module_version = import_module("pint").__version__ # type: ignore
module_version = import_module("pint").__version__
d["@version"] = str(module_version)
except (AttributeError, ImportError):
d["@version"] = None # type: ignore
d["@version"] = None
return d

if bson is not None and isinstance(o, bson.objectid.ObjectId):
Expand Down
4 changes: 2 additions & 2 deletions src/monty/multiprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ def imap_tqdm(nprocs: int, func: Callable, iterable: Iterable, *args, **kwargs)
data = []
with Pool(nprocs) as pool:
try:
n = len(iterable) # type: ignore
n = len(iterable) # type: ignore[arg-type]
except TypeError:
n = None # type: ignore
n = None # type: ignore[arg-type]
with tqdm(total=n) as prog_bar:
for d in pool.imap(func, iterable, *args, **kwargs):
prog_bar.update()
Expand Down
2 changes: 1 addition & 1 deletion src/monty/serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
try:
from ruamel.yaml import YAML
except ImportError:
YAML = None # type: ignore
YAML = None # type: ignore[arg-type]

from monty.io import zopen
from monty.json import MontyDecoder, MontyEncoder
Expand Down
2 changes: 1 addition & 1 deletion tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
TODO: Modify module doc.
Unit tests and test files for Monty.
"""

from __future__ import annotations
Expand Down
2 changes: 1 addition & 1 deletion tests/test_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pytest
from monty.collections import AttrDict, FrozenAttrDict, Namespace, frozendict, tree

test_dir = os.path.join(os.path.dirname(__file__), "test_files")
TEST_DIR = os.path.join(os.path.dirname(__file__), "test_files")


class TestFrozenDict:
Expand Down
Loading
Loading