From 7b564580f5836fdaa3113148670ed56648a17e9d Mon Sep 17 00:00:00 2001 From: Janosh Riebesell Date: Tue, 23 Jan 2024 07:34:24 +0100 Subject: [PATCH 1/8] lazy import pandas to improve startup time --- monty/json.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/monty/json.py b/monty/json.py index 8440a4225..f0ab1de0a 100644 --- a/monty/json.py +++ b/monty/json.py @@ -19,11 +19,6 @@ except ImportError: np = None # type: ignore -try: - import pandas as pd -except ImportError: - pd = None # type: ignore - try: import pydantic except ImportError: @@ -380,7 +375,9 @@ def default(self, o) -> dict: # pylint: disable=E0202 if isinstance(o, np.generic): return o.item() - if pd is not None: + try: + import pandas as pd + if isinstance(o, pd.DataFrame): return { "@module": "pandas", @@ -393,6 +390,8 @@ def default(self, o) -> dict: # pylint: disable=E0202 "@class": "Series", "data": o.to_json(default_handler=MontyEncoder().encode), } + except ImportError: + pass if bson is not None: if isinstance(o, bson.objectid.ObjectId): From 1540b4237eddda04c92ca345aecf94f6167737af Mon Sep 17 00:00:00 2001 From: Janosh Riebesell Date: Tue, 23 Jan 2024 07:35:17 +0100 Subject: [PATCH 2/8] add Path to cd() contextmanager type hint --- monty/os/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/monty/os/__init__.py b/monty/os/__init__.py index edc48e23a..d3d8574e2 100644 --- a/monty/os/__init__.py +++ b/monty/os/__init__.py @@ -5,6 +5,10 @@ import errno import os from contextlib import contextmanager +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from pathlib import Path __author__ = "Shyue Ping Ong" __copyright__ = "Copyright 2013, The Materials Project" @@ -15,7 +19,7 @@ @contextmanager -def cd(path): +def cd(path: str | Path): """ A Fabric-inspired cd context that temporarily changes directory for performing some tasks, and returns to the original working directory From ec78d77de2b0d3a011188f87928025b006c142f9 Mon Sep 17 00:00:00 2001 From: Janosh Riebesell Date: Tue, 23 Jan 2024 07:35:56 +0100 Subject: [PATCH 3/8] snake_case --- tests/test_json.py | 74 +++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/tests/test_json.py b/tests/test_json.py index 985db583e..a7859d426 100644 --- a/tests/test_json.py +++ b/tests/test_json.py @@ -158,13 +158,13 @@ def __init__(self, a, b): self.auto_mson = AutoMSON - def test_to_from_dict(self): + def test_as_from_dict(self): obj = self.good_cls("Hello", "World", "Python") d = obj.as_dict() assert d is not None self.good_cls.from_dict(d) - jsonstr = obj.to_json() - d = json.loads(jsonstr) + json_str = obj.to_json() + d = json.loads(json_str) assert d["@class"], "GoodMSONClass" obj = self.bad_cls("Hello", "World") d = obj.as_dict() @@ -219,7 +219,7 @@ def test_version(self): d = obj.as_dict() assert d["@version"] == tests_version - def test_nested_to_from_dict(self): + def test_nested_as_from_dict(self): GMC = GoodMSONClass a_list = [GMC(1, 1.0, "one"), GMC(2, 2.0, "two")] b_dict = {"first": GMC(3, 3.0, "three"), "second": GMC(4, 4.0, "four")} @@ -313,28 +313,28 @@ def test_as_from_dict(self): def test_torch(self): t = torch.tensor([0, 1, 2]) - jsonstr = json.dumps(t, cls=MontyEncoder) - t2 = json.loads(jsonstr, cls=MontyDecoder) + json_str = json.dumps(t, cls=MontyEncoder) + t2 = json.loads(json_str, cls=MontyDecoder) assert isinstance(t2, torch.Tensor) assert t2.type() == t.type() assert np.array_equal(t2, t) t = torch.tensor([1 + 1j, 2 + 1j]) - jsonstr = json.dumps(t, cls=MontyEncoder) - t2 = json.loads(jsonstr, cls=MontyDecoder) + json_str = json.dumps(t, cls=MontyEncoder) + t2 = json.loads(json_str, cls=MontyDecoder) assert isinstance(t2, torch.Tensor) assert t2.type() == t.type() assert np.array_equal(t2, t) def test_datetime(self): dt = datetime.datetime.now() - jsonstr = json.dumps(dt, cls=MontyEncoder) - d = json.loads(jsonstr, cls=MontyDecoder) + json_str = json.dumps(dt, cls=MontyEncoder) + d = json.loads(json_str, cls=MontyDecoder) assert isinstance(d, datetime.datetime) assert dt == d # Test a nested datetime. a = {"dt": dt, "a": 1} - jsonstr = json.dumps(a, cls=MontyEncoder) - d = json.loads(jsonstr, cls=MontyDecoder) + json_str = json.dumps(a, cls=MontyEncoder) + d = json.loads(json_str, cls=MontyDecoder) assert isinstance(d["dt"], datetime.datetime) jsanitize(dt, strict=True) @@ -343,33 +343,33 @@ def test_uuid(self): from uuid import UUID, uuid4 uuid = uuid4() - jsonstr = json.dumps(uuid, cls=MontyEncoder) - d = json.loads(jsonstr, cls=MontyDecoder) + json_str = json.dumps(uuid, cls=MontyEncoder) + d = json.loads(json_str, cls=MontyDecoder) assert isinstance(d, UUID) assert uuid == d # Test a nested UUID. a = {"uuid": uuid, "a": 1} - jsonstr = json.dumps(a, cls=MontyEncoder) - d = json.loads(jsonstr, cls=MontyDecoder) + json_str = json.dumps(a, cls=MontyEncoder) + d = json.loads(json_str, cls=MontyDecoder) assert isinstance(d["uuid"], UUID) def test_nan(self): x = [float("NaN")] - djson = json.dumps(x, cls=MontyEncoder) - d = json.loads(djson) + dct_json = json.dumps(x, cls=MontyEncoder) + d = json.loads(dct_json) assert isinstance(d[0], float) def test_numpy(self): x = np.array([1, 2, 3], dtype="int64") with pytest.raises(TypeError): json.dumps(x) - djson = json.dumps(x, cls=MontyEncoder) - d = json.loads(djson) + dct_json = json.dumps(x, cls=MontyEncoder) + d = json.loads(dct_json) assert d["@class"] == "array" assert d["@module"] == "numpy" assert d["data"], [1, 2 == 3] assert d["dtype"] == "int64" - x = json.loads(djson, cls=MontyDecoder) + x = json.loads(dct_json, cls=MontyDecoder) assert isinstance(x, np.ndarray) x = np.min([1, 2, 3]) > 2 with pytest.raises(TypeError): @@ -378,26 +378,26 @@ def test_numpy(self): x = np.array([1 + 1j, 2 + 1j, 3 + 1j], dtype="complex64") with pytest.raises(TypeError): json.dumps(x) - djson = json.dumps(x, cls=MontyEncoder) - d = json.loads(djson) + dct_json = json.dumps(x, cls=MontyEncoder) + d = json.loads(dct_json) assert d["@class"] == "array" assert d["@module"] == "numpy" assert d["data"], [[1.0, 2.0, 3.0], [1.0, 1.0 == 1.0]] assert d["dtype"] == "complex64" - x = json.loads(djson, cls=MontyDecoder) + x = json.loads(dct_json, cls=MontyDecoder) assert isinstance(x, np.ndarray) assert x.dtype == "complex64" x = np.array([[1 + 1j, 2 + 1j], [3 + 1j, 4 + 1j]], dtype="complex64") with pytest.raises(TypeError): json.dumps(x) - djson = json.dumps(x, cls=MontyEncoder) - d = json.loads(djson) + dct_json = json.dumps(x, cls=MontyEncoder) + d = json.loads(dct_json) assert d["@class"] == "array" assert d["@module"] == "numpy" assert d["data"], [[[1.0, 2.0], [3.0, 4.0]], [[1.0, 1.0], [1.0 == 1.0]]] assert d["dtype"] == "complex64" - x = json.loads(djson, cls=MontyDecoder) + x = json.loads(dct_json, cls=MontyDecoder) assert isinstance(x, np.ndarray) assert x.dtype == "complex64" @@ -489,22 +489,22 @@ def test_callable(self): ]: with pytest.raises(TypeError): json.dumps(function) - djson = json.dumps(function, cls=MontyEncoder) - d = json.loads(djson) + dct_json = json.dumps(function, cls=MontyEncoder) + d = json.loads(dct_json) assert "@callable" in d assert "@module" in d - x = json.loads(djson, cls=MontyDecoder) + x = json.loads(dct_json, cls=MontyDecoder) assert x == function # test method bound to instance for function in [instance.method]: with pytest.raises(TypeError): json.dumps(function) - djson = json.dumps(function, cls=MontyEncoder) - d = json.loads(djson) + dct_json = json.dumps(function, cls=MontyEncoder) + d = json.loads(dct_json) assert "@callable" in d assert "@module" in d - x = json.loads(djson, cls=MontyDecoder) + x = json.loads(dct_json, cls=MontyDecoder) # can't just check functions are equal as the instance the function is bound # to will be different. Instead, we check that the serialized instance @@ -519,15 +519,15 @@ def test_callable(self): # test that callable MSONable objects still get serialized as the objects # rather than as a callable - djson = json.dumps(instance, cls=MontyEncoder) - assert "@class" in djson + dct_json = json.dumps(instance, cls=MontyEncoder) + assert "@class" in dct_json def test_objectid(self): oid = ObjectId("562e8301218dcbbc3d7d91ce") with pytest.raises(TypeError): json.dumps(oid) - djson = json.dumps(oid, cls=MontyEncoder) - x = json.loads(djson, cls=MontyDecoder) + dct_json = json.dumps(oid, cls=MontyEncoder) + x = json.loads(dct_json, cls=MontyDecoder) assert isinstance(x, ObjectId) def test_jsanitize(self): From a4ff19ccc3dcf4773c681c1be53def21635c16fe Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 23 Jan 2024 06:43:42 +0000 Subject: [PATCH 4/8] pre-commit auto-fixes --- tests/test_files/3000_lines.txt | 2 +- tests/test_files/myfile | 1 - tests/test_files/myfile_txt | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_files/3000_lines.txt b/tests/test_files/3000_lines.txt index 2f8c055bc..1127304b4 100644 --- a/tests/test_files/3000_lines.txt +++ b/tests/test_files/3000_lines.txt @@ -2997,4 +2997,4 @@ 2997 2998 2999 -3000 \ No newline at end of file +3000 diff --git a/tests/test_files/myfile b/tests/test_files/myfile index 1e35ee2e0..459b7cc69 100644 --- a/tests/test_files/myfile +++ b/tests/test_files/myfile @@ -1,2 +1 @@ HelloWorld. - diff --git a/tests/test_files/myfile_txt b/tests/test_files/myfile_txt index 1e35ee2e0..459b7cc69 100644 --- a/tests/test_files/myfile_txt +++ b/tests/test_files/myfile_txt @@ -1,2 +1 @@ HelloWorld. - From 3025a1a9296ae68d528db5a50c6885f7cdc1e454 Mon Sep 17 00:00:00 2001 From: Janosh Riebesell Date: Tue, 23 Jan 2024 07:50:50 +0100 Subject: [PATCH 5/8] future import annotations in monty/os/__init__.py --- monty/os/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/monty/os/__init__.py b/monty/os/__init__.py index d3d8574e2..0568d7f05 100644 --- a/monty/os/__init__.py +++ b/monty/os/__init__.py @@ -2,6 +2,8 @@ Os functions, e.g., cd, makedirs_p. """ +from __future__ import annotations + import errno import os from contextlib import contextmanager From 1376c32131610b3e75d130f21b05e46978eaec63 Mon Sep 17 00:00:00 2001 From: Janosh Riebesell Date: Tue, 23 Jan 2024 07:55:37 +0100 Subject: [PATCH 6/8] add missing pandas lazy import --- monty/json.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/monty/json.py b/monty/json.py index f0ab1de0a..13c6c3df4 100644 --- a/monty/json.py +++ b/monty/json.py @@ -543,13 +543,18 @@ def process_decoded(self, d): dtype=d["dtype"], ) return np.array(d["data"], dtype=d["dtype"]) - elif pd is not None and modname == "pandas": - if classname == "DataFrame": - decoded_data = MontyDecoder().decode(d["data"]) - return pd.DataFrame(decoded_data) - if classname == "Series": - decoded_data = MontyDecoder().decode(d["data"]) - return pd.Series(decoded_data) + elif modname == "pandas": + try: + import pandas as pd + + if classname == "DataFrame": + decoded_data = MontyDecoder().decode(d["data"]) + return pd.DataFrame(decoded_data) + if classname == "Series": + decoded_data = MontyDecoder().decode(d["data"]) + return pd.Series(decoded_data) + except ImportError: + pass elif ( (bson is not None) and modname == "bson.objectid" From 2724b05652140430f95f946dd2ebd5937d60500f Mon Sep 17 00:00:00 2001 From: Janosh Riebesell Date: Tue, 23 Jan 2024 08:00:27 +0100 Subject: [PATCH 7/8] change how jsanitize function to handles dataframes and series --- monty/json.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/monty/json.py b/monty/json.py index 13c6c3df4..eb967680d 100644 --- a/monty/json.py +++ b/monty/json.py @@ -642,7 +642,8 @@ def jsanitize( ] if np is not None and isinstance(obj, np.generic): return obj.item() - if pd is not None and isinstance(obj, (pd.Series, pd.DataFrame)): + if callable(getattr(obj, "to_dict", None)): + # handle dataframes and series. used to check isinstance(obj, (pd.Series, pd.DataFrame)) return obj.to_dict() if isinstance(obj, dict): return { From 127133cca07172a61419a39dfd5fa302c9af7c95 Mon Sep 17 00:00:00 2001 From: Janosh Riebesell Date: Tue, 23 Jan 2024 08:30:06 +0100 Subject: [PATCH 8/8] fix tests --- monty/io.py | 18 +++++++++--------- tests/test_io.py | 18 +++++++++--------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/monty/io.py b/monty/io.py index df7f1ffdc..097ca2730 100644 --- a/monty/io.py +++ b/monty/io.py @@ -131,11 +131,11 @@ def reverse_readline( buf = "" m_file.seek(0, 2) if is_text: - lastchar = m_file.read(1) + last_char = m_file.read(1) else: - lastchar = m_file.read(1).decode("utf-8") + last_char = m_file.read(1).decode("utf-8") - trailing_newline = lastchar == "\n" + trailing_newline = last_char == "\n" while 1: newline_pos = buf.rfind("\n") @@ -149,14 +149,14 @@ def reverse_readline( yield line elif pos: # Need to fill buffer - toread = min(blk_size, pos) - m_file.seek(pos - toread, 0) + to_read = min(blk_size, pos) + m_file.seek(pos - to_read, 0) if is_text: - buf = m_file.read(toread) + buf + buf = m_file.read(to_read) + buf else: - buf = m_file.read(toread).decode("utf-8") + buf - m_file.seek(pos - toread, 0) - if pos == toread: + buf = m_file.read(to_read).decode("utf-8") + buf + m_file.seek(pos - to_read, 0) + if pos == to_read: buf = "\n" + buf else: # Start-of-file diff --git a/tests/test_io.py b/tests/test_io.py index 043e67440..4f37a979e 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -28,8 +28,8 @@ def test_reverse_readline(self): order, i.e. the first line that is read corresponds to the last line. number """ - with open(os.path.join(test_dir, "3000_lines.txt")) as f: - for idx, line in enumerate(reverse_readline(f)): + with open(os.path.join(test_dir, "3000_lines.txt")) as file: + for idx, line in enumerate(reverse_readline(file)): assert ( int(line) == self.NUMLINES - idx ), "read_backwards read {} whereas it should "( @@ -40,13 +40,13 @@ def test_reverse_readline_fake_big(self): """ Make sure that large textfiles are read properly """ - with open(os.path.join(test_dir, "3000_lines.txt")) as f: - for idx, line in enumerate(reverse_readline(f, max_mem=0)): + with open(os.path.join(test_dir, "3000_lines.txt")) as file: + for idx, line in enumerate(reverse_readline(file, max_mem=0), -1): + if line == "\n": + continue assert ( int(line) == self.NUMLINES - idx - ), "read_backwards read {} whereas it should "( - "have read {" "}" - ).format(int(line), self.NUMLINES - idx) + ), f"read_backwards read {int(line)} whereas it should have read {self.NUMLINES - idx}" def test_reverse_readline_bz2(self): """ @@ -80,7 +80,7 @@ def test_reverse_readfile(self): number """ fname = os.path.join(test_dir, "3000_lines.txt") - for idx, line in enumerate(reverse_readfile(fname)): + for idx, line in enumerate(filter(bool, reverse_readfile(fname))): assert int(line) == self.NUMLINES - idx def test_reverse_readfile_gz(self): @@ -127,7 +127,7 @@ def test_zopen(self): with zopen(os.path.join(test_dir, "myfile_lzma.lzma"), "rt") as f: assert f.read() == "HelloWorld.\n\n" with zopen(os.path.join(test_dir, "myfile"), mode="rt") as f: - assert f.read() == "HelloWorld.\n\n" + assert f.read() == "HelloWorld.\n" @unittest.skipIf(Path is None, "Not Py3k") def test_Path_objects(self):