Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lazy pandas import to improve startup time #604

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions monty/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,11 @@ def reverse_readline(
buf = ""
m_file.seek(0, 2)
if is_text:
lastchar = m_file.read(1)
last_char = m_file.read(1)
else:
lastchar = m_file.read(1).decode("utf-8")
last_char = m_file.read(1).decode("utf-8")

trailing_newline = lastchar == "\n"
trailing_newline = last_char == "\n"

while 1:
newline_pos = buf.rfind("\n")
Expand All @@ -149,14 +149,14 @@ def reverse_readline(
yield line
elif pos:
# Need to fill buffer
toread = min(blk_size, pos)
m_file.seek(pos - toread, 0)
to_read = min(blk_size, pos)
m_file.seek(pos - to_read, 0)
if is_text:
buf = m_file.read(toread) + buf
buf = m_file.read(to_read) + buf
else:
buf = m_file.read(toread).decode("utf-8") + buf
m_file.seek(pos - toread, 0)
if pos == toread:
buf = m_file.read(to_read).decode("utf-8") + buf
m_file.seek(pos - to_read, 0)
if pos == to_read:
buf = "\n" + buf
else:
# Start-of-file
Expand Down
33 changes: 19 additions & 14 deletions monty/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,6 @@
except ImportError:
np = None # type: ignore

try:
import pandas as pd
except ImportError:
pd = None # type: ignore

try:
import pydantic
except ImportError:
Expand Down Expand Up @@ -380,7 +375,9 @@
if isinstance(o, np.generic):
return o.item()

if pd is not None:
try:
import pandas as pd

if isinstance(o, pd.DataFrame):
return {
"@module": "pandas",
Expand All @@ -393,6 +390,8 @@
"@class": "Series",
"data": o.to_json(default_handler=MontyEncoder().encode),
}
except ImportError:
pass

Check warning on line 394 in monty/json.py

View check run for this annotation

Codecov / codecov/patch

monty/json.py#L393-L394

Added lines #L393 - L394 were not covered by tests

if bson is not None:
if isinstance(o, bson.objectid.ObjectId):
Expand Down Expand Up @@ -544,13 +543,18 @@
dtype=d["dtype"],
)
return np.array(d["data"], dtype=d["dtype"])
elif pd is not None and modname == "pandas":
if classname == "DataFrame":
decoded_data = MontyDecoder().decode(d["data"])
return pd.DataFrame(decoded_data)
if classname == "Series":
decoded_data = MontyDecoder().decode(d["data"])
return pd.Series(decoded_data)
elif modname == "pandas":
try:
import pandas as pd

if classname == "DataFrame":
decoded_data = MontyDecoder().decode(d["data"])
return pd.DataFrame(decoded_data)
if classname == "Series":
decoded_data = MontyDecoder().decode(d["data"])
return pd.Series(decoded_data)
except ImportError:
pass

Check warning on line 557 in monty/json.py

View check run for this annotation

Codecov / codecov/patch

monty/json.py#L556-L557

Added lines #L556 - L557 were not covered by tests
elif (
(bson is not None)
and modname == "bson.objectid"
Expand Down Expand Up @@ -638,7 +642,8 @@
]
if np is not None and isinstance(obj, np.generic):
return obj.item()
if pd is not None and isinstance(obj, (pd.Series, pd.DataFrame)):
if callable(getattr(obj, "to_dict", None)):
# handle dataframes and series. used to check isinstance(obj, (pd.Series, pd.DataFrame))
return obj.to_dict()
if isinstance(obj, dict):
return {
Expand Down
8 changes: 7 additions & 1 deletion monty/os/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,15 @@
Os functions, e.g., cd, makedirs_p.
"""

from __future__ import annotations

import errno
import os
from contextlib import contextmanager
from typing import TYPE_CHECKING

if TYPE_CHECKING:
from pathlib import Path

Check warning on line 13 in monty/os/__init__.py

View check run for this annotation

Codecov / codecov/patch

monty/os/__init__.py#L13

Added line #L13 was not covered by tests

__author__ = "Shyue Ping Ong"
__copyright__ = "Copyright 2013, The Materials Project"
Expand All @@ -15,7 +21,7 @@


@contextmanager
def cd(path):
def cd(path: str | Path):
"""
A Fabric-inspired cd context that temporarily changes directory for
performing some tasks, and returns to the original working directory
Expand Down
2 changes: 1 addition & 1 deletion tests/test_files/3000_lines.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2997,4 +2997,4 @@
2997
2998
2999
3000
3000
1 change: 0 additions & 1 deletion tests/test_files/myfile
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
HelloWorld.

1 change: 0 additions & 1 deletion tests/test_files/myfile_txt
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
HelloWorld.

18 changes: 9 additions & 9 deletions tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ def test_reverse_readline(self):
order, i.e. the first line that is read corresponds to the last line.
number
"""
with open(os.path.join(test_dir, "3000_lines.txt")) as f:
for idx, line in enumerate(reverse_readline(f)):
with open(os.path.join(test_dir, "3000_lines.txt")) as file:
for idx, line in enumerate(reverse_readline(file)):
assert (
int(line) == self.NUMLINES - idx
), "read_backwards read {} whereas it should "(
Expand All @@ -40,13 +40,13 @@ def test_reverse_readline_fake_big(self):
"""
Make sure that large textfiles are read properly
"""
with open(os.path.join(test_dir, "3000_lines.txt")) as f:
for idx, line in enumerate(reverse_readline(f, max_mem=0)):
with open(os.path.join(test_dir, "3000_lines.txt")) as file:
for idx, line in enumerate(reverse_readline(file, max_mem=0), -1):
if line == "\n":
continue
assert (
int(line) == self.NUMLINES - idx
), "read_backwards read {} whereas it should "(
"have read {" "}"
).format(int(line), self.NUMLINES - idx)
), f"read_backwards read {int(line)} whereas it should have read {self.NUMLINES - idx}"

def test_reverse_readline_bz2(self):
"""
Expand Down Expand Up @@ -80,7 +80,7 @@ def test_reverse_readfile(self):
number
"""
fname = os.path.join(test_dir, "3000_lines.txt")
for idx, line in enumerate(reverse_readfile(fname)):
for idx, line in enumerate(filter(bool, reverse_readfile(fname))):
assert int(line) == self.NUMLINES - idx

def test_reverse_readfile_gz(self):
Expand Down Expand Up @@ -127,7 +127,7 @@ def test_zopen(self):
with zopen(os.path.join(test_dir, "myfile_lzma.lzma"), "rt") as f:
assert f.read() == "HelloWorld.\n\n"
with zopen(os.path.join(test_dir, "myfile"), mode="rt") as f:
assert f.read() == "HelloWorld.\n\n"
assert f.read() == "HelloWorld.\n"

@unittest.skipIf(Path is None, "Not Py3k")
def test_Path_objects(self):
Expand Down
74 changes: 37 additions & 37 deletions tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,13 +158,13 @@ def __init__(self, a, b):

self.auto_mson = AutoMSON

def test_to_from_dict(self):
def test_as_from_dict(self):
obj = self.good_cls("Hello", "World", "Python")
d = obj.as_dict()
assert d is not None
self.good_cls.from_dict(d)
jsonstr = obj.to_json()
d = json.loads(jsonstr)
json_str = obj.to_json()
d = json.loads(json_str)
assert d["@class"], "GoodMSONClass"
obj = self.bad_cls("Hello", "World")
d = obj.as_dict()
Expand Down Expand Up @@ -219,7 +219,7 @@ def test_version(self):
d = obj.as_dict()
assert d["@version"] == tests_version

def test_nested_to_from_dict(self):
def test_nested_as_from_dict(self):
GMC = GoodMSONClass
a_list = [GMC(1, 1.0, "one"), GMC(2, 2.0, "two")]
b_dict = {"first": GMC(3, 3.0, "three"), "second": GMC(4, 4.0, "four")}
Expand Down Expand Up @@ -313,28 +313,28 @@ def test_as_from_dict(self):

def test_torch(self):
t = torch.tensor([0, 1, 2])
jsonstr = json.dumps(t, cls=MontyEncoder)
t2 = json.loads(jsonstr, cls=MontyDecoder)
json_str = json.dumps(t, cls=MontyEncoder)
t2 = json.loads(json_str, cls=MontyDecoder)
assert isinstance(t2, torch.Tensor)
assert t2.type() == t.type()
assert np.array_equal(t2, t)
t = torch.tensor([1 + 1j, 2 + 1j])
jsonstr = json.dumps(t, cls=MontyEncoder)
t2 = json.loads(jsonstr, cls=MontyDecoder)
json_str = json.dumps(t, cls=MontyEncoder)
t2 = json.loads(json_str, cls=MontyDecoder)
assert isinstance(t2, torch.Tensor)
assert t2.type() == t.type()
assert np.array_equal(t2, t)

def test_datetime(self):
dt = datetime.datetime.now()
jsonstr = json.dumps(dt, cls=MontyEncoder)
d = json.loads(jsonstr, cls=MontyDecoder)
json_str = json.dumps(dt, cls=MontyEncoder)
d = json.loads(json_str, cls=MontyDecoder)
assert isinstance(d, datetime.datetime)
assert dt == d
# Test a nested datetime.
a = {"dt": dt, "a": 1}
jsonstr = json.dumps(a, cls=MontyEncoder)
d = json.loads(jsonstr, cls=MontyDecoder)
json_str = json.dumps(a, cls=MontyEncoder)
d = json.loads(json_str, cls=MontyDecoder)
assert isinstance(d["dt"], datetime.datetime)

jsanitize(dt, strict=True)
Expand All @@ -343,33 +343,33 @@ def test_uuid(self):
from uuid import UUID, uuid4

uuid = uuid4()
jsonstr = json.dumps(uuid, cls=MontyEncoder)
d = json.loads(jsonstr, cls=MontyDecoder)
json_str = json.dumps(uuid, cls=MontyEncoder)
d = json.loads(json_str, cls=MontyDecoder)
assert isinstance(d, UUID)
assert uuid == d
# Test a nested UUID.
a = {"uuid": uuid, "a": 1}
jsonstr = json.dumps(a, cls=MontyEncoder)
d = json.loads(jsonstr, cls=MontyDecoder)
json_str = json.dumps(a, cls=MontyEncoder)
d = json.loads(json_str, cls=MontyDecoder)
assert isinstance(d["uuid"], UUID)

def test_nan(self):
x = [float("NaN")]
djson = json.dumps(x, cls=MontyEncoder)
d = json.loads(djson)
dct_json = json.dumps(x, cls=MontyEncoder)
d = json.loads(dct_json)
assert isinstance(d[0], float)

def test_numpy(self):
x = np.array([1, 2, 3], dtype="int64")
with pytest.raises(TypeError):
json.dumps(x)
djson = json.dumps(x, cls=MontyEncoder)
d = json.loads(djson)
dct_json = json.dumps(x, cls=MontyEncoder)
d = json.loads(dct_json)
assert d["@class"] == "array"
assert d["@module"] == "numpy"
assert d["data"], [1, 2 == 3]
assert d["dtype"] == "int64"
x = json.loads(djson, cls=MontyDecoder)
x = json.loads(dct_json, cls=MontyDecoder)
assert isinstance(x, np.ndarray)
x = np.min([1, 2, 3]) > 2
with pytest.raises(TypeError):
Expand All @@ -378,26 +378,26 @@ def test_numpy(self):
x = np.array([1 + 1j, 2 + 1j, 3 + 1j], dtype="complex64")
with pytest.raises(TypeError):
json.dumps(x)
djson = json.dumps(x, cls=MontyEncoder)
d = json.loads(djson)
dct_json = json.dumps(x, cls=MontyEncoder)
d = json.loads(dct_json)
assert d["@class"] == "array"
assert d["@module"] == "numpy"
assert d["data"], [[1.0, 2.0, 3.0], [1.0, 1.0 == 1.0]]
assert d["dtype"] == "complex64"
x = json.loads(djson, cls=MontyDecoder)
x = json.loads(dct_json, cls=MontyDecoder)
assert isinstance(x, np.ndarray)
assert x.dtype == "complex64"

x = np.array([[1 + 1j, 2 + 1j], [3 + 1j, 4 + 1j]], dtype="complex64")
with pytest.raises(TypeError):
json.dumps(x)
djson = json.dumps(x, cls=MontyEncoder)
d = json.loads(djson)
dct_json = json.dumps(x, cls=MontyEncoder)
d = json.loads(dct_json)
assert d["@class"] == "array"
assert d["@module"] == "numpy"
assert d["data"], [[[1.0, 2.0], [3.0, 4.0]], [[1.0, 1.0], [1.0 == 1.0]]]
assert d["dtype"] == "complex64"
x = json.loads(djson, cls=MontyDecoder)
x = json.loads(dct_json, cls=MontyDecoder)
assert isinstance(x, np.ndarray)
assert x.dtype == "complex64"

Expand Down Expand Up @@ -489,22 +489,22 @@ def test_callable(self):
]:
with pytest.raises(TypeError):
json.dumps(function)
djson = json.dumps(function, cls=MontyEncoder)
d = json.loads(djson)
dct_json = json.dumps(function, cls=MontyEncoder)
d = json.loads(dct_json)
assert "@callable" in d
assert "@module" in d
x = json.loads(djson, cls=MontyDecoder)
x = json.loads(dct_json, cls=MontyDecoder)
assert x == function

# test method bound to instance
for function in [instance.method]:
with pytest.raises(TypeError):
json.dumps(function)
djson = json.dumps(function, cls=MontyEncoder)
d = json.loads(djson)
dct_json = json.dumps(function, cls=MontyEncoder)
d = json.loads(dct_json)
assert "@callable" in d
assert "@module" in d
x = json.loads(djson, cls=MontyDecoder)
x = json.loads(dct_json, cls=MontyDecoder)

# can't just check functions are equal as the instance the function is bound
# to will be different. Instead, we check that the serialized instance
Expand All @@ -519,15 +519,15 @@ def test_callable(self):

# test that callable MSONable objects still get serialized as the objects
# rather than as a callable
djson = json.dumps(instance, cls=MontyEncoder)
assert "@class" in djson
dct_json = json.dumps(instance, cls=MontyEncoder)
assert "@class" in dct_json

def test_objectid(self):
oid = ObjectId("562e8301218dcbbc3d7d91ce")
with pytest.raises(TypeError):
json.dumps(oid)
djson = json.dumps(oid, cls=MontyEncoder)
x = json.loads(djson, cls=MontyDecoder)
dct_json = json.dumps(oid, cls=MontyEncoder)
x = json.loads(dct_json, cls=MontyDecoder)
assert isinstance(x, ObjectId)

def test_jsanitize(self):
Expand Down
Loading