forked from biolab/orange3
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ENH: Add pandas_compat.table_from_frame(df)
- Loading branch information
Showing
5 changed files
with
143 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
"""Pandas DataFrame↔Table conversion helpers""" | ||
import numpy as np | ||
import pandas as pd | ||
from pandas.api.types import ( | ||
is_categorical_dtype, is_object_dtype, | ||
is_datetime64_any_dtype, is_numeric_dtype, | ||
) | ||
|
||
from Orange.data import ( | ||
Table, Domain, DiscreteVariable, StringVariable, TimeVariable, | ||
ContinuousVariable, | ||
) | ||
|
||
__all__ = ['table_from_frame'] | ||
|
||
|
||
def table_from_frame(df, *, force_nominal=False): | ||
""" | ||
Convert pandas.DataFrame to Orange.data.Table | ||
Parameters | ||
---------- | ||
df : pandas.DataFrame | ||
force_nominal : boolean | ||
If True, interpret ALL string columns as nominal (DiscreteVariable). | ||
Returns | ||
------- | ||
Table | ||
""" | ||
|
||
def _is_discrete(s): | ||
return (is_categorical_dtype(s) or | ||
is_object_dtype(s) and (force_nominal or | ||
s.nunique() < s.size**.666)) | ||
|
||
def _is_datetime(s): | ||
if is_datetime64_any_dtype(s): | ||
return True | ||
try: | ||
if is_object_dtype(s): | ||
pd.to_datetime(s, infer_datetime_format=True) | ||
return True | ||
except Exception: # pylint: disable=broad-except | ||
pass | ||
return False | ||
|
||
# If df index is not a simple RangeIndex (or similar), put it into data | ||
if not (df.index.is_integer() and (df.index.is_monotonic_increasing or | ||
df.index.is_monotonic_decreasing)): | ||
df = df.reset_index() | ||
|
||
attrs, metas = [], [] | ||
X, M = [], [] | ||
|
||
# Iter over columns | ||
for name, s in df.items(): | ||
name = str(name) | ||
if _is_discrete(s): | ||
discrete = s.astype('category').cat | ||
attrs.append(DiscreteVariable(name, discrete.categories.astype(str).tolist())) | ||
X.append(discrete.codes.replace(-1, np.nan).values) | ||
elif _is_datetime(s): | ||
tvar = TimeVariable(name) | ||
attrs.append(tvar) | ||
s = pd.to_datetime(s, infer_datetime_format=True) | ||
X.append(s.astype('str').replace('NaT', np.nan).map(tvar.parse).values) | ||
elif is_numeric_dtype(s): | ||
attrs.append(ContinuousVariable(name)) | ||
X.append(s.values) | ||
else: | ||
metas.append(StringVariable(name)) | ||
M.append(s.values.astype(object)) | ||
|
||
return Table.from_numpy(Domain(attrs, None, metas), | ||
np.column_stack(X) if X else np.empty((df.shape[0], 0)), | ||
None, | ||
np.column_stack(M) if M else None) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import unittest | ||
import numpy as np | ||
from Orange.data import ContinuousVariable, DiscreteVariable, TimeVariable | ||
|
||
try: | ||
import pandas as pd | ||
except ImportError: | ||
pd = None | ||
|
||
@unittest.skipIf(pd is None, "Missing package 'pandas'") | ||
class TestPandasCompat(unittest.TestCase): | ||
def test_table_from_frame(self): | ||
from Orange.data.pandas_compat import table_from_frame | ||
|
||
nan = np.nan | ||
df = pd.DataFrame([['a', 1, pd.Timestamp('2017-12-19')], | ||
['b', 0, pd.Timestamp('1724-12-20')], | ||
['c', 0, pd.Timestamp('1724-12-20')], | ||
[nan, nan, nan]]) | ||
table = table_from_frame(df) | ||
np.testing.assert_equal(table.X, | ||
[[1, pd.Timestamp('2017-12-19').timestamp()], | ||
[0, pd.Timestamp('1724-12-20').timestamp()], | ||
[0, pd.Timestamp('1724-12-20').timestamp()], | ||
[nan, nan]]) | ||
np.testing.assert_equal(table.metas.tolist(), [['a'], | ||
['b'], | ||
['c'], | ||
[nan]]) | ||
names = [var.name for var in table.domain.attributes] | ||
types = [type(var) for var in table.domain.attributes] | ||
self.assertEqual(names, ['1', '2']) | ||
self.assertEqual(types, [ContinuousVariable, TimeVariable]) | ||
|
||
# Force strings nominal | ||
table = table_from_frame(df, force_nominal=True) | ||
np.testing.assert_equal(table.X, [[0, 1, pd.Timestamp('2017-12-19').timestamp()], | ||
[1, 0, pd.Timestamp('1724-12-20').timestamp()], | ||
[2, 0, pd.Timestamp('1724-12-20').timestamp()], | ||
[nan, nan, nan]]) | ||
np.testing.assert_equal(table.metas.tolist(), [[], [], [], []]) | ||
names = [var.name for var in table.domain.attributes] | ||
types = [type(var) for var in table.domain.attributes] | ||
self.assertEqual(names, ['0', '1', '2']) | ||
self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable]) | ||
|
||
# Include index | ||
df.index = list('abaa') | ||
table = table_from_frame(df) | ||
np.testing.assert_equal(table.X, | ||
[[0, 1, pd.Timestamp('2017-12-19').timestamp()], | ||
[1, 0, pd.Timestamp('1724-12-20').timestamp()], | ||
[0, 0, pd.Timestamp('1724-12-20').timestamp()], | ||
[0, nan, nan]]) | ||
np.testing.assert_equal(table.metas.tolist(), [['a'], | ||
['b'], | ||
['c'], | ||
[nan]]) | ||
names = [var.name for var in table.domain.attributes] | ||
types = [type(var) for var in table.domain.attributes] | ||
self.assertEqual(names, ['index', '1', '2']) | ||
self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1 @@ | ||
# This is required for, and only used by, Parallel Coordinates widget. | ||
# Once that is ported to whatever, this can be removed, along with | ||
# Orange/widgets/utils/plot/* | ||
# Optional because it's hard to install everywhere. | ||
qt-graph-helpers>=0.1.3 | ||
pandas |