Skip to content

Commit

Permalink
ENH: Add pandas_compat.table_from_frame(df)
Browse files Browse the repository at this point in the history
  • Loading branch information
kernc committed Dec 20, 2017
1 parent 62ce683 commit e602be2
Show file tree
Hide file tree
Showing 5 changed files with 143 additions and 6 deletions.
1 change: 1 addition & 0 deletions .travis/install_orange.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ foldable pip install -U setuptools pip codecov
cat requirements-core.txt \
requirements-gui.txt \
requirements-dev.txt \
requirements-opt.txt \
requirements-doc.txt |
while read dep; do
dep="${dep%%#*}" # Strip the comment
Expand Down
78 changes: 78 additions & 0 deletions Orange/data/pandas_compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""Pandas DataFrame↔Table conversion helpers"""
import numpy as np
import pandas as pd
from pandas.api.types import (
is_categorical_dtype, is_object_dtype,
is_datetime64_any_dtype, is_numeric_dtype,
)

from Orange.data import (
Table, Domain, DiscreteVariable, StringVariable, TimeVariable,
ContinuousVariable,
)

__all__ = ['table_from_frame']


def table_from_frame(df, *, force_nominal=False):
"""
Convert pandas.DataFrame to Orange.data.Table
Parameters
----------
df : pandas.DataFrame
force_nominal : boolean
If True, interpret ALL string columns as nominal (DiscreteVariable).
Returns
-------
Table
"""

def _is_discrete(s):
return (is_categorical_dtype(s) or
is_object_dtype(s) and (force_nominal or
s.nunique() < s.size**.666))

def _is_datetime(s):
if is_datetime64_any_dtype(s):
return True
try:
if is_object_dtype(s):
pd.to_datetime(s, infer_datetime_format=True)
return True
except Exception: # pylint: disable=broad-except
pass
return False

# If df index is not a simple RangeIndex (or similar), put it into data
if not (df.index.is_integer() and (df.index.is_monotonic_increasing or
df.index.is_monotonic_decreasing)):
df = df.reset_index()

attrs, metas = [], []
X, M = [], []

# Iter over columns
for name, s in df.items():
name = str(name)
if _is_discrete(s):
discrete = s.astype('category').cat
attrs.append(DiscreteVariable(name, discrete.categories.astype(str).tolist()))
X.append(discrete.codes.replace(-1, np.nan).values)
elif _is_datetime(s):
tvar = TimeVariable(name)
attrs.append(tvar)
s = pd.to_datetime(s, infer_datetime_format=True)
X.append(s.astype('str').replace('NaT', np.nan).map(tvar.parse).values)
elif is_numeric_dtype(s):
attrs.append(ContinuousVariable(name))
X.append(s.values)
else:
metas.append(StringVariable(name))
M.append(s.values.astype(object))

return Table.from_numpy(Domain(attrs, None, metas),
np.column_stack(X) if X else np.empty((df.shape[0], 0)),
None,
np.column_stack(M) if M else None)
62 changes: 62 additions & 0 deletions Orange/data/tests/test_pandas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import unittest
import numpy as np
from Orange.data import ContinuousVariable, DiscreteVariable, TimeVariable

try:
import pandas as pd
except ImportError:
pd = None

@unittest.skipIf(pd is None, "Missing package 'pandas'")
class TestPandasCompat(unittest.TestCase):
def test_table_from_frame(self):
from Orange.data.pandas_compat import table_from_frame

nan = np.nan
df = pd.DataFrame([['a', 1, pd.Timestamp('2017-12-19')],
['b', 0, pd.Timestamp('1724-12-20')],
['c', 0, pd.Timestamp('1724-12-20')],
[nan, nan, nan]])
table = table_from_frame(df)
np.testing.assert_equal(table.X,
[[1, pd.Timestamp('2017-12-19').timestamp()],
[0, pd.Timestamp('1724-12-20').timestamp()],
[0, pd.Timestamp('1724-12-20').timestamp()],
[nan, nan]])
np.testing.assert_equal(table.metas.tolist(), [['a'],
['b'],
['c'],
[nan]])
names = [var.name for var in table.domain.attributes]
types = [type(var) for var in table.domain.attributes]
self.assertEqual(names, ['1', '2'])
self.assertEqual(types, [ContinuousVariable, TimeVariable])

# Force strings nominal
table = table_from_frame(df, force_nominal=True)
np.testing.assert_equal(table.X, [[0, 1, pd.Timestamp('2017-12-19').timestamp()],
[1, 0, pd.Timestamp('1724-12-20').timestamp()],
[2, 0, pd.Timestamp('1724-12-20').timestamp()],
[nan, nan, nan]])
np.testing.assert_equal(table.metas.tolist(), [[], [], [], []])
names = [var.name for var in table.domain.attributes]
types = [type(var) for var in table.domain.attributes]
self.assertEqual(names, ['0', '1', '2'])
self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable])

# Include index
df.index = list('abaa')
table = table_from_frame(df)
np.testing.assert_equal(table.X,
[[0, 1, pd.Timestamp('2017-12-19').timestamp()],
[1, 0, pd.Timestamp('1724-12-20').timestamp()],
[0, 0, pd.Timestamp('1724-12-20').timestamp()],
[0, nan, nan]])
np.testing.assert_equal(table.metas.tolist(), [['a'],
['b'],
['c'],
[nan]])
names = [var.name for var in table.domain.attributes]
types = [type(var) for var in table.domain.attributes]
self.assertEqual(names, ['index', '1', '2'])
self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable])
2 changes: 1 addition & 1 deletion appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ environment:
BUILD_GLOBAL_OPTIONS: build -j1
BUILD_ENV: wheel==0.29.0 pip==9.0.1 numpy==1.9.3
# SIP 4.19.4+ with PyQt5==5.9.1+ segfault our tests (GH-2756)
TEST_ENV: sip==4.19.6 PyQt5==5.9.2 numpy==1.12.1 scipy==1.0.0b1 scikit-learn
TEST_ENV: sip==4.19.6 PyQt5==5.9.2 numpy==1.12.1 scipy==1.0.0b1 scikit-learn pandas==0.21.1

matrix:
- PYTHON: C:\Python34
Expand Down
6 changes: 1 addition & 5 deletions requirements-opt.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1 @@
# This is required for, and only used by, Parallel Coordinates widget.
# Once that is ported to whatever, this can be removed, along with
# Orange/widgets/utils/plot/*
# Optional because it's hard to install everywhere.
qt-graph-helpers>=0.1.3
pandas

0 comments on commit e602be2

Please sign in to comment.