diff --git a/.travis/install_orange.sh b/.travis/install_orange.sh index 3ba656f41a5..55aee88bc39 100644 --- a/.travis/install_orange.sh +++ b/.travis/install_orange.sh @@ -7,6 +7,7 @@ foldable pip install -U setuptools pip codecov cat requirements-core.txt \ requirements-gui.txt \ requirements-dev.txt \ + requirements-opt.txt \ requirements-doc.txt | while read dep; do dep="${dep%%#*}" # Strip the comment diff --git a/Orange/data/pandas_compat.py b/Orange/data/pandas_compat.py new file mode 100644 index 00000000000..965b361eff8 --- /dev/null +++ b/Orange/data/pandas_compat.py @@ -0,0 +1,78 @@ +"""Pandas DataFrame↔Table conversion helpers""" +import numpy as np +import pandas as pd +from pandas.api.types import ( + is_categorical_dtype, is_object_dtype, + is_datetime64_any_dtype, is_numeric_dtype, +) + +from Orange.data import ( + Table, Domain, DiscreteVariable, StringVariable, TimeVariable, + ContinuousVariable, +) + +__all__ = ['table_from_frame'] + + +def table_from_frame(df, *, force_nominal=False): + """ + Convert pandas.DataFrame to Orange.data.Table + + Parameters + ---------- + df : pandas.DataFrame + force_nominal : boolean + If True, interpret ALL string columns as nominal (DiscreteVariable). + + Returns + ------- + Table + """ + + def _is_discrete(s): + return (is_categorical_dtype(s) or + is_object_dtype(s) and (force_nominal or + s.nunique() < s.size**.666)) + + def _is_datetime(s): + if is_datetime64_any_dtype(s): + return True + try: + if is_object_dtype(s): + pd.to_datetime(s, infer_datetime_format=True) + return True + except Exception: # pylint: disable=broad-except + pass + return False + + # If df index is not a simple RangeIndex (or similar), put it into data + if not (df.index.is_integer() and (df.index.is_monotonic_increasing or + df.index.is_monotonic_decreasing)): + df = df.reset_index() + + attrs, metas = [], [] + X, M = [], [] + + # Iter over columns + for name, s in df.items(): + name = str(name) + if _is_discrete(s): + discrete = s.astype('category').cat + attrs.append(DiscreteVariable(name, discrete.categories.astype(str).tolist())) + X.append(discrete.codes.replace(-1, np.nan).values) + elif _is_datetime(s): + tvar = TimeVariable(name) + attrs.append(tvar) + s = pd.to_datetime(s, infer_datetime_format=True) + X.append(s.astype('str').replace('NaT', np.nan).map(tvar.parse).values) + elif is_numeric_dtype(s): + attrs.append(ContinuousVariable(name)) + X.append(s.values) + else: + metas.append(StringVariable(name)) + M.append(s.values.astype(object)) + + return Table.from_numpy(Domain(attrs, None, metas), + np.column_stack(X) if X else np.empty((df.shape[0], 0)), + None, + np.column_stack(M) if M else None) diff --git a/Orange/data/tests/test_pandas.py b/Orange/data/tests/test_pandas.py new file mode 100644 index 00000000000..184b04537a1 --- /dev/null +++ b/Orange/data/tests/test_pandas.py @@ -0,0 +1,62 @@ +import unittest +import numpy as np +from Orange.data import ContinuousVariable, DiscreteVariable, TimeVariable + +try: + import pandas as pd +except ImportError: + pd = None + +@unittest.skipIf(pd is None, "Missing package 'pandas'") +class TestPandasCompat(unittest.TestCase): + def test_table_from_frame(self): + from Orange.data.pandas_compat import table_from_frame + + nan = np.nan + df = pd.DataFrame([['a', 1, pd.Timestamp('2017-12-19')], + ['b', 0, pd.Timestamp('1724-12-20')], + ['c', 0, pd.Timestamp('1724-12-20')], + [nan, nan, nan]]) + table = table_from_frame(df) + np.testing.assert_equal(table.X, + [[1, pd.Timestamp('2017-12-19').timestamp()], + [0, pd.Timestamp('1724-12-20').timestamp()], + [0, pd.Timestamp('1724-12-20').timestamp()], + [nan, nan]]) + np.testing.assert_equal(table.metas.tolist(), [['a'], + ['b'], + ['c'], + [nan]]) + names = [var.name for var in table.domain.attributes] + types = [type(var) for var in table.domain.attributes] + self.assertEqual(names, ['1', '2']) + self.assertEqual(types, [ContinuousVariable, TimeVariable]) + + # Force strings nominal + table = table_from_frame(df, force_nominal=True) + np.testing.assert_equal(table.X, [[0, 1, pd.Timestamp('2017-12-19').timestamp()], + [1, 0, pd.Timestamp('1724-12-20').timestamp()], + [2, 0, pd.Timestamp('1724-12-20').timestamp()], + [nan, nan, nan]]) + np.testing.assert_equal(table.metas.tolist(), [[], [], [], []]) + names = [var.name for var in table.domain.attributes] + types = [type(var) for var in table.domain.attributes] + self.assertEqual(names, ['0', '1', '2']) + self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable]) + + # Include index + df.index = list('abaa') + table = table_from_frame(df) + np.testing.assert_equal(table.X, + [[0, 1, pd.Timestamp('2017-12-19').timestamp()], + [1, 0, pd.Timestamp('1724-12-20').timestamp()], + [0, 0, pd.Timestamp('1724-12-20').timestamp()], + [0, nan, nan]]) + np.testing.assert_equal(table.metas.tolist(), [['a'], + ['b'], + ['c'], + [nan]]) + names = [var.name for var in table.domain.attributes] + types = [type(var) for var in table.domain.attributes] + self.assertEqual(names, ['index', '1', '2']) + self.assertEqual(types, [DiscreteVariable, ContinuousVariable, TimeVariable]) diff --git a/appveyor.yml b/appveyor.yml index 2517e51689f..16a8bac72ea 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -21,7 +21,7 @@ environment: BUILD_GLOBAL_OPTIONS: build -j1 BUILD_ENV: wheel==0.29.0 pip==9.0.1 numpy==1.9.3 # SIP 4.19.4+ with PyQt5==5.9.1+ segfault our tests (GH-2756) - TEST_ENV: sip==4.19.6 PyQt5==5.9.2 numpy==1.12.1 scipy==1.0.0b1 scikit-learn + TEST_ENV: sip==4.19.6 PyQt5==5.9.2 numpy==1.12.1 scipy==1.0.0b1 scikit-learn pandas==0.21.1 matrix: - PYTHON: C:\Python34 diff --git a/requirements-opt.txt b/requirements-opt.txt index 3b0dcef44c3..fb6c7ed7ec6 100644 --- a/requirements-opt.txt +++ b/requirements-opt.txt @@ -1,5 +1 @@ -# This is required for, and only used by, Parallel Coordinates widget. -# Once that is ported to whatever, this can be removed, along with -# Orange/widgets/utils/plot/* -# Optional because it's hard to install everywhere. -qt-graph-helpers>=0.1.3 +pandas