From e58f18c2e117bc7b14d7be1a2c13b43de50118f8 Mon Sep 17 00:00:00 2001 From: Chris Date: Mon, 28 Sep 2015 22:44:03 -0500 Subject: [PATCH] PERF: vectorized DateOffset with months --- doc/source/whatsnew/v0.17.0.txt | 1 + pandas/tseries/offsets.py | 18 +++---- pandas/tseries/tests/test_timeseries.py | 14 ++--- pandas/tseries/tests/test_tslib.py | 11 ++++ pandas/tslib.pyx | 70 +++++++++++++++++++++++++ 5 files changed, 98 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 03cac12436898..79d0788efad82 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -1056,6 +1056,7 @@ Performance Improvements - 2x improvement of ``Series.value_counts`` for float dtype (:issue:`10821`) - Enable ``infer_datetime_format`` in ``to_datetime`` when date components do not have 0 padding (:issue:`11142`) - Regression from 0.16.1 in constructing ``DataFrame`` from nested dictionary (:issue:`11084`) +- Performance improvements in addition/subtraction operations for ``DateOffset`` with ``Series`` or ``DatetimeIndex`` (issue:`10744`, :issue:`11205`) .. _whatsnew_0170.bug_fixes: diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index e15be45ef305a..0dac09a243d36 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -261,16 +261,12 @@ def apply_index(self, i): # relativedelta/_offset path only valid for base DateOffset if (self._use_relativedelta and set(self.kwds).issubset(relativedelta_fast)): + months = ((self.kwds.get('years', 0) * 12 + self.kwds.get('months', 0)) * self.n) if months: - base = (i.to_period('M') + months).to_timestamp() - time = i.to_perioddelta('D') - days = i.to_perioddelta('M') - time - # minimum prevents month-end from wrapping - day_offset = np.minimum(days, - to_timedelta(base.days_in_month - 1, unit='D')) - i = base + day_offset + time + shifted = tslib.shift_months(i.asi8, months) + i = i._shallow_copy(shifted) weeks = (self.kwds.get('weeks', 0)) * self.n if weeks: @@ -1081,7 +1077,9 @@ def apply(self, other): @apply_index_wraps def apply_index(self, i): - return self._end_apply_index(i, 'M') + months = self.n - 1 if self.n >= 0 else self.n + shifted = tslib.shift_months(i.asi8, months, 'end') + return i._shallow_copy(shifted) def onOffset(self, dt): if self.normalize and not _is_normalized(dt): @@ -1106,7 +1104,9 @@ def apply(self, other): @apply_index_wraps def apply_index(self, i): - return self._beg_apply_index(i, 'M') + months = self.n + 1 if self.n < 0 else self.n + shifted = tslib.shift_months(i.asi8, months, 'start') + return i._shallow_copy(shifted) def onOffset(self, dt): if self.normalize and not _is_normalized(dt): diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 957cdcc009e1c..ed174bc285e4f 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -2565,32 +2565,32 @@ def test_datetime64_with_DateOffset(self): for klass, assert_func in zip([Series, DatetimeIndex], [self.assert_series_equal, tm.assert_index_equal]): - s = klass(date_range('2000-01-01', '2000-01-31')) + s = klass(date_range('2000-01-01', '2000-01-31'), name='a') result = s + pd.DateOffset(years=1) result2 = pd.DateOffset(years=1) + s - exp = klass(date_range('2001-01-01', '2001-01-31')) + exp = klass(date_range('2001-01-01', '2001-01-31'), name='a') assert_func(result, exp) assert_func(result2, exp) result = s - pd.DateOffset(years=1) - exp = klass(date_range('1999-01-01', '1999-01-31')) + exp = klass(date_range('1999-01-01', '1999-01-31'), name='a') assert_func(result, exp) s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), - pd.Timestamp('2000-02-15', tz='US/Central')]) + pd.Timestamp('2000-02-15', tz='US/Central')], name='a') result = s + pd.offsets.Day() result2 = pd.offsets.Day() + s exp = klass([Timestamp('2000-01-16 00:15:00', tz='US/Central'), - Timestamp('2000-02-16', tz='US/Central')]) + Timestamp('2000-02-16', tz='US/Central')], name='a') assert_func(result, exp) assert_func(result2, exp) s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'), - pd.Timestamp('2000-02-15', tz='US/Central')]) + pd.Timestamp('2000-02-15', tz='US/Central')], name='a') result = s + pd.offsets.MonthEnd() result2 = pd.offsets.MonthEnd() + s exp = klass([Timestamp('2000-01-31 00:15:00', tz='US/Central'), - Timestamp('2000-02-29', tz='US/Central')]) + Timestamp('2000-02-29', tz='US/Central')], name='a') assert_func(result, exp) assert_func(result2, exp) diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index fadad91e6842a..f618b2593597e 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -949,6 +949,17 @@ def compare_local_to_utc(tz_didx, utc_didx): tslib.maybe_get_tz('Asia/Tokyo')) self.assert_numpy_array_equal(result, np.array([tslib.iNaT], dtype=np.int64)) + def test_shift_months(self): + s = DatetimeIndex([Timestamp('2000-01-05 00:15:00'), Timestamp('2000-01-31 00:23:00'), + Timestamp('2000-01-01'), Timestamp('2000-02-29'), Timestamp('2000-12-31')]) + for years in [-1, 0, 1]: + for months in [-2, 0, 2]: + actual = DatetimeIndex(tslib.shift_months(s.asi8, years * 12 + months)) + expected = DatetimeIndex([x + offsets.DateOffset(years=years, months=months) for x in s]) + tm.assert_index_equal(actual, expected) + + + class TestTimestampOps(tm.TestCase): def test_timestamp_and_datetime(self): self.assertEqual((Timestamp(datetime.datetime(2013, 10, 13)) - datetime.datetime(2013, 10, 12)).days, 1) diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 7b3e404f7504c..398c5f0232de1 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -3847,6 +3847,7 @@ def get_time_micros(ndarray[int64_t] dtindex): return micros + @cython.wraparound(False) def get_date_field(ndarray[int64_t] dtindex, object field): ''' @@ -4386,6 +4387,75 @@ cpdef normalize_date(object dt): raise TypeError('Unrecognized type: %s' % type(dt)) +cdef inline int _year_add_months(pandas_datetimestruct dts, + int months): + '''new year number after shifting pandas_datetimestruct number of months''' + return dts.year + (dts.month + months - 1) / 12 + +cdef inline int _month_add_months(pandas_datetimestruct dts, + int months): + '''new month number after shifting pandas_datetimestruct number of months''' + cdef int new_month = (dts.month + months) % 12 + return 12 if new_month == 0 else new_month + +@cython.wraparound(False) +def shift_months(int64_t[:] dtindex, int months, object day=None): + ''' + Given an int64-based datetime index, shift all elements + specified number of months using DateOffset semantics + + day: {None, 'start', 'end'} + * None: day of month + * 'start' 1st day of month + * 'end' last day of month + ''' + cdef: + Py_ssize_t i + int days_in_month + pandas_datetimestruct dts + int count = len(dtindex) + int64_t[:] out = np.empty(count, dtype='int64') + + for i in range(count): + if dtindex[i] == NPY_NAT: + out[i] = NPY_NAT + else: + pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts) + + if day is None: + dts.year = _year_add_months(dts, months) + dts.month = _month_add_months(dts, months) + #prevent day from wrapping around month end + days_in_month = days_per_month_table[is_leapyear(dts.year)][dts.month-1] + dts.day = min(dts.day, days_in_month) + elif day == 'start': + dts.year = _year_add_months(dts, months) + dts.month = _month_add_months(dts, months) + + # offset semantics - when subtracting if at the start anchor + # point, shift back by one more month + if months <= 0 and dts.day == 1: + dts.year = _year_add_months(dts, -1) + dts.month = _month_add_months(dts, -1) + else: + dts.day = 1 + elif day == 'end': + days_in_month = days_per_month_table[is_leapyear(dts.year)][dts.month-1] + dts.year = _year_add_months(dts, months) + dts.month = _month_add_months(dts, months) + + # similar semantics - when adding shift forward by one + # month if already at an end of month + if months >= 0 and dts.day == days_in_month: + dts.year = _year_add_months(dts, 1) + dts.month = _month_add_months(dts, 1) + + days_in_month = days_per_month_table[is_leapyear(dts.year)][dts.month-1] + dts.day = days_in_month + + out[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts) + return np.asarray(out) + #---------------------------------------------------------------------- # Don't even ask