apache · ueshin · Feb 9, 2018 · Feb 10, 2018 · Feb 10, 2018 · Feb 10, 2018
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
@@ -3415,7 +3415,9 @@ def setUpClass(cls):
                     (u"b", 2, 20, 0.4, 4.0, Decimal("4.0"),
                      date(2012, 2, 2), datetime(2012, 2, 2, 2, 2, 2)),
                     (u"c", 3, 30, 0.8, 6.0, Decimal("6.0"),
-                     date(2100, 3, 3), datetime(2100, 3, 3, 3, 3, 3))]
+                     date(2100, 3, 3), datetime(2100, 3, 3, 3, 3, 3)),
+                    (u"d", 4, 30, 1.0, 8.0, Decimal("8.0"),
+                     date(2100, 4, 4), datetime(2100, 4, 4, 4, 4, 4))]
 
     @classmethod
     def tearDownClass(cls):
@@ -4124,7 +4126,7 @@ def test_vectorized_udf_timestamps(self):
         data = [(0, datetime(1969, 1, 1, 1, 1, 1)),
                 (1, datetime(2012, 2, 2, 2, 2, 2)),
                 (2, None),
-                (3, datetime(2100, 3, 3, 3, 3, 3))]
+                (3, datetime(2100, 4, 4, 4, 4, 4))]
 
         df = self.spark.createDataFrame(data, schema=schema)
 
@@ -4206,7 +4208,7 @@ def test_vectorized_udf_timestamps_respect_session_timezone(self):
         data = [(1, datetime(1969, 1, 1, 1, 1, 1)),
                 (2, datetime(2012, 2, 2, 2, 2, 2)),
                 (3, None),
-                (4, datetime(2100, 3, 3, 3, 3, 3))]
+                (4, datetime(2100, 4, 4, 4, 4, 4))]
         df = self.spark.createDataFrame(data, schema=schema)
 
         f_timestamp_copy = pandas_udf(lambda ts: ts, TimestampType())

diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
@@ -1709,6 +1709,15 @@ def _check_dataframe_convert_date(pdf, schema):
     return pdf
 
 
+def _get_local_timezone():
+    """ Get local timezone from environment vi pytz, or dateutil. """
+    from pyspark.sql.utils import require_minimum_pandas_version
+    require_minimum_pandas_version()
+
+    import os
+    return os.environ.get('TZ', 'dateutil/:')
+
+
 def _check_dataframe_localize_timestamps(pdf, timezone):
     """
     Convert timezone aware timestamps to timezone-naive in the specified timezone or local timezone
@@ -1721,7 +1730,7 @@ def _check_dataframe_localize_timestamps(pdf, timezone):
     require_minimum_pandas_version()
 
     from pandas.api.types import is_datetime64tz_dtype
-    tz = timezone or 'tzlocal()'
+    tz = timezone or _get_local_timezone()
     for column, series in pdf.iteritems():
         # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
         if is_datetime64tz_dtype(series.dtype):
@@ -1744,7 +1753,7 @@ def _check_series_convert_timestamps_internal(s, timezone):
     from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
     # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
     if is_datetime64_dtype(s.dtype):
-        tz = timezone or 'tzlocal()'
+        tz = timezone or _get_local_timezone()
         return s.dt.tz_localize(tz).dt.tz_convert('UTC')
     elif is_datetime64tz_dtype(s.dtype):
         return s.dt.tz_convert('UTC')
@@ -1766,15 +1775,13 @@ def _check_series_convert_timestamps_localize(s, from_timezone, to_timezone):
 
     import pandas as pd
     from pandas.api.types import is_datetime64tz_dtype, is_datetime64_dtype
-    from_tz = from_timezone or 'tzlocal()'
-    to_tz = to_timezone or 'tzlocal()'
+    from_tz = from_timezone or _get_local_timezone()
+    to_tz = to_timezone or _get_local_timezone()
     # TODO: handle nested timestamps, such as ArrayType(TimestampType())?
     if is_datetime64tz_dtype(s.dtype):
         return s.dt.tz_convert(to_tz).dt.tz_localize(None)
     elif is_datetime64_dtype(s.dtype) and from_tz != to_tz:
-        # `s.dt.tz_localize('tzlocal()')` doesn't work properly when including NaT.
-        return s.apply(lambda ts: ts.tz_localize(from_tz).tz_convert(to_tz).tz_localize(None)
-                       if ts is not pd.NaT else pd.NaT)
+        return s.dt.tz_localize(from_tz).dt.tz_convert(to_tz).dt.tz_localize(None)
     else:
         return s