From 7abb6e0ec64c55023df6fbb0809abe1f8cf6ec3e Mon Sep 17 00:00:00 2001 From: HemangChothani Date: Fri, 26 Jun 2020 15:15:05 +0530 Subject: [PATCH 1/6] feat(bigquery): expose date_as_object parameter for users --- google/cloud/bigquery/job.py | 8 +++++++ google/cloud/bigquery/table.py | 10 ++++++++- tests/unit/test_job.py | 41 +++++++++++++++++++++++++++++++++- 3 files changed, 57 insertions(+), 2 deletions(-) diff --git a/google/cloud/bigquery/job.py b/google/cloud/bigquery/job.py index b0d2e7517..35893eeb9 100644 --- a/google/cloud/bigquery/job.py +++ b/google/cloud/bigquery/job.py @@ -3320,6 +3320,7 @@ def to_dataframe( dtypes=None, progress_bar_type=None, create_bqstorage_client=True, + date_as_object=True, ): """Return a pandas DataFrame from a QueryJob @@ -3360,6 +3361,12 @@ def to_dataframe( ..versionadded:: 1.24.0 + date_as_object (bool): + Optional. If ``True`` (default), cast dates to objects. + If False, convert to datetime64[ns] dtype. + + ..versionadded:: 1.26.0 + Returns: A :class:`~pandas.DataFrame` populated with row data and column headers from the query results. The column headers are derived @@ -3373,6 +3380,7 @@ def to_dataframe( dtypes=dtypes, progress_bar_type=progress_bar_type, create_bqstorage_client=create_bqstorage_client, + date_as_object=date_as_object, ) def __iter__(self): diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index 5b13cc52a..becc8da27 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1633,6 +1633,7 @@ def to_dataframe( dtypes=None, progress_bar_type=None, create_bqstorage_client=True, + date_as_object=True, ): """Create a pandas DataFrame by loading all pages of a query. @@ -1683,6 +1684,12 @@ def to_dataframe( ..versionadded:: 1.24.0 + date_as_object (bool): + Optional. If ``True`` (default), cast dates to objects. + If False, convert to datetime64[ns] dtype. + + ..versionadded:: 1.26.0 + Returns: pandas.DataFrame: A :class:`~pandas.DataFrame` populated with row data and column @@ -1722,7 +1729,7 @@ def to_dataframe( bqstorage_client=bqstorage_client, create_bqstorage_client=create_bqstorage_client, ) - df = record_batch.to_pandas() + df = record_batch.to_pandas(date_as_object=date_as_object) for column in dtypes: df[column] = pandas.Series(df[column], dtype=dtypes[column]) return df @@ -1799,6 +1806,7 @@ def to_dataframe( dtypes=None, progress_bar_type=None, create_bqstorage_client=True, + date_as_object=True, ): """Create an empty dataframe. diff --git a/tests/unit/test_job.py b/tests/unit/test_job.py index 9eec9fda3..78a3dcbe9 100644 --- a/tests/unit/test_job.py +++ b/tests/unit/test_job.py @@ -5504,7 +5504,15 @@ def test_to_dataframe_column_dtypes(self): }, } row_data = [ - ["1.4338368E9", "420", "1.1", "1.77", "Cash", "true", "1999-12-01"], + [ + "1.4338368E9", + "420", + "1.1", + "1.77", + "Cto_dataframeash", + "true", + "1999-12-01", + ], ["1.3878117E9", "2580", "17.7", "28.5", "Cash", "false", "1953-06-14"], ["1.3855653E9", "2280", "4.4", "7.1", "Credit", "true", "1981-11-04"], ] @@ -5533,6 +5541,37 @@ def test_to_dataframe_column_dtypes(self): self.assertEqual(df.complete.dtype.name, "bool") self.assertEqual(df.date.dtype.name, "object") + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_to_dataframe_column_date_dtypes(self): + begun_resource = self._make_resource() + query_resource = { + "jobComplete": True, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "totalRows": "1", + "schema": {"fields": [{"name": "date", "type": "DATE"}]}, + } + row_data = [ + ["1999-12-01"], + ] + rows = [{"f": [{"v": field} for field in row]} for row in row_data] + query_resource["rows"] = rows + done_resource = copy.deepcopy(begun_resource) + done_resource["status"] = {"state": "DONE"} + connection = _make_connection( + begun_resource, query_resource, done_resource, query_resource + ) + client = _make_client(project=self.PROJECT, connection=connection) + job = self._make_one(self.JOB_ID, self.QUERY, client) + + df = job.to_dataframe(date_as_object=False, create_bqstorage_client=False) + + self.assertIsInstance(df, pandas.DataFrame) + self.assertEqual(len(df), 1) # verify the number of rows + exp_columns = [field["name"] for field in query_resource["schema"]["fields"]] + self.assertEqual(list(df), exp_columns) # verify the column names + + self.assertEqual(df.date.dtype.name, "datetime64[ns]") + @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(tqdm is None, "Requires `tqdm`") @mock.patch("tqdm.tqdm") From c8d19e506a62bb2cecd6c4b82c6dac3ad5aa63a9 Mon Sep 17 00:00:00 2001 From: HemangChothani Date: Mon, 29 Jun 2020 10:50:38 +0530 Subject: [PATCH 2/6] feat(bigquery): nit --- google/cloud/bigquery/job.py | 12 ++++++------ google/cloud/bigquery/table.py | 13 +++++++------ tests/unit/test_job.py | 2 +- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/google/cloud/bigquery/job.py b/google/cloud/bigquery/job.py index 35893eeb9..9a1af6acc 100644 --- a/google/cloud/bigquery/job.py +++ b/google/cloud/bigquery/job.py @@ -3351,9 +3351,9 @@ def to_dataframe( for details. ..versionadded:: 1.11.0 - create_bqstorage_client (bool): - Optional. If ``True`` (default), create a BigQuery Storage API - client using the default API settings. The BigQuery Storage API + create_bqstorage_client Optional[bool]: + If ``True`` (default), create a BigQuery Storage APIclient + using the default API settings. The BigQuery Storage API is a faster way to fetch rows from BigQuery. See the ``bqstorage_client`` parameter for more information. @@ -3361,9 +3361,9 @@ def to_dataframe( ..versionadded:: 1.24.0 - date_as_object (bool): - Optional. If ``True`` (default), cast dates to objects. - If False, convert to datetime64[ns] dtype. + date_as_object Optional[bool]: + If ``True`` (default), cast dates to objects. If ``False``, convert + to datetime64[ns] dtype. ..versionadded:: 1.26.0 diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index becc8da27..ea7f70f7b 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1674,9 +1674,9 @@ def to_dataframe( progress bar as a graphical dialog box. ..versionadded:: 1.11.0 - create_bqstorage_client (bool): - Optional. If ``True`` (default), create a BigQuery Storage API - client using the default API settings. The BigQuery Storage API + create_bqstorage_client Optional[bool]: + If ``True`` (default), create a BigQuery Storage API client + using the default API settings. The BigQuery Storage API is a faster way to fetch rows from BigQuery. See the ``bqstorage_client`` parameter for more information. @@ -1684,9 +1684,9 @@ def to_dataframe( ..versionadded:: 1.24.0 - date_as_object (bool): - Optional. If ``True`` (default), cast dates to objects. - If False, convert to datetime64[ns] dtype. + date_as_object Optional[bool]: + If ``True`` (default), cast dates to objects. If ``False``, convert + to datetime64[ns] dtype. ..versionadded:: 1.26.0 @@ -1815,6 +1815,7 @@ def to_dataframe( dtypes (Any): Ignored. Added for compatibility with RowIterator. progress_bar_type (Any): Ignored. Added for compatibility with RowIterator. create_bqstorage_client (bool): Ignored. Added for compatibility with RowIterator. + date_as_object (bool): Ignored. Added for compatibility with RowIterator. Returns: pandas.DataFrame: An empty :class:`~pandas.DataFrame`. diff --git a/tests/unit/test_job.py b/tests/unit/test_job.py index 78a3dcbe9..bc7d1765d 100644 --- a/tests/unit/test_job.py +++ b/tests/unit/test_job.py @@ -5541,6 +5541,7 @@ def test_to_dataframe_column_dtypes(self): self.assertEqual(df.complete.dtype.name, "bool") self.assertEqual(df.date.dtype.name, "object") + @unittest.skipIf(pyarrow is None, "Requires `pyarrow`") @unittest.skipIf(pandas is None, "Requires `pandas`") def test_to_dataframe_column_date_dtypes(self): begun_resource = self._make_resource() @@ -5562,7 +5563,6 @@ def test_to_dataframe_column_date_dtypes(self): ) client = _make_client(project=self.PROJECT, connection=connection) job = self._make_one(self.JOB_ID, self.QUERY, client) - df = job.to_dataframe(date_as_object=False, create_bqstorage_client=False) self.assertIsInstance(df, pandas.DataFrame) From ee3002ce93962b1562d40c99380344e263365a6e Mon Sep 17 00:00:00 2001 From: HemangChothani Date: Mon, 29 Jun 2020 11:22:12 +0530 Subject: [PATCH 3/6] feat(bigquery): add unit test for date as object without pyarrow --- tests/unit/test_job.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/unit/test_job.py b/tests/unit/test_job.py index bc7d1765d..733445337 100644 --- a/tests/unit/test_job.py +++ b/tests/unit/test_job.py @@ -5572,6 +5572,38 @@ def test_to_dataframe_column_date_dtypes(self): self.assertEqual(df.date.dtype.name, "datetime64[ns]") + @unittest.skipIf(pandas is None, "Requires `pandas`") + def test_to_dataframe_column_date_dtypes_wo_pyarrow(self): + begun_resource = self._make_resource() + query_resource = { + "jobComplete": True, + "jobReference": {"projectId": self.PROJECT, "jobId": self.JOB_ID}, + "totalRows": "1", + "schema": {"fields": [{"name": "date", "type": "DATE"}]}, + } + row_data = [ + ["1999-12-01"], + ] + rows = [{"f": [{"v": field} for field in row]} for row in row_data] + query_resource["rows"] = rows + done_resource = copy.deepcopy(begun_resource) + done_resource["status"] = {"state": "DONE"} + connection = _make_connection( + begun_resource, query_resource, done_resource, query_resource + ) + client = _make_client(project=self.PROJECT, connection=connection) + job = self._make_one(self.JOB_ID, self.QUERY, client) + + with mock.patch("google.cloud.bigquery.table.pyarrow", None): + df = job.to_dataframe(date_as_object=False, create_bqstorage_client=False) + + self.assertIsInstance(df, pandas.DataFrame) + self.assertEqual(len(df), 1) # verify the number of rows + exp_columns = [field["name"] for field in query_resource["schema"]["fields"]] + self.assertEqual(list(df), exp_columns) # verify the column names + + self.assertEqual(df.date.dtype.name, "object") + @unittest.skipIf(pandas is None, "Requires `pandas`") @unittest.skipIf(tqdm is None, "Requires `tqdm`") @mock.patch("tqdm.tqdm") From b7001d3a440c8d8a1ec35f73025abe862fbc9d18 Mon Sep 17 00:00:00 2001 From: HemangChothani Date: Mon, 29 Jun 2020 12:25:17 +0530 Subject: [PATCH 4/6] feat(bigquery): docs fixed --- google/cloud/bigquery/job.py | 6 +++--- google/cloud/bigquery/table.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/google/cloud/bigquery/job.py b/google/cloud/bigquery/job.py index 9a1af6acc..60209db69 100644 --- a/google/cloud/bigquery/job.py +++ b/google/cloud/bigquery/job.py @@ -3351,7 +3351,7 @@ def to_dataframe( for details. ..versionadded:: 1.11.0 - create_bqstorage_client Optional[bool]: + create_bqstorage_client (Optional[bool]): If ``True`` (default), create a BigQuery Storage APIclient using the default API settings. The BigQuery Storage API is a faster way to fetch rows from BigQuery. See the @@ -3361,7 +3361,7 @@ def to_dataframe( ..versionadded:: 1.24.0 - date_as_object Optional[bool]: + date_as_object (Optional[bool]): If ``True`` (default), cast dates to objects. If ``False``, convert to datetime64[ns] dtype. @@ -3437,7 +3437,7 @@ def from_api_repr(cls, resource): resource(Dict[str: object]): ExplainQueryStage representation returned from API. - Returns: + Returns:bigquery_issue_11 google.cloud.bigquery.QueryPlanEntry: Query plan entry parsed from ``resource``. """ diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index ea7f70f7b..5f557d28a 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -1674,7 +1674,7 @@ def to_dataframe( progress bar as a graphical dialog box. ..versionadded:: 1.11.0 - create_bqstorage_client Optional[bool]: + create_bqstorage_client (Optional[bool]): If ``True`` (default), create a BigQuery Storage API client using the default API settings. The BigQuery Storage API is a faster way to fetch rows from BigQuery. See the @@ -1684,7 +1684,7 @@ def to_dataframe( ..versionadded:: 1.24.0 - date_as_object Optional[bool]: + date_as_object (Optional[bool]): If ``True`` (default), cast dates to objects. If ``False``, convert to datetime64[ns] dtype. From b3b1c142f86369fa58a9cb19229c22ea491d68b5 Mon Sep 17 00:00:00 2001 From: HemangChothani Date: Mon, 29 Jun 2020 13:01:02 +0530 Subject: [PATCH 5/6] feat(bigquery): nit --- google/cloud/bigquery/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google/cloud/bigquery/job.py b/google/cloud/bigquery/job.py index 60209db69..40b802839 100644 --- a/google/cloud/bigquery/job.py +++ b/google/cloud/bigquery/job.py @@ -3437,7 +3437,7 @@ def from_api_repr(cls, resource): resource(Dict[str: object]): ExplainQueryStage representation returned from API. - Returns:bigquery_issue_11 + Returns: google.cloud.bigquery.QueryPlanEntry: Query plan entry parsed from ``resource``. """ From 3ed7f1bd35e984a0d33fe18f8111cb0a4d3a28cf Mon Sep 17 00:00:00 2001 From: HemangChothani Date: Mon, 29 Jun 2020 13:17:06 +0530 Subject: [PATCH 6/6] feat(bigquery): nit --- google/cloud/bigquery/job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/google/cloud/bigquery/job.py b/google/cloud/bigquery/job.py index 40b802839..930dc413d 100644 --- a/google/cloud/bigquery/job.py +++ b/google/cloud/bigquery/job.py @@ -3352,7 +3352,7 @@ def to_dataframe( ..versionadded:: 1.11.0 create_bqstorage_client (Optional[bool]): - If ``True`` (default), create a BigQuery Storage APIclient + If ``True`` (default), create a BigQuery Storage API client using the default API settings. The BigQuery Storage API is a faster way to fetch rows from BigQuery. See the ``bqstorage_client`` parameter for more information.