From 4d0fcdd8fa028dc6e3f96b9cd01be998bd10f710 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Thu, 6 Apr 2023 15:18:31 -0700 Subject: [PATCH 01/31] done --- dev/sparktestsupport/modules.py | 5 + python/pyspark/sql/connect/streaming/query.py | 29 +- .../sql/connect/streaming/readwriter.py | 102 ++++- python/pyspark/sql/streaming/query.py | 7 +- python/pyspark/sql/streaming/readwriter.py | 80 ++-- .../streaming/test_parity_streaming.py | 69 ++++ .../sql/tests/streaming/test_streaming.py | 376 +++--------------- .../test_streaming_foreach_family.py | 369 +++++++++++++++++ 8 files changed, 656 insertions(+), 381 deletions(-) create mode 100644 python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py create mode 100644 python/pyspark/sql/tests/streaming/test_streaming_foreach_family.py diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 1a28a644e550c..d946783c8126a 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -495,6 +495,7 @@ def __hash__(self): "pyspark.sql.tests.test_serde", "pyspark.sql.tests.test_session", "pyspark.sql.tests.streaming.test_streaming", + "pyspark.sql.tests.streaming.test_streaming_foreach_family", "pyspark.sql.tests.streaming.test_streaming_listener", "pyspark.sql.tests.test_types", "pyspark.sql.tests.test_udf", @@ -749,6 +750,8 @@ def __hash__(self): "pyspark.sql.connect.dataframe", "pyspark.sql.connect.functions", "pyspark.sql.connect.avro.functions", + "pyspark.sql.connect.streaming.readwriter", + "pyspark.sql.connect.streaming.query", # sql unittests "pyspark.sql.tests.connect.test_client", "pyspark.sql.tests.connect.test_connect_plan", @@ -777,6 +780,8 @@ def __hash__(self): "pyspark.ml.connect.functions", # ml unittests "pyspark.ml.tests.connect.test_connect_function", + # streaming unittests + "pyspark.sql.tests.connect.streaming.test_parity_streaming", ], excluded_python_implementations=[ "PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and diff --git a/python/pyspark/sql/connect/streaming/query.py b/python/pyspark/sql/connect/streaming/query.py index 2866945d161fe..3e8c679a9bca8 100644 --- a/python/pyspark/sql/connect/streaming/query.py +++ b/python/pyspark/sql/connect/streaming/query.py @@ -16,6 +16,7 @@ # import json +import sys from typing import TYPE_CHECKING, Any, cast, Dict, List, Optional from pyspark.errors import StreamingQueryException @@ -149,10 +150,32 @@ def _execute_streaming_query_cmd( def _test() -> None: - # TODO(SPARK-43031): port _test() from legacy query.py. - pass + import doctest + import os + from pyspark.sql import SparkSession as PySparkSession + import pyspark.sql.connect.streaming.query + from py4j.protocol import Py4JError + + os.chdir(os.environ["SPARK_HOME"]) + + globs = pyspark.sql.connect.streaming.query.__dict__.copy() + + globs["spark"] = ( + PySparkSession.builder.appName("sql.connect.streaming.query tests") + .remote("local[4]") + .getOrCreate() + ) + + (failure_count, test_count) = doctest.testmod( + pyspark.sql.connect.streaming.query, + globs=globs, + optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF, + ) + globs["spark"].stop() + + if failure_count: + sys.exit(-1) if __name__ == "__main__": - # TODO(SPARK-43031): Add this file dev/sparktestsupport/modules.py to enable testing in CI. _test() diff --git a/python/pyspark/sql/connect/streaming/readwriter.py b/python/pyspark/sql/connect/streaming/readwriter.py index b266f485c96c4..6246d24a09282 100644 --- a/python/pyspark/sql/connect/streaming/readwriter.py +++ b/python/pyspark/sql/connect/streaming/readwriter.py @@ -168,9 +168,75 @@ def json( json.__doc__ = PySparkDataStreamReader.json.__doc__ - # def orc() TODO - # def parquet() TODO - # def text() TODO + def orc( + self, + path: str, + mergeSchema: Optional[bool] = None, + pathGlobFilter: Optional[Union[bool, str]] = None, + recursiveFileLookup: Optional[Union[bool, str]] = None, + ) -> "DataFrame": + self._set_opts( + mergeSchema=mergeSchema, + pathGlobFilter=pathGlobFilter, + recursiveFileLookup=recursiveFileLookup, + ) + if isinstance(path, str): + return self.load(path=path, format="orc") + else: + raise TypeError("path can be only a single string") + + orc.__doc__ = PySparkDataStreamReader.orc.__doc__ + + def parquet( + self, + path: str, + mergeSchema: Optional[bool] = None, + pathGlobFilter: Optional[Union[bool, str]] = None, + recursiveFileLookup: Optional[Union[bool, str]] = None, + datetimeRebaseMode: Optional[Union[bool, str]] = None, + int96RebaseMode: Optional[Union[bool, str]] = None, + ) -> "DataFrame": + self._set_opts( + mergeSchema=mergeSchema, + pathGlobFilter=pathGlobFilter, + recursiveFileLookup=recursiveFileLookup, + datetimeRebaseMode=datetimeRebaseMode, + int96RebaseMode=int96RebaseMode, + ) + self._set_opts( + mergeSchema=mergeSchema, + pathGlobFilter=pathGlobFilter, + recursiveFileLookup=recursiveFileLookup, + datetimeRebaseMode=datetimeRebaseMode, + int96RebaseMode=int96RebaseMode, + ) + if isinstance(path, str): + return self.load(path=path, format="parquet") + else: + raise TypeError("path can be only a single string") + + parquet.__doc__ = PySparkDataStreamReader.parquet.__doc__ + + def text( + self, + path: str, + wholetext: bool = False, + lineSep: Optional[str] = None, + pathGlobFilter: Optional[Union[bool, str]] = None, + recursiveFileLookup: Optional[Union[bool, str]] = None, + ) -> "DataFrame": + self._set_opts( + wholetext=wholetext, + lineSep=lineSep, + pathGlobFilter=pathGlobFilter, + recursiveFileLookup=recursiveFileLookup, + ) + if isinstance(path, str): + return self.load(path=path, format="text") + else: + raise TypeError("path can be only a single string") + + text.__doc__ = PySparkDataStreamReader.text.__doc__ def csv( self, @@ -245,7 +311,7 @@ def csv( csv.__doc__ = PySparkDataStreamReader.csv.__doc__ - # def table() TODO. Use Read(table_name) relation. + # def table() TODO(SPARK-43042). Use Read(table_name) relation. DataStreamReader.__doc__ = PySparkDataStreamReader.__doc__ @@ -460,10 +526,32 @@ def toTable( def _test() -> None: - # TODO(SPARK-43031): port _test() from legacy query.py. - pass + import sys + import doctest + from pyspark.sql import SparkSession as PySparkSession + import pyspark.sql.connect.streaming.readwriter + + globs = pyspark.sql.connect.readwriter.__dict__.copy() + + globs["spark"] = ( + PySparkSession.builder.appName("sql.connect.streaming.readwriter tests") + .remote("local[4]") + .getOrCreate() + ) + + (failure_count, test_count) = doctest.testmod( + pyspark.sql.connect.streaming.readwriter, + globs=globs, + optionflags=doctest.ELLIPSIS + | doctest.NORMALIZE_WHITESPACE + | doctest.IGNORE_EXCEPTION_DETAIL, + ) + + globs["spark"].stop() + + if failure_count: + sys.exit(-1) if __name__ == "__main__": - # TODO(SPARK-43031): Add this file dev/sparktestsupport/modules.py to enable testing in CI. _test() diff --git a/python/pyspark/sql/streaming/query.py b/python/pyspark/sql/streaming/query.py index 3c43628bf3780..0268de2da6ec1 100644 --- a/python/pyspark/sql/streaming/query.py +++ b/python/pyspark/sql/streaming/query.py @@ -188,7 +188,8 @@ def awaitTermination(self, timeout: Optional[int] = None) -> Optional[bool]: Return whether the query has terminated or not within 5 seconds - >>> sq.awaitTermination(5) + TODO(SPARK-42960): remove the SKIP flag below + >>> sq.awaitTermination(5) # doctest: +SKIP False >>> sq.stop() @@ -330,7 +331,9 @@ def stop(self) -> None: Stop streaming query >>> sq.stop() - >>> sq.isActive + + # TODO(SPARK-42940): remove the SKIP flag below + >>> sq.isActive # doctest: +SKIP False """ self._jsq.stop() diff --git a/python/pyspark/sql/streaming/readwriter.py b/python/pyspark/sql/streaming/readwriter.py index c58848dc50850..16c44ddbbcbf8 100644 --- a/python/pyspark/sql/streaming/readwriter.py +++ b/python/pyspark/sql/streaming/readwriter.py @@ -49,8 +49,8 @@ class DataStreamReader(OptionUtils): Examples -------- - >>> spark.readStream - + >>> spark.readStream # doctest: +ELLIPSIS + <...streaming.readwriter.DataStreamReader object ...> The example below uses Rate source that generates rows continuously. After that, we operate a modulo by 3, and then writes the stream out to the console. @@ -89,8 +89,8 @@ def format(self, source: str) -> "DataStreamReader": Examples -------- - >>> spark.readStream.format("text") - + >>> spark.readStream.format("text") # doctest: +ELLIPSIS + <...streaming.readwriter.DataStreamReader object ...> This API allows to configure other sources to read. The example below writes a small text file, and reads it back via Text source. @@ -132,10 +132,10 @@ def schema(self, schema: Union[StructType, str]) -> "DataStreamReader": Examples -------- >>> from pyspark.sql.types import StructField, StructType, StringType - >>> spark.readStream.schema(StructType([StructField("data", StringType(), True)])) - - >>> spark.readStream.schema("col0 INT, col1 DOUBLE") - + >>> spark.readStream.schema(StructType([StructField("data", StringType(), True)])) # doctest: +ELLIPSIS + <...streaming.readwriter.DataStreamReader object ...> + >>> spark.readStream.schema("col0 INT, col1 DOUBLE") # doctest: +ELLIPSIS + <...streaming.readwriter.DataStreamReader object ...> The example below specifies a different schema to CSV file. @@ -171,8 +171,8 @@ def option(self, key: str, value: "OptionalPrimitiveType") -> "DataStreamReader" Examples -------- - >>> spark.readStream.option("x", 1) - + >>> spark.readStream.option("x", 1) # doctest: +ELLIPSIS + <...streaming.readwriter.DataStreamReader object ...> The example below specifies 'rowsPerSecond' option to Rate source in order to generate 10 rows every second. @@ -197,8 +197,8 @@ def options(self, **options: "OptionalPrimitiveType") -> "DataStreamReader": Examples -------- - >>> spark.readStream.options(x="1", y=2) - + >>> spark.readStream.options(x="1", y=2) # doctest: +ELLIPSIS + <...streaming.readwriter.DataStreamReader object ...> The example below specifies 'rowsPerSecond' and 'numPartitions' options to Rate source in order to generate 10 rows with 10 partitions every second. @@ -763,8 +763,8 @@ def outputMode(self, outputMode: str) -> "DataStreamWriter": Examples -------- >>> df = spark.readStream.format("rate").load() - >>> df.writeStream.outputMode('append') - + >>> df.writeStream.outputMode('append') # doctest: +ELLIPSIS + <...streaming.readwriter.DataStreamWriter object ...> The example below uses Complete mode that the entire aggregated counts are printed out. @@ -797,8 +797,8 @@ def format(self, source: str) -> "DataStreamWriter": Examples -------- >>> df = spark.readStream.format("rate").load() - >>> df.writeStream.format("text") - + >>> df.writeStream.format("text") # doctest: +ELLIPSIS + <...streaming.readwriter.DataStreamWriter object ...> This API allows to configure the source to write. The example below writes a CSV file from Rate source in a streaming manner. @@ -831,8 +831,8 @@ def option(self, key: str, value: "OptionalPrimitiveType") -> "DataStreamWriter" Examples -------- >>> df = spark.readStream.format("rate").load() - >>> df.writeStream.option("x", 1) - + >>> df.writeStream.option("x", 1) # doctest: +ELLIPSIS + <...streaming.readwriter.DataStreamWriter object ...> The example below specifies 'numRows' option to Console source in order to print 3 rows for every batch. @@ -859,8 +859,8 @@ def options(self, **options: "OptionalPrimitiveType") -> "DataStreamWriter": Examples -------- >>> df = spark.readStream.format("rate").load() - >>> df.writeStream.option("x", 1) - + >>> df.writeStream.option("x", 1) # doctest: +ELLIPSIS + <...streaming.readwriter.DataStreamWriter object ...> The example below specifies 'numRows' and 'truncate' options to Console source in order to print 3 rows for every batch without truncating the results. @@ -904,8 +904,8 @@ def partitionBy(self, *cols: str) -> "DataStreamWriter": # type: ignore[misc] Examples -------- >>> df = spark.readStream.format("rate").load() - >>> df.writeStream.partitionBy("value") - + >>> df.writeStream.partitionBy("value") # doctest: +ELLIPSIS + <...streaming.readwriter.DataStreamWriter object ...> Partition-by timestamp column from Rate source. @@ -1014,18 +1014,18 @@ def trigger( Trigger the query for execution every 5 seconds - >>> df.writeStream.trigger(processingTime='5 seconds') - + >>> df.writeStream.trigger(processingTime='5 seconds') # doctest: +ELLIPSIS + <...streaming.readwriter.DataStreamWriter object ...> Trigger the query for execution every 5 seconds - >>> df.writeStream.trigger(continuous='5 seconds') - + >>> df.writeStream.trigger(continuous='5 seconds') # doctest: +ELLIPSIS + <...streaming.readwriter.DataStreamWriter object ...> Trigger the query for reading all available data with multiple batches - >>> df.writeStream.trigger(availableNow=True) - + >>> df.writeStream.trigger(availableNow=True) # doctest: +ELLIPSIS + <...streaming.readwriter.DataStreamWriter object ...> """ params = [processingTime, once, continuous, availableNow] @@ -1150,6 +1150,7 @@ def foreach(self, f: Union[Callable[[Row], None], "SupportsProcess"]) -> "DataSt This API is evolving. Examples + TODO(SPARK-43054): remove the SKIP flags below -------- >>> import time >>> df = spark.readStream.format("rate").load() @@ -1159,9 +1160,9 @@ def foreach(self, f: Union[Callable[[Row], None], "SupportsProcess"]) -> "DataSt >>> def print_row(row): ... print(row) ... - >>> q = df.writeStream.foreach(print_row).start() - >>> time.sleep(3) - >>> q.stop() + >>> q = df.writeStream.foreach(print_row).start() # doctest: +SKIP + >>> time.sleep(3) # doctest: +SKIP + >>> q.stop() # doctest: +SKIP Print every row using a object with process() method @@ -1176,9 +1177,9 @@ def foreach(self, f: Union[Callable[[Row], None], "SupportsProcess"]) -> "DataSt ... def close(self, error): ... print("Closed with error: %s" % str(error)) ... - >>> q = df.writeStream.foreach(print_row).start() - >>> time.sleep(3) - >>> q.stop() + >>> q = df.writeStream.foreach(print_row).start() # doctest: +SKIP + >>> time.sleep(3) # doctest: +SKIP + >>> q.stop() # doctest: +SKIP """ from pyspark.rdd import _wrap_function @@ -1280,14 +1281,15 @@ def foreachBatch(self, func: Callable[["DataFrame", int], None]) -> "DataStreamW Examples -------- + # TODO(SPARK-42944): remove the SKIP flags below >>> import time >>> df = spark.readStream.format("rate").load() >>> def func(batch_df, batch_id): ... batch_df.collect() ... - >>> q = df.writeStream.foreachBatch(func).start() - >>> time.sleep(3) - >>> q.stop() + >>> q = df.writeStream.foreachBatch(func).start() # doctest: +SKIP + >>> time.sleep(3) # doctest: +SKIP + >>> q.stop() # doctest: +SKIP """ from pyspark.java_gateway import ensure_callback_server_started @@ -1359,7 +1361,9 @@ def start( >>> q.name 'this_query' >>> q.stop() - >>> q.isActive + + # TODO(SPARK-42940): remove the SKIP flag below + >>> q.isActive # doctest: +SKIP False Example with using other parameters with a trigger. diff --git a/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py b/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py new file mode 100644 index 0000000000000..d28d2c0524a9b --- /dev/null +++ b/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py @@ -0,0 +1,69 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import unittest + +from pyspark.testing.connectutils import should_test_connect +from pyspark.sql.tests.streaming.test_streaming import StreamingTestsMixin +from pyspark.testing.connectutils import ReusedConnectTestCase + + +class StreamingParityTests(StreamingTestsMixin, ReusedConnectTestCase): + @unittest.skip("Will be supported with SPARK-42960.") + def test_stream_await_termination(self): + super().test_stream_await_termination() + + @unittest.skip("Will be supported with SPARK-42960.") + def test_stream_exception(self): + super().test_stream_exception() + + @unittest.skip("Query manager API will be supported later with SPARK-43032.") + def test_stream_status_and_progress(self): + super().test_stream_status_and_progress() + + @unittest.skip("Query manager API will be supported later with SPARK-43032.") + def test_query_manager_await_termination(self): + super().test_query_manager_await_termination() + + @unittest.skip("table API will be supported later with SPARK-43042.") + def test_streaming_read_from_table(self): + super().test_streaming_read_from_table() + + @unittest.skip("table API will be supported later with SPARK-43042.") + def test_streaming_write_to_table(self): + super().test_streaming_write_to_table() + + @unittest.skip("Query manager API will be supported later with SPARK-43032.") + def test_stream_save_options(self): + super().test_stream_save_options() + + @unittest.skip("Query manager API will be supported later with SPARK-43032.") + def test_stream_save_options_overwrite(self): + super().test_stream_save_options_overwrite() + + +if __name__ == "__main__": + import unittest + from pyspark.sql.tests.connect.streaming.test_parity_streaming import * # noqa: F401 + + try: + import xmlrunner # type: ignore[import] + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) diff --git a/python/pyspark/sql/tests/streaming/test_streaming.py b/python/pyspark/sql/tests/streaming/test_streaming.py index 9f02ae848bf67..2b3903a855aef 100644 --- a/python/pyspark/sql/tests/streaming/test_streaming.py +++ b/python/pyspark/sql/tests/streaming/test_streaming.py @@ -26,7 +26,39 @@ from pyspark.testing.sqlutils import ReusedSQLTestCase -class StreamingTests(ReusedSQLTestCase): +class StreamingTestsMixin: + def test_streaming_query_functions_sanity(self): + df = self.spark.readStream.format("rate").option("rowsPerSecond", 10).load() + query = ( + df.writeStream.format("memory") + .queryName("test_streaming_query_functions_sanity") + .start() + ) + try: + self.assertEquals(query.name, "test_streaming_query_functions_sanity") + self.assertTrue(isinstance(query.id, str)) + self.assertTrue(isinstance(query.runId, str)) + self.assertTrue(query.isActive) + # TODO: Will be uncommented with [SPARK-42960] + # self.assertEqual(query.exception(), None) + # self.assertFalse(query.awaitTermination(1)) + query.processAllAvailable() + recentProgress = query.recentProgress + lastProgress = query.lastProgress + self.assertEqual(lastProgress["name"], query.name) + self.assertEqual(lastProgress["id"], query.id) + self.assertTrue(any(p == lastProgress for p in recentProgress)) + query.explain() + + except Exception as e: + self.fail( + "Streaming query functions sanity check shouldn't throw any error. " + "Error message: " + str(e) + ) + + finally: + query.stop() + def test_stream_trigger(self): df = self.spark.readStream.format("text").load("python/test_support/sql/streaming") @@ -77,8 +109,14 @@ def test_stream_read_options_overwrite(self): .schema(bad_schema) .load(path="python/test_support/sql/streaming", schema=schema, format="text") ) - self.assertTrue(df.isStreaming) - self.assertEqual(df.schema.simpleString(), "struct") + # TODO: Moving this outside of with block will trigger the following error, + # which doesn't happen in non-connect + # pyspark.errors.exceptions.connect.AnalysisException: + # There is a 'path' option set and load() is called with a path parameter. + # Either remove the path option, or call load() without the parameter. + # To ignore this check, set 'spark.sql.legacy.pathOptionBehavior.enabled' to 'true'. + self.assertTrue(df.isStreaming) + self.assertEqual(df.schema.simpleString(), "struct") def test_stream_save_options(self): df = ( @@ -295,334 +333,6 @@ def test_query_manager_await_termination(self): q.stop() shutil.rmtree(tmpPath) - class ForeachWriterTester: - def __init__(self, spark): - self.spark = spark - - def write_open_event(self, partitionId, epochId): - self._write_event(self.open_events_dir, {"partition": partitionId, "epoch": epochId}) - - def write_process_event(self, row): - self._write_event(self.process_events_dir, {"value": "text"}) - - def write_close_event(self, error): - self._write_event(self.close_events_dir, {"error": str(error)}) - - def write_input_file(self): - self._write_event(self.input_dir, "text") - - def open_events(self): - return self._read_events(self.open_events_dir, "partition INT, epoch INT") - - def process_events(self): - return self._read_events(self.process_events_dir, "value STRING") - - def close_events(self): - return self._read_events(self.close_events_dir, "error STRING") - - def run_streaming_query_on_writer(self, writer, num_files): - self._reset() - try: - sdf = self.spark.readStream.format("text").load(self.input_dir) - sq = sdf.writeStream.foreach(writer).start() - for i in range(num_files): - self.write_input_file() - sq.processAllAvailable() - finally: - self.stop_all() - - def assert_invalid_writer(self, writer, msg=None): - self._reset() - try: - sdf = self.spark.readStream.format("text").load(self.input_dir) - sq = sdf.writeStream.foreach(writer).start() - self.write_input_file() - sq.processAllAvailable() - self.fail("invalid writer %s did not fail the query" % str(writer)) # not expected - except Exception as e: - if msg: - assert msg in str(e), "%s not in %s" % (msg, str(e)) - - finally: - self.stop_all() - - def stop_all(self): - for q in self.spark.streams.active: - q.stop() - - def _reset(self): - self.input_dir = tempfile.mkdtemp() - self.open_events_dir = tempfile.mkdtemp() - self.process_events_dir = tempfile.mkdtemp() - self.close_events_dir = tempfile.mkdtemp() - - def _read_events(self, dir, json): - rows = self.spark.read.schema(json).json(dir).collect() - dicts = [row.asDict() for row in rows] - return dicts - - def _write_event(self, dir, event): - import uuid - - with open(os.path.join(dir, str(uuid.uuid4())), "w") as f: - f.write("%s\n" % str(event)) - - def __getstate__(self): - return (self.open_events_dir, self.process_events_dir, self.close_events_dir) - - def __setstate__(self, state): - self.open_events_dir, self.process_events_dir, self.close_events_dir = state - - # Those foreach tests are failed in macOS High Sierra by defined rules - # at http://sealiesoftware.com/blog/archive/2017/6/5/Objective-C_and_fork_in_macOS_1013.html - # To work around this, OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES. - def test_streaming_foreach_with_simple_function(self): - tester = self.ForeachWriterTester(self.spark) - - def foreach_func(row): - tester.write_process_event(row) - - tester.run_streaming_query_on_writer(foreach_func, 2) - self.assertEqual(len(tester.process_events()), 2) - - def test_streaming_foreach_with_basic_open_process_close(self): - tester = self.ForeachWriterTester(self.spark) - - class ForeachWriter: - def open(self, partitionId, epochId): - tester.write_open_event(partitionId, epochId) - return True - - def process(self, row): - tester.write_process_event(row) - - def close(self, error): - tester.write_close_event(error) - - tester.run_streaming_query_on_writer(ForeachWriter(), 2) - - open_events = tester.open_events() - self.assertEqual(len(open_events), 2) - self.assertSetEqual(set([e["epoch"] for e in open_events]), {0, 1}) - - self.assertEqual(len(tester.process_events()), 2) - - close_events = tester.close_events() - self.assertEqual(len(close_events), 2) - self.assertSetEqual(set([e["error"] for e in close_events]), {"None"}) - - def test_streaming_foreach_with_open_returning_false(self): - tester = self.ForeachWriterTester(self.spark) - - class ForeachWriter: - def open(self, partition_id, epoch_id): - tester.write_open_event(partition_id, epoch_id) - return False - - def process(self, row): - tester.write_process_event(row) - - def close(self, error): - tester.write_close_event(error) - - tester.run_streaming_query_on_writer(ForeachWriter(), 2) - - self.assertEqual(len(tester.open_events()), 2) - - self.assertEqual(len(tester.process_events()), 0) # no row was processed - - close_events = tester.close_events() - self.assertEqual(len(close_events), 2) - self.assertSetEqual(set([e["error"] for e in close_events]), {"None"}) - - def test_streaming_foreach_without_open_method(self): - tester = self.ForeachWriterTester(self.spark) - - class ForeachWriter: - def process(self, row): - tester.write_process_event(row) - - def close(self, error): - tester.write_close_event(error) - - tester.run_streaming_query_on_writer(ForeachWriter(), 2) - self.assertEqual(len(tester.open_events()), 0) # no open events - self.assertEqual(len(tester.process_events()), 2) - self.assertEqual(len(tester.close_events()), 2) - - def test_streaming_foreach_without_close_method(self): - tester = self.ForeachWriterTester(self.spark) - - class ForeachWriter: - def open(self, partition_id, epoch_id): - tester.write_open_event(partition_id, epoch_id) - return True - - def process(self, row): - tester.write_process_event(row) - - tester.run_streaming_query_on_writer(ForeachWriter(), 2) - self.assertEqual(len(tester.open_events()), 2) # no open events - self.assertEqual(len(tester.process_events()), 2) - self.assertEqual(len(tester.close_events()), 0) - - def test_streaming_foreach_without_open_and_close_methods(self): - tester = self.ForeachWriterTester(self.spark) - - class ForeachWriter: - def process(self, row): - tester.write_process_event(row) - - tester.run_streaming_query_on_writer(ForeachWriter(), 2) - self.assertEqual(len(tester.open_events()), 0) # no open events - self.assertEqual(len(tester.process_events()), 2) - self.assertEqual(len(tester.close_events()), 0) - - def test_streaming_foreach_with_process_throwing_error(self): - from pyspark.errors import StreamingQueryException - - tester = self.ForeachWriterTester(self.spark) - - class ForeachWriter: - def process(self, row): - raise RuntimeError("test error") - - def close(self, error): - tester.write_close_event(error) - - try: - tester.run_streaming_query_on_writer(ForeachWriter(), 1) - self.fail("bad writer did not fail the query") # this is not expected - except StreamingQueryException: - # TODO: Verify whether original error message is inside the exception - pass - - self.assertEqual(len(tester.process_events()), 0) # no row was processed - close_events = tester.close_events() - self.assertEqual(len(close_events), 1) - # TODO: Verify whether original error message is inside the exception - - def test_streaming_foreach_with_invalid_writers(self): - - tester = self.ForeachWriterTester(self.spark) - - def func_with_iterator_input(iter): - for x in iter: - print(x) - - tester.assert_invalid_writer(func_with_iterator_input) - - class WriterWithoutProcess: - def open(self, partition): - pass - - tester.assert_invalid_writer(WriterWithoutProcess(), "does not have a 'process'") - - class WriterWithNonCallableProcess: - process = True - - tester.assert_invalid_writer( - WriterWithNonCallableProcess(), "'process' in provided object is not callable" - ) - - class WriterWithNoParamProcess: - def process(self): - pass - - tester.assert_invalid_writer(WriterWithNoParamProcess()) - - # Abstract class for tests below - class WithProcess: - def process(self, row): - pass - - class WriterWithNonCallableOpen(WithProcess): - open = True - - tester.assert_invalid_writer( - WriterWithNonCallableOpen(), "'open' in provided object is not callable" - ) - - class WriterWithNoParamOpen(WithProcess): - def open(self): - pass - - tester.assert_invalid_writer(WriterWithNoParamOpen()) - - class WriterWithNonCallableClose(WithProcess): - close = True - - tester.assert_invalid_writer( - WriterWithNonCallableClose(), "'close' in provided object is not callable" - ) - - def test_streaming_foreachBatch(self): - q = None - collected = dict() - - def collectBatch(batch_df, batch_id): - collected[batch_id] = batch_df.collect() - - try: - df = self.spark.readStream.format("text").load("python/test_support/sql/streaming") - q = df.writeStream.foreachBatch(collectBatch).start() - q.processAllAvailable() - self.assertTrue(0 in collected) - self.assertTrue(len(collected[0]), 2) - finally: - if q: - q.stop() - - def test_streaming_foreachBatch_tempview(self): - q = None - collected = dict() - - def collectBatch(batch_df, batch_id): - batch_df.createOrReplaceTempView("updates") - # it should use the spark session within given DataFrame, as microbatch execution will - # clone the session which is no longer same with the session used to start the - # streaming query - collected[batch_id] = batch_df.sparkSession.sql("SELECT * FROM updates").collect() - - try: - df = self.spark.readStream.format("text").load("python/test_support/sql/streaming") - q = df.writeStream.foreachBatch(collectBatch).start() - q.processAllAvailable() - self.assertTrue(0 in collected) - self.assertTrue(len(collected[0]), 2) - finally: - if q: - q.stop() - - def test_streaming_foreachBatch_propagates_python_errors(self): - from pyspark.errors import StreamingQueryException - - q = None - - def collectBatch(df, id): - raise RuntimeError("this should fail the query") - - try: - df = self.spark.readStream.format("text").load("python/test_support/sql/streaming") - q = df.writeStream.foreachBatch(collectBatch).start() - q.processAllAvailable() - self.fail("Expected a failure") - except StreamingQueryException as e: - self.assertTrue("this should fail" in str(e)) - finally: - if q: - q.stop() - - def test_streaming_foreachBatch_graceful_stop(self): - # SPARK-39218: Make foreachBatch streaming query stop gracefully - def func(batch_df, _): - batch_df.sparkSession._jvm.java.lang.Thread.sleep(10000) - - q = self.spark.readStream.format("rate").load().writeStream.foreachBatch(func).start() - time.sleep(3) # 'rowsPerSecond' defaults to 1. Waits 3 secs out for the input. - q.stop() - self.assertIsNone(q.exception(), "No exception has to be propagated.") - def test_streaming_read_from_table(self): with self.table("input_table", "this_query"): self.spark.sql("CREATE TABLE input_table (value string) USING parquet") @@ -648,6 +358,10 @@ def test_streaming_write_to_table(self): self.assertTrue(len(result) > 0) +class StreamingTests(StreamingTestsMixin, ReusedSQLTestCase): + pass + + if __name__ == "__main__": import unittest from pyspark.sql.tests.streaming.test_streaming import * # noqa: F401 diff --git a/python/pyspark/sql/tests/streaming/test_streaming_foreach_family.py b/python/pyspark/sql/tests/streaming/test_streaming_foreach_family.py new file mode 100644 index 0000000000000..86ecfebf6fe08 --- /dev/null +++ b/python/pyspark/sql/tests/streaming/test_streaming_foreach_family.py @@ -0,0 +1,369 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import tempfile +import time + +from pyspark.testing.sqlutils import ReusedSQLTestCase + + +class StreamingTestsForeachFamilyMixin: + class ForeachWriterTester: + def __init__(self, spark): + self.spark = spark + + def write_open_event(self, partitionId, epochId): + self._write_event(self.open_events_dir, {"partition": partitionId, "epoch": epochId}) + + def write_process_event(self, row): + self._write_event(self.process_events_dir, {"value": "text"}) + + def write_close_event(self, error): + self._write_event(self.close_events_dir, {"error": str(error)}) + + def write_input_file(self): + self._write_event(self.input_dir, "text") + + def open_events(self): + return self._read_events(self.open_events_dir, "partition INT, epoch INT") + + def process_events(self): + return self._read_events(self.process_events_dir, "value STRING") + + def close_events(self): + return self._read_events(self.close_events_dir, "error STRING") + + def run_streaming_query_on_writer(self, writer, num_files): + self._reset() + try: + sdf = self.spark.readStream.format("text").load(self.input_dir) + sq = sdf.writeStream.foreach(writer).start() + for i in range(num_files): + self.write_input_file() + sq.processAllAvailable() + finally: + self.stop_all() + + def assert_invalid_writer(self, writer, msg=None): + self._reset() + try: + sdf = self.spark.readStream.format("text").load(self.input_dir) + sq = sdf.writeStream.foreach(writer).start() + self.write_input_file() + sq.processAllAvailable() + self.fail("invalid writer %s did not fail the query" % str(writer)) # not expected + except Exception as e: + if msg: + assert msg in str(e), "%s not in %s" % (msg, str(e)) + + finally: + self.stop_all() + + def stop_all(self): + for q in self.spark.streams.active: + q.stop() + + def _reset(self): + self.input_dir = tempfile.mkdtemp() + self.open_events_dir = tempfile.mkdtemp() + self.process_events_dir = tempfile.mkdtemp() + self.close_events_dir = tempfile.mkdtemp() + + def _read_events(self, dir, json): + rows = self.spark.read.schema(json).json(dir).collect() + dicts = [row.asDict() for row in rows] + return dicts + + def _write_event(self, dir, event): + import uuid + + with open(os.path.join(dir, str(uuid.uuid4())), "w") as f: + f.write("%s\n" % str(event)) + + def __getstate__(self): + return (self.open_events_dir, self.process_events_dir, self.close_events_dir) + + def __setstate__(self, state): + self.open_events_dir, self.process_events_dir, self.close_events_dir = state + + # Those foreach tests are failed in macOS High Sierra by defined rules + # at http://sealiesoftware.com/blog/archive/2017/6/5/Objective-C_and_fork_in_macOS_1013.html + # To work around this, OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES. + def test_streaming_foreach_with_simple_function(self): + tester = self.ForeachWriterTester(self.spark) + + def foreach_func(row): + tester.write_process_event(row) + + tester.run_streaming_query_on_writer(foreach_func, 2) + self.assertEqual(len(tester.process_events()), 2) + + def test_streaming_foreach_with_basic_open_process_close(self): + tester = self.ForeachWriterTester(self.spark) + + class ForeachWriter: + def open(self, partitionId, epochId): + tester.write_open_event(partitionId, epochId) + return True + + def process(self, row): + tester.write_process_event(row) + + def close(self, error): + tester.write_close_event(error) + + tester.run_streaming_query_on_writer(ForeachWriter(), 2) + + open_events = tester.open_events() + self.assertEqual(len(open_events), 2) + self.assertSetEqual(set([e["epoch"] for e in open_events]), {0, 1}) + + self.assertEqual(len(tester.process_events()), 2) + + close_events = tester.close_events() + self.assertEqual(len(close_events), 2) + self.assertSetEqual(set([e["error"] for e in close_events]), {"None"}) + + def test_streaming_foreach_with_open_returning_false(self): + tester = self.ForeachWriterTester(self.spark) + + class ForeachWriter: + def open(self, partition_id, epoch_id): + tester.write_open_event(partition_id, epoch_id) + return False + + def process(self, row): + tester.write_process_event(row) + + def close(self, error): + tester.write_close_event(error) + + tester.run_streaming_query_on_writer(ForeachWriter(), 2) + + self.assertEqual(len(tester.open_events()), 2) + + self.assertEqual(len(tester.process_events()), 0) # no row was processed + + close_events = tester.close_events() + self.assertEqual(len(close_events), 2) + self.assertSetEqual(set([e["error"] for e in close_events]), {"None"}) + + def test_streaming_foreach_without_open_method(self): + tester = self.ForeachWriterTester(self.spark) + + class ForeachWriter: + def process(self, row): + tester.write_process_event(row) + + def close(self, error): + tester.write_close_event(error) + + tester.run_streaming_query_on_writer(ForeachWriter(), 2) + self.assertEqual(len(tester.open_events()), 0) # no open events + self.assertEqual(len(tester.process_events()), 2) + self.assertEqual(len(tester.close_events()), 2) + + def test_streaming_foreach_without_close_method(self): + tester = self.ForeachWriterTester(self.spark) + + class ForeachWriter: + def open(self, partition_id, epoch_id): + tester.write_open_event(partition_id, epoch_id) + return True + + def process(self, row): + tester.write_process_event(row) + + tester.run_streaming_query_on_writer(ForeachWriter(), 2) + self.assertEqual(len(tester.open_events()), 2) # no open events + self.assertEqual(len(tester.process_events()), 2) + self.assertEqual(len(tester.close_events()), 0) + + def test_streaming_foreach_without_open_and_close_methods(self): + tester = self.ForeachWriterTester(self.spark) + + class ForeachWriter: + def process(self, row): + tester.write_process_event(row) + + tester.run_streaming_query_on_writer(ForeachWriter(), 2) + self.assertEqual(len(tester.open_events()), 0) # no open events + self.assertEqual(len(tester.process_events()), 2) + self.assertEqual(len(tester.close_events()), 0) + + def test_streaming_foreach_with_process_throwing_error(self): + from pyspark.errors import StreamingQueryException + + tester = self.ForeachWriterTester(self.spark) + + class ForeachWriter: + def process(self, row): + raise RuntimeError("test error") + + def close(self, error): + tester.write_close_event(error) + + try: + tester.run_streaming_query_on_writer(ForeachWriter(), 1) + self.fail("bad writer did not fail the query") # this is not expected + except StreamingQueryException: + # TODO: Verify whether original error message is inside the exception + pass + + self.assertEqual(len(tester.process_events()), 0) # no row was processed + close_events = tester.close_events() + self.assertEqual(len(close_events), 1) + # TODO: Verify whether original error message is inside the exception + + def test_streaming_foreach_with_invalid_writers(self): + + tester = self.ForeachWriterTester(self.spark) + + def func_with_iterator_input(iter): + for x in iter: + print(x) + + tester.assert_invalid_writer(func_with_iterator_input) + + class WriterWithoutProcess: + def open(self, partition): + pass + + tester.assert_invalid_writer(WriterWithoutProcess(), "does not have a 'process'") + + class WriterWithNonCallableProcess: + process = True + + tester.assert_invalid_writer( + WriterWithNonCallableProcess(), "'process' in provided object is not callable" + ) + + class WriterWithNoParamProcess: + def process(self): + pass + + tester.assert_invalid_writer(WriterWithNoParamProcess()) + + # Abstract class for tests below + class WithProcess: + def process(self, row): + pass + + class WriterWithNonCallableOpen(WithProcess): + open = True + + tester.assert_invalid_writer( + WriterWithNonCallableOpen(), "'open' in provided object is not callable" + ) + + class WriterWithNoParamOpen(WithProcess): + def open(self): + pass + + tester.assert_invalid_writer(WriterWithNoParamOpen()) + + class WriterWithNonCallableClose(WithProcess): + close = True + + tester.assert_invalid_writer( + WriterWithNonCallableClose(), "'close' in provided object is not callable" + ) + + def test_streaming_foreachBatch(self): + q = None + collected = dict() + + def collectBatch(batch_df, batch_id): + collected[batch_id] = batch_df.collect() + + try: + df = self.spark.readStream.format("text").load("python/test_support/sql/streaming") + q = df.writeStream.foreachBatch(collectBatch).start() + q.processAllAvailable() + self.assertTrue(0 in collected) + self.assertTrue(len(collected[0]), 2) + finally: + if q: + q.stop() + + def test_streaming_foreachBatch_tempview(self): + q = None + collected = dict() + + def collectBatch(batch_df, batch_id): + batch_df.createOrReplaceTempView("updates") + # it should use the spark session within given DataFrame, as microbatch execution will + # clone the session which is no longer same with the session used to start the + # streaming query + collected[batch_id] = batch_df.sparkSession.sql("SELECT * FROM updates").collect() + + try: + df = self.spark.readStream.format("text").load("python/test_support/sql/streaming") + q = df.writeStream.foreachBatch(collectBatch).start() + q.processAllAvailable() + self.assertTrue(0 in collected) + self.assertTrue(len(collected[0]), 2) + finally: + if q: + q.stop() + + def test_streaming_foreachBatch_propagates_python_errors(self): + from pyspark.errors import StreamingQueryException + + q = None + + def collectBatch(df, id): + raise RuntimeError("this should fail the query") + + try: + df = self.spark.readStream.format("text").load("python/test_support/sql/streaming") + q = df.writeStream.foreachBatch(collectBatch).start() + q.processAllAvailable() + self.fail("Expected a failure") + except StreamingQueryException as e: + self.assertTrue("this should fail" in str(e)) + finally: + if q: + q.stop() + + def test_streaming_foreachBatch_graceful_stop(self): + # SPARK-39218: Make foreachBatch streaming query stop gracefully + def func(batch_df, _): + batch_df.sparkSession._jvm.java.lang.Thread.sleep(10000) + + q = self.spark.readStream.format("rate").load().writeStream.foreachBatch(func).start() + time.sleep(3) # 'rowsPerSecond' defaults to 1. Waits 3 secs out for the input. + q.stop() + self.assertIsNone(q.exception(), "No exception has to be propagated.") + + +class StreamingTestsForeachFamily(StreamingTestsForeachFamilyMixin, ReusedSQLTestCase): + pass + + +if __name__ == "__main__": + import unittest + from pyspark.sql.tests.streaming.test_streaming_foreach_family import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) From 0ae7e33929ae09112b6dbd31e33f90f36ef71a2a Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Thu, 6 Apr 2023 17:59:00 -0700 Subject: [PATCH 02/31] add versionchanged to query and readwriter --- python/pyspark/sql/streaming/query.py | 3 +++ python/pyspark/sql/streaming/readwriter.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/python/pyspark/sql/streaming/query.py b/python/pyspark/sql/streaming/query.py index 0268de2da6ec1..ca83fcddf5226 100644 --- a/python/pyspark/sql/streaming/query.py +++ b/python/pyspark/sql/streaming/query.py @@ -36,6 +36,9 @@ class StreamingQuery: All these methods are thread-safe. .. versionadded:: 2.0.0 + + .. versionchanged:: 3.5.0 + Supports Spark Connect. Notes ----- diff --git a/python/pyspark/sql/streaming/readwriter.py b/python/pyspark/sql/streaming/readwriter.py index 16c44ddbbcbf8..359d8cf7cd48f 100644 --- a/python/pyspark/sql/streaming/readwriter.py +++ b/python/pyspark/sql/streaming/readwriter.py @@ -42,6 +42,9 @@ class DataStreamReader(OptionUtils): Use :attr:`SparkSession.readStream ` to access this. .. versionadded:: 2.0.0 + + .. versionchanged:: 3.5.0 + Supports Spark Connect. Notes ----- From 17720b7121eee050a546126fad7ce8229fe6bda2 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Mon, 10 Apr 2023 11:21:52 -0700 Subject: [PATCH 03/31] style --- python/pyspark/sql/streaming/query.py | 2 +- python/pyspark/sql/streaming/readwriter.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/streaming/query.py b/python/pyspark/sql/streaming/query.py index 75dafa19b7c10..d909eba0a60dd 100644 --- a/python/pyspark/sql/streaming/query.py +++ b/python/pyspark/sql/streaming/query.py @@ -36,7 +36,7 @@ class StreamingQuery: All these methods are thread-safe. .. versionadded:: 2.0.0 - + .. versionchanged:: 3.5.0 Supports Spark Connect. diff --git a/python/pyspark/sql/streaming/readwriter.py b/python/pyspark/sql/streaming/readwriter.py index 9fa5ee993df18..793c4f4cdd1a2 100644 --- a/python/pyspark/sql/streaming/readwriter.py +++ b/python/pyspark/sql/streaming/readwriter.py @@ -42,7 +42,7 @@ class DataStreamReader(OptionUtils): Use :attr:`SparkSession.readStream ` to access this. .. versionadded:: 2.0.0 - + .. versionchanged:: 3.5.0 Supports Spark Connect. From 1e68a3c212fa7ae39b958d79eec7af77b4884859 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Mon, 10 Apr 2023 12:42:14 -0700 Subject: [PATCH 04/31] comments --- python/pyspark/sql/tests/streaming/test_streaming.py | 12 +++--------- .../tests/streaming/test_streaming_foreach_family.py | 6 +----- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/python/pyspark/sql/tests/streaming/test_streaming.py b/python/pyspark/sql/tests/streaming/test_streaming.py index 2b3903a855aef..838d413a0cc3c 100644 --- a/python/pyspark/sql/tests/streaming/test_streaming.py +++ b/python/pyspark/sql/tests/streaming/test_streaming.py @@ -27,15 +27,15 @@ class StreamingTestsMixin: - def test_streaming_query_functions_sanity(self): + def test_streaming_query_functions_basic(self): df = self.spark.readStream.format("rate").option("rowsPerSecond", 10).load() query = ( df.writeStream.format("memory") - .queryName("test_streaming_query_functions_sanity") + .queryName("test_streaming_query_functions_basic") .start() ) try: - self.assertEquals(query.name, "test_streaming_query_functions_sanity") + self.assertEquals(query.name, "test_streaming_query_functions_basic") self.assertTrue(isinstance(query.id, str)) self.assertTrue(isinstance(query.runId, str)) self.assertTrue(query.isActive) @@ -109,12 +109,6 @@ def test_stream_read_options_overwrite(self): .schema(bad_schema) .load(path="python/test_support/sql/streaming", schema=schema, format="text") ) - # TODO: Moving this outside of with block will trigger the following error, - # which doesn't happen in non-connect - # pyspark.errors.exceptions.connect.AnalysisException: - # There is a 'path' option set and load() is called with a path parameter. - # Either remove the path option, or call load() without the parameter. - # To ignore this check, set 'spark.sql.legacy.pathOptionBehavior.enabled' to 'true'. self.assertTrue(df.isStreaming) self.assertEqual(df.schema.simpleString(), "struct") diff --git a/python/pyspark/sql/tests/streaming/test_streaming_foreach_family.py b/python/pyspark/sql/tests/streaming/test_streaming_foreach_family.py index 86ecfebf6fe08..89fb32e5c0347 100644 --- a/python/pyspark/sql/tests/streaming/test_streaming_foreach_family.py +++ b/python/pyspark/sql/tests/streaming/test_streaming_foreach_family.py @@ -22,7 +22,7 @@ from pyspark.testing.sqlutils import ReusedSQLTestCase -class StreamingTestsForeachFamilyMixin: +class StreamingTestsForeachFamily(ReusedSQLTestCase): class ForeachWriterTester: def __init__(self, spark): self.spark = spark @@ -352,10 +352,6 @@ def func(batch_df, _): self.assertIsNone(q.exception(), "No exception has to be propagated.") -class StreamingTestsForeachFamily(StreamingTestsForeachFamilyMixin, ReusedSQLTestCase): - pass - - if __name__ == "__main__": import unittest from pyspark.sql.tests.streaming.test_streaming_foreach_family import * # noqa: F401 From dc05be8c79b7a3eea47f008dc6f5c137349203a1 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Mon, 10 Apr 2023 14:07:29 -0700 Subject: [PATCH 05/31] address comments, add a new foreachBatch test class, remove all ELLIPSIS flag as it's already in test options --- dev/sparktestsupport/modules.py | 3 +- python/pyspark/sql/dataframe.py | 2 +- python/pyspark/sql/streaming/query.py | 8 +- python/pyspark/sql/streaming/readwriter.py | 28 ++--- ...ch_family.py => test_streaming_foreach.py} | 4 +- .../streaming/test_streaming_foreachBatch.py | 104 ++++++++++++++++++ 6 files changed, 127 insertions(+), 22 deletions(-) rename python/pyspark/sql/tests/streaming/{test_streaming_foreach_family.py => test_streaming_foreach.py} (98%) create mode 100644 python/pyspark/sql/tests/streaming/test_streaming_foreachBatch.py diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index a65789c1da8e2..08924a86fd7c5 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -495,7 +495,8 @@ def __hash__(self): "pyspark.sql.tests.test_serde", "pyspark.sql.tests.test_session", "pyspark.sql.tests.streaming.test_streaming", - "pyspark.sql.tests.streaming.test_streaming_foreach_family", + "pyspark.sql.tests.streaming.test_streaming_foreach", + "pyspark.sql.tests.streaming.test_streaming_foreachBatch", "pyspark.sql.tests.streaming.test_streaming_listener", "pyspark.sql.tests.test_types", "pyspark.sql.tests.test_udf", diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index e7df25d20fcb0..542b898015b40 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -535,7 +535,7 @@ def writeStream(self) -> DataStreamWriter: >>> with tempfile.TemporaryDirectory() as d: ... # Create a table with Rate source. ... df.writeStream.toTable( - ... "my_table", checkpointLocation=d) # doctest: +ELLIPSIS + ... "my_table", checkpointLocation=d) <...streaming.query.StreamingQuery object at 0x...> """ return DataStreamWriter(self) diff --git a/python/pyspark/sql/streaming/query.py b/python/pyspark/sql/streaming/query.py index d909eba0a60dd..fd0318f821b5a 100644 --- a/python/pyspark/sql/streaming/query.py +++ b/python/pyspark/sql/streaming/query.py @@ -71,7 +71,7 @@ def id(self) -> str: Get the unique id of this query that persists across restarts from checkpoint data - >>> sq.id # doctest: +ELLIPSIS + >>> sq.id '...' >>> sq.stop() @@ -98,7 +98,7 @@ def runId(self) -> str: Get the unique id of this query that does not persist across restarts - >>> sq.runId # doctest: +ELLIPSIS + >>> sq.runId '...' >>> sq.stop() @@ -223,7 +223,7 @@ def status(self) -> Dict[str, Any]: Get the current status of the query - >>> sq.status # doctest: +ELLIPSIS + >>> sq.status {'message': '...', 'isDataAvailable': ..., 'isTriggerActive': ...} >>> sq.stop() @@ -252,7 +252,7 @@ def recentProgress(self) -> List[Dict[str, Any]]: Get an array of the most recent query progress updates for this query - >>> sq.recentProgress # doctest: +ELLIPSIS + >>> sq.recentProgress [...] >>> sq.stop() diff --git a/python/pyspark/sql/streaming/readwriter.py b/python/pyspark/sql/streaming/readwriter.py index 793c4f4cdd1a2..df21b257f8b23 100644 --- a/python/pyspark/sql/streaming/readwriter.py +++ b/python/pyspark/sql/streaming/readwriter.py @@ -52,7 +52,7 @@ class DataStreamReader(OptionUtils): Examples -------- - >>> spark.readStream # doctest: +ELLIPSIS + >>> spark.readStream <...streaming.readwriter.DataStreamReader object ...> The example below uses Rate source that generates rows continuously. @@ -92,7 +92,7 @@ def format(self, source: str) -> "DataStreamReader": Examples -------- - >>> spark.readStream.format("text") # doctest: +ELLIPSIS + >>> spark.readStream.format("text") <...streaming.readwriter.DataStreamReader object ...> This API allows to configure other sources to read. The example below writes a small text @@ -135,9 +135,9 @@ def schema(self, schema: Union[StructType, str]) -> "DataStreamReader": Examples -------- >>> from pyspark.sql.types import StructField, StructType, StringType - >>> spark.readStream.schema(StructType([StructField("data", StringType(), True)])) # doctest: +ELLIPSIS + >>> spark.readStream.schema(StructType([StructField("data", StringType(), True)])) <...streaming.readwriter.DataStreamReader object ...> - >>> spark.readStream.schema("col0 INT, col1 DOUBLE") # doctest: +ELLIPSIS + >>> spark.readStream.schema("col0 INT, col1 DOUBLE") <...streaming.readwriter.DataStreamReader object ...> The example below specifies a different schema to CSV file. @@ -174,7 +174,7 @@ def option(self, key: str, value: "OptionalPrimitiveType") -> "DataStreamReader" Examples -------- - >>> spark.readStream.option("x", 1) # doctest: +ELLIPSIS + >>> spark.readStream.option("x", 1) <...streaming.readwriter.DataStreamReader object ...> The example below specifies 'rowsPerSecond' option to Rate source in order to generate @@ -200,7 +200,7 @@ def options(self, **options: "OptionalPrimitiveType") -> "DataStreamReader": Examples -------- - >>> spark.readStream.options(x="1", y=2) # doctest: +ELLIPSIS + >>> spark.readStream.options(x="1", y=2) <...streaming.readwriter.DataStreamReader object ...> The example below specifies 'rowsPerSecond' and 'numPartitions' options to @@ -766,7 +766,7 @@ def outputMode(self, outputMode: str) -> "DataStreamWriter": Examples -------- >>> df = spark.readStream.format("rate").load() - >>> df.writeStream.outputMode('append') # doctest: +ELLIPSIS + >>> df.writeStream.outputMode('append') <...streaming.readwriter.DataStreamWriter object ...> The example below uses Complete mode that the entire aggregated counts are printed out. @@ -800,7 +800,7 @@ def format(self, source: str) -> "DataStreamWriter": Examples -------- >>> df = spark.readStream.format("rate").load() - >>> df.writeStream.format("text") # doctest: +ELLIPSIS + >>> df.writeStream.format("text") <...streaming.readwriter.DataStreamWriter object ...> This API allows to configure the source to write. The example below writes a CSV @@ -834,7 +834,7 @@ def option(self, key: str, value: "OptionalPrimitiveType") -> "DataStreamWriter" Examples -------- >>> df = spark.readStream.format("rate").load() - >>> df.writeStream.option("x", 1) # doctest: +ELLIPSIS + >>> df.writeStream.option("x", 1) <...streaming.readwriter.DataStreamWriter object ...> The example below specifies 'numRows' option to Console source in order to print @@ -862,7 +862,7 @@ def options(self, **options: "OptionalPrimitiveType") -> "DataStreamWriter": Examples -------- >>> df = spark.readStream.format("rate").load() - >>> df.writeStream.option("x", 1) # doctest: +ELLIPSIS + >>> df.writeStream.option("x", 1) <...streaming.readwriter.DataStreamWriter object ...> The example below specifies 'numRows' and 'truncate' options to Console source in order @@ -907,7 +907,7 @@ def partitionBy(self, *cols: str) -> "DataStreamWriter": # type: ignore[misc] Examples -------- >>> df = spark.readStream.format("rate").load() - >>> df.writeStream.partitionBy("value") # doctest: +ELLIPSIS + >>> df.writeStream.partitionBy("value") <...streaming.readwriter.DataStreamWriter object ...> Partition-by timestamp column from Rate source. @@ -1017,17 +1017,17 @@ def trigger( Trigger the query for execution every 5 seconds - >>> df.writeStream.trigger(processingTime='5 seconds') # doctest: +ELLIPSIS + >>> df.writeStream.trigger(processingTime='5 seconds') <...streaming.readwriter.DataStreamWriter object ...> Trigger the query for execution every 5 seconds - >>> df.writeStream.trigger(continuous='5 seconds') # doctest: +ELLIPSIS + >>> df.writeStream.trigger(continuous='5 seconds') <...streaming.readwriter.DataStreamWriter object ...> Trigger the query for reading all available data with multiple batches - >>> df.writeStream.trigger(availableNow=True) # doctest: +ELLIPSIS + >>> df.writeStream.trigger(availableNow=True) <...streaming.readwriter.DataStreamWriter object ...> """ params = [processingTime, once, continuous, availableNow] diff --git a/python/pyspark/sql/tests/streaming/test_streaming_foreach_family.py b/python/pyspark/sql/tests/streaming/test_streaming_foreach.py similarity index 98% rename from python/pyspark/sql/tests/streaming/test_streaming_foreach_family.py rename to python/pyspark/sql/tests/streaming/test_streaming_foreach.py index 89fb32e5c0347..bac0c45e83039 100644 --- a/python/pyspark/sql/tests/streaming/test_streaming_foreach_family.py +++ b/python/pyspark/sql/tests/streaming/test_streaming_foreach.py @@ -22,7 +22,7 @@ from pyspark.testing.sqlutils import ReusedSQLTestCase -class StreamingTestsForeachFamily(ReusedSQLTestCase): +class StreamingTestsForeach(ReusedSQLTestCase): class ForeachWriterTester: def __init__(self, spark): self.spark = spark @@ -354,7 +354,7 @@ def func(batch_df, _): if __name__ == "__main__": import unittest - from pyspark.sql.tests.streaming.test_streaming_foreach_family import * # noqa: F401 + from pyspark.sql.tests.streaming.test_streaming_foreach import * # noqa: F401 try: import xmlrunner diff --git a/python/pyspark/sql/tests/streaming/test_streaming_foreachBatch.py b/python/pyspark/sql/tests/streaming/test_streaming_foreachBatch.py new file mode 100644 index 0000000000000..7d56804c3353a --- /dev/null +++ b/python/pyspark/sql/tests/streaming/test_streaming_foreachBatch.py @@ -0,0 +1,104 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +import tempfile +import time + +from pyspark.testing.sqlutils import ReusedSQLTestCase + + +class StreamingTestsForeachBatch(ReusedSQLTestCase): + def test_streaming_foreachBatch(self): + q = None + collected = dict() + + def collectBatch(batch_df, batch_id): + collected[batch_id] = batch_df.collect() + + try: + df = self.spark.readStream.format("text").load("python/test_support/sql/streaming") + q = df.writeStream.foreachBatch(collectBatch).start() + q.processAllAvailable() + self.assertTrue(0 in collected) + self.assertTrue(len(collected[0]), 2) + finally: + if q: + q.stop() + + def test_streaming_foreachBatch_tempview(self): + q = None + collected = dict() + + def collectBatch(batch_df, batch_id): + batch_df.createOrReplaceTempView("updates") + # it should use the spark session within given DataFrame, as microbatch execution will + # clone the session which is no longer same with the session used to start the + # streaming query + collected[batch_id] = batch_df.sparkSession.sql("SELECT * FROM updates").collect() + + try: + df = self.spark.readStream.format("text").load("python/test_support/sql/streaming") + q = df.writeStream.foreachBatch(collectBatch).start() + q.processAllAvailable() + self.assertTrue(0 in collected) + self.assertTrue(len(collected[0]), 2) + finally: + if q: + q.stop() + + def test_streaming_foreachBatch_propagates_python_errors(self): + from pyspark.errors import StreamingQueryException + + q = None + + def collectBatch(df, id): + raise RuntimeError("this should fail the query") + + try: + df = self.spark.readStream.format("text").load("python/test_support/sql/streaming") + q = df.writeStream.foreachBatch(collectBatch).start() + q.processAllAvailable() + self.fail("Expected a failure") + except StreamingQueryException as e: + self.assertTrue("this should fail" in str(e)) + finally: + if q: + q.stop() + + def test_streaming_foreachBatch_graceful_stop(self): + # SPARK-39218: Make foreachBatch streaming query stop gracefully + def func(batch_df, _): + batch_df.sparkSession._jvm.java.lang.Thread.sleep(10000) + + q = self.spark.readStream.format("rate").load().writeStream.foreachBatch(func).start() + time.sleep(3) # 'rowsPerSecond' defaults to 1. Waits 3 secs out for the input. + q.stop() + self.assertIsNone(q.exception(), "No exception has to be propagated.") + + +if __name__ == "__main__": + import unittest + from pyspark.sql.tests.streaming.test_streaming_foreachBatch import * # noqa: F401 + + try: + import xmlrunner + + testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2) + except ImportError: + testRunner = None + unittest.main(testRunner=testRunner, verbosity=2) From c1674ebb85fafb42fc5453832ad7807e3807ba4b Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Mon, 10 Apr 2023 14:08:56 -0700 Subject: [PATCH 06/31] minor --- .../tests/streaming/test_streaming_foreach.py | 67 ------------------- 1 file changed, 67 deletions(-) diff --git a/python/pyspark/sql/tests/streaming/test_streaming_foreach.py b/python/pyspark/sql/tests/streaming/test_streaming_foreach.py index bac0c45e83039..ffaedd0a18fcd 100644 --- a/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +++ b/python/pyspark/sql/tests/streaming/test_streaming_foreach.py @@ -284,73 +284,6 @@ class WriterWithNonCallableClose(WithProcess): WriterWithNonCallableClose(), "'close' in provided object is not callable" ) - def test_streaming_foreachBatch(self): - q = None - collected = dict() - - def collectBatch(batch_df, batch_id): - collected[batch_id] = batch_df.collect() - - try: - df = self.spark.readStream.format("text").load("python/test_support/sql/streaming") - q = df.writeStream.foreachBatch(collectBatch).start() - q.processAllAvailable() - self.assertTrue(0 in collected) - self.assertTrue(len(collected[0]), 2) - finally: - if q: - q.stop() - - def test_streaming_foreachBatch_tempview(self): - q = None - collected = dict() - - def collectBatch(batch_df, batch_id): - batch_df.createOrReplaceTempView("updates") - # it should use the spark session within given DataFrame, as microbatch execution will - # clone the session which is no longer same with the session used to start the - # streaming query - collected[batch_id] = batch_df.sparkSession.sql("SELECT * FROM updates").collect() - - try: - df = self.spark.readStream.format("text").load("python/test_support/sql/streaming") - q = df.writeStream.foreachBatch(collectBatch).start() - q.processAllAvailable() - self.assertTrue(0 in collected) - self.assertTrue(len(collected[0]), 2) - finally: - if q: - q.stop() - - def test_streaming_foreachBatch_propagates_python_errors(self): - from pyspark.errors import StreamingQueryException - - q = None - - def collectBatch(df, id): - raise RuntimeError("this should fail the query") - - try: - df = self.spark.readStream.format("text").load("python/test_support/sql/streaming") - q = df.writeStream.foreachBatch(collectBatch).start() - q.processAllAvailable() - self.fail("Expected a failure") - except StreamingQueryException as e: - self.assertTrue("this should fail" in str(e)) - finally: - if q: - q.stop() - - def test_streaming_foreachBatch_graceful_stop(self): - # SPARK-39218: Make foreachBatch streaming query stop gracefully - def func(batch_df, _): - batch_df.sparkSession._jvm.java.lang.Thread.sleep(10000) - - q = self.spark.readStream.format("rate").load().writeStream.foreachBatch(func).start() - time.sleep(3) # 'rowsPerSecond' defaults to 1. Waits 3 secs out for the input. - q.stop() - self.assertIsNone(q.exception(), "No exception has to be propagated.") - if __name__ == "__main__": import unittest From 23b9c93e4ad16f050695cff82648861786d9832c Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Mon, 10 Apr 2023 14:10:16 -0700 Subject: [PATCH 07/31] minor --- python/pyspark/sql/streaming/query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/sql/streaming/query.py b/python/pyspark/sql/streaming/query.py index fd0318f821b5a..6d14cc7560e81 100644 --- a/python/pyspark/sql/streaming/query.py +++ b/python/pyspark/sql/streaming/query.py @@ -16,7 +16,6 @@ # import json -import sys from typing import Any, Dict, List, Optional from py4j.java_gateway import JavaObject, java_import @@ -638,6 +637,7 @@ def removeListener(self, listener: StreamingQueryListener) -> None: def _test() -> None: import doctest import os + import sys from pyspark.sql import SparkSession import pyspark.sql.streaming.query from py4j.protocol import Py4JError From 60ddd01191aaa0487bf54c5b07b1f04ae214bc18 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Mon, 10 Apr 2023 22:35:20 -0700 Subject: [PATCH 08/31] lint --- python/pyspark/sql/connect/streaming/query.py | 1 - .../sql/tests/connect/streaming/test_parity_streaming.py | 1 - python/pyspark/sql/tests/streaming/test_streaming_foreach.py | 1 - .../pyspark/sql/tests/streaming/test_streaming_foreachBatch.py | 2 -- 4 files changed, 5 deletions(-) diff --git a/python/pyspark/sql/connect/streaming/query.py b/python/pyspark/sql/connect/streaming/query.py index 3e8c679a9bca8..64455e8b394ea 100644 --- a/python/pyspark/sql/connect/streaming/query.py +++ b/python/pyspark/sql/connect/streaming/query.py @@ -154,7 +154,6 @@ def _test() -> None: import os from pyspark.sql import SparkSession as PySparkSession import pyspark.sql.connect.streaming.query - from py4j.protocol import Py4JError os.chdir(os.environ["SPARK_HOME"]) diff --git a/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py b/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py index d28d2c0524a9b..6b4460bab5216 100644 --- a/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +++ b/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py @@ -17,7 +17,6 @@ import unittest -from pyspark.testing.connectutils import should_test_connect from pyspark.sql.tests.streaming.test_streaming import StreamingTestsMixin from pyspark.testing.connectutils import ReusedConnectTestCase diff --git a/python/pyspark/sql/tests/streaming/test_streaming_foreach.py b/python/pyspark/sql/tests/streaming/test_streaming_foreach.py index ffaedd0a18fcd..8bd36020c9ad7 100644 --- a/python/pyspark/sql/tests/streaming/test_streaming_foreach.py +++ b/python/pyspark/sql/tests/streaming/test_streaming_foreach.py @@ -17,7 +17,6 @@ import os import tempfile -import time from pyspark.testing.sqlutils import ReusedSQLTestCase diff --git a/python/pyspark/sql/tests/streaming/test_streaming_foreachBatch.py b/python/pyspark/sql/tests/streaming/test_streaming_foreachBatch.py index 7d56804c3353a..7e5720e429990 100644 --- a/python/pyspark/sql/tests/streaming/test_streaming_foreachBatch.py +++ b/python/pyspark/sql/tests/streaming/test_streaming_foreachBatch.py @@ -15,8 +15,6 @@ # limitations under the License. # -import os -import tempfile import time from pyspark.testing.sqlutils import ReusedSQLTestCase From e576821a78c65b8d721ea0bddfda326c47ee99ce Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Tue, 11 Apr 2023 10:52:13 -0700 Subject: [PATCH 09/31] remove empty line --- python/pyspark/sql/streaming/readwriter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyspark/sql/streaming/readwriter.py b/python/pyspark/sql/streaming/readwriter.py index df21b257f8b23..e4c38dc079251 100644 --- a/python/pyspark/sql/streaming/readwriter.py +++ b/python/pyspark/sql/streaming/readwriter.py @@ -1364,7 +1364,6 @@ def start( >>> q.name 'this_query' >>> q.stop() - >>> q.isActive # doctest: +SKIP False From 304d01e06be0f70ee77a90120b85e0ce53c3dc89 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Tue, 11 Apr 2023 17:15:24 -0700 Subject: [PATCH 10/31] wip --- .../protobuf/spark/connect/commands.proto | 19 ++++++++++++++-- .../connect/planner/SparkConnectPlanner.scala | 22 +++++++++++++++++++ python/pyspark/sql/connect/streaming/query.py | 8 ++++++- 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto index cb5ec05f97058..311de7b9a1409 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto @@ -257,8 +257,10 @@ message StreamingQueryCommand { bool process_all_available = 6; // explain() API. Returns logical and physical plans. ExplainCommand explain = 7; - - // TODO(SPARK-42960) Add more commands: await_termination(), exception() etc. + // exception() API. Returns the exception in the query if any. + bool exception = 8; + // awaitTermination() API. Waits for the termination of the query. + AwaitTerminationCommand await_termination = 9; } message ExplainCommand { @@ -266,6 +268,10 @@ message StreamingQueryCommand { // We can not do this right now since it base.proto imports this file. bool extended = 1; } + + message AwaitTerminationCommand { + int64 timeout_ms = 0; + } } // Response for commands on a streaming query. @@ -277,6 +283,7 @@ message StreamingQueryCommandResult { StatusResult status = 2; RecentProgressResult recent_progress = 3; ExplainResult explain = 4; + ExceptionResult exception = 5; } message StatusResult { @@ -296,5 +303,13 @@ message StreamingQueryCommandResult { // Logical and physical plans as string string result = 1; } + + message ExceptionResult { + // Exception as string + bool has_exception = 1; + optional string message = 2; + optional repeated string stack_trace = 3; + optional string cause = 4; + } } diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala index e1ea48d0da722..ba929ec3a1c4f 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala @@ -2219,6 +2219,28 @@ class SparkConnectPlanner(val session: SparkSession) { .build() respBuilder.setExplain(explain) + case StreamingQueryCommand.CommandCase.EXCEPTION => + val result = query.exception() + val exception = result match { + case Some(e) => + StreamingQueryCommandResult.ExceptionResult + .newBuilder() + .setHasException(true) + .setMessage(e.getMessage) + .addAllStackTraceList(e.getStackTrace.map(_.toString)) + .setCause(e.getCause.toString) + .build() + case None => + StreamingQueryCommandResult.ExceptionResult + .newBuilder() + .setHasException(false) + .build() + } + respBuilder.setException(exception) + + case StreamingQueryCommand.CommandCase.AWAIT_TERMINATION => + query.restart() + case StreamingQueryCommand.CommandCase.COMMAND_NOT_SET => throw new IllegalArgumentException("Missing command in StreamingQueryCommand") } diff --git a/python/pyspark/sql/connect/streaming/query.py b/python/pyspark/sql/connect/streaming/query.py index 2866945d161fe..118e724915184 100644 --- a/python/pyspark/sql/connect/streaming/query.py +++ b/python/pyspark/sql/connect/streaming/query.py @@ -125,7 +125,13 @@ def explain(self, extended: bool = False) -> None: explain.__doc__ = PySparkStreamingQuery.explain.__doc__ def exception(self) -> Optional[StreamingQueryException]: - raise NotImplementedError() + cmd = pb2.StreamingQueryCommand() + cmd.exception = True + result = self._execute_streaming_query_cmd(cmd).exception.result + if result == "": + return None + else: + return result exception.__doc__ = PySparkStreamingQuery.exception.__doc__ From 26e2488274e07ce9b7e8808740c70aa3ec7f6610 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Tue, 11 Apr 2023 17:35:47 -0700 Subject: [PATCH 11/31] remove several docs in connect readwriter.py and query.py to pass doc test --- dev/sparktestsupport/modules.py | 2 -- python/pyspark/sql/connect/streaming/query.py | 7 ++++-- .../sql/connect/streaming/readwriter.py | 12 ++++++++-- python/pyspark/sql/streaming/query.py | 6 ++--- python/pyspark/sql/streaming/readwriter.py | 22 +++++++++---------- 5 files changed, 27 insertions(+), 22 deletions(-) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 08924a86fd7c5..7b1d57b95d5ba 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -751,8 +751,6 @@ def __hash__(self): "pyspark.sql.connect.dataframe", "pyspark.sql.connect.functions", "pyspark.sql.connect.avro.functions", - "pyspark.sql.connect.streaming.readwriter", - "pyspark.sql.connect.streaming.query", # sql unittests "pyspark.sql.tests.connect.test_client", "pyspark.sql.tests.connect.test_connect_plan", diff --git a/python/pyspark/sql/connect/streaming/query.py b/python/pyspark/sql/connect/streaming/query.py index 64455e8b394ea..aebab9fc69fd1 100644 --- a/python/pyspark/sql/connect/streaming/query.py +++ b/python/pyspark/sql/connect/streaming/query.py @@ -66,10 +66,11 @@ def isActive(self) -> bool: isActive.__doc__ = PySparkStreamingQuery.isActive.__doc__ + # TODO (SPARK-42960): Implement and uncomment the doc def awaitTermination(self, timeout: Optional[int] = None) -> Optional[bool]: raise NotImplementedError() - awaitTermination.__doc__ = PySparkStreamingQuery.awaitTermination.__doc__ + # awaitTermination.__doc__ = PySparkStreamingQuery.awaitTermination.__doc__ @property def status(self) -> Dict[str, Any]: @@ -115,7 +116,8 @@ def stop(self) -> None: cmd.stop = True self._execute_streaming_query_cmd(cmd) - stop.__doc__ = PySparkStreamingQuery.stop.__doc__ + # TODO (SPARK-42962): uncomment below + # stop.__doc__ = PySparkStreamingQuery.stop.__doc__ def explain(self, extended: bool = False) -> None: cmd = pb2.StreamingQueryCommand() @@ -125,6 +127,7 @@ def explain(self, extended: bool = False) -> None: explain.__doc__ = PySparkStreamingQuery.explain.__doc__ + # TODO (SPARK-42960): Implement and uncomment the doc def exception(self) -> Optional[StreamingQueryException]: raise NotImplementedError() diff --git a/python/pyspark/sql/connect/streaming/readwriter.py b/python/pyspark/sql/connect/streaming/readwriter.py index 6246d24a09282..b89a6db1a9d78 100644 --- a/python/pyspark/sql/connect/streaming/readwriter.py +++ b/python/pyspark/sql/connect/streaming/readwriter.py @@ -432,6 +432,7 @@ def trigger( trigger.__doc__ = PySparkDataStreamWriter.trigger.__doc__ + # TODO (SPARK-43054): Implement and uncomment the doc @overload def foreach(self, f: Callable[[Row], None]) -> "DataStreamWriter": ... @@ -443,7 +444,13 @@ def foreach(self, f: "SupportsProcess") -> "DataStreamWriter": def foreach(self, f: Union[Callable[[Row], None], "SupportsProcess"]) -> "DataStreamWriter": raise NotImplementedError("foreach() is not implemented.") - foreach.__doc__ = PySparkDataStreamWriter.foreach.__doc__ + # foreach.__doc__ = PySparkDataStreamWriter.foreach.__doc__ + + # TODO (SPARK-42944): Implement and uncomment the doc + def foreachBatch(self, func: Callable[["DataFrame", int], None]) -> "DataStreamWriter": + raise NotImplementedError("foreachBatch() is not implemented.") + + # foreachBatch.__doc__ = PySparkDataStreamWriter.foreachBatch.__doc__ def _start_internal( self, @@ -501,7 +508,8 @@ def start( **options, ) - start.__doc__ = PySparkDataStreamWriter.start.__doc__ + # TODO (SPARK-42962): uncomment below + # start.__doc__ = PySparkDataStreamWriter.start.__doc__ def toTable( self, diff --git a/python/pyspark/sql/streaming/query.py b/python/pyspark/sql/streaming/query.py index 6d14cc7560e81..b902f0514fce9 100644 --- a/python/pyspark/sql/streaming/query.py +++ b/python/pyspark/sql/streaming/query.py @@ -156,7 +156,6 @@ def isActive(self) -> bool: """ return self._jsq.isActive() - # TODO(SPARK-42960): remove the doctest: +SKIP flag below def awaitTermination(self, timeout: Optional[int] = None) -> Optional[bool]: """ Waits for the termination of `this` query, either by :func:`query.stop()` or by an @@ -191,7 +190,7 @@ def awaitTermination(self, timeout: Optional[int] = None) -> Optional[bool]: Return whether the query has terminated or not within 5 seconds - >>> sq.awaitTermination(5) # doctest: +SKIP + >>> sq.awaitTermination(5) False >>> sq.stop() @@ -317,7 +316,6 @@ def processAllAvailable(self) -> None: """ return self._jsq.processAllAvailable() - # TODO(SPARK-42940): remove the doctest: +SKIP flag below def stop(self) -> None: """ Stop this streaming query. @@ -335,7 +333,7 @@ def stop(self) -> None: >>> sq.stop() - >>> sq.isActive # doctest: +SKIP + >>> sq.isActive False """ self._jsq.stop() diff --git a/python/pyspark/sql/streaming/readwriter.py b/python/pyspark/sql/streaming/readwriter.py index e4c38dc079251..529e3aeb60d9d 100644 --- a/python/pyspark/sql/streaming/readwriter.py +++ b/python/pyspark/sql/streaming/readwriter.py @@ -1162,9 +1162,9 @@ def foreach(self, f: Union[Callable[[Row], None], "SupportsProcess"]) -> "DataSt >>> def print_row(row): ... print(row) ... - >>> q = df.writeStream.foreach(print_row).start() # doctest: +SKIP - >>> time.sleep(3) # doctest: +SKIP - >>> q.stop() # doctest: +SKIP + >>> q = df.writeStream.foreach(print_row).start() + >>> time.sleep(3) + >>> q.stop() Print every row using a object with process() method @@ -1179,9 +1179,9 @@ def foreach(self, f: Union[Callable[[Row], None], "SupportsProcess"]) -> "DataSt ... def close(self, error): ... print("Closed with error: %s" % str(error)) ... - >>> q = df.writeStream.foreach(print_row).start() # doctest: +SKIP - >>> time.sleep(3) # doctest: +SKIP - >>> q.stop() # doctest: +SKIP + >>> q = df.writeStream.foreach(print_row).start() + >>> time.sleep(3) + >>> q.stop() """ from pyspark.rdd import _wrap_function @@ -1264,7 +1264,6 @@ def func_with_open_process_close(partition_id: Any, iterator: Iterator) -> Itera self._jwrite.foreach(jForeachWriter) return self - # TODO(SPARK-42944): remove the doctest: +SKIP flag below def foreachBatch(self, func: Callable[["DataFrame", int], None]) -> "DataStreamWriter": """ Sets the output of the streaming query to be processed using the provided @@ -1289,9 +1288,9 @@ def foreachBatch(self, func: Callable[["DataFrame", int], None]) -> "DataStreamW >>> def func(batch_df, batch_id): ... batch_df.collect() ... - >>> q = df.writeStream.foreachBatch(func).start() # doctest: +SKIP - >>> time.sleep(3) # doctest: +SKIP - >>> q.stop() # doctest: +SKIP + >>> q = df.writeStream.foreachBatch(func).start() + >>> time.sleep(3) + >>> q.stop() """ from pyspark.java_gateway import ensure_callback_server_started @@ -1305,7 +1304,6 @@ def foreachBatch(self, func: Callable[["DataFrame", int], None]) -> "DataStreamW ensure_callback_server_started(gw) return self - # TODO(SPARK-42940): remove the doctest: +SKIP flag below def start( self, path: Optional[str] = None, @@ -1364,7 +1362,7 @@ def start( >>> q.name 'this_query' >>> q.stop() - >>> q.isActive # doctest: +SKIP + >>> q.isActive False Example with using other parameters with a trigger. From e25f7e6c27fa105b30bb8bf907ee7457fe12fe28 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Tue, 11 Apr 2023 18:01:17 -0700 Subject: [PATCH 12/31] minor, add back doc tests in module.py --- dev/sparktestsupport/modules.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py index 7b1d57b95d5ba..08924a86fd7c5 100644 --- a/dev/sparktestsupport/modules.py +++ b/dev/sparktestsupport/modules.py @@ -751,6 +751,8 @@ def __hash__(self): "pyspark.sql.connect.dataframe", "pyspark.sql.connect.functions", "pyspark.sql.connect.avro.functions", + "pyspark.sql.connect.streaming.readwriter", + "pyspark.sql.connect.streaming.query", # sql unittests "pyspark.sql.tests.connect.test_client", "pyspark.sql.tests.connect.test_connect_plan", From aa1d4c288d5a7a9b9d083365e478e7838a69e3b7 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Tue, 11 Apr 2023 22:39:32 -0700 Subject: [PATCH 13/31] style --- python/pyspark/sql/connect/streaming/readwriter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/sql/connect/streaming/readwriter.py b/python/pyspark/sql/connect/streaming/readwriter.py index b89a6db1a9d78..e702b3523a4a7 100644 --- a/python/pyspark/sql/connect/streaming/readwriter.py +++ b/python/pyspark/sql/connect/streaming/readwriter.py @@ -449,7 +449,7 @@ def foreach(self, f: Union[Callable[[Row], None], "SupportsProcess"]) -> "DataSt # TODO (SPARK-42944): Implement and uncomment the doc def foreachBatch(self, func: Callable[["DataFrame", int], None]) -> "DataStreamWriter": raise NotImplementedError("foreachBatch() is not implemented.") - + # foreachBatch.__doc__ = PySparkDataStreamWriter.foreachBatch.__doc__ def _start_internal( From e82678e07fb4a83df6e8347723d6b7baefb437fe Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 12 Apr 2023 14:59:23 -0700 Subject: [PATCH 14/31] wip --- .../protobuf/spark/connect/commands.proto | 10 +- .../connect/planner/SparkConnectPlanner.scala | 11 +- .../pyspark/sql/connect/proto/commands_pb2.py | 69 +++++++-- .../sql/connect/proto/commands_pb2.pyi | 136 +++++++++++++++++- python/pyspark/sql/connect/streaming/query.py | 35 ++++- 5 files changed, 238 insertions(+), 23 deletions(-) diff --git a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto index 311de7b9a1409..c12f5a2816523 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto @@ -270,7 +270,7 @@ message StreamingQueryCommand { } message AwaitTerminationCommand { - int64 timeout_ms = 0; + int64 timeout_ms = 1; } } @@ -284,6 +284,7 @@ message StreamingQueryCommandResult { RecentProgressResult recent_progress = 3; ExplainResult explain = 4; ExceptionResult exception = 5; + AwaitTerminationResult await_termination = 6; } message StatusResult { @@ -308,8 +309,11 @@ message StreamingQueryCommandResult { // Exception as string bool has_exception = 1; optional string message = 2; - optional repeated string stack_trace = 3; + repeated string stack_trace = 3; optional string cause = 4; } -} + message AwaitTerminationResult { + bool terminated = 1; + } +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala index ba929ec3a1c4f..dd1d58d611708 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala @@ -2220,14 +2220,14 @@ class SparkConnectPlanner(val session: SparkSession) { respBuilder.setExplain(explain) case StreamingQueryCommand.CommandCase.EXCEPTION => - val result = query.exception() + val result = query.exception val exception = result match { case Some(e) => StreamingQueryCommandResult.ExceptionResult .newBuilder() .setHasException(true) .setMessage(e.getMessage) - .addAllStackTraceList(e.getStackTrace.map(_.toString)) + .addAllStackTrace(e.getStackTrace.map(_.toString).toIterable.asJava) .setCause(e.getCause.toString) .build() case None => @@ -2239,7 +2239,12 @@ class SparkConnectPlanner(val session: SparkSession) { respBuilder.setException(exception) case StreamingQueryCommand.CommandCase.AWAIT_TERMINATION => - query.restart() + val terminated = query.awaitTermination(command.getAwaitTermination.getTimeoutMs) + val terminatedResult = StreamingQueryCommandResult.AwaitTerminationResult + .newBuilder() + .setTerminated(terminated) + .build() + respBuilder.setAwaitTermination(terminatedResult) case StreamingQueryCommand.CommandCase.COMMAND_NOT_SET => throw new IllegalArgumentException("Missing command in StreamingQueryCommand") diff --git a/python/pyspark/sql/connect/proto/commands_pb2.py b/python/pyspark/sql/connect/proto/commands_pb2.py index 7b0a389c17034..e32f348beea89 100644 --- a/python/pyspark/sql/connect/proto/commands_pb2.py +++ b/python/pyspark/sql/connect/proto/commands_pb2.py @@ -35,7 +35,7 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1cspark/connect/commands.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto"\xb6\x05\n\x07\x43ommand\x12]\n\x11register_function\x18\x01 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x10registerFunction\x12H\n\x0fwrite_operation\x18\x02 \x01(\x0b\x32\x1d.spark.connect.WriteOperationH\x00R\x0ewriteOperation\x12_\n\x15\x63reate_dataframe_view\x18\x03 \x01(\x0b\x32).spark.connect.CreateDataFrameViewCommandH\x00R\x13\x63reateDataframeView\x12O\n\x12write_operation_v2\x18\x04 \x01(\x0b\x32\x1f.spark.connect.WriteOperationV2H\x00R\x10writeOperationV2\x12<\n\x0bsql_command\x18\x05 \x01(\x0b\x32\x19.spark.connect.SqlCommandH\x00R\nsqlCommand\x12k\n\x1cwrite_stream_operation_start\x18\x06 \x01(\x0b\x32(.spark.connect.WriteStreamOperationStartH\x00R\x19writeStreamOperationStart\x12^\n\x17streaming_query_command\x18\x07 \x01(\x0b\x32$.spark.connect.StreamingQueryCommandH\x00R\x15streamingQueryCommand\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textensionB\x0e\n\x0c\x63ommand_type"\xb3\x01\n\nSqlCommand\x12\x10\n\x03sql\x18\x01 \x01(\tR\x03sql\x12\x37\n\x04\x61rgs\x18\x02 \x03(\x0b\x32#.spark.connect.SqlCommand.ArgsEntryR\x04\x61rgs\x1aZ\n\tArgsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x37\n\x05value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x05value:\x02\x38\x01"\x96\x01\n\x1a\x43reateDataFrameViewCommand\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x1b\n\tis_global\x18\x03 \x01(\x08R\x08isGlobal\x12\x18\n\x07replace\x18\x04 \x01(\x08R\x07replace"\x9b\x08\n\x0eWriteOperation\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1b\n\x06source\x18\x02 \x01(\tH\x01R\x06source\x88\x01\x01\x12\x14\n\x04path\x18\x03 \x01(\tH\x00R\x04path\x12?\n\x05table\x18\x04 \x01(\x0b\x32\'.spark.connect.WriteOperation.SaveTableH\x00R\x05table\x12:\n\x04mode\x18\x05 \x01(\x0e\x32&.spark.connect.WriteOperation.SaveModeR\x04mode\x12*\n\x11sort_column_names\x18\x06 \x03(\tR\x0fsortColumnNames\x12\x31\n\x14partitioning_columns\x18\x07 \x03(\tR\x13partitioningColumns\x12\x43\n\tbucket_by\x18\x08 \x01(\x0b\x32&.spark.connect.WriteOperation.BucketByR\x08\x62ucketBy\x12\x44\n\x07options\x18\t \x03(\x0b\x32*.spark.connect.WriteOperation.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x82\x02\n\tSaveTable\x12\x1d\n\ntable_name\x18\x01 \x01(\tR\ttableName\x12X\n\x0bsave_method\x18\x02 \x01(\x0e\x32\x37.spark.connect.WriteOperation.SaveTable.TableSaveMethodR\nsaveMethod"|\n\x0fTableSaveMethod\x12!\n\x1dTABLE_SAVE_METHOD_UNSPECIFIED\x10\x00\x12#\n\x1fTABLE_SAVE_METHOD_SAVE_AS_TABLE\x10\x01\x12!\n\x1dTABLE_SAVE_METHOD_INSERT_INTO\x10\x02\x1a[\n\x08\x42ucketBy\x12.\n\x13\x62ucket_column_names\x18\x01 \x03(\tR\x11\x62ucketColumnNames\x12\x1f\n\x0bnum_buckets\x18\x02 \x01(\x05R\nnumBuckets"\x89\x01\n\x08SaveMode\x12\x19\n\x15SAVE_MODE_UNSPECIFIED\x10\x00\x12\x14\n\x10SAVE_MODE_APPEND\x10\x01\x12\x17\n\x13SAVE_MODE_OVERWRITE\x10\x02\x12\x1d\n\x19SAVE_MODE_ERROR_IF_EXISTS\x10\x03\x12\x14\n\x10SAVE_MODE_IGNORE\x10\x04\x42\x0b\n\tsave_typeB\t\n\x07_source"\xad\x06\n\x10WriteOperationV2\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1d\n\ntable_name\x18\x02 \x01(\tR\ttableName\x12\x1f\n\x08provider\x18\x03 \x01(\tH\x00R\x08provider\x88\x01\x01\x12L\n\x14partitioning_columns\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13partitioningColumns\x12\x46\n\x07options\x18\x05 \x03(\x0b\x32,.spark.connect.WriteOperationV2.OptionsEntryR\x07options\x12_\n\x10table_properties\x18\x06 \x03(\x0b\x32\x34.spark.connect.WriteOperationV2.TablePropertiesEntryR\x0ftableProperties\x12\x38\n\x04mode\x18\x07 \x01(\x0e\x32$.spark.connect.WriteOperationV2.ModeR\x04mode\x12J\n\x13overwrite_condition\x18\x08 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x12overwriteCondition\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x42\n\x14TablePropertiesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01"\x9f\x01\n\x04Mode\x12\x14\n\x10MODE_UNSPECIFIED\x10\x00\x12\x0f\n\x0bMODE_CREATE\x10\x01\x12\x12\n\x0eMODE_OVERWRITE\x10\x02\x12\x1d\n\x19MODE_OVERWRITE_PARTITIONS\x10\x03\x12\x0f\n\x0bMODE_APPEND\x10\x04\x12\x10\n\x0cMODE_REPLACE\x10\x05\x12\x1a\n\x16MODE_CREATE_OR_REPLACE\x10\x06\x42\x0b\n\t_provider"\x82\x05\n\x19WriteStreamOperationStart\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x16\n\x06\x66ormat\x18\x02 \x01(\tR\x06\x66ormat\x12O\n\x07options\x18\x03 \x03(\x0b\x32\x35.spark.connect.WriteStreamOperationStart.OptionsEntryR\x07options\x12:\n\x19partitioning_column_names\x18\x04 \x03(\tR\x17partitioningColumnNames\x12:\n\x18processing_time_interval\x18\x05 \x01(\tH\x00R\x16processingTimeInterval\x12%\n\ravailable_now\x18\x06 \x01(\x08H\x00R\x0c\x61vailableNow\x12\x14\n\x04once\x18\x07 \x01(\x08H\x00R\x04once\x12\x46\n\x1e\x63ontinuous_checkpoint_interval\x18\x08 \x01(\tH\x00R\x1c\x63ontinuousCheckpointInterval\x12\x1f\n\x0boutput_mode\x18\t \x01(\tR\noutputMode\x12\x1d\n\nquery_name\x18\n \x01(\tR\tqueryName\x12\x14\n\x04path\x18\x0b \x01(\tH\x01R\x04path\x12\x1f\n\ntable_name\x18\x0c \x01(\tH\x01R\ttableName\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\t\n\x07triggerB\x12\n\x10sink_destination"y\n\x1fWriteStreamOperationStartResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name"A\n\x18StreamingQueryInstanceId\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x12\x15\n\x06run_id\x18\x02 \x01(\tR\x05runId"\x9d\x03\n\x15StreamingQueryCommand\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x18\n\x06status\x18\x02 \x01(\x08H\x00R\x06status\x12%\n\rlast_progress\x18\x03 \x01(\x08H\x00R\x0clastProgress\x12)\n\x0frecent_progress\x18\x04 \x01(\x08H\x00R\x0erecentProgress\x12\x14\n\x04stop\x18\x05 \x01(\x08H\x00R\x04stop\x12\x34\n\x15process_all_available\x18\x06 \x01(\x08H\x00R\x13processAllAvailable\x12O\n\x07\x65xplain\x18\x07 \x01(\x0b\x32\x33.spark.connect.StreamingQueryCommand.ExplainCommandH\x00R\x07\x65xplain\x1a,\n\x0e\x45xplainCommand\x12\x1a\n\x08\x65xtended\x18\x01 \x01(\x08R\x08\x65xtendedB\t\n\x07\x63ommand"\xa5\x05\n\x1bStreamingQueryCommandResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12Q\n\x06status\x18\x02 \x01(\x0b\x32\x37.spark.connect.StreamingQueryCommandResult.StatusResultH\x00R\x06status\x12j\n\x0frecent_progress\x18\x03 \x01(\x0b\x32?.spark.connect.StreamingQueryCommandResult.RecentProgressResultH\x00R\x0erecentProgress\x12T\n\x07\x65xplain\x18\x04 \x01(\x0b\x32\x38.spark.connect.StreamingQueryCommandResult.ExplainResultH\x00R\x07\x65xplain\x1a\xaa\x01\n\x0cStatusResult\x12%\n\x0estatus_message\x18\x01 \x01(\tR\rstatusMessage\x12*\n\x11is_data_available\x18\x02 \x01(\x08R\x0fisDataAvailable\x12*\n\x11is_trigger_active\x18\x03 \x01(\x08R\x0fisTriggerActive\x12\x1b\n\tis_active\x18\x04 \x01(\x08R\x08isActive\x1aH\n\x14RecentProgressResult\x12\x30\n\x14recent_progress_json\x18\x05 \x03(\tR\x12recentProgressJson\x1a\'\n\rExplainResult\x12\x16\n\x06result\x18\x01 \x01(\tR\x06resultB\r\n\x0bresult_typeB"\n\x1eorg.apache.spark.connect.protoP\x01\x62\x06proto3' + b'\n\x1cspark/connect/commands.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto"\xb6\x05\n\x07\x43ommand\x12]\n\x11register_function\x18\x01 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x10registerFunction\x12H\n\x0fwrite_operation\x18\x02 \x01(\x0b\x32\x1d.spark.connect.WriteOperationH\x00R\x0ewriteOperation\x12_\n\x15\x63reate_dataframe_view\x18\x03 \x01(\x0b\x32).spark.connect.CreateDataFrameViewCommandH\x00R\x13\x63reateDataframeView\x12O\n\x12write_operation_v2\x18\x04 \x01(\x0b\x32\x1f.spark.connect.WriteOperationV2H\x00R\x10writeOperationV2\x12<\n\x0bsql_command\x18\x05 \x01(\x0b\x32\x19.spark.connect.SqlCommandH\x00R\nsqlCommand\x12k\n\x1cwrite_stream_operation_start\x18\x06 \x01(\x0b\x32(.spark.connect.WriteStreamOperationStartH\x00R\x19writeStreamOperationStart\x12^\n\x17streaming_query_command\x18\x07 \x01(\x0b\x32$.spark.connect.StreamingQueryCommandH\x00R\x15streamingQueryCommand\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textensionB\x0e\n\x0c\x63ommand_type"\xb3\x01\n\nSqlCommand\x12\x10\n\x03sql\x18\x01 \x01(\tR\x03sql\x12\x37\n\x04\x61rgs\x18\x02 \x03(\x0b\x32#.spark.connect.SqlCommand.ArgsEntryR\x04\x61rgs\x1aZ\n\tArgsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x37\n\x05value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x05value:\x02\x38\x01"\x96\x01\n\x1a\x43reateDataFrameViewCommand\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x1b\n\tis_global\x18\x03 \x01(\x08R\x08isGlobal\x12\x18\n\x07replace\x18\x04 \x01(\x08R\x07replace"\x9b\x08\n\x0eWriteOperation\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1b\n\x06source\x18\x02 \x01(\tH\x01R\x06source\x88\x01\x01\x12\x14\n\x04path\x18\x03 \x01(\tH\x00R\x04path\x12?\n\x05table\x18\x04 \x01(\x0b\x32\'.spark.connect.WriteOperation.SaveTableH\x00R\x05table\x12:\n\x04mode\x18\x05 \x01(\x0e\x32&.spark.connect.WriteOperation.SaveModeR\x04mode\x12*\n\x11sort_column_names\x18\x06 \x03(\tR\x0fsortColumnNames\x12\x31\n\x14partitioning_columns\x18\x07 \x03(\tR\x13partitioningColumns\x12\x43\n\tbucket_by\x18\x08 \x01(\x0b\x32&.spark.connect.WriteOperation.BucketByR\x08\x62ucketBy\x12\x44\n\x07options\x18\t \x03(\x0b\x32*.spark.connect.WriteOperation.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x82\x02\n\tSaveTable\x12\x1d\n\ntable_name\x18\x01 \x01(\tR\ttableName\x12X\n\x0bsave_method\x18\x02 \x01(\x0e\x32\x37.spark.connect.WriteOperation.SaveTable.TableSaveMethodR\nsaveMethod"|\n\x0fTableSaveMethod\x12!\n\x1dTABLE_SAVE_METHOD_UNSPECIFIED\x10\x00\x12#\n\x1fTABLE_SAVE_METHOD_SAVE_AS_TABLE\x10\x01\x12!\n\x1dTABLE_SAVE_METHOD_INSERT_INTO\x10\x02\x1a[\n\x08\x42ucketBy\x12.\n\x13\x62ucket_column_names\x18\x01 \x03(\tR\x11\x62ucketColumnNames\x12\x1f\n\x0bnum_buckets\x18\x02 \x01(\x05R\nnumBuckets"\x89\x01\n\x08SaveMode\x12\x19\n\x15SAVE_MODE_UNSPECIFIED\x10\x00\x12\x14\n\x10SAVE_MODE_APPEND\x10\x01\x12\x17\n\x13SAVE_MODE_OVERWRITE\x10\x02\x12\x1d\n\x19SAVE_MODE_ERROR_IF_EXISTS\x10\x03\x12\x14\n\x10SAVE_MODE_IGNORE\x10\x04\x42\x0b\n\tsave_typeB\t\n\x07_source"\xad\x06\n\x10WriteOperationV2\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1d\n\ntable_name\x18\x02 \x01(\tR\ttableName\x12\x1f\n\x08provider\x18\x03 \x01(\tH\x00R\x08provider\x88\x01\x01\x12L\n\x14partitioning_columns\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13partitioningColumns\x12\x46\n\x07options\x18\x05 \x03(\x0b\x32,.spark.connect.WriteOperationV2.OptionsEntryR\x07options\x12_\n\x10table_properties\x18\x06 \x03(\x0b\x32\x34.spark.connect.WriteOperationV2.TablePropertiesEntryR\x0ftableProperties\x12\x38\n\x04mode\x18\x07 \x01(\x0e\x32$.spark.connect.WriteOperationV2.ModeR\x04mode\x12J\n\x13overwrite_condition\x18\x08 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x12overwriteCondition\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x42\n\x14TablePropertiesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01"\x9f\x01\n\x04Mode\x12\x14\n\x10MODE_UNSPECIFIED\x10\x00\x12\x0f\n\x0bMODE_CREATE\x10\x01\x12\x12\n\x0eMODE_OVERWRITE\x10\x02\x12\x1d\n\x19MODE_OVERWRITE_PARTITIONS\x10\x03\x12\x0f\n\x0bMODE_APPEND\x10\x04\x12\x10\n\x0cMODE_REPLACE\x10\x05\x12\x1a\n\x16MODE_CREATE_OR_REPLACE\x10\x06\x42\x0b\n\t_provider"\x82\x05\n\x19WriteStreamOperationStart\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x16\n\x06\x66ormat\x18\x02 \x01(\tR\x06\x66ormat\x12O\n\x07options\x18\x03 \x03(\x0b\x32\x35.spark.connect.WriteStreamOperationStart.OptionsEntryR\x07options\x12:\n\x19partitioning_column_names\x18\x04 \x03(\tR\x17partitioningColumnNames\x12:\n\x18processing_time_interval\x18\x05 \x01(\tH\x00R\x16processingTimeInterval\x12%\n\ravailable_now\x18\x06 \x01(\x08H\x00R\x0c\x61vailableNow\x12\x14\n\x04once\x18\x07 \x01(\x08H\x00R\x04once\x12\x46\n\x1e\x63ontinuous_checkpoint_interval\x18\x08 \x01(\tH\x00R\x1c\x63ontinuousCheckpointInterval\x12\x1f\n\x0boutput_mode\x18\t \x01(\tR\noutputMode\x12\x1d\n\nquery_name\x18\n \x01(\tR\tqueryName\x12\x14\n\x04path\x18\x0b \x01(\tH\x01R\x04path\x12\x1f\n\ntable_name\x18\x0c \x01(\tH\x01R\ttableName\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\t\n\x07triggerB\x12\n\x10sink_destination"y\n\x1fWriteStreamOperationStartResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name"A\n\x18StreamingQueryInstanceId\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x12\x15\n\x06run_id\x18\x02 \x01(\tR\x05runId"\xe4\x04\n\x15StreamingQueryCommand\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x18\n\x06status\x18\x02 \x01(\x08H\x00R\x06status\x12%\n\rlast_progress\x18\x03 \x01(\x08H\x00R\x0clastProgress\x12)\n\x0frecent_progress\x18\x04 \x01(\x08H\x00R\x0erecentProgress\x12\x14\n\x04stop\x18\x05 \x01(\x08H\x00R\x04stop\x12\x34\n\x15process_all_available\x18\x06 \x01(\x08H\x00R\x13processAllAvailable\x12O\n\x07\x65xplain\x18\x07 \x01(\x0b\x32\x33.spark.connect.StreamingQueryCommand.ExplainCommandH\x00R\x07\x65xplain\x12\x1e\n\texception\x18\x08 \x01(\x08H\x00R\texception\x12k\n\x11\x61wait_termination\x18\t \x01(\x0b\x32<.spark.connect.StreamingQueryCommand.AwaitTerminationCommandH\x00R\x10\x61waitTermination\x1a,\n\x0e\x45xplainCommand\x12\x1a\n\x08\x65xtended\x18\x01 \x01(\x08R\x08\x65xtended\x1a\x38\n\x17\x41waitTerminationCommand\x12\x1d\n\ntimeout_ms\x18\x01 \x01(\x03R\ttimeoutMsB\t\n\x07\x63ommand"\xd7\x08\n\x1bStreamingQueryCommandResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12Q\n\x06status\x18\x02 \x01(\x0b\x32\x37.spark.connect.StreamingQueryCommandResult.StatusResultH\x00R\x06status\x12j\n\x0frecent_progress\x18\x03 \x01(\x0b\x32?.spark.connect.StreamingQueryCommandResult.RecentProgressResultH\x00R\x0erecentProgress\x12T\n\x07\x65xplain\x18\x04 \x01(\x0b\x32\x38.spark.connect.StreamingQueryCommandResult.ExplainResultH\x00R\x07\x65xplain\x12Z\n\texception\x18\x05 \x01(\x0b\x32:.spark.connect.StreamingQueryCommandResult.ExceptionResultH\x00R\texception\x12p\n\x11\x61wait_termination\x18\x06 \x01(\x0b\x32\x41.spark.connect.StreamingQueryCommandResult.AwaitTerminationResultH\x00R\x10\x61waitTermination\x1a\xaa\x01\n\x0cStatusResult\x12%\n\x0estatus_message\x18\x01 \x01(\tR\rstatusMessage\x12*\n\x11is_data_available\x18\x02 \x01(\x08R\x0fisDataAvailable\x12*\n\x11is_trigger_active\x18\x03 \x01(\x08R\x0fisTriggerActive\x12\x1b\n\tis_active\x18\x04 \x01(\x08R\x08isActive\x1aH\n\x14RecentProgressResult\x12\x30\n\x14recent_progress_json\x18\x05 \x03(\tR\x12recentProgressJson\x1a\'\n\rExplainResult\x12\x16\n\x06result\x18\x01 \x01(\tR\x06result\x1a\xa7\x01\n\x0f\x45xceptionResult\x12#\n\rhas_exception\x18\x01 \x01(\x08R\x0chasException\x12\x1d\n\x07message\x18\x02 \x01(\tH\x00R\x07message\x88\x01\x01\x12\x1f\n\x0bstack_trace\x18\x03 \x03(\tR\nstackTrace\x12\x19\n\x05\x63\x61use\x18\x04 \x01(\tH\x01R\x05\x63\x61use\x88\x01\x01\x42\n\n\x08_messageB\x08\n\x06_cause\x1a\x38\n\x16\x41waitTerminationResult\x12\x1e\n\nterminated\x18\x01 \x01(\x08R\nterminatedB\r\n\x0bresult_typeB"\n\x1eorg.apache.spark.connect.protoP\x01\x62\x06proto3' ) @@ -64,6 +64,9 @@ _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND = _STREAMINGQUERYCOMMAND.nested_types_by_name[ "ExplainCommand" ] +_STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND = _STREAMINGQUERYCOMMAND.nested_types_by_name[ + "AwaitTerminationCommand" +] _STREAMINGQUERYCOMMANDRESULT = DESCRIPTOR.message_types_by_name["StreamingQueryCommandResult"] _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT = _STREAMINGQUERYCOMMANDRESULT.nested_types_by_name[ "StatusResult" @@ -74,6 +77,12 @@ _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT = _STREAMINGQUERYCOMMANDRESULT.nested_types_by_name[ "ExplainResult" ] +_STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT = _STREAMINGQUERYCOMMANDRESULT.nested_types_by_name[ + "ExceptionResult" +] +_STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT = ( + _STREAMINGQUERYCOMMANDRESULT.nested_types_by_name["AwaitTerminationResult"] +) _WRITEOPERATION_SAVETABLE_TABLESAVEMETHOD = _WRITEOPERATION_SAVETABLE.enum_types_by_name[ "TableSaveMethod" ] @@ -250,6 +259,15 @@ # @@protoc_insertion_point(class_scope:spark.connect.StreamingQueryCommand.ExplainCommand) }, ), + "AwaitTerminationCommand": _reflection.GeneratedProtocolMessageType( + "AwaitTerminationCommand", + (_message.Message,), + { + "DESCRIPTOR": _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND, + "__module__": "spark.connect.commands_pb2" + # @@protoc_insertion_point(class_scope:spark.connect.StreamingQueryCommand.AwaitTerminationCommand) + }, + ), "DESCRIPTOR": _STREAMINGQUERYCOMMAND, "__module__": "spark.connect.commands_pb2" # @@protoc_insertion_point(class_scope:spark.connect.StreamingQueryCommand) @@ -257,6 +275,7 @@ ) _sym_db.RegisterMessage(StreamingQueryCommand) _sym_db.RegisterMessage(StreamingQueryCommand.ExplainCommand) +_sym_db.RegisterMessage(StreamingQueryCommand.AwaitTerminationCommand) StreamingQueryCommandResult = _reflection.GeneratedProtocolMessageType( "StreamingQueryCommandResult", @@ -289,6 +308,24 @@ # @@protoc_insertion_point(class_scope:spark.connect.StreamingQueryCommandResult.ExplainResult) }, ), + "ExceptionResult": _reflection.GeneratedProtocolMessageType( + "ExceptionResult", + (_message.Message,), + { + "DESCRIPTOR": _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT, + "__module__": "spark.connect.commands_pb2" + # @@protoc_insertion_point(class_scope:spark.connect.StreamingQueryCommandResult.ExceptionResult) + }, + ), + "AwaitTerminationResult": _reflection.GeneratedProtocolMessageType( + "AwaitTerminationResult", + (_message.Message,), + { + "DESCRIPTOR": _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT, + "__module__": "spark.connect.commands_pb2" + # @@protoc_insertion_point(class_scope:spark.connect.StreamingQueryCommandResult.AwaitTerminationResult) + }, + ), "DESCRIPTOR": _STREAMINGQUERYCOMMANDRESULT, "__module__": "spark.connect.commands_pb2" # @@protoc_insertion_point(class_scope:spark.connect.StreamingQueryCommandResult) @@ -298,6 +335,8 @@ _sym_db.RegisterMessage(StreamingQueryCommandResult.StatusResult) _sym_db.RegisterMessage(StreamingQueryCommandResult.RecentProgressResult) _sym_db.RegisterMessage(StreamingQueryCommandResult.ExplainResult) +_sym_db.RegisterMessage(StreamingQueryCommandResult.ExceptionResult) +_sym_db.RegisterMessage(StreamingQueryCommandResult.AwaitTerminationResult) if _descriptor._USE_C_DESCRIPTORS == False: @@ -350,15 +389,21 @@ _STREAMINGQUERYINSTANCEID._serialized_start = 3808 _STREAMINGQUERYINSTANCEID._serialized_end = 3873 _STREAMINGQUERYCOMMAND._serialized_start = 3876 - _STREAMINGQUERYCOMMAND._serialized_end = 4289 - _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND._serialized_start = 4234 - _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND._serialized_end = 4278 - _STREAMINGQUERYCOMMANDRESULT._serialized_start = 4292 - _STREAMINGQUERYCOMMANDRESULT._serialized_end = 4969 - _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_start = 4669 - _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_end = 4839 - _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_start = 4841 - _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_end = 4913 - _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_start = 4915 - _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_end = 4954 + _STREAMINGQUERYCOMMAND._serialized_end = 4488 + _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND._serialized_start = 4375 + _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND._serialized_end = 4419 + _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_start = 4421 + _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_end = 4477 + _STREAMINGQUERYCOMMANDRESULT._serialized_start = 4491 + _STREAMINGQUERYCOMMANDRESULT._serialized_end = 5602 + _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_start = 5074 + _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_end = 5244 + _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_start = 5246 + _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_end = 5318 + _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_start = 5320 + _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_end = 5359 + _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_start = 5362 + _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_end = 5529 + _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_start = 5531 + _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_end = 5587 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/commands_pb2.pyi b/python/pyspark/sql/connect/proto/commands_pb2.pyi index 52e4b272b25fb..bcf560d85667f 100644 --- a/python/pyspark/sql/connect/proto/commands_pb2.pyi +++ b/python/pyspark/sql/connect/proto/commands_pb2.pyi @@ -866,6 +866,20 @@ class StreamingQueryCommand(google.protobuf.message.Message): self, field_name: typing_extensions.Literal["extended", b"extended"] ) -> None: ... + class AwaitTerminationCommand(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + TIMEOUT_MS_FIELD_NUMBER: builtins.int + timeout_ms: builtins.int + def __init__( + self, + *, + timeout_ms: builtins.int = ..., + ) -> None: ... + def ClearField( + self, field_name: typing_extensions.Literal["timeout_ms", b"timeout_ms"] + ) -> None: ... + QUERY_ID_FIELD_NUMBER: builtins.int STATUS_FIELD_NUMBER: builtins.int LAST_PROGRESS_FIELD_NUMBER: builtins.int @@ -873,6 +887,8 @@ class StreamingQueryCommand(google.protobuf.message.Message): STOP_FIELD_NUMBER: builtins.int PROCESS_ALL_AVAILABLE_FIELD_NUMBER: builtins.int EXPLAIN_FIELD_NUMBER: builtins.int + EXCEPTION_FIELD_NUMBER: builtins.int + AWAIT_TERMINATION_FIELD_NUMBER: builtins.int @property def query_id(self) -> global___StreamingQueryInstanceId: """(Required) Query instance. See `StreamingQueryInstanceId`.""" @@ -889,6 +905,11 @@ class StreamingQueryCommand(google.protobuf.message.Message): @property def explain(self) -> global___StreamingQueryCommand.ExplainCommand: """explain() API. Returns logical and physical plans.""" + exception: builtins.bool + """exception() API. Returns the exception in the query if any.""" + @property + def await_termination(self) -> global___StreamingQueryCommand.AwaitTerminationCommand: + """awaitTermination() API. Waits for the termination of the query.""" def __init__( self, *, @@ -899,12 +920,18 @@ class StreamingQueryCommand(google.protobuf.message.Message): stop: builtins.bool = ..., process_all_available: builtins.bool = ..., explain: global___StreamingQueryCommand.ExplainCommand | None = ..., + exception: builtins.bool = ..., + await_termination: global___StreamingQueryCommand.AwaitTerminationCommand | None = ..., ) -> None: ... def HasField( self, field_name: typing_extensions.Literal[ + "await_termination", + b"await_termination", "command", b"command", + "exception", + b"exception", "explain", b"explain", "last_progress", @@ -924,8 +951,12 @@ class StreamingQueryCommand(google.protobuf.message.Message): def ClearField( self, field_name: typing_extensions.Literal[ + "await_termination", + b"await_termination", "command", b"command", + "exception", + b"exception", "explain", b"explain", "last_progress", @@ -945,7 +976,14 @@ class StreamingQueryCommand(google.protobuf.message.Message): def WhichOneof( self, oneof_group: typing_extensions.Literal["command", b"command"] ) -> typing_extensions.Literal[ - "status", "last_progress", "recent_progress", "stop", "process_all_available", "explain" + "status", + "last_progress", + "recent_progress", + "stop", + "process_all_available", + "explain", + "exception", + "await_termination", ] | None: ... global___StreamingQueryCommand = StreamingQueryCommand @@ -1023,10 +1061,88 @@ class StreamingQueryCommandResult(google.protobuf.message.Message): self, field_name: typing_extensions.Literal["result", b"result"] ) -> None: ... + class ExceptionResult(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + HAS_EXCEPTION_FIELD_NUMBER: builtins.int + MESSAGE_FIELD_NUMBER: builtins.int + STACK_TRACE_FIELD_NUMBER: builtins.int + CAUSE_FIELD_NUMBER: builtins.int + has_exception: builtins.bool + """Exception as string""" + message: builtins.str + @property + def stack_trace( + self, + ) -> google.protobuf.internal.containers.RepeatedScalarFieldContainer[builtins.str]: ... + cause: builtins.str + def __init__( + self, + *, + has_exception: builtins.bool = ..., + message: builtins.str | None = ..., + stack_trace: collections.abc.Iterable[builtins.str] | None = ..., + cause: builtins.str | None = ..., + ) -> None: ... + def HasField( + self, + field_name: typing_extensions.Literal[ + "_cause", + b"_cause", + "_message", + b"_message", + "cause", + b"cause", + "message", + b"message", + ], + ) -> builtins.bool: ... + def ClearField( + self, + field_name: typing_extensions.Literal[ + "_cause", + b"_cause", + "_message", + b"_message", + "cause", + b"cause", + "has_exception", + b"has_exception", + "message", + b"message", + "stack_trace", + b"stack_trace", + ], + ) -> None: ... + @typing.overload + def WhichOneof( + self, oneof_group: typing_extensions.Literal["_cause", b"_cause"] + ) -> typing_extensions.Literal["cause"] | None: ... + @typing.overload + def WhichOneof( + self, oneof_group: typing_extensions.Literal["_message", b"_message"] + ) -> typing_extensions.Literal["message"] | None: ... + + class AwaitTerminationResult(google.protobuf.message.Message): + DESCRIPTOR: google.protobuf.descriptor.Descriptor + + TERMINATED_FIELD_NUMBER: builtins.int + terminated: builtins.bool + def __init__( + self, + *, + terminated: builtins.bool = ..., + ) -> None: ... + def ClearField( + self, field_name: typing_extensions.Literal["terminated", b"terminated"] + ) -> None: ... + QUERY_ID_FIELD_NUMBER: builtins.int STATUS_FIELD_NUMBER: builtins.int RECENT_PROGRESS_FIELD_NUMBER: builtins.int EXPLAIN_FIELD_NUMBER: builtins.int + EXCEPTION_FIELD_NUMBER: builtins.int + AWAIT_TERMINATION_FIELD_NUMBER: builtins.int @property def query_id(self) -> global___StreamingQueryInstanceId: """(Required) Query instance id. See `StreamingQueryInstanceId`.""" @@ -1036,6 +1152,10 @@ class StreamingQueryCommandResult(google.protobuf.message.Message): def recent_progress(self) -> global___StreamingQueryCommandResult.RecentProgressResult: ... @property def explain(self) -> global___StreamingQueryCommandResult.ExplainResult: ... + @property + def exception(self) -> global___StreamingQueryCommandResult.ExceptionResult: ... + @property + def await_termination(self) -> global___StreamingQueryCommandResult.AwaitTerminationResult: ... def __init__( self, *, @@ -1043,10 +1163,16 @@ class StreamingQueryCommandResult(google.protobuf.message.Message): status: global___StreamingQueryCommandResult.StatusResult | None = ..., recent_progress: global___StreamingQueryCommandResult.RecentProgressResult | None = ..., explain: global___StreamingQueryCommandResult.ExplainResult | None = ..., + exception: global___StreamingQueryCommandResult.ExceptionResult | None = ..., + await_termination: global___StreamingQueryCommandResult.AwaitTerminationResult | None = ..., ) -> None: ... def HasField( self, field_name: typing_extensions.Literal[ + "await_termination", + b"await_termination", + "exception", + b"exception", "explain", b"explain", "query_id", @@ -1062,6 +1188,10 @@ class StreamingQueryCommandResult(google.protobuf.message.Message): def ClearField( self, field_name: typing_extensions.Literal[ + "await_termination", + b"await_termination", + "exception", + b"exception", "explain", b"explain", "query_id", @@ -1076,6 +1206,8 @@ class StreamingQueryCommandResult(google.protobuf.message.Message): ) -> None: ... def WhichOneof( self, oneof_group: typing_extensions.Literal["result_type", b"result_type"] - ) -> typing_extensions.Literal["status", "recent_progress", "explain"] | None: ... + ) -> typing_extensions.Literal[ + "status", "recent_progress", "explain", "exception", "await_termination" + ] | None: ... global___StreamingQueryCommandResult = StreamingQueryCommandResult diff --git a/python/pyspark/sql/connect/streaming/query.py b/python/pyspark/sql/connect/streaming/query.py index 118e724915184..4c5a3ff10f5be 100644 --- a/python/pyspark/sql/connect/streaming/query.py +++ b/python/pyspark/sql/connect/streaming/query.py @@ -16,6 +16,7 @@ # import json +import time from typing import TYPE_CHECKING, Any, cast, Dict, List, Optional from pyspark.errors import StreamingQueryException @@ -23,6 +24,9 @@ from pyspark.sql.streaming.query import ( StreamingQuery as PySparkStreamingQuery, ) +from pyspark.errors.exceptions.captured import ( + StreamingQueryException as CapturedStreamingQueryException, +) __all__ = [ "StreamingQuery", # TODO(SPARK-43032): "StreamingQueryManager" @@ -65,8 +69,31 @@ def isActive(self) -> bool: isActive.__doc__ = PySparkStreamingQuery.isActive.__doc__ + def _execute_await_termination_cmd(self, timeout: int = 10) -> bool: + cmd = pb2.StreamingQueryCommand() + cmd.await_termination.timeout_ms = timeout + terminated = self._execute_streaming_query_cmd(cmd).await_termination.terminated + return terminated + + def _await_termination(self, timeoutMs: Optional[int]) -> Optional[bool]: + terminated = False + if timeoutMs is None: + while not terminated: + terminated = self._execute_await_termination_cmd() + else: + reqTimeoutMs = min(timeoutMs, 10) + while timeoutMs > 0 and not terminated: + start = time.time() + terminated = self._execute_await_termination_cmd(reqTimeoutMs) + end = time.time() + timeoutMs = (end - start) * 1000 + return terminated + def awaitTermination(self, timeout: Optional[int] = None) -> Optional[bool]: - raise NotImplementedError() + if timeout is not None: + if not isinstance(timeout, (int, float)) or timeout < 0: + raise ValueError("timeout must be a positive integer or float. Got %s" % timeout) + return self._await_termination(int(timeout * 1000)) awaitTermination.__doc__ = PySparkStreamingQuery.awaitTermination.__doc__ @@ -128,10 +155,12 @@ def exception(self) -> Optional[StreamingQueryException]: cmd = pb2.StreamingQueryCommand() cmd.exception = True result = self._execute_streaming_query_cmd(cmd).exception.result - if result == "": + if not result.has_exception: return None else: - return result + msg = result.message.split(": ", 1)[1] # Drop the Java StreamingQueryException type info + stackTrace = "\n\t at ".join(map(lambda x: x.toString(), result.stack_trace)) + return CapturedStreamingQueryException(msg, stackTrace, result.cause) exception.__doc__ = PySparkStreamingQuery.exception.__doc__ From 1f3ba9454b8d10ce566efb210428f50402353465 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Thu, 13 Apr 2023 14:10:19 -0700 Subject: [PATCH 15/31] done --- .../main/protobuf/spark/connect/commands.proto | 4 +--- .../connect/planner/SparkConnectPlanner.scala | 10 ++++------ .../connect/service/SparkConnectService.scala | 8 +++++++- python/pyspark/sql/connect/client.py | 2 +- python/pyspark/sql/connect/streaming/query.py | 16 +++++++--------- .../connect/streaming/test_parity_streaming.py | 6 +----- .../sql/tests/streaming/test_streaming.py | 16 +++++++++++++--- .../sql/streaming/StreamingQueryManager.scala | 6 +++--- 8 files changed, 37 insertions(+), 31 deletions(-) diff --git a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto index c12f5a2816523..43dfea78889ce 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto @@ -308,9 +308,7 @@ message StreamingQueryCommandResult { message ExceptionResult { // Exception as string bool has_exception = 1; - optional string message = 2; - repeated string stack_trace = 3; - optional string cause = 4; + optional string error_message = 2; } message AwaitTerminationResult { diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala index dd1d58d611708..9d4401207a97a 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala @@ -21,7 +21,7 @@ import scala.collection.JavaConverters._ import scala.collection.mutable import com.google.common.collect.{Lists, Maps} -import com.google.protobuf.{Any => ProtoAny, ByteString} +import com.google.protobuf.{ByteString, Any => ProtoAny} import io.grpc.stub.StreamObserver import org.apache.spark.{Partition, SparkEnv, TaskContext} @@ -39,7 +39,7 @@ import org.apache.spark.connect.proto.WriteStreamOperationStartResult import org.apache.spark.ml.{functions => MLFunctions} import org.apache.spark.sql.{Column, Dataset, Encoders, SparkSession} import org.apache.spark.sql.avro.{AvroDataToCatalyst, CatalystDataToAvro} -import org.apache.spark.sql.catalyst.{expressions, AliasIdentifier, FunctionIdentifier} +import org.apache.spark.sql.catalyst.{AliasIdentifier, FunctionIdentifier, expressions} import org.apache.spark.sql.catalyst.analysis.{GlobalTempView, LocalTempView, MultiAlias, ParameterizedQuery, UnresolvedAlias, UnresolvedAttribute, UnresolvedDeserializer, UnresolvedExtractValue, UnresolvedFunction, UnresolvedRegex, UnresolvedRelation, UnresolvedStar} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions._ @@ -52,7 +52,7 @@ import org.apache.spark.sql.connect.artifact.SparkConnectArtifactManager import org.apache.spark.sql.connect.common.{DataTypeProtoConverter, InvalidPlanInput, LiteralValueProtoConverter, StorageLevelProtoConverter, UdfPacket} import org.apache.spark.sql.connect.config.Connect.CONNECT_GRPC_ARROW_MAX_BATCH_SIZE import org.apache.spark.sql.connect.plugin.SparkConnectPluginRegistry -import org.apache.spark.sql.connect.service.SparkConnectStreamHandler +import org.apache.spark.sql.connect.service.{SparkConnectService, SparkConnectStreamHandler} import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.execution.arrow.ArrowConverters @@ -2226,9 +2226,7 @@ class SparkConnectPlanner(val session: SparkSession) { StreamingQueryCommandResult.ExceptionResult .newBuilder() .setHasException(true) - .setMessage(e.getMessage) - .addAllStackTrace(e.getStackTrace.map(_.toString).toIterable.asJava) - .setCause(e.getCause.toString) + .setErrorMessage(SparkConnectService.extractErrorMessage(e)) .build() case None => StreamingQueryCommandResult.ExceptionResult diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala index a4474ac64c14b..6752c476e6e52 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala @@ -56,6 +56,8 @@ class SparkConnectService(debug: Boolean) extends proto.SparkConnectServiceGrpc.SparkConnectServiceImplBase with Logging { + import SparkConnectService._ + private def allClasses(cl: Class[_]): Seq[Class[_]] = { val classes = ArrayBuffer.empty[Class[_]] if (cl != null && !cl.equals(classOf[java.lang.Object])) { @@ -85,7 +87,7 @@ class SparkConnectService(debug: Boolean) .setDomain("org.apache.spark") .putMetadata("classes", compact(render(allClasses(st.getClass).map(_.getName)))) .build())) - .setMessage(StringUtils.abbreviate(st.getMessage, 2048)) + .setMessage(extractErrorMessage(st)) .build() } @@ -294,4 +296,8 @@ object SparkConnectService { } } } + + def extractErrorMessage(st: Throwable): String = { + StringUtils.abbreviate(st.getMessage, 2048) + } } diff --git a/python/pyspark/sql/connect/client.py b/python/pyspark/sql/connect/client.py index 0f7b506b4fb12..87cdfc3d17260 100644 --- a/python/pyspark/sql/connect/client.py +++ b/python/pyspark/sql/connect/client.py @@ -1090,13 +1090,13 @@ def _handle_rpc_error(self, rpc_error: grpc.RpcError) -> NoReturn: # We have to cast the value here because, a RpcError is a Call as well. # https://grpc.github.io/grpc/python/grpc.html#grpc.UnaryUnaryMultiCallable.__call__ status = rpc_status.from_call(cast(grpc.Call, rpc_error)) + print("wei===1") if status: for d in status.details: if d.Is(error_details_pb2.ErrorInfo.DESCRIPTOR): info = error_details_pb2.ErrorInfo() d.Unpack(info) raise convert_exception(info, status.message) from None - raise SparkConnectGrpcException(status.message) from None else: raise SparkConnectGrpcException(str(rpc_error)) from None diff --git a/python/pyspark/sql/connect/streaming/query.py b/python/pyspark/sql/connect/streaming/query.py index f9f0300448c93..d4f8f3f2ce59b 100644 --- a/python/pyspark/sql/connect/streaming/query.py +++ b/python/pyspark/sql/connect/streaming/query.py @@ -25,9 +25,7 @@ from pyspark.sql.streaming.query import ( StreamingQuery as PySparkStreamingQuery, ) -from pyspark.errors.exceptions.captured import ( - StreamingQueryException as CapturedStreamingQueryException, -) +from pyspark.errors.exceptions.connect import StreamingQueryException as CapturedStreamingQueryException __all__ = [ "StreamingQuery", # TODO(SPARK-43032): "StreamingQueryManager" @@ -80,14 +78,16 @@ def _await_termination(self, timeoutMs: Optional[int]) -> Optional[bool]: terminated = False if timeoutMs is None: while not terminated: + # When no timeout is set, query the server every 10ms until query terminates terminated = self._execute_await_termination_cmd() else: reqTimeoutMs = min(timeoutMs, 10) while timeoutMs > 0 and not terminated: + # When timeout is set, query the server every reqTimeoutMs until query terminates or timout start = time.time() terminated = self._execute_await_termination_cmd(reqTimeoutMs) end = time.time() - timeoutMs = (end - start) * 1000 + timeoutMs -= (end - start) * 1000 return terminated def awaitTermination(self, timeout: Optional[int] = None) -> Optional[bool]: @@ -156,13 +156,11 @@ def explain(self, extended: bool = False) -> None: def exception(self) -> Optional[StreamingQueryException]: cmd = pb2.StreamingQueryCommand() cmd.exception = True - result = self._execute_streaming_query_cmd(cmd).exception.result - if not result.has_exception: + exception = self._execute_streaming_query_cmd(cmd).exception + if not exception.has_exception: return None else: - msg = result.message.split(": ", 1)[1] # Drop the Java StreamingQueryException type info - stackTrace = "\n\t at ".join(map(lambda x: x.toString(), result.stack_trace)) - return CapturedStreamingQueryException(msg, stackTrace, result.cause) + return CapturedStreamingQueryException(exception.error_message) exception.__doc__ = PySparkStreamingQuery.exception.__doc__ diff --git a/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py b/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py index 6b4460bab5216..fee9539e307e7 100644 --- a/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +++ b/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py @@ -22,14 +22,10 @@ class StreamingParityTests(StreamingTestsMixin, ReusedConnectTestCase): - @unittest.skip("Will be supported with SPARK-42960.") + @unittest.skip("Query manager API will be supported later with SPARK-43032.") def test_stream_await_termination(self): super().test_stream_await_termination() - @unittest.skip("Will be supported with SPARK-42960.") - def test_stream_exception(self): - super().test_stream_exception() - @unittest.skip("Query manager API will be supported later with SPARK-43032.") def test_stream_status_and_progress(self): super().test_stream_status_and_progress() diff --git a/python/pyspark/sql/tests/streaming/test_streaming.py b/python/pyspark/sql/tests/streaming/test_streaming.py index 838d413a0cc3c..a200742abccf5 100644 --- a/python/pyspark/sql/tests/streaming/test_streaming.py +++ b/python/pyspark/sql/tests/streaming/test_streaming.py @@ -24,7 +24,7 @@ from pyspark.sql.functions import lit from pyspark.sql.types import StructType, StructField, IntegerType, StringType from pyspark.testing.sqlutils import ReusedSQLTestCase - +from pyspark.errors.exceptions.connect import SparkConnectException class StreamingTestsMixin: def test_streaming_query_functions_basic(self): @@ -285,11 +285,21 @@ def test_stream_exception(self): # This is expected self._assert_exception_tree_contains_msg(e, "ZeroDivisionError") finally: + exception = sq.exception() sq.stop() - self.assertIsInstance(sq.exception(), StreamingQueryException) - self._assert_exception_tree_contains_msg(sq.exception(), "ZeroDivisionError") + self.assertIsInstance(exception, StreamingQueryException) + self._assert_exception_tree_contains_msg(exception, "ZeroDivisionError") def _assert_exception_tree_contains_msg(self, exception, msg): + if isinstance(exception, SparkConnectException): + self._assert_exception_tree_contains_msg_connect(exception, msg) + else: + self._assert_exception_tree_contains_msg_default(exception, msg) + + def _assert_exception_tree_contains_msg_connect(self, exception, msg): + self.assertTrue(msg in exception.message, "Exception tree doesn't contain the expected message: %s" % msg) + + def _assert_exception_tree_contains_msg_default(self, exception, msg): e = exception contains = msg in e.desc while e.cause is not None and not contains: diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala index 20254dec3d874..5063266f62779 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala @@ -429,9 +429,9 @@ class StreamingQueryManager private[sql] ( private def unregisterTerminatedStream(terminatedQuery: StreamingQuery): Unit = { activeQueriesSharedLock.synchronized { // remove from shared state only if the streaming execution also matches - sparkSession.sharedState.activeStreamingQueries.remove( - terminatedQuery.id, terminatedQuery) - activeQueries -= terminatedQuery.id +// sparkSession.sharedState.activeStreamingQueries.remove( +// terminatedQuery.id, terminatedQuery) +// activeQueries -= terminatedQuery.id } } } From 0c2776a7863b0ec2dd7001638a65840568453a89 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Thu, 13 Apr 2023 14:29:06 -0700 Subject: [PATCH 16/31] minor --- .../connect/planner/SparkConnectPlanner.scala | 4 +-- python/pyspark/sql/connect/client.py | 2 +- python/pyspark/sql/connect/streaming/query.py | 1 - .../streaming/test_parity_streaming.py | 6 ++++ .../sql/tests/streaming/test_streaming.py | 34 ------------------- .../sql/streaming/StreamingQueryManager.scala | 6 ++-- 6 files changed, 12 insertions(+), 41 deletions(-) diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala index 2c64ab71a1d0a..b5bf22a4e4034 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala @@ -21,7 +21,7 @@ import scala.collection.JavaConverters._ import scala.collection.mutable import com.google.common.collect.{Lists, Maps} -import com.google.protobuf.{ByteString, Any => ProtoAny} +import com.google.protobuf.{Any => ProtoAny, ByteString} import io.grpc.stub.StreamObserver import org.apache.spark.{Partition, SparkEnv, TaskContext} @@ -39,7 +39,7 @@ import org.apache.spark.connect.proto.WriteStreamOperationStartResult import org.apache.spark.ml.{functions => MLFunctions} import org.apache.spark.sql.{Column, Dataset, Encoders, SparkSession} import org.apache.spark.sql.avro.{AvroDataToCatalyst, CatalystDataToAvro} -import org.apache.spark.sql.catalyst.{AliasIdentifier, FunctionIdentifier, expressions} +import org.apache.spark.sql.catalyst.{expressions, AliasIdentifier, FunctionIdentifier} import org.apache.spark.sql.catalyst.analysis.{GlobalTempView, LocalTempView, MultiAlias, ParameterizedQuery, UnresolvedAlias, UnresolvedAttribute, UnresolvedDeserializer, UnresolvedExtractValue, UnresolvedFunction, UnresolvedRegex, UnresolvedRelation, UnresolvedStar} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions._ diff --git a/python/pyspark/sql/connect/client.py b/python/pyspark/sql/connect/client.py index 600d606ba9c73..756b62fb06d86 100644 --- a/python/pyspark/sql/connect/client.py +++ b/python/pyspark/sql/connect/client.py @@ -1107,13 +1107,13 @@ def _handle_rpc_error(self, rpc_error: grpc.RpcError) -> NoReturn: # We have to cast the value here because, a RpcError is a Call as well. # https://grpc.github.io/grpc/python/grpc.html#grpc.UnaryUnaryMultiCallable.__call__ status = rpc_status.from_call(cast(grpc.Call, rpc_error)) - print("wei===1") if status: for d in status.details: if d.Is(error_details_pb2.ErrorInfo.DESCRIPTOR): info = error_details_pb2.ErrorInfo() d.Unpack(info) raise convert_exception(info, status.message) from None + raise SparkConnectGrpcException(status.message) from None else: raise SparkConnectGrpcException(str(rpc_error)) from None diff --git a/python/pyspark/sql/connect/streaming/query.py b/python/pyspark/sql/connect/streaming/query.py index 7d25045e457cb..79c83d07c665c 100644 --- a/python/pyspark/sql/connect/streaming/query.py +++ b/python/pyspark/sql/connect/streaming/query.py @@ -155,7 +155,6 @@ def explain(self, extended: bool = False) -> None: explain.__doc__ = PySparkStreamingQuery.explain.__doc__ - # TODO (SPARK-42960): Implement and uncomment the doc def exception(self) -> Optional[StreamingQueryException]: cmd = pb2.StreamingQueryCommand() cmd.exception = True diff --git a/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py b/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py index fee9539e307e7..21f43181552a1 100644 --- a/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +++ b/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py @@ -26,6 +26,12 @@ class StreamingParityTests(StreamingTestsMixin, ReusedConnectTestCase): def test_stream_await_termination(self): super().test_stream_await_termination() + @unittest.skip( + "Query immediately quits after throw, allowing access to supported queries will be added in SPARK-42962." + ) + def test_stream_exception(self): + super().test_stream_exception() + @unittest.skip("Query manager API will be supported later with SPARK-43032.") def test_stream_status_and_progress(self): super().test_stream_status_and_progress() diff --git a/python/pyspark/sql/tests/streaming/test_streaming.py b/python/pyspark/sql/tests/streaming/test_streaming.py index 7a358f66b2569..03a6699ee431c 100644 --- a/python/pyspark/sql/tests/streaming/test_streaming.py +++ b/python/pyspark/sql/tests/streaming/test_streaming.py @@ -27,40 +27,6 @@ from pyspark.errors.exceptions.connect import SparkConnectException -class StreamingTestsMixin: - def test_streaming_query_functions_basic(self): - df = self.spark.readStream.format("rate").option("rowsPerSecond", 10).load() - query = ( - df.writeStream.format("memory") - .queryName("test_streaming_query_functions_basic") - .start() - ) - try: - self.assertEquals(query.name, "test_streaming_query_functions_basic") - self.assertTrue(isinstance(query.id, str)) - self.assertTrue(isinstance(query.runId, str)) - self.assertTrue(query.isActive) - # TODO: Will be uncommented with [SPARK-42960] - # self.assertEqual(query.exception(), None) - # self.assertFalse(query.awaitTermination(1)) - query.processAllAvailable() - recentProgress = query.recentProgress - lastProgress = query.lastProgress - self.assertEqual(lastProgress["name"], query.name) - self.assertEqual(lastProgress["id"], query.id) - self.assertTrue(any(p == lastProgress for p in recentProgress)) - query.explain() - - except Exception as e: - self.fail( - "Streaming query functions sanity check shouldn't throw any error. " - "Error message: " + str(e) - ) - - finally: - query.stop() - - class StreamingTestsMixin: def test_streaming_query_functions_basic(self): df = self.spark.readStream.format("rate").option("rowsPerSecond", 10).load() diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala index 5063266f62779..20254dec3d874 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala @@ -429,9 +429,9 @@ class StreamingQueryManager private[sql] ( private def unregisterTerminatedStream(terminatedQuery: StreamingQuery): Unit = { activeQueriesSharedLock.synchronized { // remove from shared state only if the streaming execution also matches -// sparkSession.sharedState.activeStreamingQueries.remove( -// terminatedQuery.id, terminatedQuery) -// activeQueries -= terminatedQuery.id + sparkSession.sharedState.activeStreamingQueries.remove( + terminatedQuery.id, terminatedQuery) + activeQueries -= terminatedQuery.id } } } From da6fc21314b7e63131d6a661fb1dfcbf307fc4fb Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Fri, 14 Apr 2023 10:01:43 -0700 Subject: [PATCH 17/31] address comments --- .../protobuf/spark/connect/commands.proto | 15 ++-- .../connect/planner/SparkConnectPlanner.scala | 32 ++++----- python/pyspark/sql/connect/plan.py | 7 +- .../pyspark/sql/connect/proto/commands_pb2.py | 42 +++++------ .../sql/connect/proto/commands_pb2.pyi | 69 ++++++++++++++----- python/pyspark/sql/connect/streaming/query.py | 36 ++++------ python/pyspark/sql/streaming/query.py | 2 +- .../sql/tests/streaming/test_streaming.py | 6 +- 8 files changed, 121 insertions(+), 88 deletions(-) diff --git a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto index 503dae8153978..a8097bfe3e23e 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto @@ -272,7 +272,10 @@ message StreamingQueryCommand { } message AwaitTerminationCommand { - int64 timeout_ms = 1; + oneof timeout { + bool no_timeout = 1; + int64 timeout_ms = 2; + } } } @@ -308,13 +311,15 @@ message StreamingQueryCommandResult { } message ExceptionResult { - // Exception as string - bool has_exception = 1; - optional string error_message = 2; + // Exception message as string + oneof exception_content { + bool no_exception = 1; + string exception_message = 2; + } } message AwaitTerminationResult { - bool terminated = 1; + optional bool terminated = 1; } } diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala index b5bf22a4e4034..d7580114de3c3 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala @@ -2223,28 +2223,26 @@ class SparkConnectPlanner(val session: SparkSession) { case StreamingQueryCommand.CommandCase.EXCEPTION => val result = query.exception - val exception = result match { + result match { case Some(e) => - StreamingQueryCommandResult.ExceptionResult - .newBuilder() - .setHasException(true) - .setErrorMessage(SparkConnectService.extractErrorMessage(e)) - .build() + respBuilder + .getExceptionBuilder + .setExceptionMessage(SparkConnectService.extractErrorMessage(e)) case None => - StreamingQueryCommandResult.ExceptionResult - .newBuilder() - .setHasException(false) - .build() + respBuilder + .getExceptionBuilder + .setNoException(true) } - respBuilder.setException(exception) case StreamingQueryCommand.CommandCase.AWAIT_TERMINATION => - val terminated = query.awaitTermination(command.getAwaitTermination.getTimeoutMs) - val terminatedResult = StreamingQueryCommandResult.AwaitTerminationResult - .newBuilder() - .setTerminated(terminated) - .build() - respBuilder.setAwaitTermination(terminatedResult) + if (command.getAwaitTermination.hasTimeoutMs) { + val terminated = query.awaitTermination(command.getAwaitTermination.getTimeoutMs) + respBuilder + .getAwaitTerminationBuilder + .setTerminated(terminated) + } else { + query.awaitTermination() + } case StreamingQueryCommand.CommandCase.COMMAND_NOT_SET => throw new IllegalArgumentException("Missing command in StreamingQueryCommand") diff --git a/python/pyspark/sql/connect/plan.py b/python/pyspark/sql/connect/plan.py index 9e8770a80da94..06e1e38739892 100644 --- a/python/pyspark/sql/connect/plan.py +++ b/python/pyspark/sql/connect/plan.py @@ -307,7 +307,12 @@ def plan(self, session: "SparkConnectClient") -> proto.Relation: class Read(LogicalPlan): - def __init__(self, table_name: str, options: Optional[Dict[str, str]] = None) -> None: + def __init__( + self, + table_name: str, + options: Optional[Dict[str, str]] = None, + is_streaming: Optional[bool] = None, + ) -> None: super().__init__(None) self.table_name = table_name self.options = options or {} diff --git a/python/pyspark/sql/connect/proto/commands_pb2.py b/python/pyspark/sql/connect/proto/commands_pb2.py index e3656c0b1c0b0..d2ffffd4ce3d1 100644 --- a/python/pyspark/sql/connect/proto/commands_pb2.py +++ b/python/pyspark/sql/connect/proto/commands_pb2.py @@ -36,7 +36,7 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1cspark/connect/commands.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1aspark/connect/common.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto"\x90\x06\n\x07\x43ommand\x12]\n\x11register_function\x18\x01 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x10registerFunction\x12H\n\x0fwrite_operation\x18\x02 \x01(\x0b\x32\x1d.spark.connect.WriteOperationH\x00R\x0ewriteOperation\x12_\n\x15\x63reate_dataframe_view\x18\x03 \x01(\x0b\x32).spark.connect.CreateDataFrameViewCommandH\x00R\x13\x63reateDataframeView\x12O\n\x12write_operation_v2\x18\x04 \x01(\x0b\x32\x1f.spark.connect.WriteOperationV2H\x00R\x10writeOperationV2\x12<\n\x0bsql_command\x18\x05 \x01(\x0b\x32\x19.spark.connect.SqlCommandH\x00R\nsqlCommand\x12k\n\x1cwrite_stream_operation_start\x18\x06 \x01(\x0b\x32(.spark.connect.WriteStreamOperationStartH\x00R\x19writeStreamOperationStart\x12^\n\x17streaming_query_command\x18\x07 \x01(\x0b\x32$.spark.connect.StreamingQueryCommandH\x00R\x15streamingQueryCommand\x12X\n\x15get_resources_command\x18\x08 \x01(\x0b\x32".spark.connect.GetResourcesCommandH\x00R\x13getResourcesCommand\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textensionB\x0e\n\x0c\x63ommand_type"\xb3\x01\n\nSqlCommand\x12\x10\n\x03sql\x18\x01 \x01(\tR\x03sql\x12\x37\n\x04\x61rgs\x18\x02 \x03(\x0b\x32#.spark.connect.SqlCommand.ArgsEntryR\x04\x61rgs\x1aZ\n\tArgsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x37\n\x05value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x05value:\x02\x38\x01"\x96\x01\n\x1a\x43reateDataFrameViewCommand\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x1b\n\tis_global\x18\x03 \x01(\x08R\x08isGlobal\x12\x18\n\x07replace\x18\x04 \x01(\x08R\x07replace"\x9b\x08\n\x0eWriteOperation\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1b\n\x06source\x18\x02 \x01(\tH\x01R\x06source\x88\x01\x01\x12\x14\n\x04path\x18\x03 \x01(\tH\x00R\x04path\x12?\n\x05table\x18\x04 \x01(\x0b\x32\'.spark.connect.WriteOperation.SaveTableH\x00R\x05table\x12:\n\x04mode\x18\x05 \x01(\x0e\x32&.spark.connect.WriteOperation.SaveModeR\x04mode\x12*\n\x11sort_column_names\x18\x06 \x03(\tR\x0fsortColumnNames\x12\x31\n\x14partitioning_columns\x18\x07 \x03(\tR\x13partitioningColumns\x12\x43\n\tbucket_by\x18\x08 \x01(\x0b\x32&.spark.connect.WriteOperation.BucketByR\x08\x62ucketBy\x12\x44\n\x07options\x18\t \x03(\x0b\x32*.spark.connect.WriteOperation.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x82\x02\n\tSaveTable\x12\x1d\n\ntable_name\x18\x01 \x01(\tR\ttableName\x12X\n\x0bsave_method\x18\x02 \x01(\x0e\x32\x37.spark.connect.WriteOperation.SaveTable.TableSaveMethodR\nsaveMethod"|\n\x0fTableSaveMethod\x12!\n\x1dTABLE_SAVE_METHOD_UNSPECIFIED\x10\x00\x12#\n\x1fTABLE_SAVE_METHOD_SAVE_AS_TABLE\x10\x01\x12!\n\x1dTABLE_SAVE_METHOD_INSERT_INTO\x10\x02\x1a[\n\x08\x42ucketBy\x12.\n\x13\x62ucket_column_names\x18\x01 \x03(\tR\x11\x62ucketColumnNames\x12\x1f\n\x0bnum_buckets\x18\x02 \x01(\x05R\nnumBuckets"\x89\x01\n\x08SaveMode\x12\x19\n\x15SAVE_MODE_UNSPECIFIED\x10\x00\x12\x14\n\x10SAVE_MODE_APPEND\x10\x01\x12\x17\n\x13SAVE_MODE_OVERWRITE\x10\x02\x12\x1d\n\x19SAVE_MODE_ERROR_IF_EXISTS\x10\x03\x12\x14\n\x10SAVE_MODE_IGNORE\x10\x04\x42\x0b\n\tsave_typeB\t\n\x07_source"\xad\x06\n\x10WriteOperationV2\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1d\n\ntable_name\x18\x02 \x01(\tR\ttableName\x12\x1f\n\x08provider\x18\x03 \x01(\tH\x00R\x08provider\x88\x01\x01\x12L\n\x14partitioning_columns\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13partitioningColumns\x12\x46\n\x07options\x18\x05 \x03(\x0b\x32,.spark.connect.WriteOperationV2.OptionsEntryR\x07options\x12_\n\x10table_properties\x18\x06 \x03(\x0b\x32\x34.spark.connect.WriteOperationV2.TablePropertiesEntryR\x0ftableProperties\x12\x38\n\x04mode\x18\x07 \x01(\x0e\x32$.spark.connect.WriteOperationV2.ModeR\x04mode\x12J\n\x13overwrite_condition\x18\x08 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x12overwriteCondition\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x42\n\x14TablePropertiesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01"\x9f\x01\n\x04Mode\x12\x14\n\x10MODE_UNSPECIFIED\x10\x00\x12\x0f\n\x0bMODE_CREATE\x10\x01\x12\x12\n\x0eMODE_OVERWRITE\x10\x02\x12\x1d\n\x19MODE_OVERWRITE_PARTITIONS\x10\x03\x12\x0f\n\x0bMODE_APPEND\x10\x04\x12\x10\n\x0cMODE_REPLACE\x10\x05\x12\x1a\n\x16MODE_CREATE_OR_REPLACE\x10\x06\x42\x0b\n\t_provider"\x82\x05\n\x19WriteStreamOperationStart\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x16\n\x06\x66ormat\x18\x02 \x01(\tR\x06\x66ormat\x12O\n\x07options\x18\x03 \x03(\x0b\x32\x35.spark.connect.WriteStreamOperationStart.OptionsEntryR\x07options\x12:\n\x19partitioning_column_names\x18\x04 \x03(\tR\x17partitioningColumnNames\x12:\n\x18processing_time_interval\x18\x05 \x01(\tH\x00R\x16processingTimeInterval\x12%\n\ravailable_now\x18\x06 \x01(\x08H\x00R\x0c\x61vailableNow\x12\x14\n\x04once\x18\x07 \x01(\x08H\x00R\x04once\x12\x46\n\x1e\x63ontinuous_checkpoint_interval\x18\x08 \x01(\tH\x00R\x1c\x63ontinuousCheckpointInterval\x12\x1f\n\x0boutput_mode\x18\t \x01(\tR\noutputMode\x12\x1d\n\nquery_name\x18\n \x01(\tR\tqueryName\x12\x14\n\x04path\x18\x0b \x01(\tH\x01R\x04path\x12\x1f\n\ntable_name\x18\x0c \x01(\tH\x01R\ttableName\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\t\n\x07triggerB\x12\n\x10sink_destination"y\n\x1fWriteStreamOperationStartResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name"A\n\x18StreamingQueryInstanceId\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x12\x15\n\x06run_id\x18\x02 \x01(\tR\x05runId"\xe4\x04\n\x15StreamingQueryCommand\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x18\n\x06status\x18\x02 \x01(\x08H\x00R\x06status\x12%\n\rlast_progress\x18\x03 \x01(\x08H\x00R\x0clastProgress\x12)\n\x0frecent_progress\x18\x04 \x01(\x08H\x00R\x0erecentProgress\x12\x14\n\x04stop\x18\x05 \x01(\x08H\x00R\x04stop\x12\x34\n\x15process_all_available\x18\x06 \x01(\x08H\x00R\x13processAllAvailable\x12O\n\x07\x65xplain\x18\x07 \x01(\x0b\x32\x33.spark.connect.StreamingQueryCommand.ExplainCommandH\x00R\x07\x65xplain\x12\x1e\n\texception\x18\x08 \x01(\x08H\x00R\texception\x12k\n\x11\x61wait_termination\x18\t \x01(\x0b\x32<.spark.connect.StreamingQueryCommand.AwaitTerminationCommandH\x00R\x10\x61waitTermination\x1a,\n\x0e\x45xplainCommand\x12\x1a\n\x08\x65xtended\x18\x01 \x01(\x08R\x08\x65xtended\x1a\x38\n\x17\x41waitTerminationCommand\x12\x1d\n\ntimeout_ms\x18\x01 \x01(\x03R\ttimeoutMsB\t\n\x07\x63ommand"\xa1\x08\n\x1bStreamingQueryCommandResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12Q\n\x06status\x18\x02 \x01(\x0b\x32\x37.spark.connect.StreamingQueryCommandResult.StatusResultH\x00R\x06status\x12j\n\x0frecent_progress\x18\x03 \x01(\x0b\x32?.spark.connect.StreamingQueryCommandResult.RecentProgressResultH\x00R\x0erecentProgress\x12T\n\x07\x65xplain\x18\x04 \x01(\x0b\x32\x38.spark.connect.StreamingQueryCommandResult.ExplainResultH\x00R\x07\x65xplain\x12Z\n\texception\x18\x05 \x01(\x0b\x32:.spark.connect.StreamingQueryCommandResult.ExceptionResultH\x00R\texception\x12p\n\x11\x61wait_termination\x18\x06 \x01(\x0b\x32\x41.spark.connect.StreamingQueryCommandResult.AwaitTerminationResultH\x00R\x10\x61waitTermination\x1a\xaa\x01\n\x0cStatusResult\x12%\n\x0estatus_message\x18\x01 \x01(\tR\rstatusMessage\x12*\n\x11is_data_available\x18\x02 \x01(\x08R\x0fisDataAvailable\x12*\n\x11is_trigger_active\x18\x03 \x01(\x08R\x0fisTriggerActive\x12\x1b\n\tis_active\x18\x04 \x01(\x08R\x08isActive\x1aH\n\x14RecentProgressResult\x12\x30\n\x14recent_progress_json\x18\x05 \x03(\tR\x12recentProgressJson\x1a\'\n\rExplainResult\x12\x16\n\x06result\x18\x01 \x01(\tR\x06result\x1ar\n\x0f\x45xceptionResult\x12#\n\rhas_exception\x18\x01 \x01(\x08R\x0chasException\x12(\n\rerror_message\x18\x02 \x01(\tH\x00R\x0c\x65rrorMessage\x88\x01\x01\x42\x10\n\x0e_error_message\x1a\x38\n\x16\x41waitTerminationResult\x12\x1e\n\nterminated\x18\x01 \x01(\x08R\nterminatedB\r\n\x0bresult_type"\x15\n\x13GetResourcesCommand"\xd4\x01\n\x19GetResourcesCommandResult\x12U\n\tresources\x18\x01 \x03(\x0b\x32\x37.spark.connect.GetResourcesCommandResult.ResourcesEntryR\tresources\x1a`\n\x0eResourcesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x38\n\x05value\x18\x02 \x01(\x0b\x32".spark.connect.ResourceInformationR\x05value:\x02\x38\x01\x42"\n\x1eorg.apache.spark.connect.protoP\x01\x62\x06proto3' + b'\n\x1cspark/connect/commands.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1aspark/connect/common.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto"\x90\x06\n\x07\x43ommand\x12]\n\x11register_function\x18\x01 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x10registerFunction\x12H\n\x0fwrite_operation\x18\x02 \x01(\x0b\x32\x1d.spark.connect.WriteOperationH\x00R\x0ewriteOperation\x12_\n\x15\x63reate_dataframe_view\x18\x03 \x01(\x0b\x32).spark.connect.CreateDataFrameViewCommandH\x00R\x13\x63reateDataframeView\x12O\n\x12write_operation_v2\x18\x04 \x01(\x0b\x32\x1f.spark.connect.WriteOperationV2H\x00R\x10writeOperationV2\x12<\n\x0bsql_command\x18\x05 \x01(\x0b\x32\x19.spark.connect.SqlCommandH\x00R\nsqlCommand\x12k\n\x1cwrite_stream_operation_start\x18\x06 \x01(\x0b\x32(.spark.connect.WriteStreamOperationStartH\x00R\x19writeStreamOperationStart\x12^\n\x17streaming_query_command\x18\x07 \x01(\x0b\x32$.spark.connect.StreamingQueryCommandH\x00R\x15streamingQueryCommand\x12X\n\x15get_resources_command\x18\x08 \x01(\x0b\x32".spark.connect.GetResourcesCommandH\x00R\x13getResourcesCommand\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textensionB\x0e\n\x0c\x63ommand_type"\xb3\x01\n\nSqlCommand\x12\x10\n\x03sql\x18\x01 \x01(\tR\x03sql\x12\x37\n\x04\x61rgs\x18\x02 \x03(\x0b\x32#.spark.connect.SqlCommand.ArgsEntryR\x04\x61rgs\x1aZ\n\tArgsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x37\n\x05value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x05value:\x02\x38\x01"\x96\x01\n\x1a\x43reateDataFrameViewCommand\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x1b\n\tis_global\x18\x03 \x01(\x08R\x08isGlobal\x12\x18\n\x07replace\x18\x04 \x01(\x08R\x07replace"\x9b\x08\n\x0eWriteOperation\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1b\n\x06source\x18\x02 \x01(\tH\x01R\x06source\x88\x01\x01\x12\x14\n\x04path\x18\x03 \x01(\tH\x00R\x04path\x12?\n\x05table\x18\x04 \x01(\x0b\x32\'.spark.connect.WriteOperation.SaveTableH\x00R\x05table\x12:\n\x04mode\x18\x05 \x01(\x0e\x32&.spark.connect.WriteOperation.SaveModeR\x04mode\x12*\n\x11sort_column_names\x18\x06 \x03(\tR\x0fsortColumnNames\x12\x31\n\x14partitioning_columns\x18\x07 \x03(\tR\x13partitioningColumns\x12\x43\n\tbucket_by\x18\x08 \x01(\x0b\x32&.spark.connect.WriteOperation.BucketByR\x08\x62ucketBy\x12\x44\n\x07options\x18\t \x03(\x0b\x32*.spark.connect.WriteOperation.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x82\x02\n\tSaveTable\x12\x1d\n\ntable_name\x18\x01 \x01(\tR\ttableName\x12X\n\x0bsave_method\x18\x02 \x01(\x0e\x32\x37.spark.connect.WriteOperation.SaveTable.TableSaveMethodR\nsaveMethod"|\n\x0fTableSaveMethod\x12!\n\x1dTABLE_SAVE_METHOD_UNSPECIFIED\x10\x00\x12#\n\x1fTABLE_SAVE_METHOD_SAVE_AS_TABLE\x10\x01\x12!\n\x1dTABLE_SAVE_METHOD_INSERT_INTO\x10\x02\x1a[\n\x08\x42ucketBy\x12.\n\x13\x62ucket_column_names\x18\x01 \x03(\tR\x11\x62ucketColumnNames\x12\x1f\n\x0bnum_buckets\x18\x02 \x01(\x05R\nnumBuckets"\x89\x01\n\x08SaveMode\x12\x19\n\x15SAVE_MODE_UNSPECIFIED\x10\x00\x12\x14\n\x10SAVE_MODE_APPEND\x10\x01\x12\x17\n\x13SAVE_MODE_OVERWRITE\x10\x02\x12\x1d\n\x19SAVE_MODE_ERROR_IF_EXISTS\x10\x03\x12\x14\n\x10SAVE_MODE_IGNORE\x10\x04\x42\x0b\n\tsave_typeB\t\n\x07_source"\xad\x06\n\x10WriteOperationV2\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1d\n\ntable_name\x18\x02 \x01(\tR\ttableName\x12\x1f\n\x08provider\x18\x03 \x01(\tH\x00R\x08provider\x88\x01\x01\x12L\n\x14partitioning_columns\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13partitioningColumns\x12\x46\n\x07options\x18\x05 \x03(\x0b\x32,.spark.connect.WriteOperationV2.OptionsEntryR\x07options\x12_\n\x10table_properties\x18\x06 \x03(\x0b\x32\x34.spark.connect.WriteOperationV2.TablePropertiesEntryR\x0ftableProperties\x12\x38\n\x04mode\x18\x07 \x01(\x0e\x32$.spark.connect.WriteOperationV2.ModeR\x04mode\x12J\n\x13overwrite_condition\x18\x08 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x12overwriteCondition\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x42\n\x14TablePropertiesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01"\x9f\x01\n\x04Mode\x12\x14\n\x10MODE_UNSPECIFIED\x10\x00\x12\x0f\n\x0bMODE_CREATE\x10\x01\x12\x12\n\x0eMODE_OVERWRITE\x10\x02\x12\x1d\n\x19MODE_OVERWRITE_PARTITIONS\x10\x03\x12\x0f\n\x0bMODE_APPEND\x10\x04\x12\x10\n\x0cMODE_REPLACE\x10\x05\x12\x1a\n\x16MODE_CREATE_OR_REPLACE\x10\x06\x42\x0b\n\t_provider"\x82\x05\n\x19WriteStreamOperationStart\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x16\n\x06\x66ormat\x18\x02 \x01(\tR\x06\x66ormat\x12O\n\x07options\x18\x03 \x03(\x0b\x32\x35.spark.connect.WriteStreamOperationStart.OptionsEntryR\x07options\x12:\n\x19partitioning_column_names\x18\x04 \x03(\tR\x17partitioningColumnNames\x12:\n\x18processing_time_interval\x18\x05 \x01(\tH\x00R\x16processingTimeInterval\x12%\n\ravailable_now\x18\x06 \x01(\x08H\x00R\x0c\x61vailableNow\x12\x14\n\x04once\x18\x07 \x01(\x08H\x00R\x04once\x12\x46\n\x1e\x63ontinuous_checkpoint_interval\x18\x08 \x01(\tH\x00R\x1c\x63ontinuousCheckpointInterval\x12\x1f\n\x0boutput_mode\x18\t \x01(\tR\noutputMode\x12\x1d\n\nquery_name\x18\n \x01(\tR\tqueryName\x12\x14\n\x04path\x18\x0b \x01(\tH\x01R\x04path\x12\x1f\n\ntable_name\x18\x0c \x01(\tH\x01R\ttableName\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\t\n\x07triggerB\x12\n\x10sink_destination"y\n\x1fWriteStreamOperationStartResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name"A\n\x18StreamingQueryInstanceId\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x12\x15\n\x06run_id\x18\x02 \x01(\tR\x05runId"\x92\x05\n\x15StreamingQueryCommand\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x18\n\x06status\x18\x02 \x01(\x08H\x00R\x06status\x12%\n\rlast_progress\x18\x03 \x01(\x08H\x00R\x0clastProgress\x12)\n\x0frecent_progress\x18\x04 \x01(\x08H\x00R\x0erecentProgress\x12\x14\n\x04stop\x18\x05 \x01(\x08H\x00R\x04stop\x12\x34\n\x15process_all_available\x18\x06 \x01(\x08H\x00R\x13processAllAvailable\x12O\n\x07\x65xplain\x18\x07 \x01(\x0b\x32\x33.spark.connect.StreamingQueryCommand.ExplainCommandH\x00R\x07\x65xplain\x12\x1e\n\texception\x18\x08 \x01(\x08H\x00R\texception\x12k\n\x11\x61wait_termination\x18\t \x01(\x0b\x32<.spark.connect.StreamingQueryCommand.AwaitTerminationCommandH\x00R\x10\x61waitTermination\x1a,\n\x0e\x45xplainCommand\x12\x1a\n\x08\x65xtended\x18\x01 \x01(\x08R\x08\x65xtended\x1a\x66\n\x17\x41waitTerminationCommand\x12\x1f\n\nno_timeout\x18\x01 \x01(\x08H\x00R\tnoTimeout\x12\x1f\n\ntimeout_ms\x18\x02 \x01(\x03H\x00R\ttimeoutMsB\t\n\x07timeoutB\t\n\x07\x63ommand"\xbd\x08\n\x1bStreamingQueryCommandResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12Q\n\x06status\x18\x02 \x01(\x0b\x32\x37.spark.connect.StreamingQueryCommandResult.StatusResultH\x00R\x06status\x12j\n\x0frecent_progress\x18\x03 \x01(\x0b\x32?.spark.connect.StreamingQueryCommandResult.RecentProgressResultH\x00R\x0erecentProgress\x12T\n\x07\x65xplain\x18\x04 \x01(\x0b\x32\x38.spark.connect.StreamingQueryCommandResult.ExplainResultH\x00R\x07\x65xplain\x12Z\n\texception\x18\x05 \x01(\x0b\x32:.spark.connect.StreamingQueryCommandResult.ExceptionResultH\x00R\texception\x12p\n\x11\x61wait_termination\x18\x06 \x01(\x0b\x32\x41.spark.connect.StreamingQueryCommandResult.AwaitTerminationResultH\x00R\x10\x61waitTermination\x1a\xaa\x01\n\x0cStatusResult\x12%\n\x0estatus_message\x18\x01 \x01(\tR\rstatusMessage\x12*\n\x11is_data_available\x18\x02 \x01(\x08R\x0fisDataAvailable\x12*\n\x11is_trigger_active\x18\x03 \x01(\x08R\x0fisTriggerActive\x12\x1b\n\tis_active\x18\x04 \x01(\x08R\x08isActive\x1aH\n\x14RecentProgressResult\x12\x30\n\x14recent_progress_json\x18\x05 \x03(\tR\x12recentProgressJson\x1a\'\n\rExplainResult\x12\x16\n\x06result\x18\x01 \x01(\tR\x06result\x1az\n\x0f\x45xceptionResult\x12#\n\x0cno_exception\x18\x01 \x01(\x08H\x00R\x0bnoException\x12-\n\x11\x65xception_message\x18\x02 \x01(\tH\x00R\x10\x65xceptionMessageB\x13\n\x11\x65xception_content\x1aL\n\x16\x41waitTerminationResult\x12#\n\nterminated\x18\x01 \x01(\x08H\x00R\nterminated\x88\x01\x01\x42\r\n\x0b_terminatedB\r\n\x0bresult_type"\x15\n\x13GetResourcesCommand"\xd4\x01\n\x19GetResourcesCommandResult\x12U\n\tresources\x18\x01 \x03(\x0b\x32\x37.spark.connect.GetResourcesCommandResult.ResourcesEntryR\tresources\x1a`\n\x0eResourcesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x38\n\x05value\x18\x02 \x01(\x0b\x32".spark.connect.ResourceInformationR\x05value:\x02\x38\x01\x42"\n\x1eorg.apache.spark.connect.protoP\x01\x62\x06proto3' ) @@ -429,27 +429,27 @@ _STREAMINGQUERYINSTANCEID._serialized_start = 3926 _STREAMINGQUERYINSTANCEID._serialized_end = 3991 _STREAMINGQUERYCOMMAND._serialized_start = 3994 - _STREAMINGQUERYCOMMAND._serialized_end = 4606 + _STREAMINGQUERYCOMMAND._serialized_end = 4652 _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND._serialized_start = 4493 _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND._serialized_end = 4537 _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_start = 4539 - _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_end = 4595 - _STREAMINGQUERYCOMMANDRESULT._serialized_start = 4609 - _STREAMINGQUERYCOMMANDRESULT._serialized_end = 5666 - _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_start = 5192 - _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_end = 5362 - _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_start = 5364 - _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_end = 5436 - _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_start = 5438 - _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_end = 5477 - _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_start = 5479 - _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_end = 5593 - _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_start = 5595 - _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_end = 5651 - _GETRESOURCESCOMMAND._serialized_start = 5668 - _GETRESOURCESCOMMAND._serialized_end = 5689 - _GETRESOURCESCOMMANDRESULT._serialized_start = 5692 - _GETRESOURCESCOMMANDRESULT._serialized_end = 5904 - _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_start = 5808 - _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_end = 5904 + _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_end = 4641 + _STREAMINGQUERYCOMMANDRESULT._serialized_start = 4655 + _STREAMINGQUERYCOMMANDRESULT._serialized_end = 5740 + _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_start = 5238 + _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_end = 5408 + _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_start = 5410 + _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_end = 5482 + _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_start = 5484 + _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_end = 5523 + _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_start = 5525 + _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_end = 5647 + _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_start = 5649 + _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_end = 5725 + _GETRESOURCESCOMMAND._serialized_start = 5742 + _GETRESOURCESCOMMAND._serialized_end = 5763 + _GETRESOURCESCOMMANDRESULT._serialized_start = 5766 + _GETRESOURCESCOMMANDRESULT._serialized_end = 5978 + _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_start = 5882 + _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_end = 5978 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/commands_pb2.pyi b/python/pyspark/sql/connect/proto/commands_pb2.pyi index feeaac85d24a7..08200c9d97664 100644 --- a/python/pyspark/sql/connect/proto/commands_pb2.pyi +++ b/python/pyspark/sql/connect/proto/commands_pb2.pyi @@ -879,16 +879,31 @@ class StreamingQueryCommand(google.protobuf.message.Message): class AwaitTerminationCommand(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor + NO_TIMEOUT_FIELD_NUMBER: builtins.int TIMEOUT_MS_FIELD_NUMBER: builtins.int + no_timeout: builtins.bool timeout_ms: builtins.int def __init__( self, *, + no_timeout: builtins.bool = ..., timeout_ms: builtins.int = ..., ) -> None: ... + def HasField( + self, + field_name: typing_extensions.Literal[ + "no_timeout", b"no_timeout", "timeout", b"timeout", "timeout_ms", b"timeout_ms" + ], + ) -> builtins.bool: ... def ClearField( - self, field_name: typing_extensions.Literal["timeout_ms", b"timeout_ms"] + self, + field_name: typing_extensions.Literal[ + "no_timeout", b"no_timeout", "timeout", b"timeout", "timeout_ms", b"timeout_ms" + ], ) -> None: ... + def WhichOneof( + self, oneof_group: typing_extensions.Literal["timeout", b"timeout"] + ) -> typing_extensions.Literal["no_timeout", "timeout_ms"] | None: ... QUERY_ID_FIELD_NUMBER: builtins.int STATUS_FIELD_NUMBER: builtins.int @@ -1074,37 +1089,41 @@ class StreamingQueryCommandResult(google.protobuf.message.Message): class ExceptionResult(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor - HAS_EXCEPTION_FIELD_NUMBER: builtins.int - ERROR_MESSAGE_FIELD_NUMBER: builtins.int - has_exception: builtins.bool - """Exception as string""" - error_message: builtins.str + NO_EXCEPTION_FIELD_NUMBER: builtins.int + EXCEPTION_MESSAGE_FIELD_NUMBER: builtins.int + no_exception: builtins.bool + exception_message: builtins.str def __init__( self, *, - has_exception: builtins.bool = ..., - error_message: builtins.str | None = ..., + no_exception: builtins.bool = ..., + exception_message: builtins.str = ..., ) -> None: ... def HasField( self, field_name: typing_extensions.Literal[ - "_error_message", b"_error_message", "error_message", b"error_message" + "exception_content", + b"exception_content", + "exception_message", + b"exception_message", + "no_exception", + b"no_exception", ], ) -> builtins.bool: ... def ClearField( self, field_name: typing_extensions.Literal[ - "_error_message", - b"_error_message", - "error_message", - b"error_message", - "has_exception", - b"has_exception", + "exception_content", + b"exception_content", + "exception_message", + b"exception_message", + "no_exception", + b"no_exception", ], ) -> None: ... def WhichOneof( - self, oneof_group: typing_extensions.Literal["_error_message", b"_error_message"] - ) -> typing_extensions.Literal["error_message"] | None: ... + self, oneof_group: typing_extensions.Literal["exception_content", b"exception_content"] + ) -> typing_extensions.Literal["no_exception", "exception_message"] | None: ... class AwaitTerminationResult(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor @@ -1114,11 +1133,23 @@ class StreamingQueryCommandResult(google.protobuf.message.Message): def __init__( self, *, - terminated: builtins.bool = ..., + terminated: builtins.bool | None = ..., ) -> None: ... + def HasField( + self, + field_name: typing_extensions.Literal[ + "_terminated", b"_terminated", "terminated", b"terminated" + ], + ) -> builtins.bool: ... def ClearField( - self, field_name: typing_extensions.Literal["terminated", b"terminated"] + self, + field_name: typing_extensions.Literal[ + "_terminated", b"_terminated", "terminated", b"terminated" + ], ) -> None: ... + def WhichOneof( + self, oneof_group: typing_extensions.Literal["_terminated", b"_terminated"] + ) -> typing_extensions.Literal["terminated"] | None: ... QUERY_ID_FIELD_NUMBER: builtins.int STATUS_FIELD_NUMBER: builtins.int diff --git a/python/pyspark/sql/connect/streaming/query.py b/python/pyspark/sql/connect/streaming/query.py index 79c83d07c665c..3a833a3295560 100644 --- a/python/pyspark/sql/connect/streaming/query.py +++ b/python/pyspark/sql/connect/streaming/query.py @@ -70,33 +70,23 @@ def isActive(self) -> bool: isActive.__doc__ = PySparkStreamingQuery.isActive.__doc__ - def _execute_await_termination_cmd(self, timeout: int = 10) -> bool: + def _execute_await_termination_cmd(self, timeoutMs: Optional[int] = None) -> Optional[bool]: cmd = pb2.StreamingQueryCommand() - cmd.await_termination.timeout_ms = timeout - terminated = self._execute_streaming_query_cmd(cmd).await_termination.terminated - return terminated - - def _await_termination(self, timeoutMs: Optional[int]) -> Optional[bool]: - terminated = False - if timeoutMs is None: - while not terminated: - # When no timeout is set, query the server every 10ms until query terminates - terminated = self._execute_await_termination_cmd() - else: - reqTimeoutMs = min(timeoutMs, 10) - while timeoutMs > 0 and not terminated: - # When timeout is set, query the server every reqTimeoutMs until query terminates or timout - start = time.time() - terminated = self._execute_await_termination_cmd(reqTimeoutMs) - end = time.time() - timeoutMs -= (end - start) * 1000 + if timeoutMs is not None: + cmd.await_termination.timeout_ms = timeoutMs + terminated = self._execute_streaming_query_cmd(cmd).await_termination.terminated return terminated + else: + cmd.await_termination.no_timeout = True + self._execute_streaming_query_cmd(cmd) def awaitTermination(self, timeout: Optional[int] = None) -> Optional[bool]: if timeout is not None: - if not isinstance(timeout, (int, float)) or timeout < 0: + if not isinstance(timeout, (int, float)) or timeout <= 0: raise ValueError("timeout must be a positive integer or float. Got %s" % timeout) - return self._await_termination(int(timeout * 1000)) + return self._execute_await_termination_cmd(int(timeout * 1000)) + else: + return self._execute_await_termination_cmd() awaitTermination.__doc__ = PySparkStreamingQuery.awaitTermination.__doc__ @@ -159,10 +149,10 @@ def exception(self) -> Optional[StreamingQueryException]: cmd = pb2.StreamingQueryCommand() cmd.exception = True exception = self._execute_streaming_query_cmd(cmd).exception - if not exception.has_exception: + if exception.no_exception: return None else: - return CapturedStreamingQueryException(exception.error_message) + return CapturedStreamingQueryException(exception.exception_message) exception.__doc__ = PySparkStreamingQuery.exception.__doc__ diff --git a/python/pyspark/sql/streaming/query.py b/python/pyspark/sql/streaming/query.py index b902f0514fce9..aada8265dd2d4 100644 --- a/python/pyspark/sql/streaming/query.py +++ b/python/pyspark/sql/streaming/query.py @@ -196,7 +196,7 @@ def awaitTermination(self, timeout: Optional[int] = None) -> Optional[bool]: >>> sq.stop() """ if timeout is not None: - if not isinstance(timeout, (int, float)) or timeout < 0: + if not isinstance(timeout, (int, float)) or timeout <= 0: raise ValueError("timeout must be a positive integer or float. Got %s" % timeout) return self._jsq.awaitTermination(int(timeout * 1000)) else: diff --git a/python/pyspark/sql/tests/streaming/test_streaming.py b/python/pyspark/sql/tests/streaming/test_streaming.py index 03a6699ee431c..2e51c354447f0 100644 --- a/python/pyspark/sql/tests/streaming/test_streaming.py +++ b/python/pyspark/sql/tests/streaming/test_streaming.py @@ -253,8 +253,12 @@ def test_stream_await_termination(self): duration = time.time() - now self.assertTrue(duration >= 2) self.assertFalse(res) - finally: + q.processAllAvailable() + q.stop() + q.awaitTermination() + self.assertFalse(q.isActive) + finally: q.stop() shutil.rmtree(tmpPath) From 7a377ec2a31411dc7220993c0183f7c77883d78c Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Mon, 17 Apr 2023 16:14:36 -0700 Subject: [PATCH 18/31] style --- .../spark/sql/connect/planner/SparkConnectPlanner.scala | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala index 69a7bb95a8b32..773797ffbae15 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala @@ -2226,20 +2226,17 @@ class SparkConnectPlanner(val session: SparkSession) { val result = query.exception result match { case Some(e) => - respBuilder - .getExceptionBuilder + respBuilder.getExceptionBuilder .setExceptionMessage(SparkConnectService.extractErrorMessage(e)) case None => - respBuilder - .getExceptionBuilder + respBuilder.getExceptionBuilder .setNoException(true) } case StreamingQueryCommand.CommandCase.AWAIT_TERMINATION => if (command.getAwaitTermination.hasTimeoutMs) { val terminated = query.awaitTermination(command.getAwaitTermination.getTimeoutMs) - respBuilder - .getAwaitTerminationBuilder + respBuilder.getAwaitTerminationBuilder .setTerminated(terminated) } else { query.awaitTermination() From be764382dc1ede952fd1aa10f8d9c18e47ad0c20 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Mon, 17 Apr 2023 23:17:37 -0700 Subject: [PATCH 19/31] wip --- .../src/main/protobuf/spark/connect/commands.proto | 5 +---- .../sql/connect/planner/SparkConnectPlanner.scala | 10 ++-------- python/pyspark/sql/connect/streaming/query.py | 8 ++++---- .../tests/connect/streaming/test_parity_streaming.py | 10 +++++----- .../spark/sql/streaming/StreamingQueryManager.scala | 4 ++-- 5 files changed, 14 insertions(+), 23 deletions(-) diff --git a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto index a8097bfe3e23e..f45596273b00e 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto @@ -312,10 +312,7 @@ message StreamingQueryCommandResult { message ExceptionResult { // Exception message as string - oneof exception_content { - bool no_exception = 1; - string exception_message = 2; - } + optional string exception_message = 1; } message AwaitTerminationResult { diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala index 773797ffbae15..84a9c5a8818d6 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala @@ -2224,14 +2224,8 @@ class SparkConnectPlanner(val session: SparkSession) { case StreamingQueryCommand.CommandCase.EXCEPTION => val result = query.exception - result match { - case Some(e) => - respBuilder.getExceptionBuilder - .setExceptionMessage(SparkConnectService.extractErrorMessage(e)) - case None => - respBuilder.getExceptionBuilder - .setNoException(true) - } + result.foreach(e => respBuilder.getExceptionBuilder + .setExceptionMessage(SparkConnectService.extractErrorMessage(e))) case StreamingQueryCommand.CommandCase.AWAIT_TERMINATION => if (command.getAwaitTermination.hasTimeoutMs) { diff --git a/python/pyspark/sql/connect/streaming/query.py b/python/pyspark/sql/connect/streaming/query.py index 3a833a3295560..be209c4ec0d3c 100644 --- a/python/pyspark/sql/connect/streaming/query.py +++ b/python/pyspark/sql/connect/streaming/query.py @@ -148,11 +148,11 @@ def explain(self, extended: bool = False) -> None: def exception(self) -> Optional[StreamingQueryException]: cmd = pb2.StreamingQueryCommand() cmd.exception = True - exception = self._execute_streaming_query_cmd(cmd).exception - if exception.no_exception: - return None + response = self._execute_streaming_query_cmd(cmd) + if response.HasField("exception") and response.exception.HasField("exception_message"): + return CapturedStreamingQueryException(response.exception.exception_message) else: - return CapturedStreamingQueryException(exception.exception_message) + return None exception.__doc__ = PySparkStreamingQuery.exception.__doc__ diff --git a/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py b/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py index 90caea4dbc9fc..9824a61909150 100644 --- a/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +++ b/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py @@ -26,11 +26,11 @@ class StreamingParityTests(StreamingTestsMixin, ReusedConnectTestCase): def test_stream_await_termination(self): super().test_stream_await_termination() - @unittest.skip( - "Query immediately quits after throw, allowing access to supported queries will be added in SPARK-42962." - ) - def test_stream_exception(self): - super().test_stream_exception() + # @unittest.skip( + # "Query immediately quits after throw, allowing access to supported queries will be added in SPARK-42962." + # ) + # def test_stream_exception(self): + # super().test_stream_exception() @unittest.skip("Query manager API will be supported later with SPARK-43032.") def test_stream_status_and_progress(self): diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala index 20254dec3d874..48b213bc0d3f6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala @@ -408,7 +408,7 @@ class StreamingQueryManager private[sql] ( query.streamingQuery.start() } catch { case e: Throwable => - unregisterTerminatedStream(query) + // unregisterTerminatedStream(query) throw e } query @@ -416,7 +416,7 @@ class StreamingQueryManager private[sql] ( /** Notify (by the StreamingQuery) that the query has been terminated */ private[sql] def notifyQueryTermination(terminatedQuery: StreamingQuery): Unit = { - unregisterTerminatedStream(terminatedQuery) + // unregisterTerminatedStream(terminatedQuery) awaitTerminationLock.synchronized { if (lastTerminatedQueryException == null || terminatedQuery.exception.nonEmpty) { lastTerminatedQueryException = terminatedQuery.exception From 7045f9dd0ed97fb3983f264916f154e9393a0795 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Tue, 18 Apr 2023 00:05:09 -0700 Subject: [PATCH 20/31] adderss comments, change exception_message as optional --- .../sql/connect/planner/SparkConnectPlanner.scala | 5 +++-- python/pyspark/sql/connect/streaming/query.py | 6 +++--- .../tests/connect/streaming/test_parity_streaming.py | 10 +++++----- .../spark/sql/streaming/StreamingQueryManager.scala | 4 ++-- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala index 39b496e09637c..9718cd886ae77 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala @@ -2247,8 +2247,9 @@ class SparkConnectPlanner(val session: SparkSession) { case StreamingQueryCommand.CommandCase.EXCEPTION => val result = query.exception - result.foreach(e => respBuilder.getExceptionBuilder - .setExceptionMessage(SparkConnectService.extractErrorMessage(e))) + result.foreach(e => + respBuilder.getExceptionBuilder + .setExceptionMessage(SparkConnectService.extractErrorMessage(e))) case StreamingQueryCommand.CommandCase.AWAIT_TERMINATION => if (command.getAwaitTermination.hasTimeoutMs) { diff --git a/python/pyspark/sql/connect/streaming/query.py b/python/pyspark/sql/connect/streaming/query.py index be209c4ec0d3c..53f50410a7477 100644 --- a/python/pyspark/sql/connect/streaming/query.py +++ b/python/pyspark/sql/connect/streaming/query.py @@ -148,9 +148,9 @@ def explain(self, extended: bool = False) -> None: def exception(self) -> Optional[StreamingQueryException]: cmd = pb2.StreamingQueryCommand() cmd.exception = True - response = self._execute_streaming_query_cmd(cmd) - if response.HasField("exception") and response.exception.HasField("exception_message"): - return CapturedStreamingQueryException(response.exception.exception_message) + exception = self._execute_streaming_query_cmd(cmd).exception + if exception.HasField("exception_message"): + return CapturedStreamingQueryException(exception.exception_message) else: return None diff --git a/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py b/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py index 9824a61909150..90caea4dbc9fc 100644 --- a/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +++ b/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py @@ -26,11 +26,11 @@ class StreamingParityTests(StreamingTestsMixin, ReusedConnectTestCase): def test_stream_await_termination(self): super().test_stream_await_termination() - # @unittest.skip( - # "Query immediately quits after throw, allowing access to supported queries will be added in SPARK-42962." - # ) - # def test_stream_exception(self): - # super().test_stream_exception() + @unittest.skip( + "Query immediately quits after throw, allowing access to supported queries will be added in SPARK-42962." + ) + def test_stream_exception(self): + super().test_stream_exception() @unittest.skip("Query manager API will be supported later with SPARK-43032.") def test_stream_status_and_progress(self): diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala index 48b213bc0d3f6..20254dec3d874 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/StreamingQueryManager.scala @@ -408,7 +408,7 @@ class StreamingQueryManager private[sql] ( query.streamingQuery.start() } catch { case e: Throwable => - // unregisterTerminatedStream(query) + unregisterTerminatedStream(query) throw e } query @@ -416,7 +416,7 @@ class StreamingQueryManager private[sql] ( /** Notify (by the StreamingQuery) that the query has been terminated */ private[sql] def notifyQueryTermination(terminatedQuery: StreamingQuery): Unit = { - // unregisterTerminatedStream(terminatedQuery) + unregisterTerminatedStream(terminatedQuery) awaitTerminationLock.synchronized { if (lastTerminatedQueryException == null || terminatedQuery.exception.nonEmpty) { lastTerminatedQueryException = terminatedQuery.exception From 83413c8044cbf34db67d37d4af55fb2a055a469e Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Tue, 18 Apr 2023 10:24:48 -0700 Subject: [PATCH 21/31] lint --- python/pyspark/sql/connect/streaming/query.py | 1 - .../sql/tests/connect/streaming/test_parity_streaming.py | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/connect/streaming/query.py b/python/pyspark/sql/connect/streaming/query.py index 53f50410a7477..f2bda432394c3 100644 --- a/python/pyspark/sql/connect/streaming/query.py +++ b/python/pyspark/sql/connect/streaming/query.py @@ -16,7 +16,6 @@ # import json -import time import sys from typing import TYPE_CHECKING, Any, cast, Dict, List, Optional diff --git a/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py b/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py index 90caea4dbc9fc..c0440503b6984 100644 --- a/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +++ b/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py @@ -27,7 +27,8 @@ def test_stream_await_termination(self): super().test_stream_await_termination() @unittest.skip( - "Query immediately quits after throw, allowing access to supported queries will be added in SPARK-42962." + "Query immediately quits after throw, " + + "allowing access to supported queries will be added in SPARK-42962." ) def test_stream_exception(self): super().test_stream_exception() From 8e848c972a7afe009a9b3828ab4470df77a658dd Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Tue, 18 Apr 2023 11:07:08 -0700 Subject: [PATCH 22/31] reformat py --- .../sql/tests/connect/streaming/test_parity_streaming.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py b/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py index c0440503b6984..9419194a6e358 100644 --- a/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py +++ b/python/pyspark/sql/tests/connect/streaming/test_parity_streaming.py @@ -27,8 +27,8 @@ def test_stream_await_termination(self): super().test_stream_await_termination() @unittest.skip( - "Query immediately quits after throw, " + - "allowing access to supported queries will be added in SPARK-42962." + "Query immediately quits after throw, " + + "allowing access to supported queries will be added in SPARK-42962." ) def test_stream_exception(self): super().test_stream_exception() From ad73db6c3be82d202572ffdfc0da5f132448fde9 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Tue, 18 Apr 2023 12:14:51 -0700 Subject: [PATCH 23/31] done --- .../protobuf/spark/connect/commands.proto | 5 +---- .../connect/planner/SparkConnectPlanner.scala | 2 ++ .../connect/service/SparkConnectService.scala | 2 +- python/pyspark/sql/connect/streaming/query.py | 20 ++++++++----------- 4 files changed, 12 insertions(+), 17 deletions(-) diff --git a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto index f45596273b00e..0cca65600a72b 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto @@ -272,10 +272,7 @@ message StreamingQueryCommand { } message AwaitTerminationCommand { - oneof timeout { - bool no_timeout = 1; - int64 timeout_ms = 2; - } + optional int64 timeout_ms = 2; } } diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala index 9718cd886ae77..a20b32298329a 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala @@ -2258,6 +2258,8 @@ class SparkConnectPlanner(val session: SparkSession) { .setTerminated(terminated) } else { query.awaitTermination() + respBuilder.getAwaitTerminationBuilder + .setTerminated(true) } case StreamingQueryCommand.CommandCase.COMMAND_NOT_SET => diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala index fa9cee85b96a2..aec7dcb1be0d8 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala @@ -126,7 +126,7 @@ class SparkConnectService(debug: Boolean) observer.onError( Status.UNKNOWN .withCause(e) - .withDescription(extractErrorMessage(e)) + .withDescription(StringUtils.abbreviate(e.getMessage, 2048)) .asRuntimeException()) } diff --git a/python/pyspark/sql/connect/streaming/query.py b/python/pyspark/sql/connect/streaming/query.py index f2bda432394c3..a2b2e81357efc 100644 --- a/python/pyspark/sql/connect/streaming/query.py +++ b/python/pyspark/sql/connect/streaming/query.py @@ -69,23 +69,19 @@ def isActive(self) -> bool: isActive.__doc__ = PySparkStreamingQuery.isActive.__doc__ - def _execute_await_termination_cmd(self, timeoutMs: Optional[int] = None) -> Optional[bool]: - cmd = pb2.StreamingQueryCommand() - if timeoutMs is not None: - cmd.await_termination.timeout_ms = timeoutMs - terminated = self._execute_streaming_query_cmd(cmd).await_termination.terminated - return terminated - else: - cmd.await_termination.no_timeout = True - self._execute_streaming_query_cmd(cmd) - def awaitTermination(self, timeout: Optional[int] = None) -> Optional[bool]: + cmd = pb2.StreamingQueryCommand() if timeout is not None: if not isinstance(timeout, (int, float)) or timeout <= 0: raise ValueError("timeout must be a positive integer or float. Got %s" % timeout) - return self._execute_await_termination_cmd(int(timeout * 1000)) + cmd.await_termination.timeout_ms = int(timeout * 1000) + terminated = self._execute_streaming_query_cmd(cmd).await_termination.terminated + return terminated else: - return self._execute_await_termination_cmd() + await_termination_cmd = pb2.StreamingQueryCommand.AwaitTerminationCommand() + cmd.await_termination.CopyFrom(await_termination_cmd) + self._execute_streaming_query_cmd(cmd) + return None awaitTermination.__doc__ = PySparkStreamingQuery.awaitTermination.__doc__ From dd4d54b7cdaa25469ff77a12c3d3ad0eef9ebe78 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Tue, 18 Apr 2023 12:48:10 -0700 Subject: [PATCH 24/31] regenerate proto files --- .../pyspark/sql/connect/proto/commands_pb2.py | 42 +++++++++---------- .../sql/connect/proto/commands_pb2.pyi | 36 +++++++--------- 2 files changed, 35 insertions(+), 43 deletions(-) diff --git a/python/pyspark/sql/connect/proto/commands_pb2.py b/python/pyspark/sql/connect/proto/commands_pb2.py index d2ffffd4ce3d1..2474c3bc5eb1a 100644 --- a/python/pyspark/sql/connect/proto/commands_pb2.py +++ b/python/pyspark/sql/connect/proto/commands_pb2.py @@ -36,7 +36,7 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1cspark/connect/commands.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1aspark/connect/common.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto"\x90\x06\n\x07\x43ommand\x12]\n\x11register_function\x18\x01 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x10registerFunction\x12H\n\x0fwrite_operation\x18\x02 \x01(\x0b\x32\x1d.spark.connect.WriteOperationH\x00R\x0ewriteOperation\x12_\n\x15\x63reate_dataframe_view\x18\x03 \x01(\x0b\x32).spark.connect.CreateDataFrameViewCommandH\x00R\x13\x63reateDataframeView\x12O\n\x12write_operation_v2\x18\x04 \x01(\x0b\x32\x1f.spark.connect.WriteOperationV2H\x00R\x10writeOperationV2\x12<\n\x0bsql_command\x18\x05 \x01(\x0b\x32\x19.spark.connect.SqlCommandH\x00R\nsqlCommand\x12k\n\x1cwrite_stream_operation_start\x18\x06 \x01(\x0b\x32(.spark.connect.WriteStreamOperationStartH\x00R\x19writeStreamOperationStart\x12^\n\x17streaming_query_command\x18\x07 \x01(\x0b\x32$.spark.connect.StreamingQueryCommandH\x00R\x15streamingQueryCommand\x12X\n\x15get_resources_command\x18\x08 \x01(\x0b\x32".spark.connect.GetResourcesCommandH\x00R\x13getResourcesCommand\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textensionB\x0e\n\x0c\x63ommand_type"\xb3\x01\n\nSqlCommand\x12\x10\n\x03sql\x18\x01 \x01(\tR\x03sql\x12\x37\n\x04\x61rgs\x18\x02 \x03(\x0b\x32#.spark.connect.SqlCommand.ArgsEntryR\x04\x61rgs\x1aZ\n\tArgsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x37\n\x05value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x05value:\x02\x38\x01"\x96\x01\n\x1a\x43reateDataFrameViewCommand\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x1b\n\tis_global\x18\x03 \x01(\x08R\x08isGlobal\x12\x18\n\x07replace\x18\x04 \x01(\x08R\x07replace"\x9b\x08\n\x0eWriteOperation\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1b\n\x06source\x18\x02 \x01(\tH\x01R\x06source\x88\x01\x01\x12\x14\n\x04path\x18\x03 \x01(\tH\x00R\x04path\x12?\n\x05table\x18\x04 \x01(\x0b\x32\'.spark.connect.WriteOperation.SaveTableH\x00R\x05table\x12:\n\x04mode\x18\x05 \x01(\x0e\x32&.spark.connect.WriteOperation.SaveModeR\x04mode\x12*\n\x11sort_column_names\x18\x06 \x03(\tR\x0fsortColumnNames\x12\x31\n\x14partitioning_columns\x18\x07 \x03(\tR\x13partitioningColumns\x12\x43\n\tbucket_by\x18\x08 \x01(\x0b\x32&.spark.connect.WriteOperation.BucketByR\x08\x62ucketBy\x12\x44\n\x07options\x18\t \x03(\x0b\x32*.spark.connect.WriteOperation.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x82\x02\n\tSaveTable\x12\x1d\n\ntable_name\x18\x01 \x01(\tR\ttableName\x12X\n\x0bsave_method\x18\x02 \x01(\x0e\x32\x37.spark.connect.WriteOperation.SaveTable.TableSaveMethodR\nsaveMethod"|\n\x0fTableSaveMethod\x12!\n\x1dTABLE_SAVE_METHOD_UNSPECIFIED\x10\x00\x12#\n\x1fTABLE_SAVE_METHOD_SAVE_AS_TABLE\x10\x01\x12!\n\x1dTABLE_SAVE_METHOD_INSERT_INTO\x10\x02\x1a[\n\x08\x42ucketBy\x12.\n\x13\x62ucket_column_names\x18\x01 \x03(\tR\x11\x62ucketColumnNames\x12\x1f\n\x0bnum_buckets\x18\x02 \x01(\x05R\nnumBuckets"\x89\x01\n\x08SaveMode\x12\x19\n\x15SAVE_MODE_UNSPECIFIED\x10\x00\x12\x14\n\x10SAVE_MODE_APPEND\x10\x01\x12\x17\n\x13SAVE_MODE_OVERWRITE\x10\x02\x12\x1d\n\x19SAVE_MODE_ERROR_IF_EXISTS\x10\x03\x12\x14\n\x10SAVE_MODE_IGNORE\x10\x04\x42\x0b\n\tsave_typeB\t\n\x07_source"\xad\x06\n\x10WriteOperationV2\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1d\n\ntable_name\x18\x02 \x01(\tR\ttableName\x12\x1f\n\x08provider\x18\x03 \x01(\tH\x00R\x08provider\x88\x01\x01\x12L\n\x14partitioning_columns\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13partitioningColumns\x12\x46\n\x07options\x18\x05 \x03(\x0b\x32,.spark.connect.WriteOperationV2.OptionsEntryR\x07options\x12_\n\x10table_properties\x18\x06 \x03(\x0b\x32\x34.spark.connect.WriteOperationV2.TablePropertiesEntryR\x0ftableProperties\x12\x38\n\x04mode\x18\x07 \x01(\x0e\x32$.spark.connect.WriteOperationV2.ModeR\x04mode\x12J\n\x13overwrite_condition\x18\x08 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x12overwriteCondition\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x42\n\x14TablePropertiesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01"\x9f\x01\n\x04Mode\x12\x14\n\x10MODE_UNSPECIFIED\x10\x00\x12\x0f\n\x0bMODE_CREATE\x10\x01\x12\x12\n\x0eMODE_OVERWRITE\x10\x02\x12\x1d\n\x19MODE_OVERWRITE_PARTITIONS\x10\x03\x12\x0f\n\x0bMODE_APPEND\x10\x04\x12\x10\n\x0cMODE_REPLACE\x10\x05\x12\x1a\n\x16MODE_CREATE_OR_REPLACE\x10\x06\x42\x0b\n\t_provider"\x82\x05\n\x19WriteStreamOperationStart\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x16\n\x06\x66ormat\x18\x02 \x01(\tR\x06\x66ormat\x12O\n\x07options\x18\x03 \x03(\x0b\x32\x35.spark.connect.WriteStreamOperationStart.OptionsEntryR\x07options\x12:\n\x19partitioning_column_names\x18\x04 \x03(\tR\x17partitioningColumnNames\x12:\n\x18processing_time_interval\x18\x05 \x01(\tH\x00R\x16processingTimeInterval\x12%\n\ravailable_now\x18\x06 \x01(\x08H\x00R\x0c\x61vailableNow\x12\x14\n\x04once\x18\x07 \x01(\x08H\x00R\x04once\x12\x46\n\x1e\x63ontinuous_checkpoint_interval\x18\x08 \x01(\tH\x00R\x1c\x63ontinuousCheckpointInterval\x12\x1f\n\x0boutput_mode\x18\t \x01(\tR\noutputMode\x12\x1d\n\nquery_name\x18\n \x01(\tR\tqueryName\x12\x14\n\x04path\x18\x0b \x01(\tH\x01R\x04path\x12\x1f\n\ntable_name\x18\x0c \x01(\tH\x01R\ttableName\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\t\n\x07triggerB\x12\n\x10sink_destination"y\n\x1fWriteStreamOperationStartResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name"A\n\x18StreamingQueryInstanceId\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x12\x15\n\x06run_id\x18\x02 \x01(\tR\x05runId"\x92\x05\n\x15StreamingQueryCommand\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x18\n\x06status\x18\x02 \x01(\x08H\x00R\x06status\x12%\n\rlast_progress\x18\x03 \x01(\x08H\x00R\x0clastProgress\x12)\n\x0frecent_progress\x18\x04 \x01(\x08H\x00R\x0erecentProgress\x12\x14\n\x04stop\x18\x05 \x01(\x08H\x00R\x04stop\x12\x34\n\x15process_all_available\x18\x06 \x01(\x08H\x00R\x13processAllAvailable\x12O\n\x07\x65xplain\x18\x07 \x01(\x0b\x32\x33.spark.connect.StreamingQueryCommand.ExplainCommandH\x00R\x07\x65xplain\x12\x1e\n\texception\x18\x08 \x01(\x08H\x00R\texception\x12k\n\x11\x61wait_termination\x18\t \x01(\x0b\x32<.spark.connect.StreamingQueryCommand.AwaitTerminationCommandH\x00R\x10\x61waitTermination\x1a,\n\x0e\x45xplainCommand\x12\x1a\n\x08\x65xtended\x18\x01 \x01(\x08R\x08\x65xtended\x1a\x66\n\x17\x41waitTerminationCommand\x12\x1f\n\nno_timeout\x18\x01 \x01(\x08H\x00R\tnoTimeout\x12\x1f\n\ntimeout_ms\x18\x02 \x01(\x03H\x00R\ttimeoutMsB\t\n\x07timeoutB\t\n\x07\x63ommand"\xbd\x08\n\x1bStreamingQueryCommandResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12Q\n\x06status\x18\x02 \x01(\x0b\x32\x37.spark.connect.StreamingQueryCommandResult.StatusResultH\x00R\x06status\x12j\n\x0frecent_progress\x18\x03 \x01(\x0b\x32?.spark.connect.StreamingQueryCommandResult.RecentProgressResultH\x00R\x0erecentProgress\x12T\n\x07\x65xplain\x18\x04 \x01(\x0b\x32\x38.spark.connect.StreamingQueryCommandResult.ExplainResultH\x00R\x07\x65xplain\x12Z\n\texception\x18\x05 \x01(\x0b\x32:.spark.connect.StreamingQueryCommandResult.ExceptionResultH\x00R\texception\x12p\n\x11\x61wait_termination\x18\x06 \x01(\x0b\x32\x41.spark.connect.StreamingQueryCommandResult.AwaitTerminationResultH\x00R\x10\x61waitTermination\x1a\xaa\x01\n\x0cStatusResult\x12%\n\x0estatus_message\x18\x01 \x01(\tR\rstatusMessage\x12*\n\x11is_data_available\x18\x02 \x01(\x08R\x0fisDataAvailable\x12*\n\x11is_trigger_active\x18\x03 \x01(\x08R\x0fisTriggerActive\x12\x1b\n\tis_active\x18\x04 \x01(\x08R\x08isActive\x1aH\n\x14RecentProgressResult\x12\x30\n\x14recent_progress_json\x18\x05 \x03(\tR\x12recentProgressJson\x1a\'\n\rExplainResult\x12\x16\n\x06result\x18\x01 \x01(\tR\x06result\x1az\n\x0f\x45xceptionResult\x12#\n\x0cno_exception\x18\x01 \x01(\x08H\x00R\x0bnoException\x12-\n\x11\x65xception_message\x18\x02 \x01(\tH\x00R\x10\x65xceptionMessageB\x13\n\x11\x65xception_content\x1aL\n\x16\x41waitTerminationResult\x12#\n\nterminated\x18\x01 \x01(\x08H\x00R\nterminated\x88\x01\x01\x42\r\n\x0b_terminatedB\r\n\x0bresult_type"\x15\n\x13GetResourcesCommand"\xd4\x01\n\x19GetResourcesCommandResult\x12U\n\tresources\x18\x01 \x03(\x0b\x32\x37.spark.connect.GetResourcesCommandResult.ResourcesEntryR\tresources\x1a`\n\x0eResourcesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x38\n\x05value\x18\x02 \x01(\x0b\x32".spark.connect.ResourceInformationR\x05value:\x02\x38\x01\x42"\n\x1eorg.apache.spark.connect.protoP\x01\x62\x06proto3' + b'\n\x1cspark/connect/commands.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1aspark/connect/common.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto"\x90\x06\n\x07\x43ommand\x12]\n\x11register_function\x18\x01 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x10registerFunction\x12H\n\x0fwrite_operation\x18\x02 \x01(\x0b\x32\x1d.spark.connect.WriteOperationH\x00R\x0ewriteOperation\x12_\n\x15\x63reate_dataframe_view\x18\x03 \x01(\x0b\x32).spark.connect.CreateDataFrameViewCommandH\x00R\x13\x63reateDataframeView\x12O\n\x12write_operation_v2\x18\x04 \x01(\x0b\x32\x1f.spark.connect.WriteOperationV2H\x00R\x10writeOperationV2\x12<\n\x0bsql_command\x18\x05 \x01(\x0b\x32\x19.spark.connect.SqlCommandH\x00R\nsqlCommand\x12k\n\x1cwrite_stream_operation_start\x18\x06 \x01(\x0b\x32(.spark.connect.WriteStreamOperationStartH\x00R\x19writeStreamOperationStart\x12^\n\x17streaming_query_command\x18\x07 \x01(\x0b\x32$.spark.connect.StreamingQueryCommandH\x00R\x15streamingQueryCommand\x12X\n\x15get_resources_command\x18\x08 \x01(\x0b\x32".spark.connect.GetResourcesCommandH\x00R\x13getResourcesCommand\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textensionB\x0e\n\x0c\x63ommand_type"\xb3\x01\n\nSqlCommand\x12\x10\n\x03sql\x18\x01 \x01(\tR\x03sql\x12\x37\n\x04\x61rgs\x18\x02 \x03(\x0b\x32#.spark.connect.SqlCommand.ArgsEntryR\x04\x61rgs\x1aZ\n\tArgsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x37\n\x05value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x05value:\x02\x38\x01"\x96\x01\n\x1a\x43reateDataFrameViewCommand\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x1b\n\tis_global\x18\x03 \x01(\x08R\x08isGlobal\x12\x18\n\x07replace\x18\x04 \x01(\x08R\x07replace"\x9b\x08\n\x0eWriteOperation\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1b\n\x06source\x18\x02 \x01(\tH\x01R\x06source\x88\x01\x01\x12\x14\n\x04path\x18\x03 \x01(\tH\x00R\x04path\x12?\n\x05table\x18\x04 \x01(\x0b\x32\'.spark.connect.WriteOperation.SaveTableH\x00R\x05table\x12:\n\x04mode\x18\x05 \x01(\x0e\x32&.spark.connect.WriteOperation.SaveModeR\x04mode\x12*\n\x11sort_column_names\x18\x06 \x03(\tR\x0fsortColumnNames\x12\x31\n\x14partitioning_columns\x18\x07 \x03(\tR\x13partitioningColumns\x12\x43\n\tbucket_by\x18\x08 \x01(\x0b\x32&.spark.connect.WriteOperation.BucketByR\x08\x62ucketBy\x12\x44\n\x07options\x18\t \x03(\x0b\x32*.spark.connect.WriteOperation.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x82\x02\n\tSaveTable\x12\x1d\n\ntable_name\x18\x01 \x01(\tR\ttableName\x12X\n\x0bsave_method\x18\x02 \x01(\x0e\x32\x37.spark.connect.WriteOperation.SaveTable.TableSaveMethodR\nsaveMethod"|\n\x0fTableSaveMethod\x12!\n\x1dTABLE_SAVE_METHOD_UNSPECIFIED\x10\x00\x12#\n\x1fTABLE_SAVE_METHOD_SAVE_AS_TABLE\x10\x01\x12!\n\x1dTABLE_SAVE_METHOD_INSERT_INTO\x10\x02\x1a[\n\x08\x42ucketBy\x12.\n\x13\x62ucket_column_names\x18\x01 \x03(\tR\x11\x62ucketColumnNames\x12\x1f\n\x0bnum_buckets\x18\x02 \x01(\x05R\nnumBuckets"\x89\x01\n\x08SaveMode\x12\x19\n\x15SAVE_MODE_UNSPECIFIED\x10\x00\x12\x14\n\x10SAVE_MODE_APPEND\x10\x01\x12\x17\n\x13SAVE_MODE_OVERWRITE\x10\x02\x12\x1d\n\x19SAVE_MODE_ERROR_IF_EXISTS\x10\x03\x12\x14\n\x10SAVE_MODE_IGNORE\x10\x04\x42\x0b\n\tsave_typeB\t\n\x07_source"\xad\x06\n\x10WriteOperationV2\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1d\n\ntable_name\x18\x02 \x01(\tR\ttableName\x12\x1f\n\x08provider\x18\x03 \x01(\tH\x00R\x08provider\x88\x01\x01\x12L\n\x14partitioning_columns\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13partitioningColumns\x12\x46\n\x07options\x18\x05 \x03(\x0b\x32,.spark.connect.WriteOperationV2.OptionsEntryR\x07options\x12_\n\x10table_properties\x18\x06 \x03(\x0b\x32\x34.spark.connect.WriteOperationV2.TablePropertiesEntryR\x0ftableProperties\x12\x38\n\x04mode\x18\x07 \x01(\x0e\x32$.spark.connect.WriteOperationV2.ModeR\x04mode\x12J\n\x13overwrite_condition\x18\x08 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x12overwriteCondition\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x42\n\x14TablePropertiesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01"\x9f\x01\n\x04Mode\x12\x14\n\x10MODE_UNSPECIFIED\x10\x00\x12\x0f\n\x0bMODE_CREATE\x10\x01\x12\x12\n\x0eMODE_OVERWRITE\x10\x02\x12\x1d\n\x19MODE_OVERWRITE_PARTITIONS\x10\x03\x12\x0f\n\x0bMODE_APPEND\x10\x04\x12\x10\n\x0cMODE_REPLACE\x10\x05\x12\x1a\n\x16MODE_CREATE_OR_REPLACE\x10\x06\x42\x0b\n\t_provider"\x82\x05\n\x19WriteStreamOperationStart\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x16\n\x06\x66ormat\x18\x02 \x01(\tR\x06\x66ormat\x12O\n\x07options\x18\x03 \x03(\x0b\x32\x35.spark.connect.WriteStreamOperationStart.OptionsEntryR\x07options\x12:\n\x19partitioning_column_names\x18\x04 \x03(\tR\x17partitioningColumnNames\x12:\n\x18processing_time_interval\x18\x05 \x01(\tH\x00R\x16processingTimeInterval\x12%\n\ravailable_now\x18\x06 \x01(\x08H\x00R\x0c\x61vailableNow\x12\x14\n\x04once\x18\x07 \x01(\x08H\x00R\x04once\x12\x46\n\x1e\x63ontinuous_checkpoint_interval\x18\x08 \x01(\tH\x00R\x1c\x63ontinuousCheckpointInterval\x12\x1f\n\x0boutput_mode\x18\t \x01(\tR\noutputMode\x12\x1d\n\nquery_name\x18\n \x01(\tR\tqueryName\x12\x14\n\x04path\x18\x0b \x01(\tH\x01R\x04path\x12\x1f\n\ntable_name\x18\x0c \x01(\tH\x01R\ttableName\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\t\n\x07triggerB\x12\n\x10sink_destination"y\n\x1fWriteStreamOperationStartResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name"A\n\x18StreamingQueryInstanceId\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x12\x15\n\x06run_id\x18\x02 \x01(\tR\x05runId"\xf8\x04\n\x15StreamingQueryCommand\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x18\n\x06status\x18\x02 \x01(\x08H\x00R\x06status\x12%\n\rlast_progress\x18\x03 \x01(\x08H\x00R\x0clastProgress\x12)\n\x0frecent_progress\x18\x04 \x01(\x08H\x00R\x0erecentProgress\x12\x14\n\x04stop\x18\x05 \x01(\x08H\x00R\x04stop\x12\x34\n\x15process_all_available\x18\x06 \x01(\x08H\x00R\x13processAllAvailable\x12O\n\x07\x65xplain\x18\x07 \x01(\x0b\x32\x33.spark.connect.StreamingQueryCommand.ExplainCommandH\x00R\x07\x65xplain\x12\x1e\n\texception\x18\x08 \x01(\x08H\x00R\texception\x12k\n\x11\x61wait_termination\x18\t \x01(\x0b\x32<.spark.connect.StreamingQueryCommand.AwaitTerminationCommandH\x00R\x10\x61waitTermination\x1a,\n\x0e\x45xplainCommand\x12\x1a\n\x08\x65xtended\x18\x01 \x01(\x08R\x08\x65xtended\x1aL\n\x17\x41waitTerminationCommand\x12"\n\ntimeout_ms\x18\x02 \x01(\x03H\x00R\ttimeoutMs\x88\x01\x01\x42\r\n\x0b_timeout_msB\t\n\x07\x63ommand"\x9c\x08\n\x1bStreamingQueryCommandResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12Q\n\x06status\x18\x02 \x01(\x0b\x32\x37.spark.connect.StreamingQueryCommandResult.StatusResultH\x00R\x06status\x12j\n\x0frecent_progress\x18\x03 \x01(\x0b\x32?.spark.connect.StreamingQueryCommandResult.RecentProgressResultH\x00R\x0erecentProgress\x12T\n\x07\x65xplain\x18\x04 \x01(\x0b\x32\x38.spark.connect.StreamingQueryCommandResult.ExplainResultH\x00R\x07\x65xplain\x12Z\n\texception\x18\x05 \x01(\x0b\x32:.spark.connect.StreamingQueryCommandResult.ExceptionResultH\x00R\texception\x12p\n\x11\x61wait_termination\x18\x06 \x01(\x0b\x32\x41.spark.connect.StreamingQueryCommandResult.AwaitTerminationResultH\x00R\x10\x61waitTermination\x1a\xaa\x01\n\x0cStatusResult\x12%\n\x0estatus_message\x18\x01 \x01(\tR\rstatusMessage\x12*\n\x11is_data_available\x18\x02 \x01(\x08R\x0fisDataAvailable\x12*\n\x11is_trigger_active\x18\x03 \x01(\x08R\x0fisTriggerActive\x12\x1b\n\tis_active\x18\x04 \x01(\x08R\x08isActive\x1aH\n\x14RecentProgressResult\x12\x30\n\x14recent_progress_json\x18\x05 \x03(\tR\x12recentProgressJson\x1a\'\n\rExplainResult\x12\x16\n\x06result\x18\x01 \x01(\tR\x06result\x1aY\n\x0f\x45xceptionResult\x12\x30\n\x11\x65xception_message\x18\x01 \x01(\tH\x00R\x10\x65xceptionMessage\x88\x01\x01\x42\x14\n\x12_exception_message\x1aL\n\x16\x41waitTerminationResult\x12#\n\nterminated\x18\x01 \x01(\x08H\x00R\nterminated\x88\x01\x01\x42\r\n\x0b_terminatedB\r\n\x0bresult_type"\x15\n\x13GetResourcesCommand"\xd4\x01\n\x19GetResourcesCommandResult\x12U\n\tresources\x18\x01 \x03(\x0b\x32\x37.spark.connect.GetResourcesCommandResult.ResourcesEntryR\tresources\x1a`\n\x0eResourcesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x38\n\x05value\x18\x02 \x01(\x0b\x32".spark.connect.ResourceInformationR\x05value:\x02\x38\x01\x42"\n\x1eorg.apache.spark.connect.protoP\x01\x62\x06proto3' ) @@ -429,27 +429,27 @@ _STREAMINGQUERYINSTANCEID._serialized_start = 3926 _STREAMINGQUERYINSTANCEID._serialized_end = 3991 _STREAMINGQUERYCOMMAND._serialized_start = 3994 - _STREAMINGQUERYCOMMAND._serialized_end = 4652 + _STREAMINGQUERYCOMMAND._serialized_end = 4626 _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND._serialized_start = 4493 _STREAMINGQUERYCOMMAND_EXPLAINCOMMAND._serialized_end = 4537 _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_start = 4539 - _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_end = 4641 - _STREAMINGQUERYCOMMANDRESULT._serialized_start = 4655 - _STREAMINGQUERYCOMMANDRESULT._serialized_end = 5740 - _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_start = 5238 - _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_end = 5408 - _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_start = 5410 - _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_end = 5482 - _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_start = 5484 - _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_end = 5523 - _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_start = 5525 - _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_end = 5647 - _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_start = 5649 - _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_end = 5725 - _GETRESOURCESCOMMAND._serialized_start = 5742 - _GETRESOURCESCOMMAND._serialized_end = 5763 - _GETRESOURCESCOMMANDRESULT._serialized_start = 5766 - _GETRESOURCESCOMMANDRESULT._serialized_end = 5978 - _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_start = 5882 - _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_end = 5978 + _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_end = 4615 + _STREAMINGQUERYCOMMANDRESULT._serialized_start = 4629 + _STREAMINGQUERYCOMMANDRESULT._serialized_end = 5681 + _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_start = 5212 + _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_end = 5382 + _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_start = 5384 + _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_end = 5456 + _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_start = 5458 + _STREAMINGQUERYCOMMANDRESULT_EXPLAINRESULT._serialized_end = 5497 + _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_start = 5499 + _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_end = 5588 + _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_start = 5590 + _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_end = 5666 + _GETRESOURCESCOMMAND._serialized_start = 5683 + _GETRESOURCESCOMMAND._serialized_end = 5704 + _GETRESOURCESCOMMANDRESULT._serialized_start = 5707 + _GETRESOURCESCOMMANDRESULT._serialized_end = 5919 + _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_start = 5823 + _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_end = 5919 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/commands_pb2.pyi b/python/pyspark/sql/connect/proto/commands_pb2.pyi index 08200c9d97664..22ca22783e023 100644 --- a/python/pyspark/sql/connect/proto/commands_pb2.pyi +++ b/python/pyspark/sql/connect/proto/commands_pb2.pyi @@ -879,31 +879,28 @@ class StreamingQueryCommand(google.protobuf.message.Message): class AwaitTerminationCommand(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor - NO_TIMEOUT_FIELD_NUMBER: builtins.int TIMEOUT_MS_FIELD_NUMBER: builtins.int - no_timeout: builtins.bool timeout_ms: builtins.int def __init__( self, *, - no_timeout: builtins.bool = ..., - timeout_ms: builtins.int = ..., + timeout_ms: builtins.int | None = ..., ) -> None: ... def HasField( self, field_name: typing_extensions.Literal[ - "no_timeout", b"no_timeout", "timeout", b"timeout", "timeout_ms", b"timeout_ms" + "_timeout_ms", b"_timeout_ms", "timeout_ms", b"timeout_ms" ], ) -> builtins.bool: ... def ClearField( self, field_name: typing_extensions.Literal[ - "no_timeout", b"no_timeout", "timeout", b"timeout", "timeout_ms", b"timeout_ms" + "_timeout_ms", b"_timeout_ms", "timeout_ms", b"timeout_ms" ], ) -> None: ... def WhichOneof( - self, oneof_group: typing_extensions.Literal["timeout", b"timeout"] - ) -> typing_extensions.Literal["no_timeout", "timeout_ms"] | None: ... + self, oneof_group: typing_extensions.Literal["_timeout_ms", b"_timeout_ms"] + ) -> typing_extensions.Literal["timeout_ms"] | None: ... QUERY_ID_FIELD_NUMBER: builtins.int STATUS_FIELD_NUMBER: builtins.int @@ -1089,41 +1086,36 @@ class StreamingQueryCommandResult(google.protobuf.message.Message): class ExceptionResult(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor - NO_EXCEPTION_FIELD_NUMBER: builtins.int EXCEPTION_MESSAGE_FIELD_NUMBER: builtins.int - no_exception: builtins.bool exception_message: builtins.str + """Exception message as string""" def __init__( self, *, - no_exception: builtins.bool = ..., - exception_message: builtins.str = ..., + exception_message: builtins.str | None = ..., ) -> None: ... def HasField( self, field_name: typing_extensions.Literal[ - "exception_content", - b"exception_content", + "_exception_message", + b"_exception_message", "exception_message", b"exception_message", - "no_exception", - b"no_exception", ], ) -> builtins.bool: ... def ClearField( self, field_name: typing_extensions.Literal[ - "exception_content", - b"exception_content", + "_exception_message", + b"_exception_message", "exception_message", b"exception_message", - "no_exception", - b"no_exception", ], ) -> None: ... def WhichOneof( - self, oneof_group: typing_extensions.Literal["exception_content", b"exception_content"] - ) -> typing_extensions.Literal["no_exception", "exception_message"] | None: ... + self, + oneof_group: typing_extensions.Literal["_exception_message", b"_exception_message"], + ) -> typing_extensions.Literal["exception_message"] | None: ... class AwaitTerminationResult(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor From 3b375ccbee20338af25feaa89bb9c82face5e444 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Tue, 18 Apr 2023 12:49:20 -0700 Subject: [PATCH 25/31] minor --- python/pyspark/sql/tests/streaming/test_streaming.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pyspark/sql/tests/streaming/test_streaming.py b/python/pyspark/sql/tests/streaming/test_streaming.py index 2e51c354447f0..52fa19a864200 100644 --- a/python/pyspark/sql/tests/streaming/test_streaming.py +++ b/python/pyspark/sql/tests/streaming/test_streaming.py @@ -256,6 +256,7 @@ def test_stream_await_termination(self): q.processAllAvailable() q.stop() + # Sanity check when no parameter is set q.awaitTermination() self.assertFalse(q.isActive) finally: From 34c28e7762215ab783eba6d9757cb35eb815ef3d Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Tue, 18 Apr 2023 17:29:24 -0700 Subject: [PATCH 26/31] minor import import SparkConnectService._ --- .../spark/sql/connect/service/SparkConnectService.scala | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala index aec7dcb1be0d8..86590569aaae8 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala @@ -56,8 +56,6 @@ class SparkConnectService(debug: Boolean) extends proto.SparkConnectServiceGrpc.SparkConnectServiceImplBase with Logging { - import SparkConnectService._ - private def allClasses(cl: Class[_]): Seq[Class[_]] = { val classes = ArrayBuffer.empty[Class[_]] if (cl != null && !cl.equals(classOf[java.lang.Object])) { @@ -87,7 +85,7 @@ class SparkConnectService(debug: Boolean) .setDomain("org.apache.spark") .putMetadata("classes", compact(render(allClasses(st.getClass).map(_.getName)))) .build())) - .setMessage(extractErrorMessage(st)) + .setMessage(SparkConnectService.extractErrorMessage(st)) .build() } From c487cd706ea16617379fc8a7fced9a09d5af68c0 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 19 Apr 2023 10:31:27 -0700 Subject: [PATCH 27/31] can you run tests one more time From 0c2133b80faca182385c6de2be0165c5a5e1cd60 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 19 Apr 2023 14:40:17 -0700 Subject: [PATCH 28/31] please pass From da1d3e1aa6dfff427bb448a0aaa9366af5b0f0d6 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 19 Apr 2023 16:53:49 -0700 Subject: [PATCH 29/31] minor --- .../apache/spark/sql/connect/service/SparkConnectService.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala index ae8877943a78f..5ad659f451e75 100644 --- a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala @@ -76,7 +76,6 @@ class SparkConnectService(debug: Boolean) } private def buildStatusFromThrowable(st: Throwable, stackTraceEnabled: Boolean): RPCStatus = { - val message = StringUtils.abbreviate(st.getMessage, 2048) val errorInfo = ErrorInfo .newBuilder() .setReason(st.getClass.getName) From eb19c2fea380cf6da6daa3525df8d92502add3a6 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 19 Apr 2023 17:36:36 -0700 Subject: [PATCH 30/31] remove return None, remove optional tag in awaitTerminationResult --- .../main/protobuf/spark/connect/commands.proto | 2 +- .../pyspark/sql/connect/proto/commands_pb2.py | 18 +++++++++--------- .../pyspark/sql/connect/proto/commands_pb2.pyi | 16 ++-------------- python/pyspark/sql/connect/streaming/query.py | 5 ----- 4 files changed, 12 insertions(+), 29 deletions(-) diff --git a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto index 0cca65600a72b..2b648bf0f9a5e 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto @@ -313,7 +313,7 @@ message StreamingQueryCommandResult { } message AwaitTerminationResult { - optional bool terminated = 1; + bool terminated = 1; } } diff --git a/python/pyspark/sql/connect/proto/commands_pb2.py b/python/pyspark/sql/connect/proto/commands_pb2.py index 2474c3bc5eb1a..27de95a7aaa36 100644 --- a/python/pyspark/sql/connect/proto/commands_pb2.py +++ b/python/pyspark/sql/connect/proto/commands_pb2.py @@ -36,7 +36,7 @@ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x1cspark/connect/commands.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1aspark/connect/common.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto"\x90\x06\n\x07\x43ommand\x12]\n\x11register_function\x18\x01 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x10registerFunction\x12H\n\x0fwrite_operation\x18\x02 \x01(\x0b\x32\x1d.spark.connect.WriteOperationH\x00R\x0ewriteOperation\x12_\n\x15\x63reate_dataframe_view\x18\x03 \x01(\x0b\x32).spark.connect.CreateDataFrameViewCommandH\x00R\x13\x63reateDataframeView\x12O\n\x12write_operation_v2\x18\x04 \x01(\x0b\x32\x1f.spark.connect.WriteOperationV2H\x00R\x10writeOperationV2\x12<\n\x0bsql_command\x18\x05 \x01(\x0b\x32\x19.spark.connect.SqlCommandH\x00R\nsqlCommand\x12k\n\x1cwrite_stream_operation_start\x18\x06 \x01(\x0b\x32(.spark.connect.WriteStreamOperationStartH\x00R\x19writeStreamOperationStart\x12^\n\x17streaming_query_command\x18\x07 \x01(\x0b\x32$.spark.connect.StreamingQueryCommandH\x00R\x15streamingQueryCommand\x12X\n\x15get_resources_command\x18\x08 \x01(\x0b\x32".spark.connect.GetResourcesCommandH\x00R\x13getResourcesCommand\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textensionB\x0e\n\x0c\x63ommand_type"\xb3\x01\n\nSqlCommand\x12\x10\n\x03sql\x18\x01 \x01(\tR\x03sql\x12\x37\n\x04\x61rgs\x18\x02 \x03(\x0b\x32#.spark.connect.SqlCommand.ArgsEntryR\x04\x61rgs\x1aZ\n\tArgsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x37\n\x05value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x05value:\x02\x38\x01"\x96\x01\n\x1a\x43reateDataFrameViewCommand\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x1b\n\tis_global\x18\x03 \x01(\x08R\x08isGlobal\x12\x18\n\x07replace\x18\x04 \x01(\x08R\x07replace"\x9b\x08\n\x0eWriteOperation\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1b\n\x06source\x18\x02 \x01(\tH\x01R\x06source\x88\x01\x01\x12\x14\n\x04path\x18\x03 \x01(\tH\x00R\x04path\x12?\n\x05table\x18\x04 \x01(\x0b\x32\'.spark.connect.WriteOperation.SaveTableH\x00R\x05table\x12:\n\x04mode\x18\x05 \x01(\x0e\x32&.spark.connect.WriteOperation.SaveModeR\x04mode\x12*\n\x11sort_column_names\x18\x06 \x03(\tR\x0fsortColumnNames\x12\x31\n\x14partitioning_columns\x18\x07 \x03(\tR\x13partitioningColumns\x12\x43\n\tbucket_by\x18\x08 \x01(\x0b\x32&.spark.connect.WriteOperation.BucketByR\x08\x62ucketBy\x12\x44\n\x07options\x18\t \x03(\x0b\x32*.spark.connect.WriteOperation.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x82\x02\n\tSaveTable\x12\x1d\n\ntable_name\x18\x01 \x01(\tR\ttableName\x12X\n\x0bsave_method\x18\x02 \x01(\x0e\x32\x37.spark.connect.WriteOperation.SaveTable.TableSaveMethodR\nsaveMethod"|\n\x0fTableSaveMethod\x12!\n\x1dTABLE_SAVE_METHOD_UNSPECIFIED\x10\x00\x12#\n\x1fTABLE_SAVE_METHOD_SAVE_AS_TABLE\x10\x01\x12!\n\x1dTABLE_SAVE_METHOD_INSERT_INTO\x10\x02\x1a[\n\x08\x42ucketBy\x12.\n\x13\x62ucket_column_names\x18\x01 \x03(\tR\x11\x62ucketColumnNames\x12\x1f\n\x0bnum_buckets\x18\x02 \x01(\x05R\nnumBuckets"\x89\x01\n\x08SaveMode\x12\x19\n\x15SAVE_MODE_UNSPECIFIED\x10\x00\x12\x14\n\x10SAVE_MODE_APPEND\x10\x01\x12\x17\n\x13SAVE_MODE_OVERWRITE\x10\x02\x12\x1d\n\x19SAVE_MODE_ERROR_IF_EXISTS\x10\x03\x12\x14\n\x10SAVE_MODE_IGNORE\x10\x04\x42\x0b\n\tsave_typeB\t\n\x07_source"\xad\x06\n\x10WriteOperationV2\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1d\n\ntable_name\x18\x02 \x01(\tR\ttableName\x12\x1f\n\x08provider\x18\x03 \x01(\tH\x00R\x08provider\x88\x01\x01\x12L\n\x14partitioning_columns\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13partitioningColumns\x12\x46\n\x07options\x18\x05 \x03(\x0b\x32,.spark.connect.WriteOperationV2.OptionsEntryR\x07options\x12_\n\x10table_properties\x18\x06 \x03(\x0b\x32\x34.spark.connect.WriteOperationV2.TablePropertiesEntryR\x0ftableProperties\x12\x38\n\x04mode\x18\x07 \x01(\x0e\x32$.spark.connect.WriteOperationV2.ModeR\x04mode\x12J\n\x13overwrite_condition\x18\x08 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x12overwriteCondition\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x42\n\x14TablePropertiesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01"\x9f\x01\n\x04Mode\x12\x14\n\x10MODE_UNSPECIFIED\x10\x00\x12\x0f\n\x0bMODE_CREATE\x10\x01\x12\x12\n\x0eMODE_OVERWRITE\x10\x02\x12\x1d\n\x19MODE_OVERWRITE_PARTITIONS\x10\x03\x12\x0f\n\x0bMODE_APPEND\x10\x04\x12\x10\n\x0cMODE_REPLACE\x10\x05\x12\x1a\n\x16MODE_CREATE_OR_REPLACE\x10\x06\x42\x0b\n\t_provider"\x82\x05\n\x19WriteStreamOperationStart\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x16\n\x06\x66ormat\x18\x02 \x01(\tR\x06\x66ormat\x12O\n\x07options\x18\x03 \x03(\x0b\x32\x35.spark.connect.WriteStreamOperationStart.OptionsEntryR\x07options\x12:\n\x19partitioning_column_names\x18\x04 \x03(\tR\x17partitioningColumnNames\x12:\n\x18processing_time_interval\x18\x05 \x01(\tH\x00R\x16processingTimeInterval\x12%\n\ravailable_now\x18\x06 \x01(\x08H\x00R\x0c\x61vailableNow\x12\x14\n\x04once\x18\x07 \x01(\x08H\x00R\x04once\x12\x46\n\x1e\x63ontinuous_checkpoint_interval\x18\x08 \x01(\tH\x00R\x1c\x63ontinuousCheckpointInterval\x12\x1f\n\x0boutput_mode\x18\t \x01(\tR\noutputMode\x12\x1d\n\nquery_name\x18\n \x01(\tR\tqueryName\x12\x14\n\x04path\x18\x0b \x01(\tH\x01R\x04path\x12\x1f\n\ntable_name\x18\x0c \x01(\tH\x01R\ttableName\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\t\n\x07triggerB\x12\n\x10sink_destination"y\n\x1fWriteStreamOperationStartResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name"A\n\x18StreamingQueryInstanceId\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x12\x15\n\x06run_id\x18\x02 \x01(\tR\x05runId"\xf8\x04\n\x15StreamingQueryCommand\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x18\n\x06status\x18\x02 \x01(\x08H\x00R\x06status\x12%\n\rlast_progress\x18\x03 \x01(\x08H\x00R\x0clastProgress\x12)\n\x0frecent_progress\x18\x04 \x01(\x08H\x00R\x0erecentProgress\x12\x14\n\x04stop\x18\x05 \x01(\x08H\x00R\x04stop\x12\x34\n\x15process_all_available\x18\x06 \x01(\x08H\x00R\x13processAllAvailable\x12O\n\x07\x65xplain\x18\x07 \x01(\x0b\x32\x33.spark.connect.StreamingQueryCommand.ExplainCommandH\x00R\x07\x65xplain\x12\x1e\n\texception\x18\x08 \x01(\x08H\x00R\texception\x12k\n\x11\x61wait_termination\x18\t \x01(\x0b\x32<.spark.connect.StreamingQueryCommand.AwaitTerminationCommandH\x00R\x10\x61waitTermination\x1a,\n\x0e\x45xplainCommand\x12\x1a\n\x08\x65xtended\x18\x01 \x01(\x08R\x08\x65xtended\x1aL\n\x17\x41waitTerminationCommand\x12"\n\ntimeout_ms\x18\x02 \x01(\x03H\x00R\ttimeoutMs\x88\x01\x01\x42\r\n\x0b_timeout_msB\t\n\x07\x63ommand"\x9c\x08\n\x1bStreamingQueryCommandResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12Q\n\x06status\x18\x02 \x01(\x0b\x32\x37.spark.connect.StreamingQueryCommandResult.StatusResultH\x00R\x06status\x12j\n\x0frecent_progress\x18\x03 \x01(\x0b\x32?.spark.connect.StreamingQueryCommandResult.RecentProgressResultH\x00R\x0erecentProgress\x12T\n\x07\x65xplain\x18\x04 \x01(\x0b\x32\x38.spark.connect.StreamingQueryCommandResult.ExplainResultH\x00R\x07\x65xplain\x12Z\n\texception\x18\x05 \x01(\x0b\x32:.spark.connect.StreamingQueryCommandResult.ExceptionResultH\x00R\texception\x12p\n\x11\x61wait_termination\x18\x06 \x01(\x0b\x32\x41.spark.connect.StreamingQueryCommandResult.AwaitTerminationResultH\x00R\x10\x61waitTermination\x1a\xaa\x01\n\x0cStatusResult\x12%\n\x0estatus_message\x18\x01 \x01(\tR\rstatusMessage\x12*\n\x11is_data_available\x18\x02 \x01(\x08R\x0fisDataAvailable\x12*\n\x11is_trigger_active\x18\x03 \x01(\x08R\x0fisTriggerActive\x12\x1b\n\tis_active\x18\x04 \x01(\x08R\x08isActive\x1aH\n\x14RecentProgressResult\x12\x30\n\x14recent_progress_json\x18\x05 \x03(\tR\x12recentProgressJson\x1a\'\n\rExplainResult\x12\x16\n\x06result\x18\x01 \x01(\tR\x06result\x1aY\n\x0f\x45xceptionResult\x12\x30\n\x11\x65xception_message\x18\x01 \x01(\tH\x00R\x10\x65xceptionMessage\x88\x01\x01\x42\x14\n\x12_exception_message\x1aL\n\x16\x41waitTerminationResult\x12#\n\nterminated\x18\x01 \x01(\x08H\x00R\nterminated\x88\x01\x01\x42\r\n\x0b_terminatedB\r\n\x0bresult_type"\x15\n\x13GetResourcesCommand"\xd4\x01\n\x19GetResourcesCommandResult\x12U\n\tresources\x18\x01 \x03(\x0b\x32\x37.spark.connect.GetResourcesCommandResult.ResourcesEntryR\tresources\x1a`\n\x0eResourcesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x38\n\x05value\x18\x02 \x01(\x0b\x32".spark.connect.ResourceInformationR\x05value:\x02\x38\x01\x42"\n\x1eorg.apache.spark.connect.protoP\x01\x62\x06proto3' + b'\n\x1cspark/connect/commands.proto\x12\rspark.connect\x1a\x19google/protobuf/any.proto\x1a\x1aspark/connect/common.proto\x1a\x1fspark/connect/expressions.proto\x1a\x1dspark/connect/relations.proto"\x90\x06\n\x07\x43ommand\x12]\n\x11register_function\x18\x01 \x01(\x0b\x32..spark.connect.CommonInlineUserDefinedFunctionH\x00R\x10registerFunction\x12H\n\x0fwrite_operation\x18\x02 \x01(\x0b\x32\x1d.spark.connect.WriteOperationH\x00R\x0ewriteOperation\x12_\n\x15\x63reate_dataframe_view\x18\x03 \x01(\x0b\x32).spark.connect.CreateDataFrameViewCommandH\x00R\x13\x63reateDataframeView\x12O\n\x12write_operation_v2\x18\x04 \x01(\x0b\x32\x1f.spark.connect.WriteOperationV2H\x00R\x10writeOperationV2\x12<\n\x0bsql_command\x18\x05 \x01(\x0b\x32\x19.spark.connect.SqlCommandH\x00R\nsqlCommand\x12k\n\x1cwrite_stream_operation_start\x18\x06 \x01(\x0b\x32(.spark.connect.WriteStreamOperationStartH\x00R\x19writeStreamOperationStart\x12^\n\x17streaming_query_command\x18\x07 \x01(\x0b\x32$.spark.connect.StreamingQueryCommandH\x00R\x15streamingQueryCommand\x12X\n\x15get_resources_command\x18\x08 \x01(\x0b\x32".spark.connect.GetResourcesCommandH\x00R\x13getResourcesCommand\x12\x35\n\textension\x18\xe7\x07 \x01(\x0b\x32\x14.google.protobuf.AnyH\x00R\textensionB\x0e\n\x0c\x63ommand_type"\xb3\x01\n\nSqlCommand\x12\x10\n\x03sql\x18\x01 \x01(\tR\x03sql\x12\x37\n\x04\x61rgs\x18\x02 \x03(\x0b\x32#.spark.connect.SqlCommand.ArgsEntryR\x04\x61rgs\x1aZ\n\tArgsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x37\n\x05value\x18\x02 \x01(\x0b\x32!.spark.connect.Expression.LiteralR\x05value:\x02\x38\x01"\x96\x01\n\x1a\x43reateDataFrameViewCommand\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name\x12\x1b\n\tis_global\x18\x03 \x01(\x08R\x08isGlobal\x12\x18\n\x07replace\x18\x04 \x01(\x08R\x07replace"\x9b\x08\n\x0eWriteOperation\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1b\n\x06source\x18\x02 \x01(\tH\x01R\x06source\x88\x01\x01\x12\x14\n\x04path\x18\x03 \x01(\tH\x00R\x04path\x12?\n\x05table\x18\x04 \x01(\x0b\x32\'.spark.connect.WriteOperation.SaveTableH\x00R\x05table\x12:\n\x04mode\x18\x05 \x01(\x0e\x32&.spark.connect.WriteOperation.SaveModeR\x04mode\x12*\n\x11sort_column_names\x18\x06 \x03(\tR\x0fsortColumnNames\x12\x31\n\x14partitioning_columns\x18\x07 \x03(\tR\x13partitioningColumns\x12\x43\n\tbucket_by\x18\x08 \x01(\x0b\x32&.spark.connect.WriteOperation.BucketByR\x08\x62ucketBy\x12\x44\n\x07options\x18\t \x03(\x0b\x32*.spark.connect.WriteOperation.OptionsEntryR\x07options\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x82\x02\n\tSaveTable\x12\x1d\n\ntable_name\x18\x01 \x01(\tR\ttableName\x12X\n\x0bsave_method\x18\x02 \x01(\x0e\x32\x37.spark.connect.WriteOperation.SaveTable.TableSaveMethodR\nsaveMethod"|\n\x0fTableSaveMethod\x12!\n\x1dTABLE_SAVE_METHOD_UNSPECIFIED\x10\x00\x12#\n\x1fTABLE_SAVE_METHOD_SAVE_AS_TABLE\x10\x01\x12!\n\x1dTABLE_SAVE_METHOD_INSERT_INTO\x10\x02\x1a[\n\x08\x42ucketBy\x12.\n\x13\x62ucket_column_names\x18\x01 \x03(\tR\x11\x62ucketColumnNames\x12\x1f\n\x0bnum_buckets\x18\x02 \x01(\x05R\nnumBuckets"\x89\x01\n\x08SaveMode\x12\x19\n\x15SAVE_MODE_UNSPECIFIED\x10\x00\x12\x14\n\x10SAVE_MODE_APPEND\x10\x01\x12\x17\n\x13SAVE_MODE_OVERWRITE\x10\x02\x12\x1d\n\x19SAVE_MODE_ERROR_IF_EXISTS\x10\x03\x12\x14\n\x10SAVE_MODE_IGNORE\x10\x04\x42\x0b\n\tsave_typeB\t\n\x07_source"\xad\x06\n\x10WriteOperationV2\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x1d\n\ntable_name\x18\x02 \x01(\tR\ttableName\x12\x1f\n\x08provider\x18\x03 \x01(\tH\x00R\x08provider\x88\x01\x01\x12L\n\x14partitioning_columns\x18\x04 \x03(\x0b\x32\x19.spark.connect.ExpressionR\x13partitioningColumns\x12\x46\n\x07options\x18\x05 \x03(\x0b\x32,.spark.connect.WriteOperationV2.OptionsEntryR\x07options\x12_\n\x10table_properties\x18\x06 \x03(\x0b\x32\x34.spark.connect.WriteOperationV2.TablePropertiesEntryR\x0ftableProperties\x12\x38\n\x04mode\x18\x07 \x01(\x0e\x32$.spark.connect.WriteOperationV2.ModeR\x04mode\x12J\n\x13overwrite_condition\x18\x08 \x01(\x0b\x32\x19.spark.connect.ExpressionR\x12overwriteCondition\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x1a\x42\n\x14TablePropertiesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01"\x9f\x01\n\x04Mode\x12\x14\n\x10MODE_UNSPECIFIED\x10\x00\x12\x0f\n\x0bMODE_CREATE\x10\x01\x12\x12\n\x0eMODE_OVERWRITE\x10\x02\x12\x1d\n\x19MODE_OVERWRITE_PARTITIONS\x10\x03\x12\x0f\n\x0bMODE_APPEND\x10\x04\x12\x10\n\x0cMODE_REPLACE\x10\x05\x12\x1a\n\x16MODE_CREATE_OR_REPLACE\x10\x06\x42\x0b\n\t_provider"\x82\x05\n\x19WriteStreamOperationStart\x12-\n\x05input\x18\x01 \x01(\x0b\x32\x17.spark.connect.RelationR\x05input\x12\x16\n\x06\x66ormat\x18\x02 \x01(\tR\x06\x66ormat\x12O\n\x07options\x18\x03 \x03(\x0b\x32\x35.spark.connect.WriteStreamOperationStart.OptionsEntryR\x07options\x12:\n\x19partitioning_column_names\x18\x04 \x03(\tR\x17partitioningColumnNames\x12:\n\x18processing_time_interval\x18\x05 \x01(\tH\x00R\x16processingTimeInterval\x12%\n\ravailable_now\x18\x06 \x01(\x08H\x00R\x0c\x61vailableNow\x12\x14\n\x04once\x18\x07 \x01(\x08H\x00R\x04once\x12\x46\n\x1e\x63ontinuous_checkpoint_interval\x18\x08 \x01(\tH\x00R\x1c\x63ontinuousCheckpointInterval\x12\x1f\n\x0boutput_mode\x18\t \x01(\tR\noutputMode\x12\x1d\n\nquery_name\x18\n \x01(\tR\tqueryName\x12\x14\n\x04path\x18\x0b \x01(\tH\x01R\x04path\x12\x1f\n\ntable_name\x18\x0c \x01(\tH\x01R\ttableName\x1a:\n\x0cOptionsEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n\x05value\x18\x02 \x01(\tR\x05value:\x02\x38\x01\x42\t\n\x07triggerB\x12\n\x10sink_destination"y\n\x1fWriteStreamOperationStartResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x12\n\x04name\x18\x02 \x01(\tR\x04name"A\n\x18StreamingQueryInstanceId\x12\x0e\n\x02id\x18\x01 \x01(\tR\x02id\x12\x15\n\x06run_id\x18\x02 \x01(\tR\x05runId"\xf8\x04\n\x15StreamingQueryCommand\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12\x18\n\x06status\x18\x02 \x01(\x08H\x00R\x06status\x12%\n\rlast_progress\x18\x03 \x01(\x08H\x00R\x0clastProgress\x12)\n\x0frecent_progress\x18\x04 \x01(\x08H\x00R\x0erecentProgress\x12\x14\n\x04stop\x18\x05 \x01(\x08H\x00R\x04stop\x12\x34\n\x15process_all_available\x18\x06 \x01(\x08H\x00R\x13processAllAvailable\x12O\n\x07\x65xplain\x18\x07 \x01(\x0b\x32\x33.spark.connect.StreamingQueryCommand.ExplainCommandH\x00R\x07\x65xplain\x12\x1e\n\texception\x18\x08 \x01(\x08H\x00R\texception\x12k\n\x11\x61wait_termination\x18\t \x01(\x0b\x32<.spark.connect.StreamingQueryCommand.AwaitTerminationCommandH\x00R\x10\x61waitTermination\x1a,\n\x0e\x45xplainCommand\x12\x1a\n\x08\x65xtended\x18\x01 \x01(\x08R\x08\x65xtended\x1aL\n\x17\x41waitTerminationCommand\x12"\n\ntimeout_ms\x18\x02 \x01(\x03H\x00R\ttimeoutMs\x88\x01\x01\x42\r\n\x0b_timeout_msB\t\n\x07\x63ommand"\x88\x08\n\x1bStreamingQueryCommandResult\x12\x42\n\x08query_id\x18\x01 \x01(\x0b\x32\'.spark.connect.StreamingQueryInstanceIdR\x07queryId\x12Q\n\x06status\x18\x02 \x01(\x0b\x32\x37.spark.connect.StreamingQueryCommandResult.StatusResultH\x00R\x06status\x12j\n\x0frecent_progress\x18\x03 \x01(\x0b\x32?.spark.connect.StreamingQueryCommandResult.RecentProgressResultH\x00R\x0erecentProgress\x12T\n\x07\x65xplain\x18\x04 \x01(\x0b\x32\x38.spark.connect.StreamingQueryCommandResult.ExplainResultH\x00R\x07\x65xplain\x12Z\n\texception\x18\x05 \x01(\x0b\x32:.spark.connect.StreamingQueryCommandResult.ExceptionResultH\x00R\texception\x12p\n\x11\x61wait_termination\x18\x06 \x01(\x0b\x32\x41.spark.connect.StreamingQueryCommandResult.AwaitTerminationResultH\x00R\x10\x61waitTermination\x1a\xaa\x01\n\x0cStatusResult\x12%\n\x0estatus_message\x18\x01 \x01(\tR\rstatusMessage\x12*\n\x11is_data_available\x18\x02 \x01(\x08R\x0fisDataAvailable\x12*\n\x11is_trigger_active\x18\x03 \x01(\x08R\x0fisTriggerActive\x12\x1b\n\tis_active\x18\x04 \x01(\x08R\x08isActive\x1aH\n\x14RecentProgressResult\x12\x30\n\x14recent_progress_json\x18\x05 \x03(\tR\x12recentProgressJson\x1a\'\n\rExplainResult\x12\x16\n\x06result\x18\x01 \x01(\tR\x06result\x1aY\n\x0f\x45xceptionResult\x12\x30\n\x11\x65xception_message\x18\x01 \x01(\tH\x00R\x10\x65xceptionMessage\x88\x01\x01\x42\x14\n\x12_exception_message\x1a\x38\n\x16\x41waitTerminationResult\x12\x1e\n\nterminated\x18\x01 \x01(\x08R\nterminatedB\r\n\x0bresult_type"\x15\n\x13GetResourcesCommand"\xd4\x01\n\x19GetResourcesCommandResult\x12U\n\tresources\x18\x01 \x03(\x0b\x32\x37.spark.connect.GetResourcesCommandResult.ResourcesEntryR\tresources\x1a`\n\x0eResourcesEntry\x12\x10\n\x03key\x18\x01 \x01(\tR\x03key\x12\x38\n\x05value\x18\x02 \x01(\x0b\x32".spark.connect.ResourceInformationR\x05value:\x02\x38\x01\x42"\n\x1eorg.apache.spark.connect.protoP\x01\x62\x06proto3' ) @@ -435,7 +435,7 @@ _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_start = 4539 _STREAMINGQUERYCOMMAND_AWAITTERMINATIONCOMMAND._serialized_end = 4615 _STREAMINGQUERYCOMMANDRESULT._serialized_start = 4629 - _STREAMINGQUERYCOMMANDRESULT._serialized_end = 5681 + _STREAMINGQUERYCOMMANDRESULT._serialized_end = 5661 _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_start = 5212 _STREAMINGQUERYCOMMANDRESULT_STATUSRESULT._serialized_end = 5382 _STREAMINGQUERYCOMMANDRESULT_RECENTPROGRESSRESULT._serialized_start = 5384 @@ -445,11 +445,11 @@ _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_start = 5499 _STREAMINGQUERYCOMMANDRESULT_EXCEPTIONRESULT._serialized_end = 5588 _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_start = 5590 - _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_end = 5666 - _GETRESOURCESCOMMAND._serialized_start = 5683 - _GETRESOURCESCOMMAND._serialized_end = 5704 - _GETRESOURCESCOMMANDRESULT._serialized_start = 5707 - _GETRESOURCESCOMMANDRESULT._serialized_end = 5919 - _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_start = 5823 - _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_end = 5919 + _STREAMINGQUERYCOMMANDRESULT_AWAITTERMINATIONRESULT._serialized_end = 5646 + _GETRESOURCESCOMMAND._serialized_start = 5663 + _GETRESOURCESCOMMAND._serialized_end = 5684 + _GETRESOURCESCOMMANDRESULT._serialized_start = 5687 + _GETRESOURCESCOMMANDRESULT._serialized_end = 5899 + _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_start = 5803 + _GETRESOURCESCOMMANDRESULT_RESOURCESENTRY._serialized_end = 5899 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/commands_pb2.pyi b/python/pyspark/sql/connect/proto/commands_pb2.pyi index 22ca22783e023..972fe7503a1aa 100644 --- a/python/pyspark/sql/connect/proto/commands_pb2.pyi +++ b/python/pyspark/sql/connect/proto/commands_pb2.pyi @@ -1125,23 +1125,11 @@ class StreamingQueryCommandResult(google.protobuf.message.Message): def __init__( self, *, - terminated: builtins.bool | None = ..., + terminated: builtins.bool = ..., ) -> None: ... - def HasField( - self, - field_name: typing_extensions.Literal[ - "_terminated", b"_terminated", "terminated", b"terminated" - ], - ) -> builtins.bool: ... def ClearField( - self, - field_name: typing_extensions.Literal[ - "_terminated", b"_terminated", "terminated", b"terminated" - ], + self, field_name: typing_extensions.Literal["terminated", b"terminated"] ) -> None: ... - def WhichOneof( - self, oneof_group: typing_extensions.Literal["_terminated", b"_terminated"] - ) -> typing_extensions.Literal["terminated"] | None: ... QUERY_ID_FIELD_NUMBER: builtins.int STATUS_FIELD_NUMBER: builtins.int diff --git a/python/pyspark/sql/connect/streaming/query.py b/python/pyspark/sql/connect/streaming/query.py index a2b2e81357efc..3a0d273e7d1b7 100644 --- a/python/pyspark/sql/connect/streaming/query.py +++ b/python/pyspark/sql/connect/streaming/query.py @@ -81,7 +81,6 @@ def awaitTermination(self, timeout: Optional[int] = None) -> Optional[bool]: await_termination_cmd = pb2.StreamingQueryCommand.AwaitTerminationCommand() cmd.await_termination.CopyFrom(await_termination_cmd) self._execute_streaming_query_cmd(cmd) - return None awaitTermination.__doc__ = PySparkStreamingQuery.awaitTermination.__doc__ @@ -112,8 +111,6 @@ def lastProgress(self) -> Optional[Dict[str, Any]]: progress = self._execute_streaming_query_cmd(cmd).recent_progress.recent_progress_json if len(progress) > 0: return json.loads(progress[-1]) - else: - return None lastProgress.__doc__ = PySparkStreamingQuery.lastProgress.__doc__ @@ -146,8 +143,6 @@ def exception(self) -> Optional[StreamingQueryException]: exception = self._execute_streaming_query_cmd(cmd).exception if exception.HasField("exception_message"): return CapturedStreamingQueryException(exception.exception_message) - else: - return None exception.__doc__ = PySparkStreamingQuery.exception.__doc__ From ed67070653176db88eb2bed0cf737860d2006c13 Mon Sep 17 00:00:00 2001 From: Wei Liu Date: Wed, 19 Apr 2023 23:11:00 -0700 Subject: [PATCH 31/31] add back return None --- python/pyspark/sql/connect/streaming/query.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/pyspark/sql/connect/streaming/query.py b/python/pyspark/sql/connect/streaming/query.py index 3a0d273e7d1b7..a2b2e81357efc 100644 --- a/python/pyspark/sql/connect/streaming/query.py +++ b/python/pyspark/sql/connect/streaming/query.py @@ -81,6 +81,7 @@ def awaitTermination(self, timeout: Optional[int] = None) -> Optional[bool]: await_termination_cmd = pb2.StreamingQueryCommand.AwaitTerminationCommand() cmd.await_termination.CopyFrom(await_termination_cmd) self._execute_streaming_query_cmd(cmd) + return None awaitTermination.__doc__ = PySparkStreamingQuery.awaitTermination.__doc__ @@ -111,6 +112,8 @@ def lastProgress(self) -> Optional[Dict[str, Any]]: progress = self._execute_streaming_query_cmd(cmd).recent_progress.recent_progress_json if len(progress) > 0: return json.loads(progress[-1]) + else: + return None lastProgress.__doc__ = PySparkStreamingQuery.lastProgress.__doc__ @@ -143,6 +146,8 @@ def exception(self) -> Optional[StreamingQueryException]: exception = self._execute_streaming_query_cmd(cmd).exception if exception.HasField("exception_message"): return CapturedStreamingQueryException(exception.exception_message) + else: + return None exception.__doc__ = PySparkStreamingQuery.exception.__doc__