From 7178f32e8a88589220612b8b959041f8d38cccf1 Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Tue, 7 Mar 2023 04:29:59 +0100 Subject: [PATCH 01/23] #83: Create a Spike for error handling * new functions `null_col` and `call_udf` * `ErrorMessage` refactoring * `ErrorHandling` trait designed to serve as the interface for different implementations * Implement error handling by putting the info into column of `ErrorMessage` array * numerous support classes --- .../absa/spark/commons/sql/functions2.scala | 30 +++++++ .../absa/spark/commons/sql/functions2.scala | 30 +++++++ .../absa/spark/commons/sql/functions2.scala | 30 +++++++ .../commons/errorhandling/ErrorHandling.scala | 40 +++++++++ .../commons/errorhandling/ErrorMessage.scala | 16 ++-- .../errorhandling/ErrorMessageSubmit.scala | 28 ++++++ .../implementations/ErrorMessageArray.scala | 40 +++++++++ .../ErrorMessageSubmitOnColumn.scala | 48 +++++++++++ .../ErrorMessageSubmitWithoutColumn.scala | 43 ++++++++++ .../partials/ErrorHandlingCommon.scala | 55 ++++++++++++ .../partials/EvaluateIntoErrorMessage.scala | 34 ++++++++ .../partials/EvaluateViaUdf.scala | 43 ++++++++++ .../errorhandling/types/ColumnOrValue.scala | 85 +++++++++++++++++++ .../errorhandling/types/ErrorWhen.scala | 24 ++++++ .../commons/errorhandling/types/types.scala | 34 ++++++++ .../co/absa/spark/commons/sql/functions.scala | 11 ++- .../ErrorMessageArrayTest.scala | 75 ++++++++++++++++ 17 files changed, 658 insertions(+), 8 deletions(-) create mode 100644 spark-commons/src/main/scala-spark2.4-jvm/za/co/absa/spark/commons/sql/functions2.scala create mode 100644 spark-commons/src/main/scala-spark3.2-jvm/za/co/absa/spark/commons/sql/functions2.scala create mode 100644 spark-commons/src/main/scala-spark3.3-jvm/za/co/absa/spark/commons/sql/functions2.scala create mode 100644 spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala create mode 100644 spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageSubmit.scala create mode 100644 spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala create mode 100644 spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnColumn.scala create mode 100644 spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitWithoutColumn.scala create mode 100644 spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala create mode 100644 spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala create mode 100644 spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala create mode 100644 spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala create mode 100644 spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ErrorWhen.scala create mode 100644 spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/types.scala create mode 100644 spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala diff --git a/spark-commons/src/main/scala-spark2.4-jvm/za/co/absa/spark/commons/sql/functions2.scala b/spark-commons/src/main/scala-spark2.4-jvm/za/co/absa/spark/commons/sql/functions2.scala new file mode 100644 index 00000000..3b72381e --- /dev/null +++ b/spark-commons/src/main/scala-spark2.4-jvm/za/co/absa/spark/commons/sql/functions2.scala @@ -0,0 +1,30 @@ +/* + * Copyright 2021 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.sql + +import org.apache.spark.sql.Column +import org.apache.spark.sql.functions.callUDF + +import scala.util.{Success, Try} + +// scalastyle:off +object functions2 { +// scalastyle:on + + def call_udf(udfName: String, cols: Column*): Column = call_udf(udfName, cols:_*) + +} diff --git a/spark-commons/src/main/scala-spark3.2-jvm/za/co/absa/spark/commons/sql/functions2.scala b/spark-commons/src/main/scala-spark3.2-jvm/za/co/absa/spark/commons/sql/functions2.scala new file mode 100644 index 00000000..3b72381e --- /dev/null +++ b/spark-commons/src/main/scala-spark3.2-jvm/za/co/absa/spark/commons/sql/functions2.scala @@ -0,0 +1,30 @@ +/* + * Copyright 2021 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.sql + +import org.apache.spark.sql.Column +import org.apache.spark.sql.functions.callUDF + +import scala.util.{Success, Try} + +// scalastyle:off +object functions2 { +// scalastyle:on + + def call_udf(udfName: String, cols: Column*): Column = call_udf(udfName, cols:_*) + +} diff --git a/spark-commons/src/main/scala-spark3.3-jvm/za/co/absa/spark/commons/sql/functions2.scala b/spark-commons/src/main/scala-spark3.3-jvm/za/co/absa/spark/commons/sql/functions2.scala new file mode 100644 index 00000000..3b72381e --- /dev/null +++ b/spark-commons/src/main/scala-spark3.3-jvm/za/co/absa/spark/commons/sql/functions2.scala @@ -0,0 +1,30 @@ +/* + * Copyright 2021 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.sql + +import org.apache.spark.sql.Column +import org.apache.spark.sql.functions.callUDF + +import scala.util.{Success, Try} + +// scalastyle:off +object functions2 { +// scalastyle:on + + def call_udf(udfName: String, cols: Column*): Column = call_udf(udfName, cols:_*) + +} diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala new file mode 100644 index 00000000..e244459b --- /dev/null +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala @@ -0,0 +1,40 @@ +/* + * Copyright 2023 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.errorhandling + +import org.apache.spark.sql.{Column, DataFrame, SparkSession} +import za.co.absa.spark.commons.errorhandling.implementations.{ErrorMessageSubmitOnColumn, ErrorMessageSubmitWithoutColumn} +import za.co.absa.spark.commons.errorhandling.types._ + +trait ErrorHandling { + def register(sparkToRegisterTo: SparkSession): Unit = {} + + def putErrorToColumn(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errCol: ErrCol, additionalInfo: AdditionalInfo = None): ErrorColumn = { + val toSubmit = errCol + .map(errSourceColName => ErrorMessageSubmitOnColumn(errType, errCode, errMessage, errSourceColName, additionalInfo)) + .getOrElse(ErrorMessageSubmitWithoutColumn(errType, errCode, errMessage, additionalInfo)) + putErrorToColumn(toSubmit) + } + def putErrorToColumn(errorMessageSubmit: ErrorMessageSubmit): ErrorColumn + + def aggregateErrorColumns(dataFrame: DataFrame)(errCols: ErrorColumn*): DataFrame + + def putError(dataFrame: DataFrame)(when: Column)(errorMessageSubmit: ErrorMessageSubmit): DataFrame = { + putErrorsWithGrouping(dataFrame)(Seq(ErrorWhen(when, errorMessageSubmit))) + } + def putErrorsWithGrouping(dataFrame: DataFrame)(errorsWhen: Seq[ErrorWhen]): DataFrame +} + diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessage.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessage.scala index c034e545..3a5bb9ec 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessage.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessage.scala @@ -19,6 +19,7 @@ package za.co.absa.spark.commons.errorhandling import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.StructType import za.co.absa.spark.commons.errorhandling.ErrorMessage.Mapping +import za.co.absa.spark.commons.errorhandling.types._ /** * Case class to represent an error message @@ -28,18 +29,19 @@ import za.co.absa.spark.commons.errorhandling.ErrorMessage.Mapping * @param errMsg - Textual description of the error * @param errCol - The name of the column where the error occurred * @param rawValues - Sequence of raw values (which are the potential culprits of the error) - * @param mappings - Sequence of Mappings i.e Mapping Table Column -> Equivalent Mapped Dataset column + * @param additionInfo - Sequence of Mappings i.e Mapping Table Column -> Equivalent Mapped Dataset column */ case class ErrorMessage( - errType: String, - errCode: String, - errMsg: String, - errCol: String, - rawValues: Seq[String], - mappings: Seq[Mapping] = Seq() + errType: ErrType, + errCode: ErrCode, + errMsg: ErrMsg, + errCol: ErrCol, + rawValues: RawValues, + additionInfo: AdditionalInfo = None ) object ErrorMessage { + //TODO probably not needed case class Mapping( mappingTableColumn: String, mappedDatasetColumn: String diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageSubmit.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageSubmit.scala new file mode 100644 index 00000000..ce10bb86 --- /dev/null +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageSubmit.scala @@ -0,0 +1,28 @@ +/* + * Copyright 2023 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.errorhandling + +import za.co.absa.spark.commons.errorhandling.types._ + +trait ErrorMessageSubmit { + def errType: ColumnOrValue[ErrType] + def errCode: ColumnOrValue[ErrCode] + def errMsg: ColumnOrValue[ErrMsg] + def errCol: ColumnOrValue[ErrCol] + def rawValues: ColumnOrValue[RawValues] + def additionInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty +} + diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala new file mode 100644 index 00000000..6504b053 --- /dev/null +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala @@ -0,0 +1,40 @@ +/* + * Copyright 2023 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.errorhandling.implementations + +import org.apache.spark.sql.functions.{array, array_except, array_union, col} +import org.apache.spark.sql.{Column, DataFrame} +import za.co.absa.spark.commons.errorhandling.partials.{ErrorHandlingCommon, EvaluateIntoErrorMessage} +import za.co.absa.spark.commons.implicits.DataFrameImplicits.DataFrameEnhancements +import za.co.absa.spark.commons.sql.functions.null_col + +case class ErrorMessageArray(errorColumnName: String = ErrorMessageArray.defaultErrorColumnName) + extends ErrorHandlingCommon + with EvaluateIntoErrorMessage { + + override protected def doTheAggregation(dataFrame: DataFrame, errCols: Column*): DataFrame = { + def appendToArray(dataFrame: DataFrame, colName: String, colToUnion: Column): DataFrame = { + dataFrame.withColumn(colName, array_union(col(colName), colToUnion)) + } + val aggregatedWithouNulls = array_except(array(errCols: _*), array(null_col)) + dataFrame.withColumnIfDoesNotExist(appendToArray(_, _, aggregatedWithouNulls))(errorColumnName, aggregatedWithouNulls) + } + +} + +object ErrorMessageArray { + final val defaultErrorColumnName = "errCol" +} diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnColumn.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnColumn.scala new file mode 100644 index 00000000..c47862df --- /dev/null +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnColumn.scala @@ -0,0 +1,48 @@ +/* + * Copyright 2023 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.errorhandling.implementations + +import org.apache.spark.sql.Column +import org.apache.spark.sql.functions.{array, col} +import org.apache.spark.sql.types.StringType +import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit +import za.co.absa.spark.commons.errorhandling.types._ + +class ErrorMessageSubmitOnColumn ( + val errType: ColumnOrValue[ErrType], + val errCode: ColumnOrValue[ErrCode], + val errMsg: ColumnOrValue[ErrMsg], + errSourceColName: ErrSourceColName, + override val additionInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty + ) extends ErrorMessageSubmit { + val errCol: ColumnOrValue[ErrCol] = ColumnOrValue.withOption(Option(errSourceColName)) + override def rawValues: ColumnOrValue[RawValues] = { + val colExpr: Column = array(col(errSourceColName).cast(StringType)) + ColumnOrValue(colExpr) + } +} + +object ErrorMessageSubmitOnColumn { + def apply(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errSourceColName: ErrSourceColName, additionalInfo: AdditionalInfo= None): ErrorMessageSubmitOnColumn = { + new ErrorMessageSubmitOnColumn( + ColumnOrValue.withActualValue(errType), + ColumnOrValue.withActualValue(errCode), + ColumnOrValue.withActualValue(errMessage), + errSourceColName, + ColumnOrValue.withOption(additionalInfo) + ) + } +} diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitWithoutColumn.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitWithoutColumn.scala new file mode 100644 index 00000000..ca09a0e7 --- /dev/null +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitWithoutColumn.scala @@ -0,0 +1,43 @@ +/* + * Copyright 2023 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.errorhandling.implementations + +import org.apache.spark.sql.functions.array +import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit +import za.co.absa.spark.commons.errorhandling.types._ + +class ErrorMessageSubmitWithoutColumn( + val errType: ColumnOrValue[ErrType], + val errCode: ColumnOrValue[ErrCode], + val errMsg: ColumnOrValue[ErrMsg], + override val additionInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty + ) extends ErrorMessageSubmit { + + val errCol: ColumnOrValue[ErrCol] = ColumnOrValue.asEmpty + val rawValues: ColumnOrValue[RawValues] = ColumnOrValue(array()) +} + +object ErrorMessageSubmitWithoutColumn { + def apply(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, additionalInfo: AdditionalInfo = None): ErrorMessageSubmitWithoutColumn = { + new ErrorMessageSubmitWithoutColumn( + ColumnOrValue.withActualValue(errType), + ColumnOrValue.withActualValue(errCode), + ColumnOrValue.withActualValue(errMessage), + ColumnOrValue.withOption(additionalInfo) + ) + } +} + diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala new file mode 100644 index 00000000..f5e88d3b --- /dev/null +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala @@ -0,0 +1,55 @@ +/* + * Copyright 2023 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.errorhandling.partials + +import org.apache.spark.sql.catalyst.expressions.{CaseWhen, Expression} +import org.apache.spark.sql.{Column, DataFrame} +import za.co.absa.spark.commons.errorhandling.{ErrorHandling, ErrorMessageSubmit} +import za.co.absa.spark.commons.errorhandling.types._ +import org.apache.spark.sql.functions.when + +trait ErrorHandlingCommon extends ErrorHandling { + protected def evaluate(errorMessageSubmit: ErrorMessageSubmit): Column + + protected def doTheAggregation(dataFrame: DataFrame, errCols: Column*): DataFrame + + def putErrorToColumn(errorMessageSubmit: ErrorMessageSubmit): ErrorColumn = { + ErrorColumn(evaluate(errorMessageSubmit)) + } + + def aggregateErrorColumns(dataFrame: DataFrame)(errCols: ErrorColumn*): DataFrame = { + register(dataFrame.sparkSession) + doTheAggregation(dataFrame, errCols.map(_.column): _*) + } + + def putErrorsWithGrouping(dataFrame: DataFrame)(errorsWhen: Seq[ErrorWhen]): DataFrame = { + val errorsByColumn = errorsWhen.groupBy(_.errorMessageSubmit.errCol.getValue) + val errorColumns1 = errorsByColumn.getOrElse(None, Seq.empty).map(errorWhenToCol) // no grouping without ErrCol name + val errorColumns2 = (errorsByColumn - None).values.map(errorWhenSeqToCol).toSeq + doTheAggregation(dataFrame, errorColumns1 ++ errorColumns2: _*) + } + + + private def errorWhenToCol(errorWhen: ErrorWhen): Column = { + when(errorWhen.when, evaluate(errorWhen.errorMessageSubmit)) + } + + private def errorWhenSeqToCol(errorsWhen: Seq[ErrorWhen]): Column = { + val branches: Seq[(Expression, Expression)] = errorsWhen.map(errorWhen => (errorWhen.when.expr, evaluate(errorWhen.errorMessageSubmit).expr)) + new Column(CaseWhen(branches)) + } + +} diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala new file mode 100644 index 00000000..64566487 --- /dev/null +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala @@ -0,0 +1,34 @@ +/* + * Copyright 2023 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.errorhandling.partials + +import org.apache.spark.sql.Column +import org.apache.spark.sql.functions.struct +import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit + +trait EvaluateIntoErrorMessage { + protected def evaluate(errorMessageSubmit: ErrorMessageSubmit): Column = { + struct( + errorMessageSubmit.errType.column as "errType", + errorMessageSubmit.errCode.column as "errCode", + errorMessageSubmit.errMsg.column as "errMsg", + errorMessageSubmit.errCol.column as "errCol", + errorMessageSubmit.rawValues.column as "rawValues", + errorMessageSubmit.additionInfo.column as "additionInfo" + ) + } + +} diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala new file mode 100644 index 00000000..30d2570e --- /dev/null +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala @@ -0,0 +1,43 @@ +/* + * Copyright 2023 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.errorhandling.partials + +import org.apache.spark.sql.{Column, SparkSession} +import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit +import za.co.absa.spark.commons.errorhandling.partials.EvaluateViaUdf.ErrorMessageFunction +import za.co.absa.spark.commons.errorhandling.types._ +import za.co.absa.spark.commons.sql.functions2.call_udf + +trait EvaluateViaUdf[T] { + def evaluationUdfName: String + protected def evaluationUdf: ErrorMessageFunction[T] + def register(sparkToRegisterTo: SparkSession): Unit // TODO refactor when #82 has been implemented + + protected def evaluate(errorMessageSubmit: ErrorMessageSubmit): Column = { + call_udf(evaluationUdfName, + errorMessageSubmit.errType.column, + errorMessageSubmit.errCode.column, + errorMessageSubmit.errMsg.column, + errorMessageSubmit.errCol.column, + errorMessageSubmit.rawValues.column, + errorMessageSubmit.additionInfo.column + ) + } +} + +object EvaluateViaUdf { + type ErrorMessageFunction[T] = (ErrType, ErrCode, ErrMsg, ErrCol, RawValues, AdditionalInfo) => T //TODO needed? +} diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala new file mode 100644 index 00000000..6674bf66 --- /dev/null +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala @@ -0,0 +1,85 @@ +/* + * Copyright 2023 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.errorhandling.types + +import org.apache.spark.sql.Column +import org.apache.spark.sql.functions.{col, lit} +import za.co.absa.spark.commons.sql.functions.null_col + +import scala.language.higherKinds + +trait ColumnOrValue[T] { + def column: Column + def isColumn: Boolean + def isValue: Boolean + def getColumnName: Option[String] + def getValue: Option[T] +} + +object ColumnOrValue { + type CoV[T] = ColumnOrValue[T] //just a shorthand + val CoV: ColumnOrValue.type = ColumnOrValue + + def apply[T](columnName: String): ColumnOrValue[T] = CoVNamedColumn(columnName) + def apply[T](column: Column): ColumnOrValue[T] = CoVDefinedColumn(column) + def withOption[T](value: Option[T]): ColumnOrValue[Option[T]] = { + value match { + case None => CoVNull() + case Some(x) => CoVOption(x) + } + } + def withActualValue[T](value: T): ColumnOrValue[T] = CoVValue(value) + def asEmpty[T]: ColumnOrValue[T] = CoVNull() + + private final case class CoVNamedColumn[T](columnName: String) extends ColumnOrValue[T] { + val column: Column = col(columnName) + val isColumn: Boolean = true + val isValue: Boolean = false + val getColumnName: Option[String] = Option(columnName) + val getValue: Option[T] = None + } + + private final case class CoVDefinedColumn[T](column: Column) extends ColumnOrValue[T] { + val isColumn: Boolean = true + val isValue: Boolean = false + val getColumnName: Option[ErrType] = None + val getValue: Option[T] = None + } + + private final case class CoVValue[T](value: T) extends ColumnOrValue[T] { + val column: Column = lit(value) + val isColumn: Boolean = false + val isValue: Boolean = true + val getColumnName: Option[String] = None + val getValue: Option[T] = Option(value) + } + + private final case class CoVOption[T](value: T) extends ColumnOrValue[Option[T]] { + val column: Column = lit(value) + val isColumn: Boolean = false + val isValue: Boolean = true + val getColumnName: Option[String] = None + val getValue: Option[Option[T]] = Some(Some(value)) + } + + private final case class CoVNull[T]() extends ColumnOrValue[T] { + val column: Column = null_col + val isColumn: Boolean = false + val isValue: Boolean = true + val getColumnName: Option[String] = None + val getValue: Option[T] = None + } +} diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ErrorWhen.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ErrorWhen.scala new file mode 100644 index 00000000..f23089a9 --- /dev/null +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ErrorWhen.scala @@ -0,0 +1,24 @@ +/* + * Copyright 2023 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.errorhandling.types + +import org.apache.spark.sql.Column +import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit + +case class ErrorWhen ( + when: Column, + errorMessageSubmit: ErrorMessageSubmit + ) diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/types.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/types.scala new file mode 100644 index 00000000..957e78a9 --- /dev/null +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/types.scala @@ -0,0 +1,34 @@ +/* + * Copyright 2023 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.errorhandling + +import org.apache.spark.sql.Column + +package object types { + type ErrSourceColName = String + + type ErrType = String + type ErrCode = Long // was string + type ErrMsg = String + type ErrCol = Option[ErrSourceColName] // wouldn't Seq[String] be better? But perhaps AdditionalInfo will suffice, while usually having one main column + //wouldn't a better name be SourceOfErrCol? + type RawValues = Seq[String] + type AdditionalInfo = Option[String] // actually a JSON + //mapping is missing, should be part of AdditionalInfo, as being very specific + + //This is to ensure some level of type-safety + final case class ErrorColumn(column: Column) extends AnyVal +} diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala index 4508798d..a9b9edea 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala @@ -17,7 +17,7 @@ package za.co.absa.spark.commons.sql import org.apache.spark.sql.Column -import org.apache.spark.sql.functions.col +import org.apache.spark.sql.functions.{col, lit} import za.co.absa.spark.commons.utils.SchemaUtils import scala.util.{Success, Try} @@ -54,4 +54,13 @@ object functions { } } + /** + * Provides a column with NULL value. + * + * @return The column of NULL values + */ + def null_col:Column = { + lit(None.orNull) + } + } diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala new file mode 100644 index 00000000..689737d4 --- /dev/null +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala @@ -0,0 +1,75 @@ +/* + * Copyright 2023 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.errorhandling.implementations + +import org.scalatest.funsuite.AnyFunSuite +import za.co.absa.spark.commons.errorhandling.types.ErrorWhen +import za.co.absa.spark.commons.test.SparkTestBase +import org.apache.spark.sql.functions.{col, length} + + +class ErrorMessageArrayTest extends AnyFunSuite with SparkTestBase { + import spark.implicits._ + + private val col1Name = "Col1" + private val col2Name = "Col2" + private val srcDf = Seq( + (None, ""), + (Some(1), "a"), + (Some(2), "bb"), + (Some(3), "ccc") + ).toDF(col1Name, col2Name) + + test("Collect columns and aggregate them explicitly") { + val errorMessageArray = ErrorMessageArray() + + val e1 = errorMessageArray.putErrorToColumn("Test error 1", 1, "This is a test error", Some(col1Name)) + val errorSubmitA = ErrorMessageSubmitOnColumn("Test error 2", 2, "This is a test error", col2Name) + val e2 = errorMessageArray.putErrorToColumn(errorSubmitA) + val errorSubmitB = ErrorMessageSubmitWithoutColumn("Test error 3", 3, "This is a test error") + val e3 = errorMessageArray.putErrorToColumn(errorSubmitB) + + val destDf = errorMessageArray.aggregateErrorColumns(srcDf)(e1, e2, e3) + destDf.printSchema() + destDf.show(false) + } + + test("putErrors groups conditions by source column"){ + val errorMessageArray = ErrorMessageArray() + + val destDf = errorMessageArray.putErrorsWithGrouping(srcDf)(Seq( + ErrorWhen(col(col1Name).isNull, ErrorMessageSubmitWithoutColumn("WrongLine", 0, "This line is wrong")), + ErrorWhen(col(col1Name) > 2, ErrorMessageSubmitOnColumn("ValueTooBig", 1, "The value of the field is too big", col1Name)), + ErrorWhen(col(col1Name) > 1, ErrorMessageSubmitOnColumn("ValueStillTooBig", 2, "The value of the field is too big", col1Name)), + ErrorWhen(length(col(col2Name)) > 2, ErrorMessageSubmitOnColumn("String too long", 10, "The text in the field is too long", col2Name)) + )) + destDf.printSchema() + destDf.show(false) + } + test("putError and putErrors does not group by together"){ + val errorMessageArray = ErrorMessageArray() + + val midDf = errorMessageArray.putError(srcDf)(col(col1Name) > 1)(ErrorMessageSubmitOnColumn("ValueStillTooBig", 2, "The value of the field is too big", col1Name)) + + val destDf = errorMessageArray.putErrorsWithGrouping(midDf)(Seq( + ErrorWhen(col(col1Name).isNull, ErrorMessageSubmitWithoutColumn("WrongLine", 0, "This line is wrong")), + ErrorWhen(col(col1Name) > 2, ErrorMessageSubmitOnColumn("ValueTooBig", 1, "The value of the field is too big", col1Name)), + ErrorWhen(length(col(col2Name)) > 2, ErrorMessageSubmitOnColumn("String too long", 10, "The text in the field is too long", col2Name)) + )) + destDf.printSchema() + destDf.show(false) + } +} From de99046a9c449688bdbff410b154102272341d76 Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Tue, 7 Mar 2023 09:28:13 +0100 Subject: [PATCH 02/23] * UT fix * headers fix --- .../spark/commons/errorhandling/ErrorHandling.scala | 3 ++- .../commons/errorhandling/ErrorMessageSubmit.scala | 3 ++- .../implementations/ErrorMessageArray.scala | 3 ++- .../implementations/ErrorMessageSubmitOnColumn.scala | 3 ++- .../ErrorMessageSubmitWithoutColumn.scala | 3 ++- .../errorhandling/partials/ErrorHandlingCommon.scala | 3 ++- .../partials/EvaluateIntoErrorMessage.scala | 3 ++- .../errorhandling/partials/EvaluateViaUdf.scala | 3 ++- .../commons/errorhandling/types/ColumnOrValue.scala | 3 ++- .../spark/commons/errorhandling/types/ErrorWhen.scala | 3 ++- .../spark/commons/errorhandling/types/types.scala | 3 ++- .../commons/errorhandling/ErrorMessageTest.scala | 11 +++-------- 12 files changed, 25 insertions(+), 19 deletions(-) diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala index e244459b..858c8c2d 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala @@ -1,9 +1,10 @@ /* - * Copyright 2023 ABSA Group Limited + * Copyright 2021 ABSA Group Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at + * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageSubmit.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageSubmit.scala index ce10bb86..f5f2cd3e 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageSubmit.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageSubmit.scala @@ -1,9 +1,10 @@ /* - * Copyright 2023 ABSA Group Limited + * Copyright 2021 ABSA Group Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at + * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala index 6504b053..891ef142 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala @@ -1,9 +1,10 @@ /* - * Copyright 2023 ABSA Group Limited + * Copyright 2021 ABSA Group Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at + * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnColumn.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnColumn.scala index c47862df..d03e1340 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnColumn.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnColumn.scala @@ -1,9 +1,10 @@ /* - * Copyright 2023 ABSA Group Limited + * Copyright 2021 ABSA Group Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at + * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitWithoutColumn.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitWithoutColumn.scala index ca09a0e7..18e18102 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitWithoutColumn.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitWithoutColumn.scala @@ -1,9 +1,10 @@ /* - * Copyright 2023 ABSA Group Limited + * Copyright 2021 ABSA Group Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at + * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala index f5e88d3b..91841876 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala @@ -1,9 +1,10 @@ /* - * Copyright 2023 ABSA Group Limited + * Copyright 2021 ABSA Group Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at + * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala index 64566487..c5559a35 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala @@ -1,9 +1,10 @@ /* - * Copyright 2023 ABSA Group Limited + * Copyright 2021 ABSA Group Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at + * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala index 30d2570e..6e54851b 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala @@ -1,9 +1,10 @@ /* - * Copyright 2023 ABSA Group Limited + * Copyright 2021 ABSA Group Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at + * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala index 6674bf66..ccf3fb49 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala @@ -1,9 +1,10 @@ /* - * Copyright 2023 ABSA Group Limited + * Copyright 2021 ABSA Group Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at + * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ErrorWhen.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ErrorWhen.scala index f23089a9..bf506691 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ErrorWhen.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ErrorWhen.scala @@ -1,9 +1,10 @@ /* - * Copyright 2023 ABSA Group Limited + * Copyright 2021 ABSA Group Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at + * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/types.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/types.scala index 957e78a9..f076788b 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/types.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/types.scala @@ -1,9 +1,10 @@ /* - * Copyright 2023 ABSA Group Limited + * Copyright 2021 ABSA Group Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at + * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageTest.scala index a80981c8..d672fb47 100644 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageTest.scala +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageTest.scala @@ -16,7 +16,7 @@ package za.co.absa.spark.commons.errorhandling -import org.apache.spark.sql.types.{ArrayType, StringType, StructField, StructType} +import org.apache.spark.sql.types.{ArrayType, LongType, StringType, StructField, StructType} import org.scalatest.funsuite.AnyFunSuite import za.co.absa.spark.commons.test.SparkTestBase @@ -24,16 +24,11 @@ class ErrorMessageTest extends AnyFunSuite with SparkTestBase { test("errorColSchema returns the expected structure"){ val expected = StructType(Seq( StructField("errType", StringType, nullable = true), - StructField("errCode", StringType, nullable = true), + StructField("errCode", LongType, nullable = false), StructField("errMsg", StringType, nullable = true), StructField("errCol", StringType, nullable = true), StructField("rawValues", ArrayType(StringType, containsNull = true), nullable = true), - StructField("mappings", ArrayType( - StructType(Seq( - StructField("mappingTableColumn", StringType, nullable = true), - StructField("mappedDatasetColumn", StringType, nullable = true) - )), containsNull = true), - nullable = true) + StructField("additionInfo", StringType, nullable = true) )) val result = ErrorMessage.errorColSchema From 2db8c95f18ae9334c5779e515c99d2d48a83eb5a Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Tue, 14 Mar 2023 10:22:12 +0100 Subject: [PATCH 03/23] * Relatively big overwrite to use `map` instead of errCol and sequence of raw values * `ErrorMessageSubmitJustErrorValue` class created to offer the ability to submit errors without source column but with error value --- .../CallUdfAdapter.scala} | 12 +--- .../CallUdfAdapter.scala} | 14 ++--- .../CallUdfAdapter.scala} | 14 ++--- .../commons/errorhandling/ErrorHandling.scala | 2 +- .../commons/errorhandling/ErrorMessage.scala | 27 ++------- .../errorhandling/ErrorMessageSubmit.scala | 3 +- .../implementations/ErrorMessageArray.scala | 52 +++++++++++++---- .../ErrorMessageSubmitJustErrorValue.scala | 57 +++++++++++++++++++ .../ErrorMessageSubmitOnColumn.scala | 29 ++++------ .../ErrorMessageSubmitOnMoreColumns.scala | 44 ++++++++++++++ .../ErrorMessageSubmitWithoutColumn.scala | 14 +++-- .../partials/ErrorHandlingCommon.scala | 7 ++- .../partials/EvaluateIntoErrorMessage.scala | 26 ++++++--- .../partials/EvaluateViaUdf.scala | 9 ++- .../errorhandling/types/ColumnOrValue.scala | 50 +++++++++------- .../commons/errorhandling/types/types.scala | 9 +-- .../co/absa/spark/commons/sql/functions.scala | 6 ++ .../errorhandling/ErrorMessageTest.scala | 37 ------------ 18 files changed, 247 insertions(+), 165 deletions(-) rename spark-commons/src/main/scala-spark2.4-jvm/za/co/absa/spark/commons/{sql/functions2.scala => adapters/CallUdfAdapter.scala} (76%) rename spark-commons/src/main/scala-spark3.2-jvm/za/co/absa/spark/commons/{sql/functions2.scala => adapters/CallUdfAdapter.scala} (71%) rename spark-commons/src/main/scala-spark3.3-jvm/za/co/absa/spark/commons/{sql/functions2.scala => adapters/CallUdfAdapter.scala} (71%) create mode 100644 spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitJustErrorValue.scala create mode 100644 spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnMoreColumns.scala delete mode 100644 spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageTest.scala diff --git a/spark-commons/src/main/scala-spark2.4-jvm/za/co/absa/spark/commons/sql/functions2.scala b/spark-commons/src/main/scala-spark2.4-jvm/za/co/absa/spark/commons/adapters/CallUdfAdapter.scala similarity index 76% rename from spark-commons/src/main/scala-spark2.4-jvm/za/co/absa/spark/commons/sql/functions2.scala rename to spark-commons/src/main/scala-spark2.4-jvm/za/co/absa/spark/commons/adapters/CallUdfAdapter.scala index 3b72381e..e37df65a 100644 --- a/spark-commons/src/main/scala-spark2.4-jvm/za/co/absa/spark/commons/sql/functions2.scala +++ b/spark-commons/src/main/scala-spark2.4-jvm/za/co/absa/spark/commons/adapters/CallUdfAdapter.scala @@ -14,17 +14,11 @@ * limitations under the License. */ -package za.co.absa.spark.commons.sql +package za.co.absa.spark.commons.adapters import org.apache.spark.sql.Column import org.apache.spark.sql.functions.callUDF -import scala.util.{Success, Try} - -// scalastyle:off -object functions2 { -// scalastyle:on - - def call_udf(udfName: String, cols: Column*): Column = call_udf(udfName, cols:_*) - +trait CallUdfAdapter { + def call_udf(udfName: String, cols: Column*): Column = callUDF(udfName, cols:_*) } diff --git a/spark-commons/src/main/scala-spark3.2-jvm/za/co/absa/spark/commons/sql/functions2.scala b/spark-commons/src/main/scala-spark3.2-jvm/za/co/absa/spark/commons/adapters/CallUdfAdapter.scala similarity index 71% rename from spark-commons/src/main/scala-spark3.2-jvm/za/co/absa/spark/commons/sql/functions2.scala rename to spark-commons/src/main/scala-spark3.2-jvm/za/co/absa/spark/commons/adapters/CallUdfAdapter.scala index 3b72381e..aba7cbd0 100644 --- a/spark-commons/src/main/scala-spark3.2-jvm/za/co/absa/spark/commons/sql/functions2.scala +++ b/spark-commons/src/main/scala-spark3.2-jvm/za/co/absa/spark/commons/adapters/CallUdfAdapter.scala @@ -14,17 +14,11 @@ * limitations under the License. */ -package za.co.absa.spark.commons.sql +package za.co.absa.spark.commons.adapters import org.apache.spark.sql.Column -import org.apache.spark.sql.functions.callUDF - -import scala.util.{Success, Try} - -// scalastyle:off -object functions2 { -// scalastyle:on - - def call_udf(udfName: String, cols: Column*): Column = call_udf(udfName, cols:_*) +import org.apache.spark.sql.functions.{call_udf => sparkCallUdf} +trait CallUdfAdapter { + def call_udf(udfName: String, cols: Column*): Column = sparkCallUdf(udfName, cols:_*) } diff --git a/spark-commons/src/main/scala-spark3.3-jvm/za/co/absa/spark/commons/sql/functions2.scala b/spark-commons/src/main/scala-spark3.3-jvm/za/co/absa/spark/commons/adapters/CallUdfAdapter.scala similarity index 71% rename from spark-commons/src/main/scala-spark3.3-jvm/za/co/absa/spark/commons/sql/functions2.scala rename to spark-commons/src/main/scala-spark3.3-jvm/za/co/absa/spark/commons/adapters/CallUdfAdapter.scala index 3b72381e..aba7cbd0 100644 --- a/spark-commons/src/main/scala-spark3.3-jvm/za/co/absa/spark/commons/sql/functions2.scala +++ b/spark-commons/src/main/scala-spark3.3-jvm/za/co/absa/spark/commons/adapters/CallUdfAdapter.scala @@ -14,17 +14,11 @@ * limitations under the License. */ -package za.co.absa.spark.commons.sql +package za.co.absa.spark.commons.adapters import org.apache.spark.sql.Column -import org.apache.spark.sql.functions.callUDF - -import scala.util.{Success, Try} - -// scalastyle:off -object functions2 { -// scalastyle:on - - def call_udf(udfName: String, cols: Column*): Column = call_udf(udfName, cols:_*) +import org.apache.spark.sql.functions.{call_udf => sparkCallUdf} +trait CallUdfAdapter { + def call_udf(udfName: String, cols: Column*): Column = sparkCallUdf(udfName, cols:_*) } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala index 858c8c2d..7d379d8b 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala @@ -23,7 +23,7 @@ import za.co.absa.spark.commons.errorhandling.types._ trait ErrorHandling { def register(sparkToRegisterTo: SparkSession): Unit = {} - def putErrorToColumn(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errCol: ErrCol, additionalInfo: AdditionalInfo = None): ErrorColumn = { + def putErrorToColumn(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errCol: Option[ErrSourceColName], additionalInfo: AdditionalInfo = None): ErrorColumn = { val toSubmit = errCol .map(errSourceColName => ErrorMessageSubmitOnColumn(errType, errCode, errMessage, errSourceColName, additionalInfo)) .getOrElse(ErrorMessageSubmitWithoutColumn(errType, errCode, errMessage, additionalInfo)) diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessage.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessage.scala index 3a5bb9ec..69308851 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessage.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessage.scala @@ -16,9 +16,6 @@ package za.co.absa.spark.commons.errorhandling -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.types.StructType -import za.co.absa.spark.commons.errorhandling.ErrorMessage.Mapping import za.co.absa.spark.commons.errorhandling.types._ /** @@ -27,30 +24,14 @@ import za.co.absa.spark.commons.errorhandling.types._ * @param errType - Type or source of the error * @param errCode - Internal error code * @param errMsg - Textual description of the error - * @param errCol - The name of the column where the error occurred - * @param rawValues - Sequence of raw values (which are the potential culprits of the error) - * @param additionInfo - Sequence of Mappings i.e Mapping Table Column -> Equivalent Mapped Dataset column + * @param errColsAndValues - The names of the columns where the error occurred and their raw values (which are the + * potential culprits of the error) + * @param additionInfo - any optional additional information in the form of a JSON string */ case class ErrorMessage( errType: ErrType, errCode: ErrCode, errMsg: ErrMsg, - errCol: ErrCol, - rawValues: RawValues, + errColsAndValues: ErrColsAndValues, additionInfo: AdditionalInfo = None ) - -object ErrorMessage { - //TODO probably not needed - case class Mapping( - mappingTableColumn: String, - mappedDatasetColumn: String - ) - - val errorColumnName = "errCol" - def errorColSchema(implicit spark: SparkSession): StructType = { - import spark.implicits._ - spark.emptyDataset[ErrorMessage].schema - } -} - diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageSubmit.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageSubmit.scala index f5f2cd3e..6f20e772 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageSubmit.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageSubmit.scala @@ -22,8 +22,7 @@ trait ErrorMessageSubmit { def errType: ColumnOrValue[ErrType] def errCode: ColumnOrValue[ErrCode] def errMsg: ColumnOrValue[ErrMsg] - def errCol: ColumnOrValue[ErrCol] - def rawValues: ColumnOrValue[RawValues] + def errColsAndValues: ColumnOrValue[ErrColsAndValues] def additionInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala index 891ef142..7ca2ccfb 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala @@ -16,22 +16,54 @@ package za.co.absa.spark.commons.errorhandling.implementations -import org.apache.spark.sql.functions.{array, array_except, array_union, col} import org.apache.spark.sql.{Column, DataFrame} +import org.apache.spark.sql.functions.{array, array_except, array_union, col, map_from_arrays, map_keys, map_values, struct, when} +import za.co.absa.spark.commons.adapters.TransformAdapter +import za.co.absa.spark.commons.errorhandling.partials.EvaluateIntoErrorMessage.FieldNames._ import za.co.absa.spark.commons.errorhandling.partials.{ErrorHandlingCommon, EvaluateIntoErrorMessage} -import za.co.absa.spark.commons.implicits.DataFrameImplicits.DataFrameEnhancements import za.co.absa.spark.commons.sql.functions.null_col +import za.co.absa.spark.commons.implicits.DataFrameImplicits.DataFrameEnhancements case class ErrorMessageArray(errorColumnName: String = ErrorMessageArray.defaultErrorColumnName) extends ErrorHandlingCommon - with EvaluateIntoErrorMessage { - - override protected def doTheAggregation(dataFrame: DataFrame, errCols: Column*): DataFrame = { - def appendToArray(dataFrame: DataFrame, colName: String, colToUnion: Column): DataFrame = { - dataFrame.withColumn(colName, array_union(col(colName), colToUnion)) - } - val aggregatedWithouNulls = array_except(array(errCols: _*), array(null_col)) - dataFrame.withColumnIfDoesNotExist(appendToArray(_, _, aggregatedWithouNulls))(errorColumnName, aggregatedWithouNulls) + with EvaluateIntoErrorMessage + with TransformAdapter { + + private def decomposeMap(errorMessageColumn: Column): Column = { + when(errorMessageColumn.isNotNull, + struct( + errorMessageColumn.getField(errType) as errType, + errorMessageColumn.getField(errCode) as errCode, + errorMessageColumn.getField(errMsg) as errMsg, + map_keys(errorMessageColumn.getField(errColsAndValues)) as errCols, + map_values(errorMessageColumn.getField(errColsAndValues)) as errValues, + errorMessageColumn.getField(additionInfo) as additionInfo + ) + ) + } + + private def recomposeMap(errorMessageColumn: Column): Column = { + struct( + errorMessageColumn.getField(errType) as errType, + errorMessageColumn.getField(errCode) as errCode, + errorMessageColumn.getField(errMsg) as errMsg, + map_from_arrays(errorMessageColumn.getField(errCols), errorMessageColumn.getField(errValues)) as errColsAndValues , + errorMessageColumn.getField(additionInfo) as additionInfo + ) + } + + private def deMap(arrayCol: Column): Column = transform(arrayCol, decomposeMap) + private def reMap(arrayCol: Column): Column = transform(arrayCol, recomposeMap) + + private def appendToErrCol(dataFrame: DataFrame, errorColName: String, colToUnion: Column): DataFrame = { + dataFrame.withColumn(errorColName, reMap(array_union(deMap(col(errorColName)), colToUnion))) + } + + protected def doTheAggregation(dataFrame: DataFrame, errCols: Column*): DataFrame = { + val aggregated = array(errCols.map(decomposeMap): _*) //need to decompose the map field, as it's not supported in array functions + val aggregatedWithoutNulls = array_except(aggregated, array(null_col)) + val joinToExisting: (DataFrame, String) => DataFrame = appendToErrCol(_, _, aggregatedWithoutNulls) + dataFrame.withColumnIfDoesNotExist(joinToExisting)(errorColumnName, reMap(aggregatedWithoutNulls)) } } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitJustErrorValue.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitJustErrorValue.scala new file mode 100644 index 00000000..49223231 --- /dev/null +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitJustErrorValue.scala @@ -0,0 +1,57 @@ +/* + * Copyright 2021 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.errorhandling.implementations + +import org.apache.spark.sql.functions.{lit, map} +import org.apache.spark.sql.types.StringType +import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit +import za.co.absa.spark.commons.errorhandling.implementations.ErrorMessageSubmitJustErrorValue.noColumnKey +import za.co.absa.spark.commons.errorhandling.types._ + +case class ErrorMessageSubmitJustErrorValue( + errType: ColumnOrValue[ErrType], + errCode: ColumnOrValue[ErrCode], + errMsg: ColumnOrValue[ErrMsg], + errValue: ColumnOrValue[String], + override val additionInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty + ) extends ErrorMessageSubmit { + val errColsAndValues: ColumnOrValue[ErrColsAndValues] = ColumnOrValue(map(lit(noColumnKey), errValue.column.cast(StringType))) +} + +object ErrorMessageSubmitJustErrorValue { + val noColumnKey: ErrSourceColName = "" + + def apply(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errValue: String): ErrorMessageSubmitJustErrorValue = { + new ErrorMessageSubmitJustErrorValue( + ColumnOrValue.withValue(errType), + ColumnOrValue.withValue(errCode), + ColumnOrValue.withValue(errMessage), + ColumnOrValue.withValue(errValue), + ColumnOrValue.asEmpty + ) + } + + def apply(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errValue: String, additionalInfo: AdditionalInfo): ErrorMessageSubmitJustErrorValue = { + new ErrorMessageSubmitJustErrorValue( + ColumnOrValue.withValue(errType), + ColumnOrValue.withValue(errCode), + ColumnOrValue.withValue(errMessage), + ColumnOrValue.withValue(errValue), + ColumnOrValue.withOption(additionalInfo) + ) + } +} diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnColumn.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnColumn.scala index d03e1340..a5e70b1f 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnColumn.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnColumn.scala @@ -16,33 +16,24 @@ package za.co.absa.spark.commons.errorhandling.implementations -import org.apache.spark.sql.Column -import org.apache.spark.sql.functions.{array, col} -import org.apache.spark.sql.types.StringType -import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit import za.co.absa.spark.commons.errorhandling.types._ class ErrorMessageSubmitOnColumn ( - val errType: ColumnOrValue[ErrType], - val errCode: ColumnOrValue[ErrCode], - val errMsg: ColumnOrValue[ErrMsg], - errSourceColName: ErrSourceColName, + errType: ColumnOrValue[ErrType], + errCode: ColumnOrValue[ErrCode], + errMsg: ColumnOrValue[ErrMsg], + errColName: ErrSourceColName, override val additionInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty - ) extends ErrorMessageSubmit { - val errCol: ColumnOrValue[ErrCol] = ColumnOrValue.withOption(Option(errSourceColName)) - override def rawValues: ColumnOrValue[RawValues] = { - val colExpr: Column = array(col(errSourceColName).cast(StringType)) - ColumnOrValue(colExpr) - } + ) extends ErrorMessageSubmitOnMoreColumns(errType, errCode, errMsg, Set(errColName), additionInfo) { } object ErrorMessageSubmitOnColumn { - def apply(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errSourceColName: ErrSourceColName, additionalInfo: AdditionalInfo= None): ErrorMessageSubmitOnColumn = { + def apply(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errColName: ErrSourceColName, additionalInfo: AdditionalInfo= None): ErrorMessageSubmitOnColumn = { new ErrorMessageSubmitOnColumn( - ColumnOrValue.withActualValue(errType), - ColumnOrValue.withActualValue(errCode), - ColumnOrValue.withActualValue(errMessage), - errSourceColName, + ColumnOrValue.withValue(errType), + ColumnOrValue.withValue(errCode), + ColumnOrValue.withValue(errMessage), + errColName, ColumnOrValue.withOption(additionalInfo) ) } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnMoreColumns.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnMoreColumns.scala new file mode 100644 index 00000000..aaf0717d --- /dev/null +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnMoreColumns.scala @@ -0,0 +1,44 @@ +/* + * Copyright 2021 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.errorhandling.implementations + +import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit +import za.co.absa.spark.commons.errorhandling.types.ColumnOrValue.columnNameToItsStringValue +import za.co.absa.spark.commons.errorhandling.types._ + +class ErrorMessageSubmitOnMoreColumns( + val errType: ColumnOrValue[ErrType], + val errCode: ColumnOrValue[ErrCode], + val errMsg: ColumnOrValue[ErrMsg], + errColNames: Set[ErrSourceColName], + override val additionInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty + ) extends ErrorMessageSubmit { + val errColsAndValues: ColumnOrValue[ErrColsAndValues] = ColumnOrValue(errColNames, columnNameToItsStringValue) + +} + +object ErrorMessageSubmitOnMoreColumns { + def apply(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errColNames: Set[ErrSourceColName], additionalInfo: AdditionalInfo= None): ErrorMessageSubmitOnMoreColumns = { + new ErrorMessageSubmitOnMoreColumns( + ColumnOrValue.withValue(errType), + ColumnOrValue.withValue(errCode), + ColumnOrValue.withValue(errMessage), + errColNames, + ColumnOrValue.withOption(additionalInfo) + ) + } +} diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitWithoutColumn.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitWithoutColumn.scala index 18e18102..30ea139c 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitWithoutColumn.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitWithoutColumn.scala @@ -16,8 +16,9 @@ package za.co.absa.spark.commons.errorhandling.implementations -import org.apache.spark.sql.functions.array +import org.apache.spark.sql.functions.typedLit import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit +import za.co.absa.spark.commons.errorhandling.implementations.ErrorMessageSubmitWithoutColumn.emptyErrorColsAndValues import za.co.absa.spark.commons.errorhandling.types._ class ErrorMessageSubmitWithoutColumn( @@ -27,16 +28,17 @@ class ErrorMessageSubmitWithoutColumn( override val additionInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty ) extends ErrorMessageSubmit { - val errCol: ColumnOrValue[ErrCol] = ColumnOrValue.asEmpty - val rawValues: ColumnOrValue[RawValues] = ColumnOrValue(array()) + val errColsAndValues: ColumnOrValue[ErrColsAndValues] = ColumnOrValue(typedLit(emptyErrorColsAndValues)) } object ErrorMessageSubmitWithoutColumn { + private val emptyErrorColsAndValues: ErrColsAndValues = Map.empty + def apply(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, additionalInfo: AdditionalInfo = None): ErrorMessageSubmitWithoutColumn = { new ErrorMessageSubmitWithoutColumn( - ColumnOrValue.withActualValue(errType), - ColumnOrValue.withActualValue(errCode), - ColumnOrValue.withActualValue(errMessage), + ColumnOrValue.withValue(errType), + ColumnOrValue.withValue(errCode), + ColumnOrValue.withValue(errMessage), ColumnOrValue.withOption(additionalInfo) ) } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala index 91841876..6ceb7d92 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala @@ -37,9 +37,10 @@ trait ErrorHandlingCommon extends ErrorHandling { } def putErrorsWithGrouping(dataFrame: DataFrame)(errorsWhen: Seq[ErrorWhen]): DataFrame = { - val errorsByColumn = errorsWhen.groupBy(_.errorMessageSubmit.errCol.getValue) - val errorColumns1 = errorsByColumn.getOrElse(None, Seq.empty).map(errorWhenToCol) // no grouping without ErrCol name - val errorColumns2 = (errorsByColumn - None).values.map(errorWhenSeqToCol).toSeq + val errorsByColumn = errorsWhen.groupBy(_.errorMessageSubmit.errColsAndValues.columnNames) + val noColNames = Set.empty[String] + val errorColumns1 = errorsByColumn.getOrElse(noColNames, Seq.empty).map(errorWhenToCol) // no grouping without ErrCol names + val errorColumns2 = (errorsByColumn - noColNames).values.map(errorWhenSeqToCol).toSeq doTheAggregation(dataFrame, errorColumns1 ++ errorColumns2: _*) } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala index c5559a35..5dd931a5 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala @@ -18,18 +18,30 @@ package za.co.absa.spark.commons.errorhandling.partials import org.apache.spark.sql.Column import org.apache.spark.sql.functions.struct -import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit +import za.co.absa.spark.commons.errorhandling.{ErrorMessage, ErrorMessageSubmit} +import za.co.absa.spark.commons.errorhandling.partials.EvaluateIntoErrorMessage.FieldNames._ trait EvaluateIntoErrorMessage { protected def evaluate(errorMessageSubmit: ErrorMessageSubmit): Column = { struct( - errorMessageSubmit.errType.column as "errType", - errorMessageSubmit.errCode.column as "errCode", - errorMessageSubmit.errMsg.column as "errMsg", - errorMessageSubmit.errCol.column as "errCol", - errorMessageSubmit.rawValues.column as "rawValues", - errorMessageSubmit.additionInfo.column as "additionInfo" + errorMessageSubmit.errType.column as errType, + errorMessageSubmit.errCode.column as errCode, + errorMessageSubmit.errMsg.column as errMsg, + errorMessageSubmit.errColsAndValues.column as errColsAndValues, + errorMessageSubmit.additionInfo.column as additionInfo ) } +} + +object EvaluateIntoErrorMessage { + object FieldNames { + val errType = "errType" + val errCode = "errCode" + val errMsg = "errMsg" + val errColsAndValues = "errColsAndValues" + val additionInfo = "additionInfo" + val errCols = "errCols" + val errValues = "errValues" + } } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala index 6e54851b..50ddea38 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala @@ -17,12 +17,12 @@ package za.co.absa.spark.commons.errorhandling.partials import org.apache.spark.sql.{Column, SparkSession} +import za.co.absa.spark.commons.adapters.CallUdfAdapter import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit import za.co.absa.spark.commons.errorhandling.partials.EvaluateViaUdf.ErrorMessageFunction import za.co.absa.spark.commons.errorhandling.types._ -import za.co.absa.spark.commons.sql.functions2.call_udf -trait EvaluateViaUdf[T] { +trait EvaluateViaUdf[T] extends CallUdfAdapter{ def evaluationUdfName: String protected def evaluationUdf: ErrorMessageFunction[T] def register(sparkToRegisterTo: SparkSession): Unit // TODO refactor when #82 has been implemented @@ -32,13 +32,12 @@ trait EvaluateViaUdf[T] { errorMessageSubmit.errType.column, errorMessageSubmit.errCode.column, errorMessageSubmit.errMsg.column, - errorMessageSubmit.errCol.column, - errorMessageSubmit.rawValues.column, + errorMessageSubmit.errColsAndValues.column, errorMessageSubmit.additionInfo.column ) } } object EvaluateViaUdf { - type ErrorMessageFunction[T] = (ErrType, ErrCode, ErrMsg, ErrCol, RawValues, AdditionalInfo) => T //TODO needed? + type ErrorMessageFunction[T] = (ErrType, ErrCode, ErrMsg, ErrColsAndValues, AdditionalInfo) => T //TODO needed? } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala index ccf3fb49..3c821835 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala @@ -17,7 +17,8 @@ package za.co.absa.spark.commons.errorhandling.types import org.apache.spark.sql.Column -import org.apache.spark.sql.functions.{col, lit} +import org.apache.spark.sql.functions.{array, col, lit, map_from_arrays, typedLit} +import org.apache.spark.sql.types.{DataType, StringType} import za.co.absa.spark.commons.sql.functions.null_col import scala.language.higherKinds @@ -26,7 +27,7 @@ trait ColumnOrValue[T] { def column: Column def isColumn: Boolean def isValue: Boolean - def getColumnName: Option[String] + def columnNames: Set[String] def getValue: Option[T] } @@ -36,27 +37,33 @@ object ColumnOrValue { def apply[T](columnName: String): ColumnOrValue[T] = CoVNamedColumn(columnName) def apply[T](column: Column): ColumnOrValue[T] = CoVDefinedColumn(column) - def withOption[T](value: Option[T]): ColumnOrValue[Option[T]] = { + def apply[T](columnNames: Set[String], columnTransformer: ColumnTransformer): ColumnOrValue[Map[String, T]] = CoVMapColumn(columnNames, columnTransformer) //should it be explicit function? + + def withOption(value: Option[String]): ColumnOrValue[Option[String]] = { // could be safely an apply, or done more generally value match { - case None => CoVNull() - case Some(x) => CoVOption(x) + case None => CoVNull(StringType) + case _ => CoVValue(value) } } - def withActualValue[T](value: T): ColumnOrValue[T] = CoVValue(value) - def asEmpty[T]: ColumnOrValue[T] = CoVNull() + def withValue[T](value: T): ColumnOrValue[T] = CoVValue(value) + def asEmpty: ColumnOrValue[Option[String]] = CoVNull(StringType) + + def columnNameToItsStringValue(colName: String): Column = { + col(colName).cast(StringType) + } private final case class CoVNamedColumn[T](columnName: String) extends ColumnOrValue[T] { val column: Column = col(columnName) val isColumn: Boolean = true val isValue: Boolean = false - val getColumnName: Option[String] = Option(columnName) + val columnNames: Set[String] = Set(columnName) val getValue: Option[T] = None } private final case class CoVDefinedColumn[T](column: Column) extends ColumnOrValue[T] { val isColumn: Boolean = true val isValue: Boolean = false - val getColumnName: Option[ErrType] = None + val columnNames: Set[String] = Set.empty val getValue: Option[T] = None } @@ -64,23 +71,28 @@ object ColumnOrValue { val column: Column = lit(value) val isColumn: Boolean = false val isValue: Boolean = true - val getColumnName: Option[String] = None + val columnNames: Set[String] = Set.empty val getValue: Option[T] = Option(value) } - private final case class CoVOption[T](value: T) extends ColumnOrValue[Option[T]] { - val column: Column = lit(value) - val isColumn: Boolean = false - val isValue: Boolean = true - val getColumnName: Option[String] = None - val getValue: Option[Option[T]] = Some(Some(value)) + private final case class CoVMapColumn[T](columnNames: Set[String], columnTransformer: ColumnTransformer) extends ColumnOrValue[Map[String, T]] { + val isColumn: Boolean = true + val isValue: Boolean = false + val getValue: Option[Map[String, T]] = None + val column: Column = { + val (mapKeys, mapValues) = columnNames.foldLeft(Seq.empty[Column], Seq.empty[Column]) {case ((accKeys, accVals), colName) => + (typedLit(colName) +: accKeys , columnTransformer(colName) +: accVals) + } + map_from_arrays(array(mapKeys: _*), array(mapValues: _*)) + } } - private final case class CoVNull[T]() extends ColumnOrValue[T] { - val column: Column = null_col + private final case class CoVNull[T](dataType: DataType) extends ColumnOrValue[T] { + val column: Column = null_col(dataType) + val isColumn: Boolean = false val isValue: Boolean = true - val getColumnName: Option[String] = None + val columnNames: Set[String] = Set.empty val getValue: Option[T] = None } } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/types.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/types.scala index f076788b..892af1f4 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/types.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/types.scala @@ -20,16 +20,17 @@ import org.apache.spark.sql.Column package object types { type ErrSourceColName = String + type JsonString = String // TODO make it more "type safe" type ErrType = String type ErrCode = Long // was string type ErrMsg = String - type ErrCol = Option[ErrSourceColName] // wouldn't Seq[String] be better? But perhaps AdditionalInfo will suffice, while usually having one main column - //wouldn't a better name be SourceOfErrCol? - type RawValues = Seq[String] - type AdditionalInfo = Option[String] // actually a JSON + type ErrColsAndValues = Map[String, String] + type AdditionalInfo = Option[JsonString] // actually a JSON //mapping is missing, should be part of AdditionalInfo, as being very specific + type ColumnTransformer = String => Column + //This is to ensure some level of type-safety final case class ErrorColumn(column: Column) extends AnyVal } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala index a9b9edea..70d64332 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala @@ -18,6 +18,7 @@ package za.co.absa.spark.commons.sql import org.apache.spark.sql.Column import org.apache.spark.sql.functions.{col, lit} +import org.apache.spark.sql.types.DataType import za.co.absa.spark.commons.utils.SchemaUtils import scala.util.{Success, Try} @@ -63,4 +64,9 @@ object functions { lit(None.orNull) } + def null_col(dataType: DataType):Column = { + lit(None.orNull).cast(dataType) + } + + } diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageTest.scala deleted file mode 100644 index d672fb47..00000000 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageTest.scala +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright 2021 ABSA Group Limited - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package za.co.absa.spark.commons.errorhandling - -import org.apache.spark.sql.types.{ArrayType, LongType, StringType, StructField, StructType} -import org.scalatest.funsuite.AnyFunSuite -import za.co.absa.spark.commons.test.SparkTestBase - -class ErrorMessageTest extends AnyFunSuite with SparkTestBase { - test("errorColSchema returns the expected structure"){ - val expected = StructType(Seq( - StructField("errType", StringType, nullable = true), - StructField("errCode", LongType, nullable = false), - StructField("errMsg", StringType, nullable = true), - StructField("errCol", StringType, nullable = true), - StructField("rawValues", ArrayType(StringType, containsNull = true), nullable = true), - StructField("additionInfo", StringType, nullable = true) - )) - - val result = ErrorMessage.errorColSchema - assert(result == expected) - } -} From 51b16f42c3ca19f20e2830fa45384c0225a7f94c Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Tue, 14 Mar 2023 10:33:50 +0100 Subject: [PATCH 04/23] * Forgotten `register` function call --- .../commons/errorhandling/partials/ErrorHandlingCommon.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala index 6ceb7d92..4881c42d 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala @@ -37,6 +37,7 @@ trait ErrorHandlingCommon extends ErrorHandling { } def putErrorsWithGrouping(dataFrame: DataFrame)(errorsWhen: Seq[ErrorWhen]): DataFrame = { + register(dataFrame.sparkSession) val errorsByColumn = errorsWhen.groupBy(_.errorMessageSubmit.errColsAndValues.columnNames) val noColNames = Set.empty[String] val errorColumns1 = errorsByColumn.getOrElse(noColNames, Seq.empty).map(errorWhenToCol) // no grouping without ErrCol names From 4783fca437f81c7524c39ca57a3155909e101042 Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Tue, 14 Mar 2023 13:31:24 +0100 Subject: [PATCH 05/23] * line ends improved --- .../implementations/ErrorMessageSubmitOnColumn.scala | 1 + .../implementations/ErrorMessageSubmitWithoutColumn.scala | 1 - .../src/main/scala/za/co/absa/spark/commons/sql/functions.scala | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnColumn.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnColumn.scala index a5e70b1f..684ec6bb 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnColumn.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnColumn.scala @@ -28,6 +28,7 @@ class ErrorMessageSubmitOnColumn ( } object ErrorMessageSubmitOnColumn { + def apply(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errColName: ErrSourceColName, additionalInfo: AdditionalInfo= None): ErrorMessageSubmitOnColumn = { new ErrorMessageSubmitOnColumn( ColumnOrValue.withValue(errType), diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitWithoutColumn.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitWithoutColumn.scala index 30ea139c..34690590 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitWithoutColumn.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitWithoutColumn.scala @@ -43,4 +43,3 @@ object ErrorMessageSubmitWithoutColumn { ) } } - diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala index 70d64332..bde25f9b 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala @@ -68,5 +68,4 @@ object functions { lit(None.orNull).cast(dataType) } - } From c8c4aa076a7325744470964cc99edf3c02b79260 Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Thu, 16 Mar 2023 15:31:54 +0100 Subject: [PATCH 06/23] * ErrorMessageSubmits moved to submits sub-package * `ErrorMessageSubmitWithoutColumn` changed from `case class` to `class` to allow inheritance * some PR comments addressed --- .../commons/errorhandling/ErrorHandling.scala | 4 +-- .../implementations/ErrorMessageArray.scala | 2 +- .../ErrorMessageSubmitJustErrorValue.scala | 30 +++++++------------ .../ErrorMessageSubmitOnColumn.scala | 2 +- .../ErrorMessageSubmitOnMoreColumns.scala | 2 +- .../ErrorMessageSubmitWithoutColumn.scala | 4 +-- .../partials/ErrorHandlingCommon.scala | 8 ++--- .../partials/EvaluateIntoErrorMessage.scala | 2 +- .../partials/EvaluateViaUdf.scala | 2 +- .../ErrorMessageArrayTest.scala | 1 + 10 files changed, 22 insertions(+), 35 deletions(-) rename spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/{ => submits}/ErrorMessageSubmitJustErrorValue.scala (53%) rename spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/{ => submits}/ErrorMessageSubmitOnColumn.scala (95%) rename spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/{ => submits}/ErrorMessageSubmitOnMoreColumns.scala (96%) rename spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/{ => submits}/ErrorMessageSubmitWithoutColumn.scala (90%) diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala index 7d379d8b..660ec47e 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala @@ -17,12 +17,10 @@ package za.co.absa.spark.commons.errorhandling import org.apache.spark.sql.{Column, DataFrame, SparkSession} -import za.co.absa.spark.commons.errorhandling.implementations.{ErrorMessageSubmitOnColumn, ErrorMessageSubmitWithoutColumn} +import za.co.absa.spark.commons.errorhandling.implementations.submits.{ErrorMessageSubmitOnColumn, ErrorMessageSubmitWithoutColumn} import za.co.absa.spark.commons.errorhandling.types._ trait ErrorHandling { - def register(sparkToRegisterTo: SparkSession): Unit = {} - def putErrorToColumn(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errCol: Option[ErrSourceColName], additionalInfo: AdditionalInfo = None): ErrorColumn = { val toSubmit = errCol .map(errSourceColName => ErrorMessageSubmitOnColumn(errType, errCode, errMessage, errSourceColName, additionalInfo)) diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala index 7ca2ccfb..7be77f9c 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala @@ -59,7 +59,7 @@ case class ErrorMessageArray(errorColumnName: String = ErrorMessageArray.default dataFrame.withColumn(errorColName, reMap(array_union(deMap(col(errorColName)), colToUnion))) } - protected def doTheAggregation(dataFrame: DataFrame, errCols: Column*): DataFrame = { + protected def doTheColumnsAggregation(dataFrame: DataFrame, errCols: Column*): DataFrame = { val aggregated = array(errCols.map(decomposeMap): _*) //need to decompose the map field, as it's not supported in array functions val aggregatedWithoutNulls = array_except(aggregated, array(null_col)) val joinToExisting: (DataFrame, String) => DataFrame = appendToErrCol(_, _, aggregatedWithoutNulls) diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitJustErrorValue.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValue.scala similarity index 53% rename from spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitJustErrorValue.scala rename to spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValue.scala index 49223231..1a5bc12c 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitJustErrorValue.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValue.scala @@ -14,38 +14,28 @@ * limitations under the License. */ -package za.co.absa.spark.commons.errorhandling.implementations +package za.co.absa.spark.commons.errorhandling.implementations.submits import org.apache.spark.sql.functions.{lit, map} import org.apache.spark.sql.types.StringType import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit -import za.co.absa.spark.commons.errorhandling.implementations.ErrorMessageSubmitJustErrorValue.noColumnKey +import za.co.absa.spark.commons.errorhandling.implementations.submits.ErrorMessageSubmitJustErrorValue.noColumnKey import za.co.absa.spark.commons.errorhandling.types._ -case class ErrorMessageSubmitJustErrorValue( - errType: ColumnOrValue[ErrType], - errCode: ColumnOrValue[ErrCode], - errMsg: ColumnOrValue[ErrMsg], - errValue: ColumnOrValue[String], - override val additionInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty - ) extends ErrorMessageSubmit { +class ErrorMessageSubmitJustErrorValue( + val errType: ColumnOrValue[ErrType], + val errCode: ColumnOrValue[ErrCode], + val errMsg: ColumnOrValue[ErrMsg], + errValue: ColumnOrValue[String], + override val additionInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty + ) extends ErrorMessageSubmit { val errColsAndValues: ColumnOrValue[ErrColsAndValues] = ColumnOrValue(map(lit(noColumnKey), errValue.column.cast(StringType))) } object ErrorMessageSubmitJustErrorValue { val noColumnKey: ErrSourceColName = "" - def apply(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errValue: String): ErrorMessageSubmitJustErrorValue = { - new ErrorMessageSubmitJustErrorValue( - ColumnOrValue.withValue(errType), - ColumnOrValue.withValue(errCode), - ColumnOrValue.withValue(errMessage), - ColumnOrValue.withValue(errValue), - ColumnOrValue.asEmpty - ) - } - - def apply(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errValue: String, additionalInfo: AdditionalInfo): ErrorMessageSubmitJustErrorValue = { + def apply(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errValue: String, additionalInfo: AdditionalInfo = None): ErrorMessageSubmitJustErrorValue = { new ErrorMessageSubmitJustErrorValue( ColumnOrValue.withValue(errType), ColumnOrValue.withValue(errCode), diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnColumn.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnColumn.scala similarity index 95% rename from spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnColumn.scala rename to spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnColumn.scala index 684ec6bb..4d7ebe6e 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnColumn.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnColumn.scala @@ -14,7 +14,7 @@ * limitations under the License. */ -package za.co.absa.spark.commons.errorhandling.implementations +package za.co.absa.spark.commons.errorhandling.implementations.submits import za.co.absa.spark.commons.errorhandling.types._ diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnMoreColumns.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumns.scala similarity index 96% rename from spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnMoreColumns.scala rename to spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumns.scala index aaf0717d..456be13d 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitOnMoreColumns.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumns.scala @@ -14,7 +14,7 @@ * limitations under the License. */ -package za.co.absa.spark.commons.errorhandling.implementations +package za.co.absa.spark.commons.errorhandling.implementations.submits import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit import za.co.absa.spark.commons.errorhandling.types.ColumnOrValue.columnNameToItsStringValue diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitWithoutColumn.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumn.scala similarity index 90% rename from spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitWithoutColumn.scala rename to spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumn.scala index 34690590..a5ae73c8 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageSubmitWithoutColumn.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumn.scala @@ -14,11 +14,11 @@ * limitations under the License. */ -package za.co.absa.spark.commons.errorhandling.implementations +package za.co.absa.spark.commons.errorhandling.implementations.submits import org.apache.spark.sql.functions.typedLit import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit -import za.co.absa.spark.commons.errorhandling.implementations.ErrorMessageSubmitWithoutColumn.emptyErrorColsAndValues +import za.co.absa.spark.commons.errorhandling.implementations.submits.ErrorMessageSubmitWithoutColumn.emptyErrorColsAndValues import za.co.absa.spark.commons.errorhandling.types._ class ErrorMessageSubmitWithoutColumn( diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala index 4881c42d..4768df61 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala @@ -25,24 +25,22 @@ import org.apache.spark.sql.functions.when trait ErrorHandlingCommon extends ErrorHandling { protected def evaluate(errorMessageSubmit: ErrorMessageSubmit): Column - protected def doTheAggregation(dataFrame: DataFrame, errCols: Column*): DataFrame + protected def doTheColumnsAggregation(dataFrame: DataFrame, errCols: Column*): DataFrame def putErrorToColumn(errorMessageSubmit: ErrorMessageSubmit): ErrorColumn = { ErrorColumn(evaluate(errorMessageSubmit)) } def aggregateErrorColumns(dataFrame: DataFrame)(errCols: ErrorColumn*): DataFrame = { - register(dataFrame.sparkSession) - doTheAggregation(dataFrame, errCols.map(_.column): _*) + doTheColumnsAggregation(dataFrame, errCols.map(_.column): _*) } def putErrorsWithGrouping(dataFrame: DataFrame)(errorsWhen: Seq[ErrorWhen]): DataFrame = { - register(dataFrame.sparkSession) val errorsByColumn = errorsWhen.groupBy(_.errorMessageSubmit.errColsAndValues.columnNames) val noColNames = Set.empty[String] val errorColumns1 = errorsByColumn.getOrElse(noColNames, Seq.empty).map(errorWhenToCol) // no grouping without ErrCol names val errorColumns2 = (errorsByColumn - noColNames).values.map(errorWhenSeqToCol).toSeq - doTheAggregation(dataFrame, errorColumns1 ++ errorColumns2: _*) + doTheColumnsAggregation(dataFrame, errorColumns1 ++ errorColumns2: _*) } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala index 5dd931a5..834d010e 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala @@ -18,7 +18,7 @@ package za.co.absa.spark.commons.errorhandling.partials import org.apache.spark.sql.Column import org.apache.spark.sql.functions.struct -import za.co.absa.spark.commons.errorhandling.{ErrorMessage, ErrorMessageSubmit} +import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit import za.co.absa.spark.commons.errorhandling.partials.EvaluateIntoErrorMessage.FieldNames._ trait EvaluateIntoErrorMessage { diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala index 50ddea38..86761928 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala @@ -25,7 +25,7 @@ import za.co.absa.spark.commons.errorhandling.types._ trait EvaluateViaUdf[T] extends CallUdfAdapter{ def evaluationUdfName: String protected def evaluationUdf: ErrorMessageFunction[T] - def register(sparkToRegisterTo: SparkSession): Unit // TODO refactor when #82 has been implemented + def register(sparkToRegisterTo: SparkSession): Boolean // TODO refactor when #82 has been implemented protected def evaluate(errorMessageSubmit: ErrorMessageSubmit): Column = { call_udf(evaluationUdfName, diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala index 689737d4..aca79f56 100644 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala @@ -19,6 +19,7 @@ import org.scalatest.funsuite.AnyFunSuite import za.co.absa.spark.commons.errorhandling.types.ErrorWhen import za.co.absa.spark.commons.test.SparkTestBase import org.apache.spark.sql.functions.{col, length} +import za.co.absa.spark.commons.errorhandling.implementations.submits.{ErrorMessageSubmitOnColumn, ErrorMessageSubmitWithoutColumn} class ErrorMessageArrayTest extends AnyFunSuite with SparkTestBase { From 64dc95ec1a02c0bfb2baf591765028e054cf1a27 Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Mon, 20 Mar 2023 08:43:45 +0100 Subject: [PATCH 07/23] * Added UTs for `ColumnOrValue` * Fixed few minor things discovered by the UTs --- .../ErrorMessageSubmitOnMoreColumns.scala | 4 +- .../errorhandling/types/ColumnOrValue.scala | 21 ++-- .../types/ColumnOrValueForm.scala | 37 +++++++ .../types/ColumnOrValueTest.scala | 97 +++++++++++++++++++ 4 files changed, 150 insertions(+), 9 deletions(-) create mode 100644 spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueForm.scala create mode 100644 spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueTest.scala diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumns.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumns.scala index 456be13d..3ea8fdef 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumns.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumns.scala @@ -17,7 +17,6 @@ package za.co.absa.spark.commons.errorhandling.implementations.submits import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit -import za.co.absa.spark.commons.errorhandling.types.ColumnOrValue.columnNameToItsStringValue import za.co.absa.spark.commons.errorhandling.types._ class ErrorMessageSubmitOnMoreColumns( @@ -27,8 +26,7 @@ class ErrorMessageSubmitOnMoreColumns( errColNames: Set[ErrSourceColName], override val additionInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty ) extends ErrorMessageSubmit { - val errColsAndValues: ColumnOrValue[ErrColsAndValues] = ColumnOrValue(errColNames, columnNameToItsStringValue) - + val errColsAndValues: ColumnOrValue[ErrColsAndValues] = ColumnOrValue.asMapOfStringColumns(errColNames) } object ErrorMessageSubmitOnMoreColumns { diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala index 3c821835..f19871a7 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala @@ -37,20 +37,20 @@ object ColumnOrValue { def apply[T](columnName: String): ColumnOrValue[T] = CoVNamedColumn(columnName) def apply[T](column: Column): ColumnOrValue[T] = CoVDefinedColumn(column) - def apply[T](columnNames: Set[String], columnTransformer: ColumnTransformer): ColumnOrValue[Map[String, T]] = CoVMapColumn(columnNames, columnTransformer) //should it be explicit function? + def apply[T](mapColumnNames: Set[String], columnTransformer: ColumnTransformer): ColumnOrValue[Map[String, T]] = CoVMapColumn(mapColumnNames, columnTransformer) //should it be explicit function? def withOption(value: Option[String]): ColumnOrValue[Option[String]] = { // could be safely an apply, or done more generally value match { case None => CoVNull(StringType) - case _ => CoVValue(value) + case Some(x) => CoVOption(x) } } def withValue[T](value: T): ColumnOrValue[T] = CoVValue(value) + def asEmpty: ColumnOrValue[Option[String]] = CoVNull(StringType) + def asMapOfStringColumns(mapColumnNames: Set[String]): ColumnOrValue[Map[String, String]] = CoVMapColumn(mapColumnNames, columnNameToItsStringValue) - def columnNameToItsStringValue(colName: String): Column = { - col(colName).cast(StringType) - } + def columnNameToItsStringValue(colName: String): Column = col(colName).cast(StringType) private final case class CoVNamedColumn[T](columnName: String) extends ColumnOrValue[T] { val column: Column = col(columnName) @@ -80,13 +80,22 @@ object ColumnOrValue { val isValue: Boolean = false val getValue: Option[Map[String, T]] = None val column: Column = { - val (mapKeys, mapValues) = columnNames.foldLeft(Seq.empty[Column], Seq.empty[Column]) {case ((accKeys, accVals), colName) => + val (mapKeys, mapValues) = columnNames.foldRight(Seq.empty[Column], Seq.empty[Column]) {case (colName, (accKeys, accVals)) => (typedLit(colName) +: accKeys , columnTransformer(colName) +: accVals) } map_from_arrays(array(mapKeys: _*), array(mapValues: _*)) } } + private final case class CoVOption[T](value: T) extends ColumnOrValue[Option[T]] { + val column: Column = lit(value) + + val isColumn: Boolean = false + val isValue: Boolean = true + val columnNames: Set[String] = Set.empty + val getValue: Option[Option[T]] = Some(Some(value)) + } + private final case class CoVNull[T](dataType: DataType) extends ColumnOrValue[T] { val column: Column = null_col(dataType) diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueForm.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueForm.scala new file mode 100644 index 00000000..3cd1f094 --- /dev/null +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueForm.scala @@ -0,0 +1,37 @@ +/* + * Copyright 2023 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.errorhandling.types + +import org.apache.spark.sql.Column +import org.scalatest.Assertions + +case class ColumnOrValueForm[T] ( + column: Column, + isColumn: Boolean, + isValue: Boolean, + columnNames: Set[String], + value: Option[T] + ) extends Assertions { + def assertTo(columnOrValue: ColumnOrValue[T]): Unit ={ + assert(column == columnOrValue.column) + assert(isColumn == columnOrValue.isColumn) + assert(isValue == columnOrValue.isValue) + assert(columnNames == columnOrValue.columnNames) + assert(value == columnOrValue.getValue) + } + +} diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueTest.scala new file mode 100644 index 00000000..bf9359f4 --- /dev/null +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueTest.scala @@ -0,0 +1,97 @@ +/* + * Copyright 2023 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.errorhandling.types + +import net.bytebuddy.dynamic.scaffold.MethodGraph.Empty +import org.apache.spark.sql.Column +import org.apache.spark.sql.functions.{array, col, current_date, lit, map_from_arrays} +import org.apache.spark.sql.types.StringType +import org.scalatest.funsuite.AnyFunSuite +import za.co.absa.spark.commons.sql.functions.null_col + +class ColumnOrValueTest extends AnyFunSuite { + test("Creation of column based on its name"){ + val colName = "my_column" + val expected = ColumnOrValueForm(col(colName), isColumn = true, isValue = false, Set(colName), None) + val result = ColumnOrValue(colName) + expected assertTo result + } + + test("Creation of column based on its definition") { + val myColumn = current_date + val expected = ColumnOrValueForm(myColumn, isColumn = true, isValue = false, Set(), None) + val result = ColumnOrValue(myColumn) + expected assertTo result + } + + test("Creation of map from given column names") { + val colNames = Set("Col1", "Col2", "Col3") + val colTransformer: String => Column = col + val expectedColumn = map_from_arrays( + array( + lit("Col1"), lit("Col2"), lit("Col3") + ), array( + col("Col1"), col("Col2"), col("Col3") + )) + val expected = ColumnOrValueForm[Map[String, Any]](expectedColumn, isColumn = true, isValue = false, colNames, None) + val result = ColumnOrValue[Any](colNames, colTransformer) + expected assertTo(result) + } + + test("Creating ColumnOrValue from a defined Option") { + val value = "Foo" + val expected = ColumnOrValueForm(lit(value), isColumn = false, isValue = true, Set(), Option(Option(value))) + val result = ColumnOrValue.withOption(Option(value)) + expected assertTo result + } + + test("Creating ColumnOrValue from an empty Option") { + val expected = ColumnOrValueForm[Option[String]](null_col(StringType), isColumn = false, isValue = true, Set(), None) + val result = ColumnOrValue.withOption(None) + expected assertTo result + } + + test("Creating ColumnOrValue from a given value") { + val value = 42 + val expected = ColumnOrValueForm(lit(value), isColumn = false, isValue = true, Set(), Some(value)) + val result = ColumnOrValue.withValue(value) + expected assertTo result + } + + test("Creating ColumnOrValue as an undefined (empty) value") { + + val myColumn = null_col(StringType) + val expected = ColumnOrValueForm[Option[String]](myColumn, isColumn = false, isValue = true, Set(), None) + val result = ColumnOrValue.asEmpty + expected assertTo result + } + + test("Creating ColumnOrValue as a map of string columns") { + val colNames = Set("Col1", "Col2", "Col3") + val expectedColumn = map_from_arrays( + array( + lit("Col1"), lit("Col2"), lit("Col3") + ), array( + col("Col1").cast(StringType), col("Col2").cast(StringType), col("Col3").cast(StringType) + )) + val expected = ColumnOrValueForm[Map[String, String]](expectedColumn, isColumn = true, isValue = false, colNames, None) + val result = ColumnOrValue.asMapOfStringColumns(colNames) + expected assertTo(result) + } + + +} From e29098df12ed8424c332456d8740d383a7345fc4 Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Mon, 24 Apr 2023 13:59:37 +0200 Subject: [PATCH 08/23] * changed `ErrorMessageArrayTest` to actual test suite --- .../ErrorMessageArrayTest.scala | 82 +++++++++++++++++-- 1 file changed, 73 insertions(+), 9 deletions(-) diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala index aca79f56..364be9b0 100644 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala @@ -15,16 +15,20 @@ package za.co.absa.spark.commons.errorhandling.implementations +import org.apache.spark.sql.DataFrame import org.scalatest.funsuite.AnyFunSuite import za.co.absa.spark.commons.errorhandling.types.ErrorWhen import za.co.absa.spark.commons.test.SparkTestBase import org.apache.spark.sql.functions.{col, length} +import za.co.absa.spark.commons.errorhandling.ErrorMessage import za.co.absa.spark.commons.errorhandling.implementations.submits.{ErrorMessageSubmitOnColumn, ErrorMessageSubmitWithoutColumn} class ErrorMessageArrayTest extends AnyFunSuite with SparkTestBase { import spark.implicits._ + private val nullString = Option.empty[String].orNull + private val col1Name = "Col1" private val col2Name = "Col2" private val srcDf = Seq( @@ -34,7 +38,36 @@ class ErrorMessageArrayTest extends AnyFunSuite with SparkTestBase { (Some(3), "ccc") ).toDF(col1Name, col2Name) + private type ResultDfRecordType = (Option[Integer], String, List[ErrorMessage]) + + private def resultDfToResult(resultDf: DataFrame): List[ResultDfRecordType] = { + resultDf.as[ResultDfRecordType].collect().sortBy(_._1).toList + } + test("Collect columns and aggregate them explicitly") { + val expected: List[ResultDfRecordType] = List( + (None, "", List( + ErrorMessage("Test error 1", 1, "This is a test error", Map("Col1" -> nullString)), + ErrorMessage("Test error 2", 2, "This is a test error", Map("Col2" -> "")), + ErrorMessage("Test error 3", 3, "This is a test error", Map.empty) + )), + (Some(1), "a", List( + ErrorMessage("Test error 1", 1, "This is a test error", Map("Col1" -> "1")), + ErrorMessage("Test error 2", 2, "This is a test error", Map("Col2" -> "a")), + ErrorMessage("Test error 3", 3, "This is a test error", Map.empty) + )), + (Some(2), "bb", List( + ErrorMessage("Test error 1", 1, "This is a test error", Map("Col1" -> "2")), + ErrorMessage("Test error 2", 2, "This is a test error", Map("Col2" -> "bb")), + ErrorMessage("Test error 3", 3, "This is a test error", Map.empty) + )), + (Some(3), "ccc", List( + ErrorMessage("Test error 1", 1, "This is a test error", Map("Col1" -> "3")), + ErrorMessage("Test error 2", 2, "This is a test error", Map("Col2" -> "ccc")), + ErrorMessage("Test error 3", 3, "This is a test error", Map.empty) + )) + ) + val errorMessageArray = ErrorMessageArray() val e1 = errorMessageArray.putErrorToColumn("Test error 1", 1, "This is a test error", Some(col1Name)) @@ -43,34 +76,65 @@ class ErrorMessageArrayTest extends AnyFunSuite with SparkTestBase { val errorSubmitB = ErrorMessageSubmitWithoutColumn("Test error 3", 3, "This is a test error") val e3 = errorMessageArray.putErrorToColumn(errorSubmitB) - val destDf = errorMessageArray.aggregateErrorColumns(srcDf)(e1, e2, e3) - destDf.printSchema() - destDf.show(false) + val resultDf = errorMessageArray.aggregateErrorColumns(srcDf)(e1, e2, e3) + val result = resultDfToResult(resultDf) + + assert(result == expected) } test("putErrors groups conditions by source column"){ val errorMessageArray = ErrorMessageArray() + val expected: List[ResultDfRecordType] = List( + (None, "", List( + ErrorMessage("WrongLine", 0, "This line is wrong", Map.empty) + )), + (Some(1), "a", List.empty), + (Some(2), "bb", List( + ErrorMessage("ValueStillTooBig", 2, "The value of the field is too big", Map("Col1" -> "2")) + )), + (Some(3), "ccc", List( + ErrorMessage("ValueTooBig", 1, "The value of the field is too big", Map("Col1" -> "3")), + ErrorMessage("String too long", 10, "The text in the field is too long", Map("Col2" -> "ccc")) + )) + ) - val destDf = errorMessageArray.putErrorsWithGrouping(srcDf)(Seq( + val resultDf = errorMessageArray.putErrorsWithGrouping(srcDf)(Seq( ErrorWhen(col(col1Name).isNull, ErrorMessageSubmitWithoutColumn("WrongLine", 0, "This line is wrong")), ErrorWhen(col(col1Name) > 2, ErrorMessageSubmitOnColumn("ValueTooBig", 1, "The value of the field is too big", col1Name)), ErrorWhen(col(col1Name) > 1, ErrorMessageSubmitOnColumn("ValueStillTooBig", 2, "The value of the field is too big", col1Name)), ErrorWhen(length(col(col2Name)) > 2, ErrorMessageSubmitOnColumn("String too long", 10, "The text in the field is too long", col2Name)) )) - destDf.printSchema() - destDf.show(false) + val result = resultDfToResult(resultDf) + + assert(result == expected) } + test("putError and putErrors does not group by together"){ val errorMessageArray = ErrorMessageArray() + val expected: List[ResultDfRecordType] = List( + (None, "", List( + ErrorMessage("WrongLine", 0, "This line is wrong", Map.empty) + )), + (Some(1), "a", List.empty), + (Some(2), "bb", List( + ErrorMessage("ValueStillTooBig", 2, "The value of the field is too big", Map("Col1" -> "2")) + )), + (Some(3), "ccc", List( + ErrorMessage("ValueStillTooBig", 2, "The value of the field is too big", Map("Col1" -> "3")), + ErrorMessage("ValueTooBig", 1, "The value of the field is too big", Map("Col1" -> "3")), + ErrorMessage("String too long", 10, "The text in the field is too long", Map("Col2" -> "ccc")) + )) + ) val midDf = errorMessageArray.putError(srcDf)(col(col1Name) > 1)(ErrorMessageSubmitOnColumn("ValueStillTooBig", 2, "The value of the field is too big", col1Name)) - val destDf = errorMessageArray.putErrorsWithGrouping(midDf)(Seq( + val resultDf = errorMessageArray.putErrorsWithGrouping(midDf)(Seq( ErrorWhen(col(col1Name).isNull, ErrorMessageSubmitWithoutColumn("WrongLine", 0, "This line is wrong")), ErrorWhen(col(col1Name) > 2, ErrorMessageSubmitOnColumn("ValueTooBig", 1, "The value of the field is too big", col1Name)), ErrorWhen(length(col(col2Name)) > 2, ErrorMessageSubmitOnColumn("String too long", 10, "The text in the field is too long", col2Name)) )) - destDf.printSchema() - destDf.show(false) + val result = resultDfToResult(resultDf) + + assert(result == expected) } } From 8fe1b27443f063171978b3498e29ed307ac82797 Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Tue, 25 Apr 2023 10:48:31 +0200 Subject: [PATCH 09/23] * `ErrorHandling` - put abstract methods first --- .../spark/commons/errorhandling/ErrorHandling.scala | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala index 660ec47e..9a485407 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala @@ -21,12 +21,6 @@ import za.co.absa.spark.commons.errorhandling.implementations.submits.{ErrorMess import za.co.absa.spark.commons.errorhandling.types._ trait ErrorHandling { - def putErrorToColumn(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errCol: Option[ErrSourceColName], additionalInfo: AdditionalInfo = None): ErrorColumn = { - val toSubmit = errCol - .map(errSourceColName => ErrorMessageSubmitOnColumn(errType, errCode, errMessage, errSourceColName, additionalInfo)) - .getOrElse(ErrorMessageSubmitWithoutColumn(errType, errCode, errMessage, additionalInfo)) - putErrorToColumn(toSubmit) - } def putErrorToColumn(errorMessageSubmit: ErrorMessageSubmit): ErrorColumn def aggregateErrorColumns(dataFrame: DataFrame)(errCols: ErrorColumn*): DataFrame @@ -35,5 +29,12 @@ trait ErrorHandling { putErrorsWithGrouping(dataFrame)(Seq(ErrorWhen(when, errorMessageSubmit))) } def putErrorsWithGrouping(dataFrame: DataFrame)(errorsWhen: Seq[ErrorWhen]): DataFrame + + def putErrorToColumn(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errCol: Option[ErrSourceColName], additionalInfo: AdditionalInfo = None): ErrorColumn = { + val toSubmit = errCol + .map(errSourceColName => ErrorMessageSubmitOnColumn(errType, errCode, errMessage, errSourceColName, additionalInfo)) + .getOrElse(ErrorMessageSubmitWithoutColumn(errType, errCode, errMessage, additionalInfo)) + putErrorToColumn(toSubmit) + } } From 9543c947fe5f3d25c3e795c5921e8af59ca6dbbe Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Tue, 25 Apr 2023 19:19:16 +0200 Subject: [PATCH 10/23] * addressed PR comments --- .../partials/EvaluateViaUdf.scala | 15 +++++++------ .../errorhandling/types/ColumnOrValue.scala | 16 +++----------- .../commons/errorhandling/types/types.scala | 3 +-- .../types/ColumnOrValueForm.scala | 4 ---- .../types/ColumnOrValueTest.scala | 21 +++++++++---------- 5 files changed, 21 insertions(+), 38 deletions(-) diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala index 86761928..65c0af8a 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala @@ -17,24 +17,23 @@ package za.co.absa.spark.commons.errorhandling.partials import org.apache.spark.sql.{Column, SparkSession} +import za.co.absa.spark.commons.OncePerSparkSession import za.co.absa.spark.commons.adapters.CallUdfAdapter import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit import za.co.absa.spark.commons.errorhandling.partials.EvaluateViaUdf.ErrorMessageFunction import za.co.absa.spark.commons.errorhandling.types._ -trait EvaluateViaUdf[T] extends CallUdfAdapter{ +trait EvaluateViaUdf[T] extends OncePerSparkSession with CallUdfAdapter { def evaluationUdfName: String protected def evaluationUdf: ErrorMessageFunction[T] - def register(sparkToRegisterTo: SparkSession): Boolean // TODO refactor when #82 has been implemented protected def evaluate(errorMessageSubmit: ErrorMessageSubmit): Column = { call_udf(evaluationUdfName, - errorMessageSubmit.errType.column, - errorMessageSubmit.errCode.column, - errorMessageSubmit.errMsg.column, - errorMessageSubmit.errColsAndValues.column, - errorMessageSubmit.additionInfo.column - ) + errorMessageSubmit.errType.column, + errorMessageSubmit.errCode.column, + errorMessageSubmit.errMsg.column, + errorMessageSubmit.errColsAndValues.column, + errorMessageSubmit.additionInfo.column) } } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala index f19871a7..28451d6f 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala @@ -25,8 +25,6 @@ import scala.language.higherKinds trait ColumnOrValue[T] { def column: Column - def isColumn: Boolean - def isValue: Boolean def columnNames: Set[String] def getValue: Option[T] } @@ -37,7 +35,9 @@ object ColumnOrValue { def apply[T](columnName: String): ColumnOrValue[T] = CoVNamedColumn(columnName) def apply[T](column: Column): ColumnOrValue[T] = CoVDefinedColumn(column) - def apply[T](mapColumnNames: Set[String], columnTransformer: ColumnTransformer): ColumnOrValue[Map[String, T]] = CoVMapColumn(mapColumnNames, columnTransformer) //should it be explicit function? + def apply[T](mapColumnNames: Set[String], columnTransformer: ColumnTransformer): ColumnOrValue[Map[String, T]] = { + CoVMapColumn(mapColumnNames, columnTransformer) + } def withOption(value: Option[String]): ColumnOrValue[Option[String]] = { // could be safely an apply, or done more generally value match { @@ -54,30 +54,22 @@ object ColumnOrValue { private final case class CoVNamedColumn[T](columnName: String) extends ColumnOrValue[T] { val column: Column = col(columnName) - val isColumn: Boolean = true - val isValue: Boolean = false val columnNames: Set[String] = Set(columnName) val getValue: Option[T] = None } private final case class CoVDefinedColumn[T](column: Column) extends ColumnOrValue[T] { - val isColumn: Boolean = true - val isValue: Boolean = false val columnNames: Set[String] = Set.empty val getValue: Option[T] = None } private final case class CoVValue[T](value: T) extends ColumnOrValue[T] { val column: Column = lit(value) - val isColumn: Boolean = false - val isValue: Boolean = true val columnNames: Set[String] = Set.empty val getValue: Option[T] = Option(value) } private final case class CoVMapColumn[T](columnNames: Set[String], columnTransformer: ColumnTransformer) extends ColumnOrValue[Map[String, T]] { - val isColumn: Boolean = true - val isValue: Boolean = false val getValue: Option[Map[String, T]] = None val column: Column = { val (mapKeys, mapValues) = columnNames.foldRight(Seq.empty[Column], Seq.empty[Column]) {case (colName, (accKeys, accVals)) => @@ -90,8 +82,6 @@ object ColumnOrValue { private final case class CoVOption[T](value: T) extends ColumnOrValue[Option[T]] { val column: Column = lit(value) - val isColumn: Boolean = false - val isValue: Boolean = true val columnNames: Set[String] = Set.empty val getValue: Option[Option[T]] = Some(Some(value)) } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/types.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/types.scala index 892af1f4..46c59716 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/types.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/types.scala @@ -23,11 +23,10 @@ package object types { type JsonString = String // TODO make it more "type safe" type ErrType = String - type ErrCode = Long // was string + type ErrCode = Long type ErrMsg = String type ErrColsAndValues = Map[String, String] type AdditionalInfo = Option[JsonString] // actually a JSON - //mapping is missing, should be part of AdditionalInfo, as being very specific type ColumnTransformer = String => Column diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueForm.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueForm.scala index 3cd1f094..1dd042f3 100644 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueForm.scala +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueForm.scala @@ -21,15 +21,11 @@ import org.scalatest.Assertions case class ColumnOrValueForm[T] ( column: Column, - isColumn: Boolean, - isValue: Boolean, columnNames: Set[String], value: Option[T] ) extends Assertions { def assertTo(columnOrValue: ColumnOrValue[T]): Unit ={ assert(column == columnOrValue.column) - assert(isColumn == columnOrValue.isColumn) - assert(isValue == columnOrValue.isValue) assert(columnNames == columnOrValue.columnNames) assert(value == columnOrValue.getValue) } diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueTest.scala index bf9359f4..4658cd4a 100644 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueTest.scala +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueTest.scala @@ -16,7 +16,6 @@ package za.co.absa.spark.commons.errorhandling.types -import net.bytebuddy.dynamic.scaffold.MethodGraph.Empty import org.apache.spark.sql.Column import org.apache.spark.sql.functions.{array, col, current_date, lit, map_from_arrays} import org.apache.spark.sql.types.StringType @@ -26,14 +25,14 @@ import za.co.absa.spark.commons.sql.functions.null_col class ColumnOrValueTest extends AnyFunSuite { test("Creation of column based on its name"){ val colName = "my_column" - val expected = ColumnOrValueForm(col(colName), isColumn = true, isValue = false, Set(colName), None) + val expected = ColumnOrValueForm(col(colName), Set(colName), None) val result = ColumnOrValue(colName) expected assertTo result } test("Creation of column based on its definition") { val myColumn = current_date - val expected = ColumnOrValueForm(myColumn, isColumn = true, isValue = false, Set(), None) + val expected = ColumnOrValueForm(myColumn, Set(), None) val result = ColumnOrValue(myColumn) expected assertTo result } @@ -47,27 +46,27 @@ class ColumnOrValueTest extends AnyFunSuite { ), array( col("Col1"), col("Col2"), col("Col3") )) - val expected = ColumnOrValueForm[Map[String, Any]](expectedColumn, isColumn = true, isValue = false, colNames, None) + val expected = ColumnOrValueForm[Map[String, Any]](expectedColumn, colNames, None) val result = ColumnOrValue[Any](colNames, colTransformer) - expected assertTo(result) + expected assertTo result } test("Creating ColumnOrValue from a defined Option") { val value = "Foo" - val expected = ColumnOrValueForm(lit(value), isColumn = false, isValue = true, Set(), Option(Option(value))) + val expected = ColumnOrValueForm(lit(value), Set(), Option(Option(value))) val result = ColumnOrValue.withOption(Option(value)) expected assertTo result } test("Creating ColumnOrValue from an empty Option") { - val expected = ColumnOrValueForm[Option[String]](null_col(StringType), isColumn = false, isValue = true, Set(), None) + val expected = ColumnOrValueForm[Option[String]](null_col(StringType), Set(), None) val result = ColumnOrValue.withOption(None) expected assertTo result } test("Creating ColumnOrValue from a given value") { val value = 42 - val expected = ColumnOrValueForm(lit(value), isColumn = false, isValue = true, Set(), Some(value)) + val expected = ColumnOrValueForm(lit(value), Set(), Some(value)) val result = ColumnOrValue.withValue(value) expected assertTo result } @@ -75,7 +74,7 @@ class ColumnOrValueTest extends AnyFunSuite { test("Creating ColumnOrValue as an undefined (empty) value") { val myColumn = null_col(StringType) - val expected = ColumnOrValueForm[Option[String]](myColumn, isColumn = false, isValue = true, Set(), None) + val expected = ColumnOrValueForm[Option[String]](myColumn, Set(), None) val result = ColumnOrValue.asEmpty expected assertTo result } @@ -88,9 +87,9 @@ class ColumnOrValueTest extends AnyFunSuite { ), array( col("Col1").cast(StringType), col("Col2").cast(StringType), col("Col3").cast(StringType) )) - val expected = ColumnOrValueForm[Map[String, String]](expectedColumn, isColumn = true, isValue = false, colNames, None) + val expected = ColumnOrValueForm[Map[String, String]](expectedColumn, colNames, None) val result = ColumnOrValue.asMapOfStringColumns(colNames) - expected assertTo(result) + expected assertTo result } From 8822c8a0fdef78dabbac254ad64a185620ccac2c Mon Sep 17 00:00:00 2001 From: David Benedeki <14905969+benedeki@users.noreply.github.com> Date: Wed, 26 Apr 2023 20:19:36 +0200 Subject: [PATCH 11/23] Update spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala Co-authored-by: Ladislav Sulak --- .../errorhandling/implementations/ErrorMessageArray.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala index 7be77f9c..caef20c3 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala @@ -47,7 +47,7 @@ case class ErrorMessageArray(errorColumnName: String = ErrorMessageArray.default errorMessageColumn.getField(errType) as errType, errorMessageColumn.getField(errCode) as errCode, errorMessageColumn.getField(errMsg) as errMsg, - map_from_arrays(errorMessageColumn.getField(errCols), errorMessageColumn.getField(errValues)) as errColsAndValues , + map_from_arrays(errorMessageColumn.getField(errCols), errorMessageColumn.getField(errValues)) as errColsAndValues, errorMessageColumn.getField(additionInfo) as additionInfo ) } From d9a2129ffa079f39a73bca0c0af039c784c86c96 Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Wed, 26 Apr 2023 20:30:34 +0200 Subject: [PATCH 12/23] * Further PR comments addressed --- .../submits/ErrorMessageSubmitJustErrorValue.scala | 2 +- .../submits/ErrorMessageSubmitOnMoreColumns.scala | 6 +++++- .../spark/commons/errorhandling/types/ColumnOrValue.scala | 2 -- .../co/absa/spark/commons/errorhandling/types/types.scala | 3 ++- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValue.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValue.scala index 1a5bc12c..b0aeb6b6 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValue.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValue.scala @@ -26,7 +26,7 @@ class ErrorMessageSubmitJustErrorValue( val errType: ColumnOrValue[ErrType], val errCode: ColumnOrValue[ErrCode], val errMsg: ColumnOrValue[ErrMsg], - errValue: ColumnOrValue[String], + errValue: ColumnOrValue[ErrValue], override val additionInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty ) extends ErrorMessageSubmit { val errColsAndValues: ColumnOrValue[ErrColsAndValues] = ColumnOrValue(map(lit(noColumnKey), errValue.column.cast(StringType))) diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumns.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumns.scala index 3ea8fdef..d3ac19d6 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumns.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumns.scala @@ -30,7 +30,11 @@ class ErrorMessageSubmitOnMoreColumns( } object ErrorMessageSubmitOnMoreColumns { - def apply(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errColNames: Set[ErrSourceColName], additionalInfo: AdditionalInfo= None): ErrorMessageSubmitOnMoreColumns = { + def apply(errType: ErrType, + errCode: ErrCode, + errMessage: ErrMsg, + errColNames: Set[ErrSourceColName], + additionalInfo: AdditionalInfo= None): ErrorMessageSubmitOnMoreColumns = { new ErrorMessageSubmitOnMoreColumns( ColumnOrValue.withValue(errType), ColumnOrValue.withValue(errCode), diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala index 28451d6f..aea10f5b 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala @@ -89,8 +89,6 @@ object ColumnOrValue { private final case class CoVNull[T](dataType: DataType) extends ColumnOrValue[T] { val column: Column = null_col(dataType) - val isColumn: Boolean = false - val isValue: Boolean = true val columnNames: Set[String] = Set.empty val getValue: Option[T] = None } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/types.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/types.scala index 46c59716..d02c8542 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/types.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/types.scala @@ -25,7 +25,8 @@ package object types { type ErrType = String type ErrCode = Long type ErrMsg = String - type ErrColsAndValues = Map[String, String] + type ErrValue = String + type ErrColsAndValues = Map[ErrSourceColName, ErrValue] type AdditionalInfo = Option[JsonString] // actually a JSON type ColumnTransformer = String => Column From 5b317d89e1a428733a725d465f9002b3de85a8ed Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Mon, 1 May 2023 00:17:06 +0200 Subject: [PATCH 13/23] * more UTs --- .../ErrorMessageSubmitWithoutColumn.scala | 5 +- .../ErrorMessageArrayTest.scala | 61 ++++++++++++++- ...ErrorMessageSubmitJustErrorValueTest.scala | 70 +++++++++++++++++ .../ErrorMessageSubmitOnColumnTest.scala | 68 ++++++++++++++++ .../ErrorMessageSubmitOnMoreColumnsTest.scala | 78 +++++++++++++++++++ .../ErrorMessageSubmitWithoutColumnTest.scala | 68 ++++++++++++++++ 6 files changed, 347 insertions(+), 3 deletions(-) create mode 100644 spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValueTest.scala create mode 100644 spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnColumnTest.scala create mode 100644 spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumnsTest.scala create mode 100644 spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumnTest.scala diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumn.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumn.scala index a5ae73c8..6a732710 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumn.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumn.scala @@ -16,6 +16,7 @@ package za.co.absa.spark.commons.errorhandling.implementations.submits +import org.apache.spark.sql.Column import org.apache.spark.sql.functions.typedLit import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit import za.co.absa.spark.commons.errorhandling.implementations.submits.ErrorMessageSubmitWithoutColumn.emptyErrorColsAndValues @@ -28,12 +29,14 @@ class ErrorMessageSubmitWithoutColumn( override val additionInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty ) extends ErrorMessageSubmit { - val errColsAndValues: ColumnOrValue[ErrColsAndValues] = ColumnOrValue(typedLit(emptyErrorColsAndValues)) + val errColsAndValues: ColumnOrValue[ErrColsAndValues] = ColumnOrValue(ErrorMessageSubmitWithoutColumn.emptyErrColsAndValues) } object ErrorMessageSubmitWithoutColumn { private val emptyErrorColsAndValues: ErrColsAndValues = Map.empty + val emptyErrColsAndValues: Column = typedLit(emptyErrorColsAndValues) + def apply(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, additionalInfo: AdditionalInfo = None): ErrorMessageSubmitWithoutColumn = { new ErrorMessageSubmitWithoutColumn( ColumnOrValue.withValue(errType), diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala index 364be9b0..546db2ab 100644 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala @@ -21,7 +21,8 @@ import za.co.absa.spark.commons.errorhandling.types.ErrorWhen import za.co.absa.spark.commons.test.SparkTestBase import org.apache.spark.sql.functions.{col, length} import za.co.absa.spark.commons.errorhandling.ErrorMessage -import za.co.absa.spark.commons.errorhandling.implementations.submits.{ErrorMessageSubmitOnColumn, ErrorMessageSubmitWithoutColumn} +import za.co.absa.spark.commons.errorhandling.implementations.submits.{ErrorMessageSubmitJustErrorValue, ErrorMessageSubmitOnColumn, ErrorMessageSubmitOnMoreColumns, ErrorMessageSubmitWithoutColumn} +import za.co.absa.spark.commons.errorhandling.types.ColumnOrValue.CoV class ErrorMessageArrayTest extends AnyFunSuite with SparkTestBase { @@ -35,7 +36,8 @@ class ErrorMessageArrayTest extends AnyFunSuite with SparkTestBase { (None, ""), (Some(1), "a"), (Some(2), "bb"), - (Some(3), "ccc") + (Some(3), "ccc"), + (Some(0), "X") ).toDF(col1Name, col2Name) private type ResultDfRecordType = (Option[Integer], String, List[ErrorMessage]) @@ -51,6 +53,11 @@ class ErrorMessageArrayTest extends AnyFunSuite with SparkTestBase { ErrorMessage("Test error 2", 2, "This is a test error", Map("Col2" -> "")), ErrorMessage("Test error 3", 3, "This is a test error", Map.empty) )), + (Some(0), "X", List( + ErrorMessage("Test error 1", 1, "This is a test error", Map("Col1" -> "0")), + ErrorMessage("Test error 2", 2, "This is a test error", Map("Col2" -> "X")), + ErrorMessage("Test error 3", 3, "This is a test error", Map.empty) + )), (Some(1), "a", List( ErrorMessage("Test error 1", 1, "This is a test error", Map("Col1" -> "1")), ErrorMessage("Test error 2", 2, "This is a test error", Map("Col2" -> "a")), @@ -88,6 +95,7 @@ class ErrorMessageArrayTest extends AnyFunSuite with SparkTestBase { (None, "", List( ErrorMessage("WrongLine", 0, "This line is wrong", Map.empty) )), + (Some(0), "X", List.empty), (Some(1), "a", List.empty), (Some(2), "bb", List( ErrorMessage("ValueStillTooBig", 2, "The value of the field is too big", Map("Col1" -> "2")) @@ -115,6 +123,7 @@ class ErrorMessageArrayTest extends AnyFunSuite with SparkTestBase { (None, "", List( ErrorMessage("WrongLine", 0, "This line is wrong", Map.empty) )), + (Some(0), "X", List.empty), (Some(1), "a", List.empty), (Some(2), "bb", List( ErrorMessage("ValueStillTooBig", 2, "The value of the field is too big", Map("Col1" -> "2")) @@ -137,4 +146,52 @@ class ErrorMessageArrayTest extends AnyFunSuite with SparkTestBase { assert(result == expected) } + + test("Various error submits combined") { + val errorMessageArray = ErrorMessageArray("MyErrCol") + + + case class NullError(errColName: String) extends ErrorMessageSubmitOnColumn( + CoV.withValue("Null Error"), + CoV.withValue(1L), + CoV.withValue("Field should not be null"), + errColName) + case class CorrelationError(lengthColName: String, textColumnName: String) extends ErrorMessageSubmitOnMoreColumns( + CoV.withValue("Columns not correlated"), + CoV.withValue(2), + CoV.withValue(s"Column '$textColumnName' doesn't have a length mentioned in column '$lengthColName'"), + Set(lengthColName, textColumnName) + ) + + val expected: List[ResultDfRecordType] = List( + (None, "", List( + ErrorMessage("Null Error", 1, "Field should not be null", Map(col1Name -> nullString)) + )), + (Some(0), "X", List( + ErrorMessage( + "Columns not correlated", + 2, + s"Column '$col2Name' doesn't have a length mentioned in column '$col1Name'", + Map(col1Name -> "0", col2Name -> "X")) + )), + (Some(1), "a", List.empty), + (Some(2), "bb", List( + ErrorMessage("ID is protected", 2, "The ID is too big", Map("" -> "2")) + )), + (Some(3), "ccc", List( + ErrorMessage("Ugly row", 3, "I don't like this row", Map.empty) + )) + ) + + val resultDf = errorMessageArray.putErrorsWithGrouping(srcDf)(Seq( + ErrorWhen(col(col1Name).isNull, NullError(col1Name)), + ErrorWhen(col(col1Name) =!= length(col(col2Name)), CorrelationError(col1Name, col2Name)), + ErrorWhen(col(col1Name) === 2, ErrorMessageSubmitJustErrorValue("ID is protected", 2, "The ID is too big", "2")), + ErrorWhen(length(col(col2Name)) > 2, ErrorMessageSubmitWithoutColumn("Ugly row", 3, "I don't like this row")) + )) + val result = resultDfToResult(resultDf) + + assert(result == expected) + + } } diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValueTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValueTest.scala new file mode 100644 index 00000000..37c7f2d1 --- /dev/null +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValueTest.scala @@ -0,0 +1,70 @@ +/* + * Copyright 2023 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.errorhandling.implementations.submits + +import org.apache.spark.sql.functions.{array, lit, map} +import org.apache.spark.sql.types.StringType +import org.scalatest.funsuite.AnyFunSuite +import za.co.absa.spark.commons.errorhandling.types.{AdditionalInfo, ColumnOrValueForm, ErrColsAndValues} +import za.co.absa.spark.commons.sql.functions.null_col + +class ErrorMessageSubmitJustErrorValueTest extends AnyFunSuite { + test("Apply function properly hands over data without additional info") { + val errType = "Test error" + val errCode = 201L + val errMsg = "This is a test error" + val errValue = "This was wrong" + + val result = ErrorMessageSubmitJustErrorValue(errType, errCode, errMsg, errValue) + + val expectedErrType = ColumnOrValueForm(lit(errType), Set.empty, Some(errType)) + val expectedErrCode = ColumnOrValueForm(lit(errCode), Set.empty, Some(errCode)) + val expectedErrMsg = ColumnOrValueForm(lit(errMsg), Set.empty, Some(errMsg)) + val column = map(lit(""), lit(errValue).cast(StringType)) + val expectedErrValuesCol = ColumnOrValueForm[ErrColsAndValues](column, Set.empty, None) + val expectedAdditionalInfo = ColumnOrValueForm[AdditionalInfo](null_col(StringType), Set.empty, None) + + expectedErrType assertTo result.errType + expectedErrCode assertTo result.errCode + expectedErrMsg assertTo result.errMsg + expectedErrValuesCol assertTo result.errColsAndValues + expectedAdditionalInfo assertTo result.additionInfo + } + + test("Apply function properly hands over data with additional info") { + val errType = "Test error" + val errCode = 201L + val errMsg = "This is a test error" + val errValue = "This was wrong" + val additionalInfo = "{}" + + val result = ErrorMessageSubmitJustErrorValue(errType, errCode, errMsg, errValue, Some(additionalInfo)) + + val expectedErrType = ColumnOrValueForm(lit(errType), Set.empty, Some(errType)) + val expectedErrCode = ColumnOrValueForm(lit(errCode), Set.empty, Some(errCode)) + val expectedErrMsg = ColumnOrValueForm(lit(errMsg), Set.empty, Some(errMsg)) + val column = map(lit(""), lit(errValue).cast(StringType)) + val expectedErrValuesCol = ColumnOrValueForm[ErrColsAndValues](column, Set.empty, None) + val expectedAdditionalInfo = ColumnOrValueForm[AdditionalInfo](lit(additionalInfo), Set.empty, Some(Some(additionalInfo))) + + expectedErrType assertTo result.errType + expectedErrCode assertTo result.errCode + expectedErrMsg assertTo result.errMsg + expectedErrValuesCol assertTo result.errColsAndValues + expectedAdditionalInfo assertTo result.additionInfo + } +} diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnColumnTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnColumnTest.scala new file mode 100644 index 00000000..432986ce --- /dev/null +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnColumnTest.scala @@ -0,0 +1,68 @@ +/* + * Copyright 2023 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.errorhandling.implementations.submits + +import org.apache.spark.sql.functions.{array, col, lit, map_from_arrays} +import org.apache.spark.sql.types.StringType +import org.scalatest.funsuite.AnyFunSuite +import za.co.absa.spark.commons.errorhandling.types.{AdditionalInfo, ColumnOrValueForm, ErrColsAndValues} +import za.co.absa.spark.commons.sql.functions.null_col + +class ErrorMessageSubmitOnColumnTest extends AnyFunSuite { + test("Apply function properly hands over data without additional info") { + val errType = "Test error" + val errCode = 201L + val errMsg = "This is a test error" + val colName = "foo" + + val result = ErrorMessageSubmitOnColumn(errType, errCode, errMsg, colName) + + val expectedErrType = ColumnOrValueForm(lit(errType), Set.empty, Some(errType)) + val expectedErrCode = ColumnOrValueForm(lit(errCode), Set.empty, Some(errCode)) + val expectedErrMsg = ColumnOrValueForm(lit(errMsg), Set.empty, Some(errMsg)) + val expectedCol = ColumnOrValueForm[ErrColsAndValues](map_from_arrays(array(lit(colName)), array(col(colName).cast(StringType))), Set(colName), None) + val expectedAdditionalInfo = ColumnOrValueForm[AdditionalInfo](null_col(StringType), Set.empty, None) + + expectedErrType assertTo result.errType + expectedErrCode assertTo result.errCode + expectedErrMsg assertTo result.errMsg + expectedCol assertTo result.errColsAndValues + expectedAdditionalInfo assertTo result.additionInfo + } + + test("Apply function properly hands over data with additional info") { + val errType = "Test error" + val errCode = 201L + val errMsg = "This is a test error" + val colName = "foo" + val additionalInfo = "{}" + + val result = ErrorMessageSubmitOnColumn(errType, errCode, errMsg, colName, Some(additionalInfo)) + + val expectedErrType = ColumnOrValueForm(lit(errType), Set.empty, Some(errType)) + val expectedErrCode = ColumnOrValueForm(lit(errCode), Set.empty, Some(errCode)) + val expectedErrMsg = ColumnOrValueForm(lit(errMsg), Set.empty, Some(errMsg)) + val expectedCol = ColumnOrValueForm[ErrColsAndValues](map_from_arrays(array(lit(colName)), array(col(colName).cast(StringType))), Set(colName), None) + val expectedAdditionalInfo = ColumnOrValueForm[AdditionalInfo](lit(additionalInfo), Set.empty, Some(Some(additionalInfo))) + + expectedErrType assertTo result.errType + expectedErrCode assertTo result.errCode + expectedErrMsg assertTo result.errMsg + expectedCol assertTo result.errColsAndValues + expectedAdditionalInfo assertTo result.additionInfo + } +} diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumnsTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumnsTest.scala new file mode 100644 index 00000000..656a78d2 --- /dev/null +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumnsTest.scala @@ -0,0 +1,78 @@ +/* + * Copyright 2023 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.errorhandling.implementations.submits + +import org.apache.spark.sql.functions.{array, col, lit, map_from_arrays} +import org.apache.spark.sql.types.StringType +import org.scalatest.funsuite.AnyFunSuite +import za.co.absa.spark.commons.errorhandling.types.{AdditionalInfo, ColumnOrValueForm, ErrColsAndValues} +import za.co.absa.spark.commons.sql.functions.null_col + +class ErrorMessageSubmitOnMoreColumnsTest extends AnyFunSuite { + test("Apply function properly hands over data without additional info") { + val errType = "Test error" + val errCode = 201L + val errMsg = "This is a test error" + val colName1 = "foo" + val colName2 = "bar" + + val result = ErrorMessageSubmitOnMoreColumns(errType, errCode, errMsg, Set(colName1, colName2)) + + val expectedErrType = ColumnOrValueForm(lit(errType), Set.empty, Some(errType)) + val expectedErrCode = ColumnOrValueForm(lit(errCode), Set.empty, Some(errCode)) + val expectedErrMsg = ColumnOrValueForm(lit(errMsg), Set.empty, Some(errMsg)) + val column = map_from_arrays( + array(lit(colName1), lit(colName2)), + array(col(colName1).cast(StringType), col(colName2).cast(StringType)) + ) + val expectedCol = ColumnOrValueForm[ErrColsAndValues](column, Set(colName1, colName2), None) + val expectedAdditionalInfo = ColumnOrValueForm[AdditionalInfo](null_col(StringType), Set.empty, None) + + expectedErrType assertTo result.errType + expectedErrCode assertTo result.errCode + expectedErrMsg assertTo result.errMsg + expectedCol assertTo result.errColsAndValues + expectedAdditionalInfo assertTo result.additionInfo + } + + test("Apply function properly hands over data with additional info") { + val errType = "Test error" + val errCode = 201L + val errMsg = "This is a test error" + val colName1 = "foo" + val colName2 = "bar" + val additionalInfo = "{}" + + val result = ErrorMessageSubmitOnMoreColumns(errType, errCode, errMsg, Set(colName1, colName2), Some(additionalInfo)) + + val expectedErrType = ColumnOrValueForm(lit(errType), Set.empty, Some(errType)) + val expectedErrCode = ColumnOrValueForm(lit(errCode), Set.empty, Some(errCode)) + val expectedErrMsg = ColumnOrValueForm(lit(errMsg), Set.empty, Some(errMsg)) + val column = map_from_arrays( + array(lit(colName1), lit(colName2)), + array(col(colName1).cast(StringType), col(colName2).cast(StringType)) + ) + val expectedCol = ColumnOrValueForm[ErrColsAndValues](column, Set(colName1, colName2), None) + val expectedAdditionalInfo = ColumnOrValueForm[AdditionalInfo](lit(additionalInfo), Set.empty, Some(Some(additionalInfo))) + + expectedErrType assertTo result.errType + expectedErrCode assertTo result.errCode + expectedErrMsg assertTo result.errMsg + expectedCol assertTo result.errColsAndValues + expectedAdditionalInfo assertTo result.additionInfo + } +} diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumnTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumnTest.scala new file mode 100644 index 00000000..9ea0a8fe --- /dev/null +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumnTest.scala @@ -0,0 +1,68 @@ +/* + * Copyright 2023 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.spark.commons.errorhandling.implementations.submits + +import org.apache.spark.sql.functions.{lit, map, typedLit} +import org.apache.spark.sql.types.StringType +import org.scalatest.funsuite.AnyFunSuite +import za.co.absa.spark.commons.errorhandling.types.{AdditionalInfo, ColumnOrValueForm, ErrColsAndValues} +import za.co.absa.spark.commons.sql.functions.null_col + +class ErrorMessageSubmitWithoutColumnTest extends AnyFunSuite { + test("Apply function properly hands over data without additional info") { + val errType = "Test error" + val errCode = 201L + val errMsg = "This is a test error" + + val result = ErrorMessageSubmitWithoutColumn(errType, errCode, errMsg) + + val expectedErrType = ColumnOrValueForm(lit(errType), Set.empty, Some(errType)) + val expectedErrCode = ColumnOrValueForm(lit(errCode), Set.empty, Some(errCode)) + val expectedErrMsg = ColumnOrValueForm(lit(errMsg), Set.empty, Some(errMsg)) + val expectedErrValuesCol = ColumnOrValueForm[ErrColsAndValues](ErrorMessageSubmitWithoutColumn.emptyErrColsAndValues, Set.empty, None) + val expectedAdditionalInfo = ColumnOrValueForm[AdditionalInfo](null_col(StringType), Set.empty, None) + + expectedErrType assertTo result.errType + expectedErrCode assertTo result.errCode + expectedErrMsg assertTo result.errMsg + result.errColsAndValues.column.expr + expectedErrValuesCol assertTo result.errColsAndValues + expectedAdditionalInfo assertTo result.additionInfo + } + + test("Apply function properly hands over data with additional info") { + val errType = "Test error" + val errCode = 201L + val errMsg = "This is a test error" + val additionalInfo = "{}" + val columnValue: ErrColsAndValues = Map.empty + + val result = ErrorMessageSubmitWithoutColumn(errType, errCode, errMsg, Some(additionalInfo)) + + val expectedErrType = ColumnOrValueForm(lit(errType), Set.empty, Some(errType)) + val expectedErrCode = ColumnOrValueForm(lit(errCode), Set.empty, Some(errCode)) + val expectedErrMsg = ColumnOrValueForm(lit(errMsg), Set.empty, Some(errMsg)) + val expectedErrValuesCol = ColumnOrValueForm[ErrColsAndValues](ErrorMessageSubmitWithoutColumn.emptyErrColsAndValues, Set.empty, None) + val expectedAdditionalInfo = ColumnOrValueForm[AdditionalInfo](lit(additionalInfo), Set.empty, Some(Some(additionalInfo))) + + expectedErrType assertTo result.errType + expectedErrCode assertTo result.errCode + expectedErrMsg assertTo result.errMsg + expectedErrValuesCol assertTo result.errColsAndValues + expectedAdditionalInfo assertTo result.additionInfo + } +} From 96c230831754cfcb82299a59f4a242031d8e89dd Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Mon, 1 May 2023 00:26:39 +0200 Subject: [PATCH 14/23] * Added Jacoco exclusion for adapters --- build.sbt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/build.sbt b/build.sbt index a903f879..6311507c 100644 --- a/build.sbt +++ b/build.sbt @@ -42,6 +42,8 @@ lazy val commonJacocoReportSettings: JacocoReportSettings = JacocoReportSettings ) lazy val commonJacocoExcludes: Seq[String] = Seq( + "za.co.absa.spark.commons.adapters.CallUdfAdapter", + "za.co.absa.spark.commons.adapters.TransformAdapter" // "za.co.absa.spark.commons.utils.JsonUtils*", // class and related objects // "za.co.absa.spark.commons.utils.ExplodeTools" // class only ) From 30f50c35c95dbda69ada616ed2f5b9daba160b36 Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Wed, 3 May 2023 01:00:22 +0200 Subject: [PATCH 15/23] #101: ErrorHandling documentation and fields renames * merged `ErrorHandlingCommon` into `ErrorHandling` * renamed errCol / errCols => errSrcColName * renamed putErrorToColumn => createErrorAsColumn * renamed aggregateErrorColumns => applyErrorColumnsToDataFrame * renamed `EvaluateIntoErrorMessage` to `TransformIntoErrorMessage` * renamed `EvaluateViaUdf` to `TransformViaUdf` * some smaller parameters renamed * numerous ScalaDoc entries --- .../spark/commons/OncePerSparkSession.scala | 2 +- .../commons/errorhandling/ErrorHandling.scala | 119 ++++++++++++++++-- .../errorhandling/ErrorMessageSubmit.scala | 9 +- .../implementations/ErrorMessageArray.scala | 15 +-- .../ErrorMessageSubmitJustErrorValue.scala | 27 +++- .../submits/ErrorMessageSubmitOnColumn.scala | 34 ++++- .../ErrorMessageSubmitOnMoreColumns.scala | 34 ++++- .../ErrorMessageSubmitWithoutColumn.scala | 24 +++- .../partials/ErrorHandlingCommon.scala | 56 --------- ....scala => TransformIntoErrorMessage.scala} | 20 +-- ...uateViaUdf.scala => TransformViaUdf.scala} | 18 +-- .../errorhandling/types/ColumnOrValue.scala | 84 ++++++++++++- .../errorhandling/types/ErrorWhen.scala | 8 ++ .../co/absa/spark/commons/sql/functions.scala | 2 +- .../spark/commons/utils/ExplodeTools.scala | 3 +- .../ErrorMessageArrayTest.scala | 10 +- ...ErrorMessageSubmitJustErrorValueTest.scala | 8 +- .../ErrorMessageSubmitOnColumnTest.scala | 8 +- .../ErrorMessageSubmitOnMoreColumnsTest.scala | 8 +- .../ErrorMessageSubmitWithoutColumnTest.scala | 8 +- 20 files changed, 368 insertions(+), 129 deletions(-) delete mode 100644 spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala rename spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/{EvaluateIntoErrorMessage.scala => TransformIntoErrorMessage.scala} (61%) rename spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/{EvaluateViaUdf.scala => TransformViaUdf.scala} (69%) diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/OncePerSparkSession.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/OncePerSparkSession.scala index dcec715d..dc1f9e4d 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/OncePerSparkSession.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/OncePerSparkSession.scala @@ -25,7 +25,7 @@ import java.util.concurrent.ConcurrentHashMap * instantiated classes thus not running the method [[register]] again on them. * * Usage: extend this abstract class and implement the method [[register]]. On initialization the - * [[register]] method gets called by the [[za.co.absa.spark.commons.OncePerSparkSession$.registerMe]] method if the class + spark session + * [[register]] method gets called by the [[za.co.absa.spark.commons.OncePerSparkSession$.registerMe OncePerSparkSession.registerMe()]] method if the class + spark session * combination is unique. If it is not unique [[register]] will not get called again. * This way we ensure only single registration per spark session. * diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala index 9a485407..e04c598d 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala @@ -16,25 +16,130 @@ package za.co.absa.spark.commons.errorhandling -import org.apache.spark.sql.{Column, DataFrame, SparkSession} +import org.apache.spark.sql.catalyst.expressions.{CaseWhen, Expression} +import org.apache.spark.sql.functions.when +import org.apache.spark.sql.{Column, DataFrame} import za.co.absa.spark.commons.errorhandling.implementations.submits.{ErrorMessageSubmitOnColumn, ErrorMessageSubmitWithoutColumn} import za.co.absa.spark.commons.errorhandling.types._ +/** + * The basic class of error handling component. Every library that wants to use the component during Spark data + * processing should utilize this trait and its methods. The methods serve to record the errors and attach them to the + * `DataFrame`. The trait should be an input parameter for such library, perhaps as an implicit. + * On the other side the end application provides concrete `ErrorHandling` implementation, that does the actual error + * handling by the application desire. + * For easy to use ana as examples, a few general implementations are provided in the implementations sub-folder. + */ trait ErrorHandling { - def putErrorToColumn(errorMessageSubmit: ErrorMessageSubmit): ErrorColumn + /** + * First of the few methods that needs to be coded in the trait implementation + * The purpose of this method is to convert the error specification into a `Column` expression + * @param errorMessageSubmit - the error specification + * @return - the error specification transformed into a column expression + * @group Error Handling + * @since 0.6.0 + */ + protected def transformErrorSubmitToColumn(errorMessageSubmit: ErrorMessageSubmit): Column - def aggregateErrorColumns(dataFrame: DataFrame)(errCols: ErrorColumn*): DataFrame + /** + * Applies the provided columns to the incoming DataFrame. Usually they might be aggregated in some way and attached + * to the DataFrame, but any other operations are imaginable. Unless really bent, the incoming columns are those + * produced by [[transformErrorSubmitToColumn]]. + * The idea here is that the error column contains information of the error that occurred on the row or is empty (NULL) + * otherwise. + * In each implementation calling the function to each column separately or in any grouping of columns should produce + * the same result (with the exception of order of errors in the aggregation). + * @param dataFrame - the data frame to apply the error columns to + * @param errCols - the list of error columns to apply + * @return - data frame with the error columns applied (aggregated and attached or done otherwise) + */ + protected def doApplyErrorColumnsToDataFrame(dataFrame: DataFrame, errCols: Column*): DataFrame + /** + * The idea of this function is: "Put the error specified to the provided dataframe if the condition is true on the row." + * The error is transformed to a column using the [[transformErrorSubmitToColumn]] method and applied to the data frame + * if the "when" condition is true using the [[doApplyErrorColumnsToDataFrame]] method. + * @param dataFrame - the data frame to operate on + * @param when - the condition that defines the error occurred on the row + * @param errorMessageSubmit - the detected error specification + * @return - the original data frame with the error detection applied + * @group Error Handling + * @since 0.6.0 + */ def putError(dataFrame: DataFrame)(when: Column)(errorMessageSubmit: ErrorMessageSubmit): DataFrame = { putErrorsWithGrouping(dataFrame)(Seq(ErrorWhen(when, errorMessageSubmit))) } - def putErrorsWithGrouping(dataFrame: DataFrame)(errorsWhen: Seq[ErrorWhen]): DataFrame - def putErrorToColumn(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errCol: Option[ErrSourceColName], additionalInfo: AdditionalInfo = None): ErrorColumn = { - val toSubmit = errCol + /** + * Same as [[putError]], but allows a series of pairs condition-error to be specified at once. + * It should be noted, that once an error has been identified for a field on the row, no more conditions bound to that + * field are evaluated. + * @param dataFrame - the data frame to operate on + * @param errorsWhen - the list of condition-error pairs, the condition are grouped by the field of the error submissions + * @return - the original data frame with the error detection applied + * @group Error Handling + * @since 0.6.0 + */ + def putErrorsWithGrouping(dataFrame: DataFrame)(errorsWhen: Seq[ErrorWhen]): DataFrame = { + def errorWhenToCol(errorWhen: ErrorWhen): Column = { + when(errorWhen.when, transformErrorSubmitToColumn(errorWhen.errorMessageSubmit)) + } + def errorWhenSeqToCol(errorsWhen: Seq[ErrorWhen]): Column = { + val branches: Seq[(Expression, Expression)] = errorsWhen.map(errorWhen => (errorWhen.when.expr, transformErrorSubmitToColumn(errorWhen.errorMessageSubmit).expr)) + new Column(CaseWhen(branches)) + } + + val errorsByColumn = errorsWhen.groupBy(_.errorMessageSubmit.errColsAndValues.columnNames) + val noColNames = Set.empty[String] + val errorColumns1 = errorsByColumn.getOrElse(noColNames, Seq.empty).map(errorWhenToCol) // no grouping without ErrCol names + val errorColumns2 = (errorsByColumn - noColNames).values.map(errorWhenSeqToCol).toSeq + doApplyErrorColumnsToDataFrame(dataFrame, errorColumns1 ++ errorColumns2: _*) + } + + /** + * Transforms an error information into a column expression. For cases when simple column expression condition used in + * [[putError]] is not suitable for whatever reason. + * The returned [[types.ErrorColumn]] should then be used in [[applyErrorColumnsToDataFrame]]. + * @param errorMessageSubmit - the error specification + * @return - [[types.ErrorColumn]] expression containing the error specification + * @group Error Handling + * @since 0.6.0 + */ + def createErrorAsColumn(errorMessageSubmit: ErrorMessageSubmit): ErrorColumn = { + ErrorColumn(transformErrorSubmitToColumn(errorMessageSubmit)) + } + + //TODO Fix ScalaDoc cross-module links #48 - createErrorAsColumn(errorMessageSubmit: ErrorMessageSubmit) + /** + * Same as as above createErrorAsColumn(errorMessageSubmit: ErrorMessageSubmit), only providing the error specification + * in decomposed state, not in the [[ErrorMessageSubmit]] trait form + * @param errType - word description of the type of the error + * @param errCode - number designation of the type of the error + * @param errMessage - human friendly description of the error + * @param errSourceColName - the name of the column the error happened at + * @param additionalInfo - any optional additional info in JSON format + * @return - [[types.ErrorColumn]] expression containing the error specification + * @group Error Handling + * @since 0.6.0 + */ + def createErrorAsColumn(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errSourceColName: Option[ErrSourceColName], additionalInfo: AdditionalInfo = None): ErrorColumn = { + val toSubmit = errSourceColName .map(errSourceColName => ErrorMessageSubmitOnColumn(errType, errCode, errMessage, errSourceColName, additionalInfo)) .getOrElse(ErrorMessageSubmitWithoutColumn(errType, errCode, errMessage, additionalInfo)) - putErrorToColumn(toSubmit) + createErrorAsColumn(toSubmit) + } + + /** + * Applies the earlier collected [[types.ErrorColumn ErrorColumns]] to the provided DataFrame. + * See [[doApplyErrorColumnsToDataFrame]] for detailed functional explanation + * @param dataFrame - the data frame to operate on + * @param errCols - a list of [[types.ErrorColumn]] returned by previous calls of `createErrorAsColumn` + * @return - the original data frame with the error detection applied + * @group Error Handling + * @since 0.6.0 + */ + def applyErrorColumnsToDataFrame(dataFrame: DataFrame)(errCols: ErrorColumn*): DataFrame = { + doApplyErrorColumnsToDataFrame(dataFrame, errCols.map(_.column): _*) } } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageSubmit.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageSubmit.scala index 6f20e772..77f4ae42 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageSubmit.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorMessageSubmit.scala @@ -18,11 +18,16 @@ package za.co.absa.spark.commons.errorhandling import za.co.absa.spark.commons.errorhandling.types._ +/** + * Trait collecting error definition in a format usable during Spark data processing + * @group Error Handling + * @since 0.6.0 + */ trait ErrorMessageSubmit { def errType: ColumnOrValue[ErrType] def errCode: ColumnOrValue[ErrCode] - def errMsg: ColumnOrValue[ErrMsg] + def errMessage: ColumnOrValue[ErrMsg] def errColsAndValues: ColumnOrValue[ErrColsAndValues] - def additionInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty + def additionalInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala index caef20c3..56dbd9fb 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala @@ -19,14 +19,15 @@ package za.co.absa.spark.commons.errorhandling.implementations import org.apache.spark.sql.{Column, DataFrame} import org.apache.spark.sql.functions.{array, array_except, array_union, col, map_from_arrays, map_keys, map_values, struct, when} import za.co.absa.spark.commons.adapters.TransformAdapter -import za.co.absa.spark.commons.errorhandling.partials.EvaluateIntoErrorMessage.FieldNames._ -import za.co.absa.spark.commons.errorhandling.partials.{ErrorHandlingCommon, EvaluateIntoErrorMessage} +import za.co.absa.spark.commons.errorhandling.ErrorHandling +import za.co.absa.spark.commons.errorhandling.partials.TransformIntoErrorMessage.FieldNames._ +import za.co.absa.spark.commons.errorhandling.partials.TransformIntoErrorMessage import za.co.absa.spark.commons.sql.functions.null_col import za.co.absa.spark.commons.implicits.DataFrameImplicits.DataFrameEnhancements case class ErrorMessageArray(errorColumnName: String = ErrorMessageArray.defaultErrorColumnName) - extends ErrorHandlingCommon - with EvaluateIntoErrorMessage + extends ErrorHandling + with TransformIntoErrorMessage with TransformAdapter { private def decomposeMap(errorMessageColumn: Column): Column = { @@ -35,7 +36,7 @@ case class ErrorMessageArray(errorColumnName: String = ErrorMessageArray.default errorMessageColumn.getField(errType) as errType, errorMessageColumn.getField(errCode) as errCode, errorMessageColumn.getField(errMsg) as errMsg, - map_keys(errorMessageColumn.getField(errColsAndValues)) as errCols, + map_keys(errorMessageColumn.getField(errColsAndValues)) as errSourceCols, map_values(errorMessageColumn.getField(errColsAndValues)) as errValues, errorMessageColumn.getField(additionInfo) as additionInfo ) @@ -47,7 +48,7 @@ case class ErrorMessageArray(errorColumnName: String = ErrorMessageArray.default errorMessageColumn.getField(errType) as errType, errorMessageColumn.getField(errCode) as errCode, errorMessageColumn.getField(errMsg) as errMsg, - map_from_arrays(errorMessageColumn.getField(errCols), errorMessageColumn.getField(errValues)) as errColsAndValues, + map_from_arrays(errorMessageColumn.getField(errSourceCols), errorMessageColumn.getField(errValues)) as errColsAndValues, errorMessageColumn.getField(additionInfo) as additionInfo ) } @@ -59,7 +60,7 @@ case class ErrorMessageArray(errorColumnName: String = ErrorMessageArray.default dataFrame.withColumn(errorColName, reMap(array_union(deMap(col(errorColName)), colToUnion))) } - protected def doTheColumnsAggregation(dataFrame: DataFrame, errCols: Column*): DataFrame = { + protected def doApplyErrorColumnsToDataFrame(dataFrame: DataFrame, errCols: Column*): DataFrame = { val aggregated = array(errCols.map(decomposeMap): _*) //need to decompose the map field, as it's not supported in array functions val aggregatedWithoutNulls = array_except(aggregated, array(null_col)) val joinToExisting: (DataFrame, String) => DataFrame = appendToErrCol(_, _, aggregatedWithoutNulls) diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValue.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValue.scala index b0aeb6b6..dc252a7b 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValue.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValue.scala @@ -22,12 +22,24 @@ import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit import za.co.absa.spark.commons.errorhandling.implementations.submits.ErrorMessageSubmitJustErrorValue.noColumnKey import za.co.absa.spark.commons.errorhandling.types._ + +/** + * [[za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit ErrorMessageSubmit]] subclass to represent an error not + * bound to a particular column but still having a value that caused the error. + * @param errType - error type + * @param errCode - error code + * @param errMessage - error message + * @param errValue - the value that caused the error + * @param additionalInfo - optional additional info in form of JSON + * @group Error Handling + * @since 0.6.0 + */ class ErrorMessageSubmitJustErrorValue( val errType: ColumnOrValue[ErrType], val errCode: ColumnOrValue[ErrCode], - val errMsg: ColumnOrValue[ErrMsg], + val errMessage: ColumnOrValue[ErrMsg], errValue: ColumnOrValue[ErrValue], - override val additionInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty + override val additionalInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty ) extends ErrorMessageSubmit { val errColsAndValues: ColumnOrValue[ErrColsAndValues] = ColumnOrValue(map(lit(noColumnKey), errValue.column.cast(StringType))) } @@ -35,6 +47,17 @@ class ErrorMessageSubmitJustErrorValue( object ErrorMessageSubmitJustErrorValue { val noColumnKey: ErrSourceColName = "" + /** + * Convenient apply function + * @param errType - error type + * @param errCode - error code + * @param errMessage - error message + * @param errValue - the value that caused the error + * @param additionalInfo - optional additional info in form of JSON + * @return - instance of [[ErrorMessageSubmitJustErrorValue]] + * @group Error Handling + * @since 0.6.0 + */ def apply(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errValue: String, additionalInfo: AdditionalInfo = None): ErrorMessageSubmitJustErrorValue = { new ErrorMessageSubmitJustErrorValue( ColumnOrValue.withValue(errType), diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnColumn.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnColumn.scala index 4d7ebe6e..5e73c4d7 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnColumn.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnColumn.scala @@ -18,23 +18,45 @@ package za.co.absa.spark.commons.errorhandling.implementations.submits import za.co.absa.spark.commons.errorhandling.types._ +/** + * [[za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit ErrorMessageSubmit]] subclass to represent an error bound to exactly one + * column. + * @param errType - error type + * @param errCode - error code + * @param errMessage - error message + * @param errSourceColName - the name of the column the error was detected on + * @param additionalInfo - optional additional info in form of JSON + * @group Error Handling + * @since 0.6.0 + */ class ErrorMessageSubmitOnColumn ( errType: ColumnOrValue[ErrType], errCode: ColumnOrValue[ErrCode], - errMsg: ColumnOrValue[ErrMsg], - errColName: ErrSourceColName, - override val additionInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty - ) extends ErrorMessageSubmitOnMoreColumns(errType, errCode, errMsg, Set(errColName), additionInfo) { + errMessage: ColumnOrValue[ErrMsg], + errSourceColName: ErrSourceColName, + override val additionalInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty + ) extends ErrorMessageSubmitOnMoreColumns(errType, errCode, errMessage, Set(errSourceColName), additionalInfo) { } object ErrorMessageSubmitOnColumn { - def apply(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errColName: ErrSourceColName, additionalInfo: AdditionalInfo= None): ErrorMessageSubmitOnColumn = { + /** + * Convenient apply function + * @param errType - error type + * @param errCode - error code + * @param errMessage - error message + * @param errSourceColName - the name of the column the error was detected on + * @param additionalInfo - optional additional info in form of JSON + * @return - instance of [[ErrorMessageSubmitOnColumn]] + * @group Error Handling + * @since 0.6.0 + */ + def apply(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, errSourceColName: ErrSourceColName, additionalInfo: AdditionalInfo= None): ErrorMessageSubmitOnColumn = { new ErrorMessageSubmitOnColumn( ColumnOrValue.withValue(errType), ColumnOrValue.withValue(errCode), ColumnOrValue.withValue(errMessage), - errColName, + errSourceColName, ColumnOrValue.withOption(additionalInfo) ) } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumns.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumns.scala index d3ac19d6..3fcaf563 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumns.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumns.scala @@ -19,27 +19,49 @@ package za.co.absa.spark.commons.errorhandling.implementations.submits import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit import za.co.absa.spark.commons.errorhandling.types._ +/** + * [[za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit ErrorMessageSubmit]] subclass to represent an error bound + * to multiple columns. + * @param errType - error type + * @param errCode - error code + * @param errMessage - error message + * @param errSourceColNames - the name of the columns the error was detected on + * @param additionalInfo - optional additional info in form of JSON + * @group Error Handling + * @since 0.6.0 + */ class ErrorMessageSubmitOnMoreColumns( val errType: ColumnOrValue[ErrType], val errCode: ColumnOrValue[ErrCode], - val errMsg: ColumnOrValue[ErrMsg], - errColNames: Set[ErrSourceColName], - override val additionInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty + val errMessage: ColumnOrValue[ErrMsg], + errSourceColNames: Set[ErrSourceColName], + override val additionalInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty ) extends ErrorMessageSubmit { - val errColsAndValues: ColumnOrValue[ErrColsAndValues] = ColumnOrValue.asMapOfStringColumns(errColNames) + val errColsAndValues: ColumnOrValue[ErrColsAndValues] = ColumnOrValue.asMapOfStringColumns(errSourceColNames) } object ErrorMessageSubmitOnMoreColumns { + /** + * Convenient apply function + * @param errType - error type + * @param errCode - error code + * @param errMessage - error message + * @param errSourceColNames - the name of the columns the error was detected on + * @param additionalInfo - optional additional info in form of JSON + * @return - instance of [[ErrorMessageSubmitOnMoreColumns]] + * @group Error Handling + * @since 0.6.0 + */ def apply(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, - errColNames: Set[ErrSourceColName], + errSourceColNames: Set[ErrSourceColName], additionalInfo: AdditionalInfo= None): ErrorMessageSubmitOnMoreColumns = { new ErrorMessageSubmitOnMoreColumns( ColumnOrValue.withValue(errType), ColumnOrValue.withValue(errCode), ColumnOrValue.withValue(errMessage), - errColNames, + errSourceColNames, ColumnOrValue.withOption(additionalInfo) ) } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumn.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumn.scala index 6a732710..9aeb3462 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumn.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumn.scala @@ -22,11 +22,21 @@ import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit import za.co.absa.spark.commons.errorhandling.implementations.submits.ErrorMessageSubmitWithoutColumn.emptyErrorColsAndValues import za.co.absa.spark.commons.errorhandling.types._ +/** + * [[za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit ErrorMessageSubmit]] subclass to represent an error not + * bound to any particular column. + * @param errType - error type + * @param errCode - error code + * @param errMessage - error message + * @param additionalInfo - optional additional info in form of JSON + * @group Error Handling + * @since 0.6.0 + */ class ErrorMessageSubmitWithoutColumn( val errType: ColumnOrValue[ErrType], val errCode: ColumnOrValue[ErrCode], - val errMsg: ColumnOrValue[ErrMsg], - override val additionInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty + val errMessage: ColumnOrValue[ErrMsg], + override val additionalInfo: ColumnOrValue[AdditionalInfo] = ColumnOrValue.asEmpty ) extends ErrorMessageSubmit { val errColsAndValues: ColumnOrValue[ErrColsAndValues] = ColumnOrValue(ErrorMessageSubmitWithoutColumn.emptyErrColsAndValues) @@ -37,6 +47,16 @@ object ErrorMessageSubmitWithoutColumn { val emptyErrColsAndValues: Column = typedLit(emptyErrorColsAndValues) + /** + * Convenient apply function + * @param errType - error type + * @param errCode - error code + * @param errMessage - error message + * @param additionalInfo - optional additional info in form of JSON + * @return - instance of [[ErrorMessageSubmitWithoutColumn]] + * @group Error Handling + * @since 0.6.0 + */ def apply(errType: ErrType, errCode: ErrCode, errMessage: ErrMsg, additionalInfo: AdditionalInfo = None): ErrorMessageSubmitWithoutColumn = { new ErrorMessageSubmitWithoutColumn( ColumnOrValue.withValue(errType), diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala deleted file mode 100644 index 4768df61..00000000 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright 2021 ABSA Group Limited - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package za.co.absa.spark.commons.errorhandling.partials - -import org.apache.spark.sql.catalyst.expressions.{CaseWhen, Expression} -import org.apache.spark.sql.{Column, DataFrame} -import za.co.absa.spark.commons.errorhandling.{ErrorHandling, ErrorMessageSubmit} -import za.co.absa.spark.commons.errorhandling.types._ -import org.apache.spark.sql.functions.when - -trait ErrorHandlingCommon extends ErrorHandling { - protected def evaluate(errorMessageSubmit: ErrorMessageSubmit): Column - - protected def doTheColumnsAggregation(dataFrame: DataFrame, errCols: Column*): DataFrame - - def putErrorToColumn(errorMessageSubmit: ErrorMessageSubmit): ErrorColumn = { - ErrorColumn(evaluate(errorMessageSubmit)) - } - - def aggregateErrorColumns(dataFrame: DataFrame)(errCols: ErrorColumn*): DataFrame = { - doTheColumnsAggregation(dataFrame, errCols.map(_.column): _*) - } - - def putErrorsWithGrouping(dataFrame: DataFrame)(errorsWhen: Seq[ErrorWhen]): DataFrame = { - val errorsByColumn = errorsWhen.groupBy(_.errorMessageSubmit.errColsAndValues.columnNames) - val noColNames = Set.empty[String] - val errorColumns1 = errorsByColumn.getOrElse(noColNames, Seq.empty).map(errorWhenToCol) // no grouping without ErrCol names - val errorColumns2 = (errorsByColumn - noColNames).values.map(errorWhenSeqToCol).toSeq - doTheColumnsAggregation(dataFrame, errorColumns1 ++ errorColumns2: _*) - } - - - private def errorWhenToCol(errorWhen: ErrorWhen): Column = { - when(errorWhen.when, evaluate(errorWhen.errorMessageSubmit)) - } - - private def errorWhenSeqToCol(errorsWhen: Seq[ErrorWhen]): Column = { - val branches: Seq[(Expression, Expression)] = errorsWhen.map(errorWhen => (errorWhen.when.expr, evaluate(errorWhen.errorMessageSubmit).expr)) - new Column(CaseWhen(branches)) - } - -} diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/TransformIntoErrorMessage.scala similarity index 61% rename from spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala rename to spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/TransformIntoErrorMessage.scala index 834d010e..a428d88c 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/TransformIntoErrorMessage.scala @@ -19,28 +19,34 @@ package za.co.absa.spark.commons.errorhandling.partials import org.apache.spark.sql.Column import org.apache.spark.sql.functions.struct import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit -import za.co.absa.spark.commons.errorhandling.partials.EvaluateIntoErrorMessage.FieldNames._ +import za.co.absa.spark.commons.errorhandling.partials.TransformIntoErrorMessage.FieldNames._ -trait EvaluateIntoErrorMessage { - protected def evaluate(errorMessageSubmit: ErrorMessageSubmit): Column = { +/** + * Trait offers a presumably very common implementation of [[za.co.absa.spark.commons.errorhandling.ErrorHandling.transformErrorSubmitToColumn ErrorHandling.transformErrorSubmitToColumn()]], + * where the error is transformed into the struct of [[za.co.absa.spark.commons.errorhandling.ErrorMessage ErrorMessage]]. + * @group Error Handling + * @since 0.6.0 + */ +trait TransformIntoErrorMessage { + protected def transformErrorSubmitToColumn(errorMessageSubmit: ErrorMessageSubmit): Column = { struct( errorMessageSubmit.errType.column as errType, errorMessageSubmit.errCode.column as errCode, - errorMessageSubmit.errMsg.column as errMsg, + errorMessageSubmit.errMessage.column as errMsg, errorMessageSubmit.errColsAndValues.column as errColsAndValues, - errorMessageSubmit.additionInfo.column as additionInfo + errorMessageSubmit.additionalInfo.column as additionInfo ) } } -object EvaluateIntoErrorMessage { +object TransformIntoErrorMessage { object FieldNames { val errType = "errType" val errCode = "errCode" val errMsg = "errMsg" val errColsAndValues = "errColsAndValues" val additionInfo = "additionInfo" - val errCols = "errCols" + val errSourceCols = "errSourceCols" val errValues = "errValues" } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/TransformViaUdf.scala similarity index 69% rename from spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala rename to spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/TransformViaUdf.scala index 65c0af8a..f84b1c12 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/TransformViaUdf.scala @@ -20,23 +20,23 @@ import org.apache.spark.sql.{Column, SparkSession} import za.co.absa.spark.commons.OncePerSparkSession import za.co.absa.spark.commons.adapters.CallUdfAdapter import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit -import za.co.absa.spark.commons.errorhandling.partials.EvaluateViaUdf.ErrorMessageFunction +import za.co.absa.spark.commons.errorhandling.partials.TransformViaUdf.ErrorMessageFunction import za.co.absa.spark.commons.errorhandling.types._ -trait EvaluateViaUdf[T] extends OncePerSparkSession with CallUdfAdapter { - def evaluationUdfName: String - protected def evaluationUdf: ErrorMessageFunction[T] +trait TransformViaUdf[T] extends OncePerSparkSession with CallUdfAdapter { + def transformationUdfName: String + protected def transformationUdf: ErrorMessageFunction[T] - protected def evaluate(errorMessageSubmit: ErrorMessageSubmit): Column = { - call_udf(evaluationUdfName, + protected def transformErrorSubmitToColumn(errorMessageSubmit: ErrorMessageSubmit): Column = { + call_udf(transformationUdfName, errorMessageSubmit.errType.column, errorMessageSubmit.errCode.column, - errorMessageSubmit.errMsg.column, + errorMessageSubmit.errMessage.column, errorMessageSubmit.errColsAndValues.column, - errorMessageSubmit.additionInfo.column) + errorMessageSubmit.additionalInfo.column) } } -object EvaluateViaUdf { +object TransformViaUdf { type ErrorMessageFunction[T] = (ErrType, ErrCode, ErrMsg, ErrColsAndValues, AdditionalInfo) => T //TODO needed? } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala index aea10f5b..1a16ef46 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValue.scala @@ -23,34 +23,116 @@ import za.co.absa.spark.commons.sql.functions.null_col import scala.language.higherKinds +/** + * Class to unify a representation of a [[za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit ErrorMessageSubmit]] segments. + * It can be build from `column`,column name or a set of column names, a constant value and others. + * The class then provides the ability to express each option as a Spark column used in other [[za.co.absa.spark.commons.errorhandling.ErrorHandling ErrorHandling]] + * classes and methods. + * @tparam T - The type of the value and the Scala equivalent of the column DataType + * @group Error Handling + * @since 0.6.0 + */ trait ColumnOrValue[T] { + /** + * @return `column` expression representing the input + * @group Error Handling + * @since 0.6.0 + */ def column: Column + + /** + * @return the name or names if columns are directly referenced. + * @group Error Handling + * @since 0.6.0 + */ def columnNames: Set[String] + + /** + * @return the constant value if entity was build from one, otherwise `None` + * @group Error Handling + * @since 0.6.0 + */ def getValue: Option[T] } object ColumnOrValue { + /** + * Just a shorthand alias of [[ColumnOrValue]], for less typying + * @tparam T - The type of the value and the Scala equivalent of the column DataType + * @group Error Handling + * @since 0.6.0 + */ type CoV[T] = ColumnOrValue[T] //just a shorthand val CoV: ColumnOrValue.type = ColumnOrValue + /** + * Referencing exactly one column, by its name + * @param columnName - the column name + * @tparam T - The Scala type equivalent to the column `DataType` + * @group Error Handling + * @since 0.6.0 + */ def apply[T](columnName: String): ColumnOrValue[T] = CoVNamedColumn(columnName) + + /** + * Referencing a column by its expression + * @param column - the column expression + * @tparam T - The Scala type equivalent to the column `DataType` + * @group Error Handling + * @since 0.6.0 + */ def apply[T](column: Column): ColumnOrValue[T] = CoVDefinedColumn(column) + + /** + * Referencing a column which is a map of column names and their values transformed by the transformer + * @param mapColumnNames - the column names in the map + * @param columnTransformer - function to tranform the column values with + * @tparam T - The Scala type equivalent to the column `DataType` + * @group Error Handling + * @since 0.6.0 + */ def apply[T](mapColumnNames: Set[String], columnTransformer: ColumnTransformer): ColumnOrValue[Map[String, T]] = { CoVMapColumn(mapColumnNames, columnTransformer) } + /** + * Representing and optional string value - String or NULL + * @param value - the value to represent in the constant column or NULL if None + * @group Error Handling + * @since 0.6.0 + */ def withOption(value: Option[String]): ColumnOrValue[Option[String]] = { // could be safely an apply, or done more generally value match { case None => CoVNull(StringType) case Some(x) => CoVOption(x) } } + + /** + * Referencing a constant value + * @param value - the constant the column to represent + * @tparam T - The Scala type equivalent to the column `DataType` + * @group Error Handling + * @since 0.6.0 + */ def withValue[T](value: T): ColumnOrValue[T] = CoVValue(value) + /** + * @return - column of NULL values as StringType + * @group Error Handling + * @since 0.6.0 + */ def asEmpty: ColumnOrValue[Option[String]] = CoVNull(StringType) + + /** + * Referencing a column which is a map of column names and their values casted to string + * @param mapColumnNames - the column names in the map + * @group Error Handling + * @since 0.6.0 + */ def asMapOfStringColumns(mapColumnNames: Set[String]): ColumnOrValue[Map[String, String]] = CoVMapColumn(mapColumnNames, columnNameToItsStringValue) - def columnNameToItsStringValue(colName: String): Column = col(colName).cast(StringType) + private def columnNameToItsStringValue(colName: String): Column = col(colName).cast(StringType) private final case class CoVNamedColumn[T](columnName: String) extends ColumnOrValue[T] { val column: Column = col(columnName) diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ErrorWhen.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ErrorWhen.scala index bf506691..b6a9e0c6 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ErrorWhen.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ErrorWhen.scala @@ -19,6 +19,14 @@ package za.co.absa.spark.commons.errorhandling.types import org.apache.spark.sql.Column import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit +/** + * A case class the puts together an error specification and the condition to identify it. + * The primary usage is in [[za.co.absa.spark.commons.errorhandling.ErrorHandling.putErrorsWithGrouping ErrorHandling.putErrorsWithGrouping()]] + * @param when - boolean column expression that should evaluate to true on and only on the error detection + * @param errorMessageSubmit - the error specification + * @group Error Handling + * @since 0.6.0 + */ case class ErrorWhen ( when: Column, errorMessageSubmit: ErrorMessageSubmit diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala index bde25f9b..c736fdb2 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala @@ -65,7 +65,7 @@ object functions { } def null_col(dataType: DataType):Column = { - lit(None.orNull).cast(dataType) + null_col.cast(dataType) } } diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/utils/ExplodeTools.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/utils/ExplodeTools.scala index 1783d0bb..4b5ad5a3 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/utils/ExplodeTools.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/utils/ExplodeTools.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{ArrayType, StructType} import org.apache.spark.sql.{Column, DataFrame} import za.co.absa.spark.commons.implicits.StructTypeImplicits.StructTypeEnhancements +import za.co.absa.spark.commons.sql.functions.null_col import za.co.absa.spark.commons.utils.explode.{Explosion, ExplosionContext} import za.co.absa.spark.hats.Extensions.DataFrameExtension @@ -343,7 +344,7 @@ object ExplodeTools { private def addSuperTransientField(inputDf: DataFrame, arrayColPathName: String): (DataFrame, String) = { val colName = inputDf.schema.getClosestUniqueName(superTransientColumnName) val nestedColName = (SchemaUtils.splitPath(arrayColPathName).dropRight(1) :+ colName).mkString(".") - val df = inputDf.nestedWithColumn(nestedColName, lit(null)) + val df = inputDf.nestedWithColumn(nestedColName, null_col) (df, nestedColName) } diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala index 546db2ab..89d66e57 100644 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala @@ -77,13 +77,13 @@ class ErrorMessageArrayTest extends AnyFunSuite with SparkTestBase { val errorMessageArray = ErrorMessageArray() - val e1 = errorMessageArray.putErrorToColumn("Test error 1", 1, "This is a test error", Some(col1Name)) + val e1 = errorMessageArray.createErrorAsColumn("Test error 1", 1, "This is a test error", Some(col1Name)) val errorSubmitA = ErrorMessageSubmitOnColumn("Test error 2", 2, "This is a test error", col2Name) - val e2 = errorMessageArray.putErrorToColumn(errorSubmitA) + val e2 = errorMessageArray.createErrorAsColumn(errorSubmitA) val errorSubmitB = ErrorMessageSubmitWithoutColumn("Test error 3", 3, "This is a test error") - val e3 = errorMessageArray.putErrorToColumn(errorSubmitB) + val e3 = errorMessageArray.createErrorAsColumn(errorSubmitB) - val resultDf = errorMessageArray.aggregateErrorColumns(srcDf)(e1, e2, e3) + val resultDf = errorMessageArray.applyErrorColumnsToDataFrame(srcDf)(e1, e2, e3) val result = resultDfToResult(resultDf) assert(result == expected) @@ -150,7 +150,6 @@ class ErrorMessageArrayTest extends AnyFunSuite with SparkTestBase { test("Various error submits combined") { val errorMessageArray = ErrorMessageArray("MyErrCol") - case class NullError(errColName: String) extends ErrorMessageSubmitOnColumn( CoV.withValue("Null Error"), CoV.withValue(1L), @@ -192,6 +191,7 @@ class ErrorMessageArrayTest extends AnyFunSuite with SparkTestBase { val result = resultDfToResult(resultDf) assert(result == expected) + assert(resultDf.columns.contains("MyErrCol")) } } diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValueTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValueTest.scala index 37c7f2d1..4bf16e80 100644 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValueTest.scala +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValueTest.scala @@ -40,9 +40,9 @@ class ErrorMessageSubmitJustErrorValueTest extends AnyFunSuite { expectedErrType assertTo result.errType expectedErrCode assertTo result.errCode - expectedErrMsg assertTo result.errMsg + expectedErrMsg assertTo result.errMessage expectedErrValuesCol assertTo result.errColsAndValues - expectedAdditionalInfo assertTo result.additionInfo + expectedAdditionalInfo assertTo result.additionalInfo } test("Apply function properly hands over data with additional info") { @@ -63,8 +63,8 @@ class ErrorMessageSubmitJustErrorValueTest extends AnyFunSuite { expectedErrType assertTo result.errType expectedErrCode assertTo result.errCode - expectedErrMsg assertTo result.errMsg + expectedErrMsg assertTo result.errMessage expectedErrValuesCol assertTo result.errColsAndValues - expectedAdditionalInfo assertTo result.additionInfo + expectedAdditionalInfo assertTo result.additionalInfo } } diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnColumnTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnColumnTest.scala index 432986ce..57234470 100644 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnColumnTest.scala +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnColumnTest.scala @@ -39,9 +39,9 @@ class ErrorMessageSubmitOnColumnTest extends AnyFunSuite { expectedErrType assertTo result.errType expectedErrCode assertTo result.errCode - expectedErrMsg assertTo result.errMsg + expectedErrMsg assertTo result.errMessage expectedCol assertTo result.errColsAndValues - expectedAdditionalInfo assertTo result.additionInfo + expectedAdditionalInfo assertTo result.additionalInfo } test("Apply function properly hands over data with additional info") { @@ -61,8 +61,8 @@ class ErrorMessageSubmitOnColumnTest extends AnyFunSuite { expectedErrType assertTo result.errType expectedErrCode assertTo result.errCode - expectedErrMsg assertTo result.errMsg + expectedErrMsg assertTo result.errMessage expectedCol assertTo result.errColsAndValues - expectedAdditionalInfo assertTo result.additionInfo + expectedAdditionalInfo assertTo result.additionalInfo } } diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumnsTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumnsTest.scala index 656a78d2..6ea93254 100644 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumnsTest.scala +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumnsTest.scala @@ -44,9 +44,9 @@ class ErrorMessageSubmitOnMoreColumnsTest extends AnyFunSuite { expectedErrType assertTo result.errType expectedErrCode assertTo result.errCode - expectedErrMsg assertTo result.errMsg + expectedErrMsg assertTo result.errMessage expectedCol assertTo result.errColsAndValues - expectedAdditionalInfo assertTo result.additionInfo + expectedAdditionalInfo assertTo result.additionalInfo } test("Apply function properly hands over data with additional info") { @@ -71,8 +71,8 @@ class ErrorMessageSubmitOnMoreColumnsTest extends AnyFunSuite { expectedErrType assertTo result.errType expectedErrCode assertTo result.errCode - expectedErrMsg assertTo result.errMsg + expectedErrMsg assertTo result.errMessage expectedCol assertTo result.errColsAndValues - expectedAdditionalInfo assertTo result.additionInfo + expectedAdditionalInfo assertTo result.additionalInfo } } diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumnTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumnTest.scala index 9ea0a8fe..af45551e 100644 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumnTest.scala +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumnTest.scala @@ -38,10 +38,10 @@ class ErrorMessageSubmitWithoutColumnTest extends AnyFunSuite { expectedErrType assertTo result.errType expectedErrCode assertTo result.errCode - expectedErrMsg assertTo result.errMsg + expectedErrMsg assertTo result.errMessage result.errColsAndValues.column.expr expectedErrValuesCol assertTo result.errColsAndValues - expectedAdditionalInfo assertTo result.additionInfo + expectedAdditionalInfo assertTo result.additionalInfo } test("Apply function properly hands over data with additional info") { @@ -61,8 +61,8 @@ class ErrorMessageSubmitWithoutColumnTest extends AnyFunSuite { expectedErrType assertTo result.errType expectedErrCode assertTo result.errCode - expectedErrMsg assertTo result.errMsg + expectedErrMsg assertTo result.errMessage expectedErrValuesCol assertTo result.errColsAndValues - expectedAdditionalInfo assertTo result.additionInfo + expectedAdditionalInfo assertTo result.additionalInfo } } From 38b54b4e06361ac3efcbb65bb8eb0598bee8084e Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Wed, 3 May 2023 14:29:14 +0200 Subject: [PATCH 16/23] * Fixing license year --- .github/workflows/jacoco_check.yml | 2 +- .github/workflows/release.yml | 2 +- project/JacocoSetup.scala | 2 +- .../errorhandling/implementations/ErrorMessageArrayTest.scala | 2 +- .../submits/ErrorMessageSubmitJustErrorValueTest.scala | 2 +- .../submits/ErrorMessageSubmitOnColumnTest.scala | 2 +- .../submits/ErrorMessageSubmitOnMoreColumnsTest.scala | 2 +- .../submits/ErrorMessageSubmitWithoutColumnTest.scala | 2 +- .../spark/commons/errorhandling/types/ColumnOrValueForm.scala | 2 +- .../spark/commons/errorhandling/types/ColumnOrValueTest.scala | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/jacoco_check.yml b/.github/workflows/jacoco_check.yml index fe02ccaf..8f15a54a 100644 --- a/.github/workflows/jacoco_check.yml +++ b/.github/workflows/jacoco_check.yml @@ -1,5 +1,5 @@ # -# Copyright 2023 ABSA Group Limited +# Copyright 2021 ABSA Group Limited # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e15fbddb..2f418b92 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,5 +1,5 @@ # -# Copyright 2022 ABSA Group Limited +# Copyright 2021 ABSA Group Limited # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/project/JacocoSetup.scala b/project/JacocoSetup.scala index 59c90034..8d4f77ab 100644 --- a/project/JacocoSetup.scala +++ b/project/JacocoSetup.scala @@ -1,5 +1,5 @@ /* - * Copyright 2023 ABSA Group Limited + * Copyright 2021 ABSA Group Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala index 89d66e57..4251c79c 100644 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArrayTest.scala @@ -1,5 +1,5 @@ /* - * Copyright 2023 ABSA Group Limited + * Copyright 2021 ABSA Group Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValueTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValueTest.scala index 4bf16e80..a35338bf 100644 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValueTest.scala +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitJustErrorValueTest.scala @@ -1,5 +1,5 @@ /* - * Copyright 2023 ABSA Group Limited + * Copyright 2021 ABSA Group Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnColumnTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnColumnTest.scala index 57234470..47072491 100644 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnColumnTest.scala +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnColumnTest.scala @@ -1,5 +1,5 @@ /* - * Copyright 2023 ABSA Group Limited + * Copyright 2021 ABSA Group Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumnsTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumnsTest.scala index 6ea93254..262b23f2 100644 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumnsTest.scala +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumnsTest.scala @@ -1,5 +1,5 @@ /* - * Copyright 2023 ABSA Group Limited + * Copyright 2021 ABSA Group Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumnTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumnTest.scala index af45551e..34e0d70d 100644 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumnTest.scala +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumnTest.scala @@ -1,5 +1,5 @@ /* - * Copyright 2023 ABSA Group Limited + * Copyright 2021 ABSA Group Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueForm.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueForm.scala index 1dd042f3..1d5288e1 100644 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueForm.scala +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueForm.scala @@ -1,5 +1,5 @@ /* - * Copyright 2023 ABSA Group Limited + * Copyright 2021 ABSA Group Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueTest.scala index 4658cd4a..bc7a8d18 100644 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueTest.scala +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/types/ColumnOrValueTest.scala @@ -1,5 +1,5 @@ /* - * Copyright 2023 ABSA Group Limited + * Copyright 2021 ABSA Group Limited * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 712931bc915c34ce495832f7ac44250e0ee33f4a Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Thu, 4 May 2023 19:09:47 +0200 Subject: [PATCH 17/23] * Fix after merge with develop * typo fix * enhanced information about partials --- build.sbt | 5 -- .../commons/errorhandling/ErrorHandling.scala | 3 +- .../partials/ErrorHandlingCommon.scala | 56 ------------------- .../partials/EvaluateIntoErrorMessage.scala | 47 ---------------- .../partials/EvaluateViaUdf.scala | 42 -------------- 5 files changed, 2 insertions(+), 151 deletions(-) delete mode 100644 spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala delete mode 100644 spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala delete mode 100644 spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala diff --git a/build.sbt b/build.sbt index 9defac05..800936eb 100644 --- a/build.sbt +++ b/build.sbt @@ -36,10 +36,6 @@ lazy val commonSettings = Seq( Test / parallelExecution := false ) -lazy val commonJacocoReportSettings: JacocoReportSettings = JacocoReportSettings( - formats = Seq(JacocoReportFormats.HTML, JacocoReportFormats.XML) -) - lazy val commonJacocoExcludes: Seq[String] = Seq( "za.co.absa.spark.commons.adapters.CallUdfAdapter", "za.co.absa.spark.commons.adapters.TransformAdapter" @@ -47,7 +43,6 @@ lazy val commonJacocoExcludes: Seq[String] = Seq( // "za.co.absa.spark.commons.utils.ExplodeTools" // class only ) - lazy val parent = (project in file(".")) .aggregate(sparkCommons.projectRefs ++ sparkCommonsTest.projectRefs: _*) .settings( diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala index e04c598d..8cff6d14 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala @@ -28,7 +28,8 @@ import za.co.absa.spark.commons.errorhandling.types._ * `DataFrame`. The trait should be an input parameter for such library, perhaps as an implicit. * On the other side the end application provides concrete `ErrorHandling` implementation, that does the actual error * handling by the application desire. - * For easy to use ana as examples, a few general implementations are provided in the implementations sub-folder. + * For easy to use and as examples, a few general implementations are provided in the implementations sub-folder. + * Also for common, repeated implementations the folder `partials` offer some traits. */ trait ErrorHandling { /** diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala deleted file mode 100644 index 4768df61..00000000 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/ErrorHandlingCommon.scala +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright 2021 ABSA Group Limited - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package za.co.absa.spark.commons.errorhandling.partials - -import org.apache.spark.sql.catalyst.expressions.{CaseWhen, Expression} -import org.apache.spark.sql.{Column, DataFrame} -import za.co.absa.spark.commons.errorhandling.{ErrorHandling, ErrorMessageSubmit} -import za.co.absa.spark.commons.errorhandling.types._ -import org.apache.spark.sql.functions.when - -trait ErrorHandlingCommon extends ErrorHandling { - protected def evaluate(errorMessageSubmit: ErrorMessageSubmit): Column - - protected def doTheColumnsAggregation(dataFrame: DataFrame, errCols: Column*): DataFrame - - def putErrorToColumn(errorMessageSubmit: ErrorMessageSubmit): ErrorColumn = { - ErrorColumn(evaluate(errorMessageSubmit)) - } - - def aggregateErrorColumns(dataFrame: DataFrame)(errCols: ErrorColumn*): DataFrame = { - doTheColumnsAggregation(dataFrame, errCols.map(_.column): _*) - } - - def putErrorsWithGrouping(dataFrame: DataFrame)(errorsWhen: Seq[ErrorWhen]): DataFrame = { - val errorsByColumn = errorsWhen.groupBy(_.errorMessageSubmit.errColsAndValues.columnNames) - val noColNames = Set.empty[String] - val errorColumns1 = errorsByColumn.getOrElse(noColNames, Seq.empty).map(errorWhenToCol) // no grouping without ErrCol names - val errorColumns2 = (errorsByColumn - noColNames).values.map(errorWhenSeqToCol).toSeq - doTheColumnsAggregation(dataFrame, errorColumns1 ++ errorColumns2: _*) - } - - - private def errorWhenToCol(errorWhen: ErrorWhen): Column = { - when(errorWhen.when, evaluate(errorWhen.errorMessageSubmit)) - } - - private def errorWhenSeqToCol(errorsWhen: Seq[ErrorWhen]): Column = { - val branches: Seq[(Expression, Expression)] = errorsWhen.map(errorWhen => (errorWhen.when.expr, evaluate(errorWhen.errorMessageSubmit).expr)) - new Column(CaseWhen(branches)) - } - -} diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala deleted file mode 100644 index 834d010e..00000000 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateIntoErrorMessage.scala +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Copyright 2021 ABSA Group Limited - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package za.co.absa.spark.commons.errorhandling.partials - -import org.apache.spark.sql.Column -import org.apache.spark.sql.functions.struct -import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit -import za.co.absa.spark.commons.errorhandling.partials.EvaluateIntoErrorMessage.FieldNames._ - -trait EvaluateIntoErrorMessage { - protected def evaluate(errorMessageSubmit: ErrorMessageSubmit): Column = { - struct( - errorMessageSubmit.errType.column as errType, - errorMessageSubmit.errCode.column as errCode, - errorMessageSubmit.errMsg.column as errMsg, - errorMessageSubmit.errColsAndValues.column as errColsAndValues, - errorMessageSubmit.additionInfo.column as additionInfo - ) - } -} - -object EvaluateIntoErrorMessage { - object FieldNames { - val errType = "errType" - val errCode = "errCode" - val errMsg = "errMsg" - val errColsAndValues = "errColsAndValues" - val additionInfo = "additionInfo" - val errCols = "errCols" - val errValues = "errValues" - } - -} diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala deleted file mode 100644 index 65c0af8a..00000000 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/partials/EvaluateViaUdf.scala +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright 2021 ABSA Group Limited - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package za.co.absa.spark.commons.errorhandling.partials - -import org.apache.spark.sql.{Column, SparkSession} -import za.co.absa.spark.commons.OncePerSparkSession -import za.co.absa.spark.commons.adapters.CallUdfAdapter -import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit -import za.co.absa.spark.commons.errorhandling.partials.EvaluateViaUdf.ErrorMessageFunction -import za.co.absa.spark.commons.errorhandling.types._ - -trait EvaluateViaUdf[T] extends OncePerSparkSession with CallUdfAdapter { - def evaluationUdfName: String - protected def evaluationUdf: ErrorMessageFunction[T] - - protected def evaluate(errorMessageSubmit: ErrorMessageSubmit): Column = { - call_udf(evaluationUdfName, - errorMessageSubmit.errType.column, - errorMessageSubmit.errCode.column, - errorMessageSubmit.errMsg.column, - errorMessageSubmit.errColsAndValues.column, - errorMessageSubmit.additionInfo.column) - } -} - -object EvaluateViaUdf { - type ErrorMessageFunction[T] = (ErrType, ErrCode, ErrMsg, ErrColsAndValues, AdditionalInfo) => T //TODO needed? -} From b99d06aee6fd3679b7dba9bdbc352ced13f927d6 Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Sat, 6 May 2023 00:53:48 +0200 Subject: [PATCH 18/23] * fixed and added cross-links * documentation to `ErrorMessageArray` * `null_col` function description enhanced --- project/plugins.sbt | 1 + .../commons/errorhandling/ErrorHandling.scala | 31 +++++++++---------- .../implementations/ErrorMessageArray.scala | 6 ++++ .../ErrorMessageSubmitOnMoreColumns.scala | 1 - .../ErrorMessageSubmitWithoutColumn.scala | 1 - .../errorhandling/types/ErrorWhen.scala | 2 +- .../co/absa/spark/commons/sql/functions.scala | 8 +++-- 7 files changed, 29 insertions(+), 21 deletions(-) diff --git a/project/plugins.sbt b/project/plugins.sbt index 80552c6b..875c03ca 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -37,3 +37,4 @@ addSbtPlugin("org.ow2.asm" % "asm-commons" % ow2Version from ow2Url("asm-commons addSbtPlugin("org.ow2.asm" % "asm-tree" % ow2Version from ow2Url("asm-tree")) addSbtPlugin("za.co.absa.sbt" % "sbt-jacoco" % "3.4.1-absa.3" from "https://github.com/AbsaOSS/sbt-jacoco/releases/download/3.4.1-absa.3/sbt-jacoco-3.4.1-absa.3.jar") +addSbtPlugin("com.thoughtworks.sbt-api-mappings" % "sbt-api-mappings" % "3.0.2") \ No newline at end of file diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala index 8cff6d14..e124f0d7 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/ErrorHandling.scala @@ -25,8 +25,8 @@ import za.co.absa.spark.commons.errorhandling.types._ /** * The basic class of error handling component. Every library that wants to use the component during Spark data * processing should utilize this trait and its methods. The methods serve to record the errors and attach them to the - * `DataFrame`. The trait should be an input parameter for such library, perhaps as an implicit. - * On the other side the end application provides concrete `ErrorHandling` implementation, that does the actual error + * [[org.apache.spark.sql.DataFrame spark.DataFrame]]. The trait should be an input parameter for such library, perhaps as an implicit. + * On the other side the end application provides concrete [[ErrorHandling]] implementation, that does the actual error * handling by the application desire. * For easy to use and as examples, a few general implementations are provided in the implementations sub-folder. * Also for common, repeated implementations the folder `partials` offer some traits. @@ -34,7 +34,7 @@ import za.co.absa.spark.commons.errorhandling.types._ trait ErrorHandling { /** * First of the few methods that needs to be coded in the trait implementation - * The purpose of this method is to convert the error specification into a `Column` expression + * The purpose of this method is to convert the error specification into a [[org.apache.spark.sql.Column spark.Column]] expression * @param errorMessageSubmit - the error specification * @return - the error specification transformed into a column expression * @group Error Handling @@ -43,14 +43,14 @@ trait ErrorHandling { protected def transformErrorSubmitToColumn(errorMessageSubmit: ErrorMessageSubmit): Column /** - * Applies the provided columns to the incoming DataFrame. Usually they might be aggregated in some way and attached - * to the DataFrame, but any other operations are imaginable. Unless really bent, the incoming columns are those + * Applies the provided columns to the incoming [[org.apache.spark.sql.DataFrame spark.DataFrame]]. Usually they might be aggregated in some way and attached + * to the [[org.apache.spark.sql.DataFrame spark.DataFrame]], but any other operations are imaginable. Unless really bent, the incoming columns are those * produced by [[transformErrorSubmitToColumn]]. * The idea here is that the error column contains information of the error that occurred on the row or is empty (NULL) * otherwise. * In each implementation calling the function to each column separately or in any grouping of columns should produce * the same result (with the exception of order of errors in the aggregation). - * @param dataFrame - the data frame to apply the error columns to + * @param dataFrame - the [[org.apache.spark.sql.DataFrame spark.DataFrame]] to apply the error columns to * @param errCols - the list of error columns to apply * @return - data frame with the error columns applied (aggregated and attached or done otherwise) */ @@ -60,10 +60,10 @@ trait ErrorHandling { * The idea of this function is: "Put the error specified to the provided dataframe if the condition is true on the row." * The error is transformed to a column using the [[transformErrorSubmitToColumn]] method and applied to the data frame * if the "when" condition is true using the [[doApplyErrorColumnsToDataFrame]] method. - * @param dataFrame - the data frame to operate on + * @param dataFrame - the [[org.apache.spark.sql.DataFrame spark.DataFrame]] to operate on * @param when - the condition that defines the error occurred on the row * @param errorMessageSubmit - the detected error specification - * @return - the original data frame with the error detection applied + * @return - the original [[org.apache.spark.sql.DataFrame spark.DataFrame]] with the error detection applied * @group Error Handling * @since 0.6.0 */ @@ -75,7 +75,7 @@ trait ErrorHandling { * Same as [[putError]], but allows a series of pairs condition-error to be specified at once. * It should be noted, that once an error has been identified for a field on the row, no more conditions bound to that * field are evaluated. - * @param dataFrame - the data frame to operate on + * @param dataFrame - the [[org.apache.spark.sql.DataFrame spark.DataFrame]] to operate on * @param errorsWhen - the list of condition-error pairs, the condition are grouped by the field of the error submissions * @return - the original data frame with the error detection applied * @group Error Handling @@ -110,10 +110,9 @@ trait ErrorHandling { ErrorColumn(transformErrorSubmitToColumn(errorMessageSubmit)) } - //TODO Fix ScalaDoc cross-module links #48 - createErrorAsColumn(errorMessageSubmit: ErrorMessageSubmit) /** - * Same as as above createErrorAsColumn(errorMessageSubmit: ErrorMessageSubmit), only providing the error specification - * in decomposed state, not in the [[ErrorMessageSubmit]] trait form + * Same as the other [[ErrorHandling!.createErrorAsColumn(errorMessageSubmit:za\.co\.absa\.spark\.commons\.errorhandling\.ErrorMessageSubmit)* createErrorAsColumn(errorMessageSubmit: ErrorMessageSubmit)]], only providing the error specification + * in decomposed state, not in the [[ErrorMessageSubmit]] trait form. * @param errType - word description of the type of the error * @param errCode - number designation of the type of the error * @param errMessage - human friendly description of the error @@ -131,10 +130,10 @@ trait ErrorHandling { } /** - * Applies the earlier collected [[types.ErrorColumn ErrorColumns]] to the provided DataFrame. - * See [[doApplyErrorColumnsToDataFrame]] for detailed functional explanation - * @param dataFrame - the data frame to operate on - * @param errCols - a list of [[types.ErrorColumn]] returned by previous calls of `createErrorAsColumn` + * Applies the earlier collected [[types.ErrorColumn ErrorColumns]] to the provided [[org.apache.spark.sql.DataFrame spark.DataFrame]]. + * See [[doApplyErrorColumnsToDataFrame]] for detailed functional explanation. + * @param dataFrame - the [[org.apache.spark.sql.DataFrame spark.DataFrame]] to operate on + * @param errCols - a list of [[types.ErrorColumn]] returned by previous calls of [[ErrorHandling!.createErrorAsColumn(errorMessageSubmit:za\.co\.absa\.spark\.commons\.errorhandling\.ErrorMessageSubmit)* createErrorAsColumn]] * @return - the original data frame with the error detection applied * @group Error Handling * @since 0.6.0 diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala index 56dbd9fb..644adbe3 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala @@ -25,6 +25,12 @@ import za.co.absa.spark.commons.errorhandling.partials.TransformIntoErrorMessage import za.co.absa.spark.commons.sql.functions.null_col import za.co.absa.spark.commons.implicits.DataFrameImplicits.DataFrameEnhancements +/** + * An implementation of [[ErrorHandling]] the collects errors into columns of struct based on [[za.co.absa.spark.commons.errorhandling.ErrorMessage ErrorMessage]] case class. + * Upon applying the non-NULL columns are aggregated into an array column which is attached to the [[org.apache.spark.sql.DataFrame spark.DataFrame]]. + * In case the column already exists in the DataFrame, the columns are appended to the column. + * @param errorColumnName - the name of the array column aggregating all the errors + */ case class ErrorMessageArray(errorColumnName: String = ErrorMessageArray.defaultErrorColumnName) extends ErrorHandling with TransformIntoErrorMessage diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumns.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumns.scala index c943f80b..3fcaf563 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumns.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitOnMoreColumns.scala @@ -56,7 +56,6 @@ object ErrorMessageSubmitOnMoreColumns { errCode: ErrCode, errMessage: ErrMsg, errSourceColNames: Set[ErrSourceColName], - additionalInfo: AdditionalInfo= None): ErrorMessageSubmitOnMoreColumns = { new ErrorMessageSubmitOnMoreColumns( ColumnOrValue.withValue(errType), diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumn.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumn.scala index 9aeb3462..a1b74793 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumn.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumn.scala @@ -19,7 +19,6 @@ package za.co.absa.spark.commons.errorhandling.implementations.submits import org.apache.spark.sql.Column import org.apache.spark.sql.functions.typedLit import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit -import za.co.absa.spark.commons.errorhandling.implementations.submits.ErrorMessageSubmitWithoutColumn.emptyErrorColsAndValues import za.co.absa.spark.commons.errorhandling.types._ /** diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ErrorWhen.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ErrorWhen.scala index b6a9e0c6..64722ed6 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ErrorWhen.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/types/ErrorWhen.scala @@ -20,7 +20,7 @@ import org.apache.spark.sql.Column import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit /** - * A case class the puts together an error specification and the condition to identify it. + * A case class that puts together an error specification and the condition to identify it. * The primary usage is in [[za.co.absa.spark.commons.errorhandling.ErrorHandling.putErrorsWithGrouping ErrorHandling.putErrorsWithGrouping()]] * @param when - boolean column expression that should evaluate to true on and only on the error detection * @param errorMessageSubmit - the error specification diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala index c736fdb2..5cc077a4 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala @@ -57,13 +57,17 @@ object functions { /** * Provides a column with NULL value. - * - * @return The column of NULL values + * @return - column of NULL values */ def null_col:Column = { lit(None.orNull) } + /** + * Provides a column with NULL values, but the actual type is per specification + * @param dataType - the actual data type of the column that will contain NULLs + * @return - column of NULL values + */ def null_col(dataType: DataType):Column = { null_col.cast(dataType) } From 6e24db0363e98efd7e82d81203832ec308e4a763 Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Mon, 8 May 2023 23:56:42 +0200 Subject: [PATCH 19/23] * Fixed after merge and conflicts resolution --- .../ErrorHandlingFilterRowsWithErrors.scala | 9 ++++----- .../ErrorHandlingFilterRowsWithErrorsTest.scala | 8 ++++---- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorHandlingFilterRowsWithErrors.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorHandlingFilterRowsWithErrors.scala index 12e9c1e1..6a0b377b 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorHandlingFilterRowsWithErrors.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorHandlingFilterRowsWithErrors.scala @@ -18,20 +18,19 @@ package za.co.absa.spark.commons.errorhandling.implementations import org.apache.spark.sql.{Column, DataFrame} import org.apache.spark.sql.functions.{coalesce, lit} -import za.co.absa.spark.commons.errorhandling.ErrorMessageSubmit -import za.co.absa.spark.commons.errorhandling.partials.ErrorHandlingCommon +import za.co.absa.spark.commons.errorhandling.{ErrorHandling, ErrorMessageSubmit} /** * Class implement the functionality of filtering rows with columns. */ -object ErrorHandlingFilterRowsWithErrors extends ErrorHandlingCommon { +object ErrorHandlingFilterRowsWithErrors extends ErrorHandling { /** * Creates a column with the error description, in this particular case actually only signals with a boolean flag there was an error in the row. * @param errorMessageSubmit - the description of the error * @return - A column with boolean value indicating there was an error on the row. */ - override protected def evaluate(errorMessageSubmit: ErrorMessageSubmit): Column = { + override protected def transformErrorSubmitToColumn(errorMessageSubmit: ErrorMessageSubmit): Column = { lit(true) } @@ -41,7 +40,7 @@ object ErrorHandlingFilterRowsWithErrors extends ErrorHandlingCommon { * @param errCols - the error columns to signal if the row should be filtered or not * @return - returns the dataframe without rows with errors */ - override protected def doTheColumnsAggregation(dataFrame: DataFrame, errCols: Column*): DataFrame = { + override protected def doApplyErrorColumnsToDataFrame(dataFrame: DataFrame, errCols: Column*): DataFrame = { val columns: Seq[Column] = errCols :+ lit(false) dataFrame.filter(!coalesce(columns: _*)) } diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorHandlingFilterRowsWithErrorsTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorHandlingFilterRowsWithErrorsTest.scala index 2ebc90c2..786f16a8 100644 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorHandlingFilterRowsWithErrorsTest.scala +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorHandlingFilterRowsWithErrorsTest.scala @@ -41,13 +41,13 @@ class ErrorHandlingFilterRowsWithErrorsTest extends AnyFunSuite with SparkTestBa test("aggregateErrorColumns should return an empty list after error aggregation") { val expectedResults: List[ResultDfRecordType] = List() - val e1 = ErrorHandlingFilterRowsWithErrors.putErrorToColumn("Test error 1", 1, "This is a test error", Some(col1Name)) + val e1 = ErrorHandlingFilterRowsWithErrors.createErrorAsColumn("Test error 1", 1, "This is a test error", Some(col1Name)) val errorSubmitA = ErrorMessageSubmitOnColumn("Test error 2", 2, "This is a test error", col2Name) - val e2 = ErrorHandlingFilterRowsWithErrors.putErrorToColumn(errorSubmitA) + val e2 = ErrorHandlingFilterRowsWithErrors.createErrorAsColumn(errorSubmitA) val errorSubmitB = ErrorMessageSubmitWithoutColumn("Test error 3", 3, "This is a test error") - val e3 = ErrorHandlingFilterRowsWithErrors.putErrorToColumn(errorSubmitB) + val e3 = ErrorHandlingFilterRowsWithErrors.createErrorAsColumn(errorSubmitB) - val resultsDF = ErrorHandlingFilterRowsWithErrors.aggregateErrorColumns(srcDf)(e1, e2, e3) + val resultsDF = ErrorHandlingFilterRowsWithErrors.applyErrorColumnsToDataFrame(srcDf)(e1, e2, e3) val results = resultDfToResult(resultsDF) assert(results.length == expectedResults.length) From 68fc0aa5255607b4836b03234c8cd1c5d4ec5bfc Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Tue, 9 May 2023 00:02:33 +0200 Subject: [PATCH 20/23] * PR comments addressed --- project/plugins.sbt | 2 +- .../errorhandling/implementations/ErrorMessageArray.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/project/plugins.sbt b/project/plugins.sbt index 875c03ca..6568c072 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -37,4 +37,4 @@ addSbtPlugin("org.ow2.asm" % "asm-commons" % ow2Version from ow2Url("asm-commons addSbtPlugin("org.ow2.asm" % "asm-tree" % ow2Version from ow2Url("asm-tree")) addSbtPlugin("za.co.absa.sbt" % "sbt-jacoco" % "3.4.1-absa.3" from "https://github.com/AbsaOSS/sbt-jacoco/releases/download/3.4.1-absa.3/sbt-jacoco-3.4.1-absa.3.jar") -addSbtPlugin("com.thoughtworks.sbt-api-mappings" % "sbt-api-mappings" % "3.0.2") \ No newline at end of file +addSbtPlugin("com.thoughtworks.sbt-api-mappings" % "sbt-api-mappings" % "3.0.2") diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala index 644adbe3..bd313ab8 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/errorhandling/implementations/ErrorMessageArray.scala @@ -26,7 +26,7 @@ import za.co.absa.spark.commons.sql.functions.null_col import za.co.absa.spark.commons.implicits.DataFrameImplicits.DataFrameEnhancements /** - * An implementation of [[ErrorHandling]] the collects errors into columns of struct based on [[za.co.absa.spark.commons.errorhandling.ErrorMessage ErrorMessage]] case class. + * An implementation of [[ErrorHandling]] that collects errors into columns of struct based on [[za.co.absa.spark.commons.errorhandling.ErrorMessage ErrorMessage]] case class. * Upon applying the non-NULL columns are aggregated into an array column which is attached to the [[org.apache.spark.sql.DataFrame spark.DataFrame]]. * In case the column already exists in the DataFrame, the columns are appended to the column. * @param errorColumnName - the name of the array column aggregating all the errors From 963787cb56270fafc74071c9d41270a1a79e231c Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Wed, 10 May 2023 00:11:31 +0200 Subject: [PATCH 21/23] * README.md enhanced --- README.md | 17 +++++++++++++++++ .../co/absa/spark/commons/sql/functions.scala | 4 ++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d3dda351..88d988cb 100644 --- a/README.md +++ b/README.md @@ -411,7 +411,24 @@ path even of nested fields. It also evaluates arrays and maps where the array in def col_of_path(fullColName: String): Column ``` +2. Provides a column of NULL values. + + ```scala + def nul_coll(): Column + ``` + + +3. Provides a column of NULL values, but the actual type is per specification + + ```scala + def nul_coll(dataType: DataType): Column + ``` +## Error Handling + +A `trait` and a set of supporting classes and other traits to enable errrors channeling between libraries and +application during Spark data processing. + ## Spark Commons Test ### Usage: diff --git a/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala b/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala index 5cc077a4..47542036 100644 --- a/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala +++ b/spark-commons/src/main/scala/za/co/absa/spark/commons/sql/functions.scala @@ -56,7 +56,7 @@ object functions { } /** - * Provides a column with NULL value. + * Provides a column of NULL values. * @return - column of NULL values */ def null_col:Column = { @@ -64,7 +64,7 @@ object functions { } /** - * Provides a column with NULL values, but the actual type is per specification + * Provides a column of NULL values, but the actual type is per specification * @param dataType - the actual data type of the column that will contain NULLs * @return - column of NULL values */ From 656dfc5fe9ae85dc23e9f633519cc3d132c467ea Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Thu, 11 May 2023 23:01:02 +0200 Subject: [PATCH 22/23] * Removed unused code --- .../submits/ErrorMessageSubmitWithoutColumnTest.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumnTest.scala b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumnTest.scala index 19cb1cc5..e1433cff 100644 --- a/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumnTest.scala +++ b/spark-commons/src/test/scala/za/co/absa/spark/commons/errorhandling/implementations/submits/ErrorMessageSubmitWithoutColumnTest.scala @@ -16,7 +16,7 @@ package za.co.absa.spark.commons.errorhandling.implementations.submits -import org.apache.spark.sql.functions.{lit, map, typedLit} +import org.apache.spark.sql.functions.lit import org.apache.spark.sql.types.StringType import org.scalatest.funsuite.AnyFunSuite import za.co.absa.spark.commons.errorhandling.types.{AdditionalInfo, ColumnOrValueForm, ErrColsAndValues} @@ -50,7 +50,6 @@ class ErrorMessageSubmitWithoutColumnTest extends AnyFunSuite { val errCode = 201L val errMsg = "This is a test error" val additionalInfo = "{}" - val columnValue: ErrColsAndValues = Map.empty val result = ErrorMessageSubmitWithoutColumn(errType, errCode, errMsg, Some(additionalInfo)) From 49855db6f29058f024af6f3b15745a56c1453234 Mon Sep 17 00:00:00 2001 From: David Benedeki Date: Tue, 20 Jun 2023 13:27:10 +0200 Subject: [PATCH 23/23] * changed "dead code" to documentation string --- build.sbt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/build.sbt b/build.sbt index 800936eb..45ca617b 100644 --- a/build.sbt +++ b/build.sbt @@ -36,11 +36,13 @@ lazy val commonSettings = Seq( Test / parallelExecution := false ) +/** + * add "za.co.absa.spark.commons.utils.ExplodeTools" to filter a class + * or "za.co.absa.spark.commons.utils.JsonUtils*" to filter the class and all related objects + */ lazy val commonJacocoExcludes: Seq[String] = Seq( "za.co.absa.spark.commons.adapters.CallUdfAdapter", "za.co.absa.spark.commons.adapters.TransformAdapter" - // "za.co.absa.spark.commons.utils.JsonUtils*", // class and related objects - // "za.co.absa.spark.commons.utils.ExplodeTools" // class only ) lazy val parent = (project in file("."))