Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SPARK-2407: Added internal implementation of SQL SUBSTR() #1359

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@ package org.apache.spark.sql.catalyst.expressions

import java.util.regex.Pattern

import org.apache.spark.sql.catalyst.types.DataType
import org.apache.spark.sql.catalyst.types.StringType
import org.apache.spark.sql.catalyst.types.BooleanType
import scala.collection.IndexedSeqOptimized


import org.apache.spark.sql.catalyst.analysis.UnresolvedException
import org.apache.spark.sql.catalyst.types.{BinaryType, BooleanType, DataType, StringType}

trait StringRegexExpression {
self: BinaryExpression =>
Expand Down Expand Up @@ -205,3 +207,72 @@ case class EndsWith(left: Expression, right: Expression)
extends BinaryExpression with StringComparison {
def compare(l: String, r: String) = l.endsWith(r)
}

/**
* A function that takes a substring of its first argument starting at a given position.
* Defined for String and Binary types.
*/
case class Substring(str: Expression, pos: Expression, len: Expression) extends Expression {

type EvaluatedType = Any

def nullable: Boolean = true
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nullable could be str.nullable || pos.nullable || len.nullable ?

def dataType: DataType = {
if (!resolved) {
throw new UnresolvedException(this, s"Cannot resolve since $children are not resolved")
}
if (str.dataType == BinaryType) str.dataType else StringType
}

def references = children.flatMap(_.references).toSet

override def children = str :: pos :: len :: Nil

@inline
def slice[T, C <: Any](str: C, startPos: Int, sliceLen: Int)
(implicit ev: (C=>IndexedSeqOptimized[T,_])): Any = {
val len = str.length
// Hive and SQL use one-based indexing for SUBSTR arguments but also accept zero and
// negative indices for start positions. If a start index i is greater than 0, it
// refers to element i-1 in the sequence. If a start index i is less than 0, it refers
// to the -ith element before the end of the sequence. If a start index i is 0, it
// refers to the first element.

val start = startPos match {
case pos if pos > 0 => pos - 1
case neg if neg < 0 => len + neg
case _ => 0
}

val end = sliceLen match {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The MySQL doc mentions that "If len is less than 1, the result is the empty string."

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the behavior of IndexedSeqOptimized[A,B].slice (and thus this patch) as well as of Hive, too.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, missed this before, sorry.

case max if max == Integer.MAX_VALUE => max
case x => start + x
}

str.slice(start, end)
}

override def eval(input: Row): Any = {
val string = str.eval(input)

val po = pos.eval(input)
val ln = len.eval(input)

if ((string == null) || (po == null) || (ln == null)) {
null
} else {
val start = po.asInstanceOf[Int]
val length = ln.asInstanceOf[Int]

string match {
case ba: Array[Byte] => slice(ba, start, length)
case other => slice(other.toString, start, length)
}
}
}

override def toString = len match {
case max if max == Integer.MAX_VALUE => s"SUBSTR($str, $pos)"
case _ => s"SUBSTR($str, $pos, $len)"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -466,5 +466,54 @@ class ExpressionEvaluationSuite extends FunSuite {
checkEvaluation(c1 === c2, false, row)
checkEvaluation(c1 !== c2, true, row)
}

test("Substring") {
val row = new GenericRow(Array[Any]("example", "example".toArray.map(_.toByte)))

val s = 'a.string.at(0)

// substring from zero position with less-than-full length
checkEvaluation(Substring(s, Literal(0, IntegerType), Literal(2, IntegerType)), "ex", row)
checkEvaluation(Substring(s, Literal(1, IntegerType), Literal(2, IntegerType)), "ex", row)

// substring from zero position with full length
checkEvaluation(Substring(s, Literal(0, IntegerType), Literal(7, IntegerType)), "example", row)
checkEvaluation(Substring(s, Literal(1, IntegerType), Literal(7, IntegerType)), "example", row)

// substring from zero position with greater-than-full length
checkEvaluation(Substring(s, Literal(0, IntegerType), Literal(100, IntegerType)), "example", row)
checkEvaluation(Substring(s, Literal(1, IntegerType), Literal(100, IntegerType)), "example", row)

// substring from nonzero position with less-than-full length
checkEvaluation(Substring(s, Literal(2, IntegerType), Literal(2, IntegerType)), "xa", row)

// substring from nonzero position with full length
checkEvaluation(Substring(s, Literal(2, IntegerType), Literal(6, IntegerType)), "xample", row)

// substring from nonzero position with greater-than-full length
checkEvaluation(Substring(s, Literal(2, IntegerType), Literal(100, IntegerType)), "xample", row)

// zero-length substring (within string bounds)
checkEvaluation(Substring(s, Literal(0, IntegerType), Literal(0, IntegerType)), "", row)

// zero-length substring (beyond string bounds)
checkEvaluation(Substring(s, Literal(100, IntegerType), Literal(4, IntegerType)), "", row)

// substring(null, _, _) -> null
checkEvaluation(Substring(s, Literal(100, IntegerType), Literal(4, IntegerType)), null, new GenericRow(Array[Any](null)))

// substring(_, null, _) -> null
checkEvaluation(Substring(s, Literal(null, IntegerType), Literal(4, IntegerType)), null, row)

// substring(_, _, null) -> null
checkEvaluation(Substring(s, Literal(100, IntegerType), Literal(null, IntegerType)), null, row)

// 2-arg substring from zero position
checkEvaluation(Substring(s, Literal(0, IntegerType), Literal(Integer.MAX_VALUE, IntegerType)), "example", row)
checkEvaluation(Substring(s, Literal(1, IntegerType), Literal(Integer.MAX_VALUE, IntegerType)), "example", row)

// 2-arg substring from nonzero position
checkEvaluation(Substring(s, Literal(2, IntegerType), Literal(Integer.MAX_VALUE, IntegerType)), "xample", row)
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -860,6 +860,7 @@ private[hive] object HiveQl {
val BETWEEN = "(?i)BETWEEN".r
val WHEN = "(?i)WHEN".r
val CASE = "(?i)CASE".r
val SUBSTR = "(?i)SUBSTR(?:ING)?".r

protected def nodeToExpr(node: Node): Expression = node match {
/* Attribute References */
Expand Down Expand Up @@ -987,6 +988,10 @@ private[hive] object HiveQl {

/* Other functions */
case Token("TOK_FUNCTION", Token(RAND(), Nil) :: Nil) => Rand
case Token("TOK_FUNCTION", Token(SUBSTR(), Nil) :: string :: pos :: Nil) =>
Substring(nodeToExpr(string), nodeToExpr(pos), Literal(Integer.MAX_VALUE, IntegerType))
case Token("TOK_FUNCTION", Token(SUBSTR(), Nil) :: string :: pos :: length :: Nil) =>
Substring(nodeToExpr(string), nodeToExpr(pos), nodeToExpr(length))

/* UDFs - Must be last otherwise will preempt built in functions */
case Token("TOK_FUNCTION", Token(name, Nil) :: args) =>
Expand Down