Skip to content

Commit

Permalink
Added internal implementation of SQL SUBSTR()
Browse files Browse the repository at this point in the history
This replaces the Hive UDF for SUBSTR(ING) with an implementation in Catalyst
and adds tests to verify correct operation.

Squashes 8969d038 and e6419b4e.
  • Loading branch information
willb committed Jul 14, 2014
1 parent 9fe693b commit 4f3bfdb
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,11 @@ package org.apache.spark.sql.catalyst.expressions

import java.util.regex.Pattern

import scala.collection.IndexedSeqOptimized

import org.apache.spark.sql.catalyst.types.DataType
import org.apache.spark.sql.catalyst.types.StringType
import org.apache.spark.sql.catalyst.types.BinaryType
import org.apache.spark.sql.catalyst.types.BooleanType

trait StringRegexExpression {
Expand Down Expand Up @@ -205,3 +208,64 @@ case class EndsWith(left: Expression, right: Expression)
extends BinaryExpression with StringComparison {
def compare(l: String, r: String) = l.endsWith(r)
}

/**
* A function that takes a substring of its first argument starting at a given position.
* Defined for String and Binary types.
*/
case class Substring(str: Expression, pos: Expression, len: Expression) extends Expression {

type EvaluatedType = Any

def nullable: Boolean = true
def dataType: DataType = {
if (str.dataType == BinaryType) str.dataType else StringType
}

def references = children.flatMap(_.references).toSet

override def children = str :: pos :: len :: Nil

def slice[T, C <% IndexedSeqOptimized[T,_]](str: C, startPos: Int, sliceLen: Int): Any = {
val len = str.length
// Hive and SQL use one-based indexing for SUBSTR arguments but also accept zero and
// negative indices for start positions. If a start index i is greater than 0, it
// refers to element i-1 in the sequence. If a start index i is less than 0, it refers
// to the -ith element before the end of the sequence. If a start index i is 0, it
// refers to the first element.

val start = startPos match {
case pos if pos > 0 => pos - 1
case neg if neg < 0 => len + neg
case _ => 0
}

val end = sliceLen match {
case max if max == Integer.MAX_VALUE => max
case x => start + x
}

str.slice(start, end)
}

override def eval(input: Row): Any = {
val string = str.eval(input)

val po = pos.eval(input)
val ln = len.eval(input)

if ((string == null) || (po == null) || (ln == null)) {
null
} else {
val start = po.asInstanceOf[Int]
val length = ln.asInstanceOf[Int]

string match {
case ba: Array[Byte] => slice(ba, start, length)
case other => slice(other.toString, start, length)
}
}
}

override def toString = s"SUBSTR($str, $pos, $len)"
}
Original file line number Diff line number Diff line change
Expand Up @@ -466,5 +466,54 @@ class ExpressionEvaluationSuite extends FunSuite {
checkEvaluation(c1 === c2, false, row)
checkEvaluation(c1 !== c2, true, row)
}

test("Substring") {
val row = new GenericRow(Array[Any]("example", "example".toArray.map(_.toByte)))

val s = 'a.string.at(0)

// substring from zero position with less-than-full length
checkEvaluation(Substring(s, Literal(0, IntegerType), Literal(2, IntegerType)), "ex", row)
checkEvaluation(Substring(s, Literal(1, IntegerType), Literal(2, IntegerType)), "ex", row)

// substring from zero position with full length
checkEvaluation(Substring(s, Literal(0, IntegerType), Literal(7, IntegerType)), "example", row)
checkEvaluation(Substring(s, Literal(1, IntegerType), Literal(7, IntegerType)), "example", row)

// substring from zero position with greater-than-full length
checkEvaluation(Substring(s, Literal(0, IntegerType), Literal(100, IntegerType)), "example", row)
checkEvaluation(Substring(s, Literal(1, IntegerType), Literal(100, IntegerType)), "example", row)

// substring from nonzero position with less-than-full length
checkEvaluation(Substring(s, Literal(2, IntegerType), Literal(2, IntegerType)), "xa", row)

// substring from nonzero position with full length
checkEvaluation(Substring(s, Literal(2, IntegerType), Literal(6, IntegerType)), "xample", row)

// substring from nonzero position with greater-than-full length
checkEvaluation(Substring(s, Literal(2, IntegerType), Literal(100, IntegerType)), "xample", row)

// zero-length substring (within string bounds)
checkEvaluation(Substring(s, Literal(0, IntegerType), Literal(0, IntegerType)), "", row)

// zero-length substring (beyond string bounds)
checkEvaluation(Substring(s, Literal(100, IntegerType), Literal(4, IntegerType)), "", row)

// substring(null, _, _) -> null
checkEvaluation(Substring(s, Literal(100, IntegerType), Literal(4, IntegerType)), null, new GenericRow(Array[Any](null)))

// substring(_, null, _) -> null
checkEvaluation(Substring(s, Literal(null, IntegerType), Literal(4, IntegerType)), null, row)

// substring(_, _, null) -> null
checkEvaluation(Substring(s, Literal(100, IntegerType), Literal(null, IntegerType)), null, row)

// 2-arg substring from zero position
checkEvaluation(Substring(s, Literal(0, IntegerType), Literal(Integer.MAX_VALUE, IntegerType)), "example", row)
checkEvaluation(Substring(s, Literal(1, IntegerType), Literal(Integer.MAX_VALUE, IntegerType)), "example", row)

// 2-arg substring from nonzero position
checkEvaluation(Substring(s, Literal(2, IntegerType), Literal(Integer.MAX_VALUE, IntegerType)), "xample", row)
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -860,6 +860,7 @@ private[hive] object HiveQl {
val BETWEEN = "(?i)BETWEEN".r
val WHEN = "(?i)WHEN".r
val CASE = "(?i)CASE".r
val SUBSTR = "(?i)I_SUBSTR(?:ING)?".r

protected def nodeToExpr(node: Node): Expression = node match {
/* Attribute References */
Expand Down Expand Up @@ -987,6 +988,10 @@ private[hive] object HiveQl {

/* Other functions */
case Token("TOK_FUNCTION", Token(RAND(), Nil) :: Nil) => Rand
case Token("TOK_FUNCTION", Token(SUBSTR(), Nil) :: string :: pos :: Nil) =>
Substring(nodeToExpr(string), nodeToExpr(pos), Literal(Integer.MAX_VALUE, IntegerType))
case Token("TOK_FUNCTION", Token(SUBSTR(), Nil) :: string :: pos :: length :: Nil) =>
Substring(nodeToExpr(string), nodeToExpr(pos), nodeToExpr(length))

/* UDFs - Must be last otherwise will preempt built in functions */
case Token("TOK_FUNCTION", Token(name, Nil) :: args) =>
Expand Down

0 comments on commit 4f3bfdb

Please sign in to comment.