From 4f3bfdb9034bb911b7565611b2cb58f05250eda6 Mon Sep 17 00:00:00 2001
From: William Benton <willb@redhat.com>
Date: Tue, 8 Jul 2014 13:06:15 -0500
Subject: [PATCH 1/4] Added internal implementation of SQL SUBSTR()

This replaces the Hive UDF for SUBSTR(ING) with an implementation in Catalyst
and adds tests to verify correct operation.

Squashes 8969d038 and e6419b4e.
---
 .../expressions/stringOperations.scala        | 64 +++++++++++++++++++
 .../ExpressionEvaluationSuite.scala           | 49 ++++++++++++++
 .../org/apache/spark/sql/hive/HiveQl.scala    |  5 ++
 3 files changed, 118 insertions(+)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index b3850533c3736..0fd9cdb578634 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -19,8 +19,11 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.util.regex.Pattern
 
+import scala.collection.IndexedSeqOptimized
+
 import org.apache.spark.sql.catalyst.types.DataType
 import org.apache.spark.sql.catalyst.types.StringType
+import org.apache.spark.sql.catalyst.types.BinaryType
 import org.apache.spark.sql.catalyst.types.BooleanType
 
 trait StringRegexExpression {
@@ -205,3 +208,64 @@ case class EndsWith(left: Expression, right: Expression)
     extends BinaryExpression with StringComparison {
   def compare(l: String, r: String) = l.endsWith(r)
 }
+
+/**
+ * A function that takes a substring of its first argument starting at a given position.
+ * Defined for String and Binary types.
+ */
+case class Substring(str: Expression, pos: Expression, len: Expression) extends Expression {
+  
+  type EvaluatedType = Any
+  
+  def nullable: Boolean = true
+  def dataType: DataType = {
+    if (str.dataType == BinaryType) str.dataType else StringType
+  }
+  
+  def references = children.flatMap(_.references).toSet
+  
+  override def children = str :: pos :: len :: Nil
+  
+  def slice[T, C <% IndexedSeqOptimized[T,_]](str: C, startPos: Int, sliceLen: Int): Any = {
+    val len = str.length
+    // Hive and SQL use one-based indexing for SUBSTR arguments but also accept zero and
+    // negative indices for start positions. If a start index i is greater than 0, it 
+    // refers to element i-1 in the sequence. If a start index i is less than 0, it refers
+    // to the -ith element before the end of the sequence. If a start index i is 0, it
+    // refers to the first element.
+    
+    val start = startPos match {
+      case pos if pos > 0 => pos - 1
+      case neg if neg < 0 => len + neg
+      case _ => 0
+    }
+    
+    val end = sliceLen match {
+      case max if max == Integer.MAX_VALUE => max
+      case x => start + x
+    }
+      
+    str.slice(start, end)    
+  }
+  
+  override def eval(input: Row): Any = {
+    val string = str.eval(input)
+
+    val po = pos.eval(input)
+    val ln = len.eval(input)
+    
+    if ((string == null) || (po == null) || (ln == null)) {
+      null
+    } else {
+      val start = po.asInstanceOf[Int]
+      val length = ln.asInstanceOf[Int] 
+      
+      string match {
+        case ba: Array[Byte] => slice(ba, start, length)
+        case other => slice(other.toString, start, length)
+      }
+    }
+  }
+  
+  override def toString = s"SUBSTR($str, $pos, $len)"
+}
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
index 84d72814778ba..f1d7aedcc2d2d 100644
--- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
+++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvaluationSuite.scala
@@ -466,5 +466,54 @@ class ExpressionEvaluationSuite extends FunSuite {
     checkEvaluation(c1 === c2, false, row)
     checkEvaluation(c1 !== c2, true, row)
   }
+  
+  test("Substring") {
+    val row = new GenericRow(Array[Any]("example", "example".toArray.map(_.toByte)))
+    
+    val s = 'a.string.at(0)
+    
+    // substring from zero position with less-than-full length
+    checkEvaluation(Substring(s, Literal(0, IntegerType), Literal(2, IntegerType)), "ex", row)
+    checkEvaluation(Substring(s, Literal(1, IntegerType), Literal(2, IntegerType)), "ex", row)
+
+    // substring from zero position with full length
+    checkEvaluation(Substring(s, Literal(0, IntegerType), Literal(7, IntegerType)), "example", row)
+    checkEvaluation(Substring(s, Literal(1, IntegerType), Literal(7, IntegerType)), "example", row)
+
+    // substring from zero position with greater-than-full length
+    checkEvaluation(Substring(s, Literal(0, IntegerType), Literal(100, IntegerType)), "example", row)
+    checkEvaluation(Substring(s, Literal(1, IntegerType), Literal(100, IntegerType)), "example", row)
+
+    // substring from nonzero position with less-than-full length
+    checkEvaluation(Substring(s, Literal(2, IntegerType), Literal(2, IntegerType)), "xa", row)
+
+    // substring from nonzero position with full length
+    checkEvaluation(Substring(s, Literal(2, IntegerType), Literal(6, IntegerType)), "xample", row)
+
+    // substring from nonzero position with greater-than-full length
+    checkEvaluation(Substring(s, Literal(2, IntegerType), Literal(100, IntegerType)), "xample", row)
+
+    // zero-length substring (within string bounds)
+    checkEvaluation(Substring(s, Literal(0, IntegerType), Literal(0, IntegerType)), "", row)
+
+    // zero-length substring (beyond string bounds)
+    checkEvaluation(Substring(s, Literal(100, IntegerType), Literal(4, IntegerType)), "", row)
+
+    // substring(null, _, _) -> null
+    checkEvaluation(Substring(s, Literal(100, IntegerType), Literal(4, IntegerType)), null, new GenericRow(Array[Any](null)))
+    
+    // substring(_, null, _) -> null
+    checkEvaluation(Substring(s, Literal(null, IntegerType), Literal(4, IntegerType)), null, row)
+
+    // substring(_, _, null) -> null
+    checkEvaluation(Substring(s, Literal(100, IntegerType), Literal(null, IntegerType)), null, row)
+
+    // 2-arg substring from zero position
+    checkEvaluation(Substring(s, Literal(0, IntegerType), Literal(Integer.MAX_VALUE, IntegerType)), "example", row)
+    checkEvaluation(Substring(s, Literal(1, IntegerType), Literal(Integer.MAX_VALUE, IntegerType)), "example", row)
+
+    // 2-arg substring from nonzero position
+    checkEvaluation(Substring(s, Literal(2, IntegerType), Literal(Integer.MAX_VALUE, IntegerType)), "xample", row)
+  }
 }
 
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index b70104dd5be5a..3f243fee7b140 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -860,6 +860,7 @@ private[hive] object HiveQl {
   val BETWEEN = "(?i)BETWEEN".r
   val WHEN = "(?i)WHEN".r
   val CASE = "(?i)CASE".r
+  val SUBSTR = "(?i)I_SUBSTR(?:ING)?".r
 
   protected def nodeToExpr(node: Node): Expression = node match {
     /* Attribute References */
@@ -987,6 +988,10 @@ private[hive] object HiveQl {
 
     /* Other functions */
     case Token("TOK_FUNCTION", Token(RAND(), Nil) :: Nil) => Rand
+    case Token("TOK_FUNCTION", Token(SUBSTR(), Nil) :: string :: pos :: Nil) => 
+      Substring(nodeToExpr(string), nodeToExpr(pos), Literal(Integer.MAX_VALUE, IntegerType))
+    case Token("TOK_FUNCTION", Token(SUBSTR(), Nil) :: string :: pos :: length :: Nil) => 
+      Substring(nodeToExpr(string), nodeToExpr(pos), nodeToExpr(length))
 
     /* UDFs - Must be last otherwise will preempt built in functions */
     case Token("TOK_FUNCTION", Token(name, Nil) :: args) =>

From ec35c8016c63dccc7793d1cedb3b11e1c4dc1214 Mon Sep 17 00:00:00 2001
From: William Benton <willb@redhat.com>
Date: Thu, 10 Jul 2014 14:34:52 -0500
Subject: [PATCH 2/4] Adds fixes from review:

* orders imports in stringOperations.scala
* Substring.dataType throws exception if children are unresolved
* inlines Substring.slice (~11.5% performance improvement on microbenchmark runs)
* adds a special `toString` case for two-argument SUBSTR expressions
* removes spurious I_ prefix to SUBSTR(ING) in HiveQL.scala

Thanks to @concretevitamin for prompt and useful feedback!
---
 .../catalyst/expressions/stringOperations.scala  | 16 +++++++++++-----
 .../scala/org/apache/spark/sql/hive/HiveQl.scala |  2 +-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 0fd9cdb578634..03fd7843f20e3 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -21,10 +21,9 @@ import java.util.regex.Pattern
 
 import scala.collection.IndexedSeqOptimized
 
-import org.apache.spark.sql.catalyst.types.DataType
-import org.apache.spark.sql.catalyst.types.StringType
-import org.apache.spark.sql.catalyst.types.BinaryType
-import org.apache.spark.sql.catalyst.types.BooleanType
+
+import org.apache.spark.sql.catalyst.analysis.UnresolvedException
+import org.apache.spark.sql.catalyst.types.{BinaryType, BooleanType, DataType, StringType}
 
 trait StringRegexExpression {
   self: BinaryExpression =>
@@ -219,6 +218,9 @@ case class Substring(str: Expression, pos: Expression, len: Expression) extends
   
   def nullable: Boolean = true
   def dataType: DataType = {
+    if (!resolved) {
+      throw new UnresolvedException(this, s"Cannot resolve since $children are not resolved")
+    }
     if (str.dataType == BinaryType) str.dataType else StringType
   }
   
@@ -226,6 +228,7 @@ case class Substring(str: Expression, pos: Expression, len: Expression) extends
   
   override def children = str :: pos :: len :: Nil
   
+  @inline
   def slice[T, C <% IndexedSeqOptimized[T,_]](str: C, startPos: Int, sliceLen: Int): Any = {
     val len = str.length
     // Hive and SQL use one-based indexing for SUBSTR arguments but also accept zero and
@@ -267,5 +270,8 @@ case class Substring(str: Expression, pos: Expression, len: Expression) extends
     }
   }
   
-  override def toString = s"SUBSTR($str, $pos, $len)"
+  override def toString = len match {
+    case max if max == Integer.MAX_VALUE => s"SUBSTR($str, $pos)"
+    case _ => s"SUBSTR($str, $pos, $len)"
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
index 3f243fee7b140..622e0fb9c36e0 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveQl.scala
@@ -860,7 +860,7 @@ private[hive] object HiveQl {
   val BETWEEN = "(?i)BETWEEN".r
   val WHEN = "(?i)WHEN".r
   val CASE = "(?i)CASE".r
-  val SUBSTR = "(?i)I_SUBSTR(?:ING)?".r
+  val SUBSTR = "(?i)SUBSTR(?:ING)?".r
 
   protected def nodeToExpr(node: Node): Expression = node match {
     /* Attribute References */

From a30a0371d3f311a797918a4820dff9b3e4e00419 Mon Sep 17 00:00:00 2001
From: William Benton <willb@redhat.com>
Date: Thu, 10 Jul 2014 19:57:05 -0500
Subject: [PATCH 3/4] replace view bounds with implicit parameters

---
 .../spark/sql/catalyst/expressions/stringOperations.scala       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index 03fd7843f20e3..da5c8b345fd67 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -229,7 +229,7 @@ case class Substring(str: Expression, pos: Expression, len: Expression) extends
   override def children = str :: pos :: len :: Nil
   
   @inline
-  def slice[T, C <% IndexedSeqOptimized[T,_]](str: C, startPos: Int, sliceLen: Int): Any = {
+  def slice[T, C <: Any](str: C, startPos: Int, sliceLen: Int)(implicit ev: (C=>IndexedSeqOptimized[T,_])): Any = {
     val len = str.length
     // Hive and SQL use one-based indexing for SUBSTR arguments but also accept zero and
     // negative indices for start positions. If a start index i is greater than 0, it 

From ccedc47d4240c3f7df886ecde6588e645af55715 Mon Sep 17 00:00:00 2001
From: William Benton <willb@redhat.com>
Date: Thu, 10 Jul 2014 20:35:47 -0500
Subject: [PATCH 4/4] Fixed too-long line.

---
 .../spark/sql/catalyst/expressions/stringOperations.scala      | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index da5c8b345fd67..4bd7bf5a0cd8c 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -229,7 +229,8 @@ case class Substring(str: Expression, pos: Expression, len: Expression) extends
   override def children = str :: pos :: len :: Nil
   
   @inline
-  def slice[T, C <: Any](str: C, startPos: Int, sliceLen: Int)(implicit ev: (C=>IndexedSeqOptimized[T,_])): Any = {
+  def slice[T, C <: Any](str: C, startPos: Int, sliceLen: Int)
+      (implicit ev: (C=>IndexedSeqOptimized[T,_])): Any = {
     val len = str.length
     // Hive and SQL use one-based indexing for SUBSTR arguments but also accept zero and
     // negative indices for start positions. If a start index i is greater than 0, it