[SPARK-48584][SQL] Perf improvement for unescapePathName

### What changes were proposed in this pull request? This PR improves perf for unescapePathName with algorithms briefly described as: - If a path contains no '%' or contains '%' at `position > path.length-2`, we return the original identity instead of creating a new StringBuilder to append char by char - Otherwise, we loop with 2 indices, `plaintextStartIdx` which starts from 0 and then points to the next char after resolving `%xx`, and `plaintextEndIdx` which points to the next `'%'`. `plaintextStartIdx` moves to `plaintextEndIdx + 3` if `%xx` is valid, or moves to `plaintextEndIdx + 1` if `%xx` is invalid. - Instead of using Integer.parseInt with error capture, we identify the high and low characters manually. ### Why are the changes needed? performance improvement for hotspots ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? - new tests in ExternalCatalogUtilsSuite - Benchmark results (9-11x faster) ### Was this patch authored or co-authored using generative AI tooling? no Closes #46938 from yaooqinn/SPARK-48584. Authored-by: Kent Yao <[email protected]> Signed-off-by: Kent Yao <[email protected]>
apache · Jun 12, 2024 · da81d8e · da81d8e
1 parent 8870efc
commit da81d8e
Show file tree

Hide file tree

Showing 5 changed files with 135 additions and 27 deletions.
diff --git a/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt b/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt
@@ -6,7 +6,19 @@ OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1021-azure
 AMD EPYC 7763 64-Core Processor
 Escape Tests:                             Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Legacy                                             7128           7146           8          0.1        7127.9       1.0X
-New                                                 790            795           5          1.3         789.7       9.0X
+Legacy                                             6996           7009           9          0.1        6996.5       1.0X
+New                                                 771            776           3          1.3         770.7       9.1X
+
+
+================================================================================================
+Unescape
+================================================================================================
+
+OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1021-azure
+AMD EPYC 7763 64-Core Processor
+Unescape Tests:                           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+Legacy                                             5127           5137           6          0.2        5127.3       1.0X
+New                                                 579            583           4          1.7         579.3       8.9X
 
 
diff --git a/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt b/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt
@@ -6,7 +6,19 @@ OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1021-azure
 AMD EPYC 7763 64-Core Processor
 Escape Tests:                             Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
 ------------------------------------------------------------------------------------------------------------------------
-Legacy                                             6719           6726           6          0.1        6719.3       1.0X
-New                                                 735            744          21          1.4         735.3       9.1X
+Legacy                                             6966           6978          12          0.1        6965.9       1.0X
+New                                                 725            730           4          1.4         725.4       9.6X
+
+
+================================================================================================
+Unescape
+================================================================================================
+
+OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1021-azure
+AMD EPYC 7763 64-Core Processor
+Unescape Tests:                           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+Legacy                                             6665           6677          11          0.2        6664.6       1.0X
+New                                                 602            606           2          1.7         602.1      11.1X
 
 
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala
@@ -26,6 +26,7 @@ import org.apache.hadoop.util.Shell
 import org.apache.spark.sql.catalyst.analysis.Resolver
 import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
 import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, BasePredicate, BoundReference, Expression, Predicate}
+import org.apache.spark.sql.catalyst.expressions.Hex.unhexDigits
 import org.apache.spark.sql.catalyst.util.CharVarcharUtils
 import org.apache.spark.sql.errors.QueryCompilationErrors
 import org.apache.spark.sql.internal.SQLConf
@@ -97,31 +98,40 @@ object ExternalCatalogUtils {
   }
 
   def unescapePathName(path: String): String = {
-    val sb = new StringBuilder
-    var i = 0
-
-    while (i < path.length) {
-      val c = path.charAt(i)
-      if (c == '%' && i + 2 < path.length) {
-        val code: Int = try {
-          Integer.parseInt(path.substring(i + 1, i + 3), 16)
-        } catch {
-          case _: Exception => -1
-        }
-        if (code >= 0) {
-          sb.append(code.asInstanceOf[Char])
-          i += 3
+    if (path == null || path.isEmpty) {
+      return path
+    }
+    var plaintextEndIdx = path.indexOf('%')
+    val length = path.length
+    if (plaintextEndIdx == -1 || plaintextEndIdx + 2 > length) {
+      // fast path, no %xx encoding found then return the string identity
+      path
+    } else {
+      val sb = new java.lang.StringBuilder(length)
+      var plaintextStartIdx = 0
+      while(plaintextEndIdx != -1 && plaintextEndIdx + 2 < length) {
+        if (plaintextEndIdx > plaintextStartIdx) sb.append(path, plaintextStartIdx, plaintextEndIdx)
+        val high = path.charAt(plaintextEndIdx + 1)
+        if ((high >>> 8) == 0 && unhexDigits(high) != -1) {
+          val low = path.charAt(plaintextEndIdx + 2)
+          if ((low >>> 8) == 0 && unhexDigits(low) != -1) {
+            sb.append((unhexDigits(high) << 4 | unhexDigits(low)).asInstanceOf[Char])
+            plaintextStartIdx = plaintextEndIdx + 3
+          } else {
+            sb.append('%')
+            plaintextStartIdx = plaintextEndIdx + 1
+          }
         } else {
-          sb.append(c)
-          i += 1
+          sb.append('%')
+          plaintextStartIdx = plaintextEndIdx + 1
         }
-      } else {
-        sb.append(c)
-        i += 1
+        plaintextEndIdx = path.indexOf('%', plaintextStartIdx)
+      }
+      if (plaintextStartIdx < length) {
+        sb.append(path, plaintextStartIdx, length)
       }
+      sb.toString
     }
-
-    sb.toString()
   }
 
   def generatePartitionPath(

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/EscapePathBenchmark.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/EscapePathBenchmark.scala
@@ -20,7 +20,7 @@ import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}
 import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils
 
 /**
- * Benchmark for path escaping
+ * Benchmark for path escaping/unescaping
  * To run this benchmark:
  * {{{
  *   1. without sbt:
@@ -53,6 +53,30 @@ object EscapePathBenchmark extends BenchmarkBase {
       }
       benchmark.run()
     }
+
+    runBenchmark("Unescape") {
+      val benchmark = new Benchmark("Unescape Tests", N, 10, output = output)
+      val paths = Seq(
+        "https%3A%2F%2Fissues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551",
+        "https:%2F%2Fissues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551",
+        "https:/%2Fissues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551",
+        "https://issues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551",
+        "https://issues.apache.org/jira%2Fbrowse%2FSPARK-48551",
+        "https://issues.apache.org/jira%2Fbrowse%2FSPARK-48551",
+        "https://issues.apache.org/jira/browse%2FSPARK-48551",
+        "https://issues.apache.org/jira/browse%2SPARK-48551",
+        "https://issues.apache.org/jira/browse/SPARK-48551")
+      benchmark.addCase("Legacy") { _ =>
+        (1 to N).foreach(_ => paths.foreach(unescapePathNameLegacy))
+      }
+
+      benchmark.addCase("New") { _ =>
+        (1 to N).foreach(_ => {
+          paths.foreach(ExternalCatalogUtils.unescapePathName)
+        })
+      }
+      benchmark.run()
+    }
   }
 
   /**
@@ -71,4 +95,30 @@ object EscapePathBenchmark extends BenchmarkBase {
 
     builder.toString()
   }
+
+  def unescapePathNameLegacy(path: String): String = {
+    val sb = new StringBuilder
+    var i = 0
+    while (i < path.length) {
+      val c = path.charAt(i)
+      if (c == '%' && i + 2 < path.length) {
+        val code: Int = try {
+          Integer.parseInt(path.substring(i + 1, i + 3), 16)
+        } catch {
+          case _: Exception => -1
+        }
+        if (code >= 0) {
+          sb.append(code.asInstanceOf[Char])
+          i += 3
+        } else {
+          sb.append(c)
+          i += 1
+        }
+      } else {
+        sb.append(c)
+        i += 1
+      }
+    }
+    sb.toString()
+  }
 }
diff --git a/...lyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtilsSuite.scala b/...lyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtilsSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.sql.catalyst.catalog
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName
+import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.{escapePathName, unescapePathName}
 
 class ExternalCatalogUtilsSuite extends SparkFunSuite {
 
@@ -39,4 +39,28 @@ class ExternalCatalogUtilsSuite extends SparkFunSuite {
     assert(escapePathName("a,b") === "a,b")
     assert(escapePathName("a/b") === "a%2Fb")
   }
+
+  test("SPARK-48551: unescapePathName") {
+    ExternalCatalogUtils.charToEscape.stream().toArray.map(_.asInstanceOf[Char]).foreach { c =>
+      // Check parity with old conversion technique:
+      assert(unescapePathName("%" + f"$c%02X") === c.toString,
+        s"wrong unescaping for $c")
+    }
+    assert(unescapePathName(null) === null)
+    assert(unescapePathName("") === "")
+    assert(unescapePathName(" ") === " ")
+    assert(unescapePathName("%0A") === "\n")
+    assert(unescapePathName("a b") === "a b")
+    assert(unescapePathName("a%3Ab") === "a:b")
+    assert(unescapePathName("%3Aab") === ":ab")
+    assert(unescapePathName("ab%3A") === "ab:")
+    assert(unescapePathName("a%25b") === "a%b")
+    assert(unescapePathName("a,b") === "a,b")
+    assert(unescapePathName("a%2Fb") === "a/b")
+    assert(unescapePathName("a%2") === "a%2")
+    assert(unescapePathName("a%F ") === "a%F ")
+    // scalastyle:off nonascii
+    assert(unescapePathName("a\u00FF") === "a\u00FF")
+    // scalastyle:on nonascii
+  }
 }