[SPARK-30645][SPARKR][TESTS][WINDOWS] Move Unicode test data to exter…

…nal file ### What changes were proposed in this pull request? Reference data for "collect() support Unicode characters" has been moved to an external file, to make test OS and locale independent. ### Why are the changes needed? As-is, embedded data is not properly encoded on Windows: ``` library(SparkR) SparkR::sparkR.session() Sys.info() # sysname release version # "Windows" "Server x64" "build 17763" # nodename machine login # "WIN-5BLT6Q610KH" "x86-64" "Administrator" # user effective_user # "Administrator" "Administrator" Sys.getlocale() # [1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252" lines <- c("{\"name\":\"안녕하세요\"}", "{\"name\":\"您好\", \"age\":30}", "{\"name\":\"こんにちは\", \"age\":19}", "{\"name\":\"Xin chào\"}") system(paste0("cat ", jsonPath)) # {"name":"<U+C548><U+B155><U+D558><U+C138><U+C694>"} # {"name":"<U+60A8><U+597D>", "age":30} # {"name":"<U+3053><U+3093><U+306B><U+3061><U+306F>", "age":19} # {"name":"Xin chào"} # [1] 0 jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") writeLines(lines, jsonPath) df <- read.df(jsonPath, "json") printSchema(df) # root # |-- _corrupt_record: string (nullable = true) # |-- age: long (nullable = true) # |-- name: string (nullable = true) head(df) # _corrupt_record age name # 1 <NA> NA <U+C548><U+B155><U+D558><U+C138><U+C694> # 2 <NA> 30 <U+60A8><U+597D> # 3 <NA> 19 <U+3053><U+3093><U+306B><U+3061><U+306F> # 4 {"name":"Xin ch<U+FFFD>o"} NA <NA> ``` This can be reproduced outside tests (Windows Server 2019, English locale), and causes failures, when `testthat` is updated to 2.x (#27359). Somehow problem is not picked-up when test is executed on `testthat` 1.0.2. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Running modified test, manual testing. ### Note Alternative seems to be to used bytes, but it hasn't been properly tested. ``` test_that("collect() support Unicode characters", { lines <- markUtf8(c( '{"name": "안녕하세요"}', '{"name": "您好", "age": 30}', '{"name": "こんにちは", "age": 19}', '{"name": "Xin ch\xc3\xa0o"}' )) jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") writeLines(lines, jsonPath, useBytes = TRUE) expected <- regmatches(lines, regexec('(?<="name": ").*?(?=")', lines, perl = TRUE)) df <- read.df(jsonPath, "json") rdf <- collect(df) expect_true(is.data.frame(rdf)) rdf$name <- markUtf8(rdf$name) expect_equal(rdf$name[1], expected[[1]]) expect_equal(rdf$name[2], expected[[2]]) expect_equal(rdf$name[3], expected[[3]]) expect_equal(rdf$name[4], expected[[4]]) df1 <- createDataFrame(rdf) expect_equal( collect( where(df1, df1$name == expected[[2]]) )$name, expected[[2]] ) }) ``` Closes #27362 from zero323/SPARK-30645. Authored-by: zero323 <[email protected]> Signed-off-by: HyukjinKwon <[email protected]> (cherry picked from commit 40b1f4d) Signed-off-by: HyukjinKwon <[email protected]>
apache · Jan 26, 2020 · 1b3ddcf · 1b3ddcf
1 parent d7be535
commit 1b3ddcf
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 11 deletions.
diff --git a/R/pkg/tests/fulltests/data/test_utils_utf.json b/R/pkg/tests/fulltests/data/test_utils_utf.json
@@ -0,0 +1,4 @@
+{"name": "안녕하세요"}
+{"name": "您好", "age": 30}
+{"name": "こんにちは", "age": 19}
+{"name": "Xin chào"}
diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R
@@ -875,24 +875,31 @@ test_that("collect() and take() on a DataFrame return the same number of rows an
 })
 
 test_that("collect() support Unicode characters", {
-  lines <- c("{\"name\":\"안녕하세요\"}",
-             "{\"name\":\"您好\", \"age\":30}",
-             "{\"name\":\"こんにちは\", \"age\":19}",
-             "{\"name\":\"Xin chào\"}")
+  jsonPath <- file.path(
+    Sys.getenv("SPARK_HOME"),
+    "R", "pkg", "tests", "fulltests", "data",
+    "test_utils_utf.json"
+  )
+
+  lines <- readLines(jsonPath, encoding = "UTF-8")
 
-  jsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp")
-  writeLines(lines, jsonPath)
+  expected <- regmatches(lines, gregexpr('(?<="name": ").*?(?=")', lines, perl = TRUE))
 
   df <- read.df(jsonPath, "json")
   rdf <- collect(df)
   expect_true(is.data.frame(rdf))
-  expect_equal(rdf$name[1], markUtf8("안녕하세요"))
-  expect_equal(rdf$name[2], markUtf8("您好"))
-  expect_equal(rdf$name[3], markUtf8("こんにちは"))
-  expect_equal(rdf$name[4], markUtf8("Xin chào"))
+  expect_equal(rdf$name[1], expected[[1]])
+  expect_equal(rdf$name[2], expected[[2]])
+  expect_equal(rdf$name[3], expected[[3]])
+  expect_equal(rdf$name[4], expected[[4]])
 
   df1 <- createDataFrame(rdf)
-  expect_equal(collect(where(df1, df1$name == markUtf8("您好")))$name, markUtf8("您好"))
+  expect_equal(
+    collect(
+      where(df1, df1$name == expected[[2]])
+    )$name,
+    expected[[2]]
+  )
 })
 
 test_that("multiple pipeline transformations result in an RDD with the correct values", {