From e2828f8c2159a1c7a7dc8c2fb531f08999615f77 Mon Sep 17 00:00:00 2001 From: Winston Chang Date: Tue, 5 Feb 2019 14:31:39 -0600 Subject: [PATCH 1/6] Mark UTF-8 encoding in C++ instead of R --- DESCRIPTION | 1 - R/RcppExports.R | 24 ++++++++------- R/decode_uri.R | 15 ---------- man/encodeURI.Rd | 2 +- src/RcppExports-legacy.cpp | 4 +-- src/RcppExports.cpp | 22 +++++++------- src/httpuv.cpp | 59 ++++++++++++++++++++++--------------- tests/testthat/test-utils.R | 50 ++++++++++++++++++++++--------- 8 files changed, 99 insertions(+), 78 deletions(-) delete mode 100644 R/decode_uri.R diff --git a/DESCRIPTION b/DESCRIPTION index 74cb4d0f..dbfbdcb2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -35,7 +35,6 @@ Remotes: r-lib/later Collate: 'RcppExports.R' - 'decode_uri.R' 'httpuv.R' 'server.R' 'static_paths.R' diff --git a/R/RcppExports.R b/R/RcppExports.R index e2652543..b99ff75e 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -46,7 +46,7 @@ base64encode <- function(x) { } #' URI encoding/decoding -#' +#' #' Encodes/decodes strings using URI encoding/decoding in the same way that web #' browsers do. The precise behaviors of these functions can be found at #' developer.mozilla.org: @@ -54,22 +54,22 @@ base64encode <- function(x) { #' \href{https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURIComponent}{encodeURIComponent}, #' \href{https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURI}{decodeURI}, #' \href{https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent}{decodeURIComponent} -#' +#' #' Intended as a faster replacement for \code{\link[utils]{URLencode}} and #' \code{\link[utils]{URLdecode}}. -#' +#' #' encodeURI differs from encodeURIComponent in that the former will not encode #' reserved characters: \code{;,/?:@@&=+$} -#' +#' #' decodeURI differs from decodeURIComponent in that it will refuse to decode #' encoded sequences that decode to a reserved character. (If in doubt, use #' decodeURIComponent.) -#' +#' #' The only way these functions differ from web browsers is in the encoding of #' non-ASCII characters. All non-ASCII characters will be escaped byte-by-byte. #' If conformant non-ASCII behavior is important, ensure that your input vector #' is UTF-8 encoded before calling encodeURI or encodeURIComponent. -#' +#' #' @param value Character vector to be encoded or decoded. #' @return Encoded or decoded character vector of the same length as the #' input value. \code{decodeURI} and \code{decodeURIComponent} will return @@ -86,12 +86,16 @@ encodeURIComponent <- function(value) { .Call('_httpuv_encodeURIComponent', PACKAGE = 'httpuv', value) } -decodeURI_ <- function(value) { - .Call('_httpuv_decodeURI_', PACKAGE = 'httpuv', value) +#' @rdname encodeURI +#' @export +decodeURI <- function(value) { + .Call('_httpuv_decodeURI', PACKAGE = 'httpuv', value) } -decodeURIComponent_ <- function(value) { - .Call('_httpuv_decodeURIComponent_', PACKAGE = 'httpuv', value) +#' @rdname encodeURI +#' @export +decodeURIComponent <- function(value) { + .Call('_httpuv_decodeURIComponent', PACKAGE = 'httpuv', value) } #' Check whether an address is IPv4 or IPv6 diff --git a/R/decode_uri.R b/R/decode_uri.R deleted file mode 100644 index 2d54187f..00000000 --- a/R/decode_uri.R +++ /dev/null @@ -1,15 +0,0 @@ -#' @rdname encodeURI -#' @export -decodeURI <- function(value) { - res <- decodeURI_(value) - Encoding(res) <- "UTF-8" - res -} - -#' @rdname encodeURI -#' @export -decodeURIComponent <- function(value) { - res <- decodeURIComponent_(value) - Encoding(res) <- "UTF-8" - res -} diff --git a/man/encodeURI.Rd b/man/encodeURI.Rd index 0db7a48a..3ef7821c 100644 --- a/man/encodeURI.Rd +++ b/man/encodeURI.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/RcppExports.R, R/decode_uri.R +% Please edit documentation in R/RcppExports.R \name{encodeURI} \alias{encodeURI} \alias{encodeURIComponent} diff --git a/src/RcppExports-legacy.cpp b/src/RcppExports-legacy.cpp index 1bba5f8e..e0a33505 100644 --- a/src/RcppExports-legacy.cpp +++ b/src/RcppExports-legacy.cpp @@ -24,13 +24,13 @@ using namespace Rcpp; -std::vector decodeURIComponent_(std::vector value); +Rcpp::CharacterVector decodeURIComponent(std::vector value); RcppExport SEXP httpuv_decodeURIComponent(SEXP valueSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< std::vector >::type value(valueSEXP); - rcpp_result_gen = Rcpp::wrap(decodeURIComponent_(value)); + rcpp_result_gen = Rcpp::wrap(decodeURIComponent(value)); return rcpp_result_gen; END_RCPP } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index dd25affd..8d3dfded 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -170,25 +170,25 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// decodeURI_ -std::vector decodeURI_(std::vector value); -RcppExport SEXP _httpuv_decodeURI_(SEXP valueSEXP) { +// decodeURI +Rcpp::CharacterVector decodeURI(std::vector value); +RcppExport SEXP _httpuv_decodeURI(SEXP valueSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< std::vector >::type value(valueSEXP); - rcpp_result_gen = Rcpp::wrap(decodeURI_(value)); + rcpp_result_gen = Rcpp::wrap(decodeURI(value)); return rcpp_result_gen; END_RCPP } -// decodeURIComponent_ -std::vector decodeURIComponent_(std::vector value); -RcppExport SEXP _httpuv_decodeURIComponent_(SEXP valueSEXP) { +// decodeURIComponent +Rcpp::CharacterVector decodeURIComponent(std::vector value); +RcppExport SEXP _httpuv_decodeURIComponent(SEXP valueSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< std::vector >::type value(valueSEXP); - rcpp_result_gen = Rcpp::wrap(decodeURIComponent_(value)); + rcpp_result_gen = Rcpp::wrap(decodeURIComponent(value)); return rcpp_result_gen; END_RCPP } @@ -251,13 +251,13 @@ static const R_CallMethodDef CallEntries[] = { {"_httpuv_base64encode", (DL_FUNC) &_httpuv_base64encode, 1}, {"_httpuv_encodeURI", (DL_FUNC) &_httpuv_encodeURI, 1}, {"_httpuv_encodeURIComponent", (DL_FUNC) &_httpuv_encodeURIComponent, 1}, - {"_httpuv_decodeURI_", (DL_FUNC) &_httpuv_decodeURI_, 1}, - {"_httpuv_decodeURIComponent_", (DL_FUNC) &_httpuv_decodeURIComponent_, 1}, + {"_httpuv_decodeURI", (DL_FUNC) &_httpuv_decodeURI, 1}, + {"_httpuv_decodeURIComponent", (DL_FUNC) &_httpuv_decodeURIComponent, 1}, {"_httpuv_ipFamily", (DL_FUNC) &_httpuv_ipFamily, 1}, {"_httpuv_invokeCppCallback", (DL_FUNC) &_httpuv_invokeCppCallback, 2}, {"_httpuv_getRNGState", (DL_FUNC) &_httpuv_getRNGState, 0}, {"_httpuv_wsconn_address", (DL_FUNC) &_httpuv_wsconn_address, 1}, - {"httpuv_decodeURIComponent", (DL_FUNC) &httpuv_decodeURIComponent, 1}, + {"httpuv_decodeURIComponent", (DL_FUNC) &httpuv_decodeURIComponent, 1}, {NULL, NULL, 0} }; diff --git a/src/httpuv.cpp b/src/httpuv.cpp index c223f6a1..8e77350a 100644 --- a/src/httpuv.cpp +++ b/src/httpuv.cpp @@ -490,7 +490,7 @@ std::string doEncodeURI(std::string value, bool encodeReserved) { for (std::string::const_iterator it = value.begin(); it != value.end(); it++) { - + if (!needsEscape(*it, encodeReserved)) { os << *it; } else { @@ -501,7 +501,7 @@ std::string doEncodeURI(std::string value, bool encodeReserved) { } //' URI encoding/decoding -//' +//' //' Encodes/decodes strings using URI encoding/decoding in the same way that web //' browsers do. The precise behaviors of these functions can be found at //' developer.mozilla.org: @@ -509,22 +509,22 @@ std::string doEncodeURI(std::string value, bool encodeReserved) { //' \href{https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURIComponent}{encodeURIComponent}, //' \href{https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURI}{decodeURI}, //' \href{https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/decodeURIComponent}{decodeURIComponent} -//' +//' //' Intended as a faster replacement for \code{\link[utils]{URLencode}} and //' \code{\link[utils]{URLdecode}}. -//' +//' //' encodeURI differs from encodeURIComponent in that the former will not encode //' reserved characters: \code{;,/?:@@&=+$} -//' +//' //' decodeURI differs from decodeURIComponent in that it will refuse to decode //' encoded sequences that decode to a reserved character. (If in doubt, use //' decodeURIComponent.) -//' +//' //' The only way these functions differ from web browsers is in the encoding of //' non-ASCII characters. All non-ASCII characters will be escaped byte-by-byte. //' If conformant non-ASCII behavior is important, ensure that your input vector //' is UTF-8 encoded before calling encodeURI or encodeURIComponent. -//' +//' //' @param value Character vector to be encoded or decoded. //' @return Encoded or decoded character vector of the same length as the //' input value. \code{decodeURI} and \code{decodeURIComponent} will return @@ -539,7 +539,7 @@ std::vector encodeURI(std::vector value) { *it = doEncodeURI(*it, false); } - + return value; } @@ -553,7 +553,7 @@ std::vector encodeURIComponent(std::vector value) { *it = doEncodeURI(*it, true); } - + return value; } @@ -584,14 +584,14 @@ std::string doDecodeURI(std::string value, bool component) { for (std::string::const_iterator it = value.begin(); it != value.end(); it++) { - + // If there aren't enough characters left for this to be a // valid escape code, just use the character and move on if (it > value.end() - 3) { os << *it; continue; } - + if (*it == '%') { char hi = *(++it); char lo = *(++it); @@ -612,32 +612,43 @@ std::string doDecodeURI(std::string value, bool component) { os << *it; } } - + return os.str(); } + +//' @rdname encodeURI +//' @export // [[Rcpp::export]] -std::vector decodeURI_(std::vector value) { +Rcpp::CharacterVector decodeURI(std::vector value) { + Rcpp::CharacterVector out(value.size()); + int i = 0; for (std::vector::iterator it = value.begin(); it != value.end(); - it++) { - - *it = doDecodeURI(*it, false); + it++, i++) + { + const char* s = doDecodeURI(*it, false).c_str(); + out[i] = Rf_mkCharCE(s, CE_UTF8); } - - return value; + + return out; } +//' @rdname encodeURI +//' @export // [[Rcpp::export]] -std::vector decodeURIComponent_(std::vector value) { +Rcpp::CharacterVector decodeURIComponent(std::vector value) { + Rcpp::CharacterVector out(value.size()); + int i = 0; for (std::vector::iterator it = value.begin(); it != value.end(); - it++) { - - *it = doDecodeURI(*it, true); + it++, i++) + { + const char* s = doDecodeURI(*it, true).c_str(); + out[i] = Rf_mkCharCE(s, CE_UTF8); } - - return value; + + return out; } //' Check whether an address is IPv4 or IPv6 diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 47876d5b..326467f9 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -4,30 +4,52 @@ test_that("encodeURI and encodeURIComponent", { # "abc \ue5 \u4e2d" is identical to "abc å 中" when the system's encoding is # UTF-8. However, the former is always encoded as UTF-8, while the latter will # be encoded using the system's native encoding. - utf8_str <- "abc \ue5 \u4e2d" + utf8_str <- "abc \ue5 \u4e2d" + utf8_str_encoded <- "abc%20%C3%A5%20%E4%B8%AD" + reserved_str <- ",/?:@" + reserved_str_encoded <- "%2C%2F%3F%3A%40" + expect_true(Encoding(utf8_str) == "UTF-8") - expect_identical(encodeURI(utf8_str), "abc%20%C3%A5%20%E4%B8%AD") - expect_identical(encodeURIComponent(utf8_str), "abc%20%C3%A5%20%E4%B8%AD") - expect_identical(decodeURI("abc%20%C3%A5%20%E4%B8%AD"), utf8_str) - expect_identical(decodeURIComponent("abc%20%C3%A5%20%E4%B8%AD"), utf8_str) - expect_true(Encoding(decodeURI("abc%20%C3%A5%20%E4%B8%AD")) == "UTF-8") - expect_true(Encoding(decodeURIComponent("abc%20%C3%A5%20%E4%B8%AD")) == "UTF-8") + expect_identical(encodeURI(utf8_str), utf8_str_encoded) + expect_identical(encodeURIComponent(utf8_str), utf8_str_encoded) + expect_identical(decodeURI(utf8_str_encoded), utf8_str) + expect_identical(decodeURIComponent(utf8_str_encoded), utf8_str) + expect_true(Encoding(decodeURI(utf8_str_encoded)) == "UTF-8") + expect_true(Encoding(decodeURIComponent(utf8_str_encoded)) == "UTF-8") # Behavior with reserved characters differs between encodeURI and # encodeURIComponent. - expect_identical(encodeURI(",/?:@"), ",/?:@") - expect_identical(encodeURIComponent(",/?:@"), "%2C%2F%3F%3A%40") - expect_identical(decodeURI("%2C%2F%3F%3A%40"), "%2C%2F%3F%3A%40") - expect_identical(decodeURIComponent("%2C%2F%3F%3A%40"), ",/?:@") + expect_identical(encodeURI(reserved_str), reserved_str) + expect_identical(encodeURIComponent(reserved_str), reserved_str_encoded) + expect_identical(decodeURI(reserved_str_encoded), reserved_str_encoded) + expect_identical(decodeURIComponent(reserved_str_encoded), reserved_str) # Decoding characters that aren't encoded should have no effect. - expect_identical(decodeURI(utf8_str), utf8_str) + expect_identical(decodeURI(utf8_str), utf8_str) expect_identical(decodeURIComponent(utf8_str), utf8_str) expect_true(Encoding(decodeURI(utf8_str)) == "UTF-8") expect_true(Encoding(decodeURIComponent(utf8_str)) == "UTF-8") - expect_identical(decodeURI(",/?:@"), ",/?:@") - expect_identical(decodeURIComponent(",/?:@"), ",/?:@") + expect_identical(decodeURI(reserved_str), reserved_str) + expect_identical(decodeURIComponent(reserved_str), reserved_str) + + # Vector input + expect_identical( + encodeURI(c(reserved_str, utf8_str)), + c(reserved_str, utf8_str_encoded) + ) + expect_identical( + encodeURIComponent(c(reserved_str, utf8_str)), + c(reserved_str_encoded, utf8_str_encoded) + ) + expect_identical( + decodeURI(c(reserved_str_encoded, utf8_str_encoded)), + c(reserved_str_encoded, utf8_str) + ) + expect_identical( + decodeURIComponent(c(reserved_str_encoded, utf8_str_encoded)), + c(reserved_str, utf8_str) + ) }) From b2b5e9aaf61f3293506742ee7c5fd7747f6f0e6b Mon Sep 17 00:00:00 2001 From: Winston Chang Date: Tue, 5 Feb 2019 14:49:48 -0600 Subject: [PATCH 2/6] Fix NA handling for encode/decode URI functions --- src/RcppExports-legacy.cpp | 4 +-- src/RcppExports.cpp | 16 ++++----- src/httpuv.cpp | 68 +++++++++++++++++++++---------------- tests/testthat/test-utils.R | 6 ++++ 4 files changed, 54 insertions(+), 40 deletions(-) diff --git a/src/RcppExports-legacy.cpp b/src/RcppExports-legacy.cpp index e0a33505..92d5cda6 100644 --- a/src/RcppExports-legacy.cpp +++ b/src/RcppExports-legacy.cpp @@ -24,12 +24,12 @@ using namespace Rcpp; -Rcpp::CharacterVector decodeURIComponent(std::vector value); +Rcpp::CharacterVector decodeURIComponent(Rcpp::CharacterVector value); RcppExport SEXP httpuv_decodeURIComponent(SEXP valueSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< std::vector >::type value(valueSEXP); + Rcpp::CharacterVector value(valueSEXP); rcpp_result_gen = Rcpp::wrap(decodeURIComponent(value)); return rcpp_result_gen; END_RCPP diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 8d3dfded..ab56eca5 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -149,45 +149,45 @@ BEGIN_RCPP END_RCPP } // encodeURI -std::vector encodeURI(std::vector value); +Rcpp::CharacterVector encodeURI(Rcpp::CharacterVector value); RcppExport SEXP _httpuv_encodeURI(SEXP valueSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< std::vector >::type value(valueSEXP); + Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type value(valueSEXP); rcpp_result_gen = Rcpp::wrap(encodeURI(value)); return rcpp_result_gen; END_RCPP } // encodeURIComponent -std::vector encodeURIComponent(std::vector value); +Rcpp::CharacterVector encodeURIComponent(Rcpp::CharacterVector value); RcppExport SEXP _httpuv_encodeURIComponent(SEXP valueSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< std::vector >::type value(valueSEXP); + Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type value(valueSEXP); rcpp_result_gen = Rcpp::wrap(encodeURIComponent(value)); return rcpp_result_gen; END_RCPP } // decodeURI -Rcpp::CharacterVector decodeURI(std::vector value); +Rcpp::CharacterVector decodeURI(Rcpp::CharacterVector value); RcppExport SEXP _httpuv_decodeURI(SEXP valueSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< std::vector >::type value(valueSEXP); + Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type value(valueSEXP); rcpp_result_gen = Rcpp::wrap(decodeURI(value)); return rcpp_result_gen; END_RCPP } // decodeURIComponent -Rcpp::CharacterVector decodeURIComponent(std::vector value); +Rcpp::CharacterVector decodeURIComponent(Rcpp::CharacterVector value); RcppExport SEXP _httpuv_decodeURIComponent(SEXP valueSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< std::vector >::type value(valueSEXP); + Rcpp::traits::input_parameter< Rcpp::CharacterVector >::type value(valueSEXP); rcpp_result_gen = Rcpp::wrap(decodeURIComponent(value)); return rcpp_result_gen; END_RCPP diff --git a/src/httpuv.cpp b/src/httpuv.cpp index 8e77350a..41070868 100644 --- a/src/httpuv.cpp +++ b/src/httpuv.cpp @@ -532,29 +532,35 @@ std::string doEncodeURI(std::string value, bool encodeReserved) { //' //' @export // [[Rcpp::export]] -std::vector encodeURI(std::vector value) { - for (std::vector::iterator it = value.begin(); - it != value.end(); - it++) { +Rcpp::CharacterVector encodeURI(Rcpp::CharacterVector value) { + Rcpp::CharacterVector out(value.size()); - *it = doEncodeURI(*it, false); + for (int i = 0; i < value.size(); i++) { + if (value[i] == NA_STRING) { + out[i] = NA_STRING; + } else { + const char* s = doEncodeURI(Rcpp::as(value[i]), false).c_str(); + out[i] = Rf_mkCharCE(s, CE_UTF8); + } } - - return value; + return out; } //' @rdname encodeURI //' @export // [[Rcpp::export]] -std::vector encodeURIComponent(std::vector value) { - for (std::vector::iterator it = value.begin(); - it != value.end(); - it++) { +Rcpp::CharacterVector encodeURIComponent(Rcpp::CharacterVector value) { + Rcpp::CharacterVector out(value.size()); - *it = doEncodeURI(*it, true); + for (int i = 0; i < value.size(); i++) { + if (value[i] == NA_STRING) { + out[i] = NA_STRING; + } else { + const char* s = doEncodeURI(Rcpp::as(value[i]), true).c_str(); + out[i] = Rf_mkCharCE(s, CE_UTF8); + } } - - return value; + return out; } int hexToInt(char c) { @@ -620,15 +626,16 @@ std::string doDecodeURI(std::string value, bool component) { //' @rdname encodeURI //' @export // [[Rcpp::export]] -Rcpp::CharacterVector decodeURI(std::vector value) { +Rcpp::CharacterVector decodeURI(Rcpp::CharacterVector value) { Rcpp::CharacterVector out(value.size()); - int i = 0; - for (std::vector::iterator it = value.begin(); - it != value.end(); - it++, i++) - { - const char* s = doDecodeURI(*it, false).c_str(); - out[i] = Rf_mkCharCE(s, CE_UTF8); + + for (int i = 0; i < value.size(); i++) { + if (value[i] == NA_STRING) { + out[i] = NA_STRING; + } else { + const char* s = doDecodeURI(Rcpp::as(value[i]), false).c_str(); + out[i] = Rf_mkCharCE(s, CE_UTF8); + } } return out; @@ -637,15 +644,16 @@ Rcpp::CharacterVector decodeURI(std::vector value) { //' @rdname encodeURI //' @export // [[Rcpp::export]] -Rcpp::CharacterVector decodeURIComponent(std::vector value) { +Rcpp::CharacterVector decodeURIComponent(Rcpp::CharacterVector value) { Rcpp::CharacterVector out(value.size()); - int i = 0; - for (std::vector::iterator it = value.begin(); - it != value.end(); - it++, i++) - { - const char* s = doDecodeURI(*it, true).c_str(); - out[i] = Rf_mkCharCE(s, CE_UTF8); + + for (int i = 0; i < value.size(); i++) { + if (value[i] == NA_STRING) { + out[i] = NA_STRING; + } else { + const char* s = doDecodeURI(Rcpp::as(value[i]), true).c_str(); + out[i] = Rf_mkCharCE(s, CE_UTF8); + } } return out; diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 326467f9..c387d4af 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -50,6 +50,12 @@ test_that("encodeURI and encodeURIComponent", { decodeURIComponent(c(reserved_str_encoded, utf8_str_encoded)), c(reserved_str, utf8_str) ) + + # NA handling + expect_identical(encodeURI(NA_character_), NA_character_) + expect_identical(encodeURIComponent(NA_character_), NA_character_) + expect_identical(decodeURI(NA_character_), NA_character_) + expect_identical(decodeURIComponent(NA_character_), NA_character_) }) From 9e409e55f7498abc5129127892c188615e48e403 Mon Sep 17 00:00:00 2001 From: Winston Chang Date: Wed, 6 Feb 2019 10:51:32 -0600 Subject: [PATCH 3/6] Pre-populate vectors with NA --- src/httpuv.cpp | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/src/httpuv.cpp b/src/httpuv.cpp index 41070868..2c007345 100644 --- a/src/httpuv.cpp +++ b/src/httpuv.cpp @@ -533,12 +533,10 @@ std::string doEncodeURI(std::string value, bool encodeReserved) { //' @export // [[Rcpp::export]] Rcpp::CharacterVector encodeURI(Rcpp::CharacterVector value) { - Rcpp::CharacterVector out(value.size()); + Rcpp::CharacterVector out(value.size(), NA_STRING); for (int i = 0; i < value.size(); i++) { - if (value[i] == NA_STRING) { - out[i] = NA_STRING; - } else { + if (value[i] != NA_STRING) { const char* s = doEncodeURI(Rcpp::as(value[i]), false).c_str(); out[i] = Rf_mkCharCE(s, CE_UTF8); } @@ -550,12 +548,10 @@ Rcpp::CharacterVector encodeURI(Rcpp::CharacterVector value) { //' @export // [[Rcpp::export]] Rcpp::CharacterVector encodeURIComponent(Rcpp::CharacterVector value) { - Rcpp::CharacterVector out(value.size()); + Rcpp::CharacterVector out(value.size(), NA_STRING); for (int i = 0; i < value.size(); i++) { - if (value[i] == NA_STRING) { - out[i] = NA_STRING; - } else { + if (value[i] != NA_STRING) { const char* s = doEncodeURI(Rcpp::as(value[i]), true).c_str(); out[i] = Rf_mkCharCE(s, CE_UTF8); } @@ -627,12 +623,10 @@ std::string doDecodeURI(std::string value, bool component) { //' @export // [[Rcpp::export]] Rcpp::CharacterVector decodeURI(Rcpp::CharacterVector value) { - Rcpp::CharacterVector out(value.size()); + Rcpp::CharacterVector out(value.size(), NA_STRING); for (int i = 0; i < value.size(); i++) { - if (value[i] == NA_STRING) { - out[i] = NA_STRING; - } else { + if (value[i] != NA_STRING) { const char* s = doDecodeURI(Rcpp::as(value[i]), false).c_str(); out[i] = Rf_mkCharCE(s, CE_UTF8); } @@ -645,12 +639,10 @@ Rcpp::CharacterVector decodeURI(Rcpp::CharacterVector value) { //' @export // [[Rcpp::export]] Rcpp::CharacterVector decodeURIComponent(Rcpp::CharacterVector value) { - Rcpp::CharacterVector out(value.size()); + Rcpp::CharacterVector out(value.size(), NA_STRING); for (int i = 0; i < value.size(); i++) { - if (value[i] == NA_STRING) { - out[i] = NA_STRING; - } else { + if (value[i] != NA_STRING) { const char* s = doDecodeURI(Rcpp::as(value[i]), true).c_str(); out[i] = Rf_mkCharCE(s, CE_UTF8); } From 4e9f8ec45373439b7cedf3d99c29364e0f564544 Mon Sep 17 00:00:00 2001 From: Winston Chang Date: Wed, 6 Feb 2019 11:56:33 -0600 Subject: [PATCH 4/6] Convert to UTF-8 before URL-encoding --- R/RcppExports.R | 6 ++---- man/encodeURI.Rd | 6 ++---- src/httpuv.cpp | 12 ++++++------ tests/testthat/test-utils.R | 13 +++++++++++++ 4 files changed, 23 insertions(+), 14 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index b99ff75e..26a69a39 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -65,10 +65,8 @@ base64encode <- function(x) { #' encoded sequences that decode to a reserved character. (If in doubt, use #' decodeURIComponent.) #' -#' The only way these functions differ from web browsers is in the encoding of -#' non-ASCII characters. All non-ASCII characters will be escaped byte-by-byte. -#' If conformant non-ASCII behavior is important, ensure that your input vector -#' is UTF-8 encoded before calling encodeURI or encodeURIComponent. +#' For \code{encodeURI} and \code{encodeURIComponent}, input strings will be +#' converted to UTF-8 before URL-encoding. #' #' @param value Character vector to be encoded or decoded. #' @return Encoded or decoded character vector of the same length as the diff --git a/man/encodeURI.Rd b/man/encodeURI.Rd index 3ef7821c..6cd8b27d 100644 --- a/man/encodeURI.Rd +++ b/man/encodeURI.Rd @@ -43,8 +43,6 @@ decodeURI differs from decodeURIComponent in that it will refuse to decode encoded sequences that decode to a reserved character. (If in doubt, use decodeURIComponent.) -The only way these functions differ from web browsers is in the encoding of -non-ASCII characters. All non-ASCII characters will be escaped byte-by-byte. -If conformant non-ASCII behavior is important, ensure that your input vector -is UTF-8 encoded before calling encodeURI or encodeURIComponent. +For \code{encodeURI} and \code{encodeURIComponent}, input strings will be +converted to UTF-8 before URL-encoding. } diff --git a/src/httpuv.cpp b/src/httpuv.cpp index 2c007345..fb0762d0 100644 --- a/src/httpuv.cpp +++ b/src/httpuv.cpp @@ -520,10 +520,8 @@ std::string doEncodeURI(std::string value, bool encodeReserved) { //' encoded sequences that decode to a reserved character. (If in doubt, use //' decodeURIComponent.) //' -//' The only way these functions differ from web browsers is in the encoding of -//' non-ASCII characters. All non-ASCII characters will be escaped byte-by-byte. -//' If conformant non-ASCII behavior is important, ensure that your input vector -//' is UTF-8 encoded before calling encodeURI or encodeURIComponent. +//' For \code{encodeURI} and \code{encodeURIComponent}, input strings will be +//' converted to UTF-8 before URL-encoding. //' //' @param value Character vector to be encoded or decoded. //' @return Encoded or decoded character vector of the same length as the @@ -537,7 +535,8 @@ Rcpp::CharacterVector encodeURI(Rcpp::CharacterVector value) { for (int i = 0; i < value.size(); i++) { if (value[i] != NA_STRING) { - const char* s = doEncodeURI(Rcpp::as(value[i]), false).c_str(); + const char* s = Rf_translateCharUTF8(value[i]); + s = doEncodeURI(s, false).c_str(); out[i] = Rf_mkCharCE(s, CE_UTF8); } } @@ -552,7 +551,8 @@ Rcpp::CharacterVector encodeURIComponent(Rcpp::CharacterVector value) { for (int i = 0; i < value.size(); i++) { if (value[i] != NA_STRING) { - const char* s = doEncodeURI(Rcpp::as(value[i]), true).c_str(); + const char* s = Rf_translateCharUTF8(value[i]); + s = doEncodeURI(s, true).c_str(); out[i] = Rf_mkCharCE(s, CE_UTF8); } } diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index c387d4af..16722cf2 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -56,6 +56,19 @@ test_that("encodeURI and encodeURIComponent", { expect_identical(encodeURIComponent(NA_character_), NA_character_) expect_identical(decodeURI(NA_character_), NA_character_) expect_identical(decodeURIComponent(NA_character_), NA_character_) + + # Strings that are not UTF-8 encoded should be automatically converted to + # UTF-8 before URL-encoding. + # + # "å", in UTF-8. The previous string, with Chinese characters, can't be + # converted to latin1. + utf8_str <- "\ue5" + latin1_str <- iconv(utf8_str, "UTF-8", "latin1") + + expect_identical(encodeURI(utf8_str), "%C3%A5") + expect_identical(encodeURI(latin1_str), "%C3%A5") + expect_identical(encodeURIComponent(utf8_str), "%C3%A5") + expect_identical(encodeURIComponent(latin1_str), "%C3%A5") }) From 41cce877b28d6c22754563ab1e6777ec44cb3089 Mon Sep 17 00:00:00 2001 From: Winston Chang Date: Wed, 6 Feb 2019 11:57:34 -0600 Subject: [PATCH 5/6] Update NEWS --- NEWS.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index e103a444..cbc9f0da 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,7 +9,11 @@ httpuv 1.4.5.9002 * Fixed [#168](https://github.com/rstudio/httpuv/issues/168): A SIGPIPE signal on the httpuv background thread could cause the process to quit. This can happen in some instances when the server is under heavy load. ([#169](https://github.com/rstudio/httpuv/pull/169)) -* Fixed [#122](https://github.com/rstudio/httpuv/issues/122): `decodeURI()` and `decodeURIComponent()` previously returned strings encoded with the system's native encoding; they now return UTF-8 encoded strings. ([#185](https://github.com/rstudio/httpuv/pull/185)) +* Fixed [#122](https://github.com/rstudio/httpuv/issues/122): `decodeURI()` and `decodeURIComponent()` previously returned strings encoded with the system's native encoding; they now return UTF-8 encoded strings. ([#185](https://github.com/rstudio/httpuv/pull/185), [#192](https://github.com/rstudio/httpuv/pull/192)) + +* `encodeURI()` and `encodeURIComponent()`, now convert their inputs to UTF-8 before URL-encoding. ([#192](https://github.com/rstudio/httpuv/pull/192)) + +* `encodeURI()`, `encodeURIComponent()`, `decodeURI()`, and `decodeURIComponent()` now handle `NA`s correctly. ([#192](https://github.com/rstudio/httpuv/pull/192)) * `service()` now executes a single `later` callback, rather than all eligible callbacks. This gives callers more opportunities to perform their own housekeeping when multiple expensive callbacks queue up. ([#176](https://github.com/rstudio/httpuv/pull/176)) From 6a72dd934f8e8e8238d1e16b233a76bbf718c192 Mon Sep 17 00:00:00 2001 From: Winston Chang Date: Fri, 8 Feb 2019 13:12:00 -0600 Subject: [PATCH 6/6] Bump version to 1.4.5.9003 --- DESCRIPTION | 2 +- NEWS.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index dbfbdcb2..fa1cac72 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: httpuv Type: Package Title: HTTP and WebSocket Server Library -Version: 1.4.5.9002 +Version: 1.4.5.9003 Author: Joe Cheng, Hector Corrada Bravo [ctb], Jeroen Ooms [ctb], Winston Chang [ctb] Copyright: RStudio, Inc.; Joyent, Inc.; Nginx Inc.; Igor Sysoev; Niels Provos; diff --git a/NEWS.md b/NEWS.md index cbc9f0da..5f14c392 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -httpuv 1.4.5.9002 +httpuv 1.4.5.9003 ============ * Added support for serving static files from the background I/O thread. Files can now be served from the filesystem without involving the main R thread, which means that these operations won't block or be blocked by code that runs in the main R thread. ([#177](https://github.com/rstudio/httpuv/pull/177))