r-lib · jimhester · May 19, 2016 · Oct 27, 2015 · Oct 27, 2015 · May 17, 2016
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # xml2 0.1.2.9000
 
+* `xml_read()` gains a `options` argument to control all available parsing
+  options, including `HUGE` to turn off limits for parsing very large
+  documents and now drops blank text nodes by default, mimicking default
+  behavior of XML package. (@jimhester, #49, #62, #85, #88)
+
 * `xml_write()` expands the path on filenames, so directories can be specified
   with '~/' (@jimhester, #86, #80)
 

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -5,12 +5,12 @@ read_connection_ <- function(con, chunk_size = 64 * 1024L) {
     .Call('xml2_read_connection_', PACKAGE = 'xml2', con, chunk_size)
 }
 
-doc_parse_file <- function(path, encoding = "", as_html = FALSE) {
-    .Call('xml2_doc_parse_file', PACKAGE = 'xml2', path, encoding, as_html)
+doc_parse_file <- function(path, encoding = "", as_html = FALSE, options = 0L) {
+    .Call('xml2_doc_parse_file', PACKAGE = 'xml2', path, encoding, as_html, options)
 }
 
-doc_parse_raw <- function(x, encoding, base_url = "", as_html = FALSE) {
-    .Call('xml2_doc_parse_raw', PACKAGE = 'xml2', x, encoding, base_url, as_html)
+doc_parse_raw <- function(x, encoding, base_url = "", as_html = FALSE, options = 0L) {
+    .Call('xml2_doc_parse_raw', PACKAGE = 'xml2', x, encoding, base_url, as_html, options)
 }
 
 doc_format <- function(x) {

diff --git a/R/xml_parse.R b/R/xml_parse.R
@@ -22,6 +22,33 @@
 #'    iteration. Defaults to 64kb.
 #' @param verbose When reading from a slow connection, this prints some
 #'    output on every iteration so you know its working.
+#' @param options Set parsing options for the libxml2 parser. These are
+#' specified as a character vector of options to set. Available values are
+#' \describe{
+#'   \item{RECOVER}{recover on errors}
+#'   \item{NOENT}{substitute entities}
+#'   \item{DTDLOAD}{load the external subset}
+#'   \item{DTDATTR}{default DTD attributes}
+#'   \item{DTDVALID}{validate with the DTD}
+#'   \item{NOERROR}{suppress error reports}
+#'   \item{NOWARNING}{suppress warning reports}
+#'   \item{PEDANTIC}{pedantic error reporting}
+#'   \item{NOBLANKS}{remove blank nodes}
+#'   \item{SAX1}{use the SAX1 interface internally}
+#'   \item{XINCLUDE}{Implement XInclude substitition}
+#'   \item{NONET}{Forbid network access}
+#'   \item{NODICT}{Do not reuse the context dictionary}
+#'   \item{NSCLEAN}{remove redundant namespaces declarations}
+#'   \item{NOCDATA}{merge CDATA as text nodes}
+#'   \item{NOXINCNODE}{do not generate XINCLUDE START/END nodes}
+#'   \item{COMPACT}{compact small text nodes; no modification of the tree allowed afterwards (will possibly crash if you try to modify the tree)}
+#'   \item{OLD10}{parse using XML-1.0 before update 5}
+#'   \item{NOBASEFIX}{do not fixup XINCLUDE xml:base uris}
+#'   \item{HUGE}{relax any hardcoded limit from the parser}
+#'   \item{OLDSAX}{parse using SAX2 interface before 2.7.0}
+#'   \item{IGNORE_ENC}{ignore internal document encoding hint}
+#'   \item{BIG_LINES}{Store big lines numbers in text PSVI field}
+#' }
 #' @return An XML document. HTML is normalised to valid XML - this may not
 #'   be exactly the same transformation performed by the browser, but it's
 #'   a reasonable approximation.
@@ -38,28 +65,32 @@
 #' # From a url
 #' cd <- read_xml("http://www.xmlfiles.com/examples/cd_catalog.xml")
 #' me <- read_html("http://had.co.nz")
-read_xml <- function(x, encoding = "", ..., as_html = FALSE) {
+read_xml <- function(x, encoding = "", ..., as_html = FALSE, options = "NOBLANKS") {
   UseMethod("read_xml")
 }
 
 #' @export
 #' @rdname read_xml
-read_html <- function(x, encoding = "", ...) {
-  suppressWarnings(read_xml(x, encoding, ..., as_html = TRUE))
+read_html <- function(x, encoding = "", ..., options = c("RECOVER", "NOERROR", "NOBLANKS")) {
+  suppressWarnings(read_xml(x, encoding, ..., as_html = TRUE, options = options))
 }
 
 #' @export
 #' @rdname read_xml
-read_xml.character <- function(x, encoding = "", ..., as_html = FALSE) {
+read_xml.character <- function(x, encoding = "", ..., as_html = FALSE,
+                               options = "NOBLANKS") {
+
+  options <- parse_options(options)
   if (grepl("<|>", x)) {
-    read_xml.raw(charToRaw(enc2utf8(x)), "UTF-8", ..., as_html = as_html)
+    read_xml.raw(charToRaw(enc2utf8(x)), "UTF-8", ..., as_html = as_html, options = options)
   } else {
     con <- path_to_connection(x)
     if (inherits(con, "connection")) {
       read_xml.connection(con, encoding = encoding, ..., as_html = as_html,
-        base_url = x)
+        base_url = x, options = options)
     } else {
-      doc <- doc_parse_file(con, encoding = encoding, as_html = as_html)
+      doc <- doc_parse_file(con, encoding = encoding, as_html = as_html,
+        options = options)
       xml_document(doc)
     }
   }
@@ -68,21 +99,66 @@ read_xml.character <- function(x, encoding = "", ..., as_html = FALSE) {
 #' @export
 #' @rdname read_xml
 read_xml.raw <- function(x, encoding = "", base_url = "", ...,
-                         as_html = FALSE) {
-  doc <- doc_parse_raw(x, encoding = encoding, base_url = base_url, as_html = as_html)
+                         as_html = FALSE, options = "NOBLANKS") {
+  options <- parse_options(options)
+
+  doc <- doc_parse_raw(x, encoding = encoding, base_url = base_url,
+    as_html = as_html, options = options)
   xml_document(doc)
 }
 
 #' @export
 #' @rdname read_xml
 read_xml.connection <- function(x, encoding = "", n = 64 * 1024,
                                 verbose = FALSE, ..., base_url = "",
-                                as_html = FALSE) {
+                                as_html = FALSE, options = "NOBLANKS") {
   if (!isOpen(x)) {
     open(x, "rb")
     on.exit(close(x))
   }
 
   raw <- read_connection_(x, n)
-  read_xml.raw(raw, encoding = encoding, base_url = base_url, as_html = as_html)
+  read_xml.raw(raw, encoding = encoding, base_url = base_url, as_html =
+    as_html, options = options)
+}
+
+`%<<%` <- function(a, n) bitwShiftL(a, n)
+
+# http://xmlsoft.org/html/libxml-parser.html#xmlParserOption
+parser_options <- c(
+  "RECOVER" = 1 %<<% 0,
+  "NOENT" = 1 %<<% 1,
+  "DTDLOAD" = 1 %<<% 2,
+  "DTDATTR" = 1 %<<% 3,
+  "DTDVALID" = 1 %<<% 4,
+  "NOERROR" = 1 %<<% 5,
+  "NOWARNING" = 1 %<<% 6,
+  "PEDANTIC" = 1 %<<% 7,
+  "NOBLANKS" = 1 %<<% 8,
+  "SAX1" = 1 %<<% 9,
+  "XINCLUDE" = 1 %<<% 10,
+  "NONET" = 1 %<<% 11,
+  "NODICT" = 1 %<<% 12,
+  "NSCLEAN" = 1 %<<% 13,
+  "NOCDATA" = 1 %<<% 14,
+  "NOXINCNODE" = 1 %<<% 15,
+  "COMPACT" = 1 %<<% 16,
+  "OLD10" = 1 %<<% 17,
+  "NOBASEFIX" = 1 %<<% 18,
+  "HUGE" = 1 %<<% 19,
+  "OLDSAX" = 1 %<<% 20,
+  "OLDSAX" = 1 %<<% 20,
+  "IGNORE_ENC" = 1 %<<% 21,
+  "BIG_LINES" = 1 %<<% 22)
+
+parse_options <- function(options) {
+  if (is.numeric(options)) {
+     return(options)
+  }
+  mtch <- pmatch(options, names(parser_options))
+  if (any(is.na(mtch))) {
+    stop("`options` ", options[is.na(mtch)][1L], " is not a valid option", call. = FALSE)
+  }
+
+  sum(parser_options[mtch])
 }
diff --git a/man/read_xml.Rd b/man/read_xml.Rd
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -19,29 +19,31 @@ BEGIN_RCPP
 END_RCPP
 }
 // doc_parse_file
-XPtrDoc doc_parse_file(std::string path, std::string encoding, bool as_html);
-RcppExport SEXP xml2_doc_parse_file(SEXP pathSEXP, SEXP encodingSEXP, SEXP as_htmlSEXP) {
+XPtrDoc doc_parse_file(std::string path, std::string encoding, bool as_html, int options);
+RcppExport SEXP xml2_doc_parse_file(SEXP pathSEXP, SEXP encodingSEXP, SEXP as_htmlSEXP, SEXP optionsSEXP) {
 BEGIN_RCPP
     Rcpp::RObject __result;
     Rcpp::RNGScope __rngScope;
     Rcpp::traits::input_parameter< std::string >::type path(pathSEXP);
     Rcpp::traits::input_parameter< std::string >::type encoding(encodingSEXP);
     Rcpp::traits::input_parameter< bool >::type as_html(as_htmlSEXP);
-    __result = Rcpp::wrap(doc_parse_file(path, encoding, as_html));
+    Rcpp::traits::input_parameter< int >::type options(optionsSEXP);
+    __result = Rcpp::wrap(doc_parse_file(path, encoding, as_html, options));
     return __result;
 END_RCPP
 }
 // doc_parse_raw
-XPtrDoc doc_parse_raw(RawVector x, std::string encoding, std::string base_url, bool as_html);
-RcppExport SEXP xml2_doc_parse_raw(SEXP xSEXP, SEXP encodingSEXP, SEXP base_urlSEXP, SEXP as_htmlSEXP) {
+XPtrDoc doc_parse_raw(RawVector x, std::string encoding, std::string base_url, bool as_html, int options);
+RcppExport SEXP xml2_doc_parse_raw(SEXP xSEXP, SEXP encodingSEXP, SEXP base_urlSEXP, SEXP as_htmlSEXP, SEXP optionsSEXP) {
 BEGIN_RCPP
     Rcpp::RObject __result;
     Rcpp::RNGScope __rngScope;
     Rcpp::traits::input_parameter< RawVector >::type x(xSEXP);
     Rcpp::traits::input_parameter< std::string >::type encoding(encodingSEXP);
     Rcpp::traits::input_parameter< std::string >::type base_url(base_urlSEXP);
     Rcpp::traits::input_parameter< bool >::type as_html(as_htmlSEXP);
-    __result = Rcpp::wrap(doc_parse_raw(x, encoding, base_url, as_html));
+    Rcpp::traits::input_parameter< int >::type options(optionsSEXP);
+    __result = Rcpp::wrap(doc_parse_raw(x, encoding, base_url, as_html, options));
     return __result;
 END_RCPP
 }

diff --git a/src/xml2_doc.cpp b/src/xml2_doc.cpp
@@ -9,19 +9,20 @@ using namespace Rcpp;
 // [[Rcpp::export]]
 XPtrDoc doc_parse_file(std::string path,
                             std::string encoding = "",
-                            bool as_html = false) {
+                            bool as_html = false,
+                            int options = 0) {
   xmlDoc* pDoc;
   if (as_html) {
     pDoc = htmlReadFile(
       path.c_str(),
       encoding == "" ? NULL : encoding.c_str(),
-      HTML_PARSE_RECOVER | HTML_PARSE_NOERROR
+      options
     );
   } else {
     pDoc = xmlReadFile(
       path.c_str(),
       encoding == "" ? NULL : encoding.c_str(),
-      0
+      options
     );
   }
 
@@ -34,23 +35,24 @@ XPtrDoc doc_parse_file(std::string path,
 // [[Rcpp::export]]
 XPtrDoc doc_parse_raw(RawVector x, std::string encoding,
                       std::string base_url = "",
-                      bool as_html = false) {
+                      bool as_html = false,
+                      int options = 0) {
   xmlDoc* pDoc;
   if (as_html) {
     pDoc = htmlReadMemory(
       (const char *) RAW(x),
       Rf_length(x),
       base_url == "" ? NULL : base_url.c_str(),
       encoding == "" ? NULL : encoding.c_str(),
-      HTML_PARSE_RECOVER | HTML_PARSE_NOERROR
+      options
     );
   } else {
     pDoc = xmlReadMemory(
       (const char *) RAW(x),
       Rf_length(x),
       base_url == "" ? NULL : base_url.c_str(),
       encoding == "" ? NULL : encoding.c_str(),
-      0
+      options
     );
   }
 

diff --git a/tests/testthat/cd_catalog.xml.bz2 b/tests/testthat/cd_catalog.xml.bz2
diff --git a/tests/testthat/test-read-xml.R b/tests/testthat/test-read-xml.R
@@ -4,3 +4,25 @@ test_that("read_html correctly parses malformed document", {
   lego <- read_html("lego.html.bz2")
   expect_equal(length(xml_find_all(lego, ".//p")), 39)
 })
+
+test_that("parse_options errors when given an invalid option", {
+  expect_error(parse_options("INVALID"),
+    "`options` INVALID is not a valid option")
+
+  expect_error(read_html("lego.html.bz2", options = "INVALID"),
+    "`options` INVALID is not a valid option")
+})
+
+test_that("read_html properly passes parser arguments", {
+
+  skip_if_not(libxml2_version() >= "2.9.2")
+
+  blanks <- read_html("cd_catalog.xml.bz2", options = c("RECOVER", "NOERROR"))
+  expect_equal(as_list(blanks)$body$catalog$cd[[1]],
+    "\r\n    ")
+
+  no_blanks <- read_html("cd_catalog.xml.bz2", options = c("RECOVER", "NOERROR", "NOBLANKS"))
+
+  expect_equal(as_list(no_blanks)$body$catalog$cd[[1]],
+    list("Empire Burlesque"))
+})
diff --git a/tests/testthat/test-xml_find.R b/tests/testthat/test-xml_find.R
@@ -72,7 +72,7 @@ test_that("xml_find_chr errors with non character results", {
   expect_error(xml_find_chr(x, "1+1"), "result of type:.*numeric.*, not character")
 })
 
-test_that("xml_find_num returns a numeric result", {
+test_that("xml_find_chr returns a character result", {
   x <- read_xml("<x><y>1</y><y/></x>")
   expect_equal(xml_find_chr(x, "string(5)"), "5")