Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

libxml2 parsing options #88

Merged
merged 8 commits into from
May 19, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# xml2 0.1.2.9000

* `xml_read()` gains a `options` argument to control all available parsing
options, including `HUGE` to turn off limits for parsing very large
documents and now drops blank text nodes by default, mimicking default
behavior of XML package. (@jimhester, #49, #62, #85, #88)

* `xml_write()` expands the path on filenames, so directories can be specified
with '~/' (@jimhester, #86, #80)

Expand Down
8 changes: 4 additions & 4 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ read_connection_ <- function(con, chunk_size = 64 * 1024L) {
.Call('xml2_read_connection_', PACKAGE = 'xml2', con, chunk_size)
}

doc_parse_file <- function(path, encoding = "", as_html = FALSE) {
.Call('xml2_doc_parse_file', PACKAGE = 'xml2', path, encoding, as_html)
doc_parse_file <- function(path, encoding = "", as_html = FALSE, options = 0L) {
.Call('xml2_doc_parse_file', PACKAGE = 'xml2', path, encoding, as_html, options)
}

doc_parse_raw <- function(x, encoding, base_url = "", as_html = FALSE) {
.Call('xml2_doc_parse_raw', PACKAGE = 'xml2', x, encoding, base_url, as_html)
doc_parse_raw <- function(x, encoding, base_url = "", as_html = FALSE, options = 0L) {
.Call('xml2_doc_parse_raw', PACKAGE = 'xml2', x, encoding, base_url, as_html, options)
}

doc_format <- function(x) {
Expand Down
98 changes: 87 additions & 11 deletions R/xml_parse.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,33 @@
#' iteration. Defaults to 64kb.
#' @param verbose When reading from a slow connection, this prints some
#' output on every iteration so you know its working.
#' @param options Set parsing options for the libxml2 parser. These are
#' specified as a character vector of options to set. Available values are
#' \describe{
#' \item{RECOVER}{recover on errors}
#' \item{NOENT}{substitute entities}
#' \item{DTDLOAD}{load the external subset}
#' \item{DTDATTR}{default DTD attributes}
#' \item{DTDVALID}{validate with the DTD}
#' \item{NOERROR}{suppress error reports}
#' \item{NOWARNING}{suppress warning reports}
#' \item{PEDANTIC}{pedantic error reporting}
#' \item{NOBLANKS}{remove blank nodes}
#' \item{SAX1}{use the SAX1 interface internally}
#' \item{XINCLUDE}{Implement XInclude substitition}
#' \item{NONET}{Forbid network access}
#' \item{NODICT}{Do not reuse the context dictionary}
#' \item{NSCLEAN}{remove redundant namespaces declarations}
#' \item{NOCDATA}{merge CDATA as text nodes}
#' \item{NOXINCNODE}{do not generate XINCLUDE START/END nodes}
#' \item{COMPACT}{compact small text nodes; no modification of the tree allowed afterwards (will possibly crash if you try to modify the tree)}
#' \item{OLD10}{parse using XML-1.0 before update 5}
#' \item{NOBASEFIX}{do not fixup XINCLUDE xml:base uris}
#' \item{HUGE}{relax any hardcoded limit from the parser}
#' \item{OLDSAX}{parse using SAX2 interface before 2.7.0}
#' \item{IGNORE_ENC}{ignore internal document encoding hint}
#' \item{BIG_LINES}{Store big lines numbers in text PSVI field}
#' }
#' @return An XML document. HTML is normalised to valid XML - this may not
#' be exactly the same transformation performed by the browser, but it's
#' a reasonable approximation.
Expand All @@ -38,28 +65,32 @@
#' # From a url
#' cd <- read_xml("http://www.xmlfiles.com/examples/cd_catalog.xml")
#' me <- read_html("http://had.co.nz")
read_xml <- function(x, encoding = "", ..., as_html = FALSE) {
read_xml <- function(x, encoding = "", ..., as_html = FALSE, options = "NOBLANKS") {
UseMethod("read_xml")
}

#' @export
#' @rdname read_xml
read_html <- function(x, encoding = "", ...) {
suppressWarnings(read_xml(x, encoding, ..., as_html = TRUE))
read_html <- function(x, encoding = "", ..., options = c("RECOVER", "NOERROR", "NOBLANKS")) {
suppressWarnings(read_xml(x, encoding, ..., as_html = TRUE, options = options))
}

#' @export
#' @rdname read_xml
read_xml.character <- function(x, encoding = "", ..., as_html = FALSE) {
read_xml.character <- function(x, encoding = "", ..., as_html = FALSE,
options = "NOBLANKS") {

options <- parse_options(options)
if (grepl("<|>", x)) {
read_xml.raw(charToRaw(enc2utf8(x)), "UTF-8", ..., as_html = as_html)
read_xml.raw(charToRaw(enc2utf8(x)), "UTF-8", ..., as_html = as_html, options = options)
} else {
con <- path_to_connection(x)
if (inherits(con, "connection")) {
read_xml.connection(con, encoding = encoding, ..., as_html = as_html,
base_url = x)
base_url = x, options = options)
} else {
doc <- doc_parse_file(con, encoding = encoding, as_html = as_html)
doc <- doc_parse_file(con, encoding = encoding, as_html = as_html,
options = options)
xml_document(doc)
}
}
Expand All @@ -68,21 +99,66 @@ read_xml.character <- function(x, encoding = "", ..., as_html = FALSE) {
#' @export
#' @rdname read_xml
read_xml.raw <- function(x, encoding = "", base_url = "", ...,
as_html = FALSE) {
doc <- doc_parse_raw(x, encoding = encoding, base_url = base_url, as_html = as_html)
as_html = FALSE, options = "NOBLANKS") {
options <- parse_options(options)

doc <- doc_parse_raw(x, encoding = encoding, base_url = base_url,
as_html = as_html, options = options)
xml_document(doc)
}

#' @export
#' @rdname read_xml
read_xml.connection <- function(x, encoding = "", n = 64 * 1024,
verbose = FALSE, ..., base_url = "",
as_html = FALSE) {
as_html = FALSE, options = "NOBLANKS") {
if (!isOpen(x)) {
open(x, "rb")
on.exit(close(x))
}

raw <- read_connection_(x, n)
read_xml.raw(raw, encoding = encoding, base_url = base_url, as_html = as_html)
read_xml.raw(raw, encoding = encoding, base_url = base_url, as_html =
as_html, options = options)
}

`%<<%` <- function(a, n) bitwShiftL(a, n)

# http://xmlsoft.org/html/libxml-parser.html#xmlParserOption
parser_options <- c(
"RECOVER" = 1 %<<% 0,
"NOENT" = 1 %<<% 1,
"DTDLOAD" = 1 %<<% 2,
"DTDATTR" = 1 %<<% 3,
"DTDVALID" = 1 %<<% 4,
"NOERROR" = 1 %<<% 5,
"NOWARNING" = 1 %<<% 6,
"PEDANTIC" = 1 %<<% 7,
"NOBLANKS" = 1 %<<% 8,
"SAX1" = 1 %<<% 9,
"XINCLUDE" = 1 %<<% 10,
"NONET" = 1 %<<% 11,
"NODICT" = 1 %<<% 12,
"NSCLEAN" = 1 %<<% 13,
"NOCDATA" = 1 %<<% 14,
"NOXINCNODE" = 1 %<<% 15,
"COMPACT" = 1 %<<% 16,
"OLD10" = 1 %<<% 17,
"NOBASEFIX" = 1 %<<% 18,
"HUGE" = 1 %<<% 19,
"OLDSAX" = 1 %<<% 20,
"OLDSAX" = 1 %<<% 20,
"IGNORE_ENC" = 1 %<<% 21,
"BIG_LINES" = 1 %<<% 22)

parse_options <- function(options) {
if (is.numeric(options)) {
return(options)
}
mtch <- pmatch(options, names(parser_options))
if (any(is.na(mtch))) {
stop("`options` ", options[is.na(mtch)][1L], " is not a valid option", call. = FALSE)
}

sum(parser_options[mtch])
}
41 changes: 36 additions & 5 deletions man/read_xml.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 8 additions & 6 deletions src/RcppExports.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,29 +19,31 @@ BEGIN_RCPP
END_RCPP
}
// doc_parse_file
XPtrDoc doc_parse_file(std::string path, std::string encoding, bool as_html);
RcppExport SEXP xml2_doc_parse_file(SEXP pathSEXP, SEXP encodingSEXP, SEXP as_htmlSEXP) {
XPtrDoc doc_parse_file(std::string path, std::string encoding, bool as_html, int options);
RcppExport SEXP xml2_doc_parse_file(SEXP pathSEXP, SEXP encodingSEXP, SEXP as_htmlSEXP, SEXP optionsSEXP) {
BEGIN_RCPP
Rcpp::RObject __result;
Rcpp::RNGScope __rngScope;
Rcpp::traits::input_parameter< std::string >::type path(pathSEXP);
Rcpp::traits::input_parameter< std::string >::type encoding(encodingSEXP);
Rcpp::traits::input_parameter< bool >::type as_html(as_htmlSEXP);
__result = Rcpp::wrap(doc_parse_file(path, encoding, as_html));
Rcpp::traits::input_parameter< int >::type options(optionsSEXP);
__result = Rcpp::wrap(doc_parse_file(path, encoding, as_html, options));
return __result;
END_RCPP
}
// doc_parse_raw
XPtrDoc doc_parse_raw(RawVector x, std::string encoding, std::string base_url, bool as_html);
RcppExport SEXP xml2_doc_parse_raw(SEXP xSEXP, SEXP encodingSEXP, SEXP base_urlSEXP, SEXP as_htmlSEXP) {
XPtrDoc doc_parse_raw(RawVector x, std::string encoding, std::string base_url, bool as_html, int options);
RcppExport SEXP xml2_doc_parse_raw(SEXP xSEXP, SEXP encodingSEXP, SEXP base_urlSEXP, SEXP as_htmlSEXP, SEXP optionsSEXP) {
BEGIN_RCPP
Rcpp::RObject __result;
Rcpp::RNGScope __rngScope;
Rcpp::traits::input_parameter< RawVector >::type x(xSEXP);
Rcpp::traits::input_parameter< std::string >::type encoding(encodingSEXP);
Rcpp::traits::input_parameter< std::string >::type base_url(base_urlSEXP);
Rcpp::traits::input_parameter< bool >::type as_html(as_htmlSEXP);
__result = Rcpp::wrap(doc_parse_raw(x, encoding, base_url, as_html));
Rcpp::traits::input_parameter< int >::type options(optionsSEXP);
__result = Rcpp::wrap(doc_parse_raw(x, encoding, base_url, as_html, options));
return __result;
END_RCPP
}
Expand Down
14 changes: 8 additions & 6 deletions src/xml2_doc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,20 @@ using namespace Rcpp;
// [[Rcpp::export]]
XPtrDoc doc_parse_file(std::string path,
std::string encoding = "",
bool as_html = false) {
bool as_html = false,
int options = 0) {
xmlDoc* pDoc;
if (as_html) {
pDoc = htmlReadFile(
path.c_str(),
encoding == "" ? NULL : encoding.c_str(),
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR
options
);
} else {
pDoc = xmlReadFile(
path.c_str(),
encoding == "" ? NULL : encoding.c_str(),
0
options
);
}

Expand All @@ -34,23 +35,24 @@ XPtrDoc doc_parse_file(std::string path,
// [[Rcpp::export]]
XPtrDoc doc_parse_raw(RawVector x, std::string encoding,
std::string base_url = "",
bool as_html = false) {
bool as_html = false,
int options = 0) {
xmlDoc* pDoc;
if (as_html) {
pDoc = htmlReadMemory(
(const char *) RAW(x),
Rf_length(x),
base_url == "" ? NULL : base_url.c_str(),
encoding == "" ? NULL : encoding.c_str(),
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR
options
);
} else {
pDoc = xmlReadMemory(
(const char *) RAW(x),
Rf_length(x),
base_url == "" ? NULL : base_url.c_str(),
encoding == "" ? NULL : encoding.c_str(),
0
options
);
}

Expand Down
Binary file added tests/testthat/cd_catalog.xml.bz2
Binary file not shown.
22 changes: 22 additions & 0 deletions tests/testthat/test-read-xml.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,25 @@ test_that("read_html correctly parses malformed document", {
lego <- read_html("lego.html.bz2")
expect_equal(length(xml_find_all(lego, ".//p")), 39)
})

test_that("parse_options errors when given an invalid option", {
expect_error(parse_options("INVALID"),
"`options` INVALID is not a valid option")

expect_error(read_html("lego.html.bz2", options = "INVALID"),
"`options` INVALID is not a valid option")
})

test_that("read_html properly passes parser arguments", {

skip_if_not(libxml2_version() >= "2.9.2")

blanks <- read_html("cd_catalog.xml.bz2", options = c("RECOVER", "NOERROR"))
expect_equal(as_list(blanks)$body$catalog$cd[[1]],
"\r\n ")

no_blanks <- read_html("cd_catalog.xml.bz2", options = c("RECOVER", "NOERROR", "NOBLANKS"))

expect_equal(as_list(no_blanks)$body$catalog$cd[[1]],
list("Empire Burlesque"))
})
2 changes: 1 addition & 1 deletion tests/testthat/test-xml_find.R
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ test_that("xml_find_chr errors with non character results", {
expect_error(xml_find_chr(x, "1+1"), "result of type:.*numeric.*, not character")
})

test_that("xml_find_num returns a numeric result", {
test_that("xml_find_chr returns a character result", {
x <- read_xml("<x><y>1</y><y/></x>")
expect_equal(xml_find_chr(x, "string(5)"), "5")

Expand Down