Skip to content

Commit

Permalink
Support files with mixed line endings in vroom and vroom_lines
Browse files Browse the repository at this point in the history
  • Loading branch information
jimhester committed May 17, 2021
1 parent 649282f commit 2b94f88
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 32 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# vroom (development version)

* `vroom()` and `vroom_lines()` now handle files with mixed windows and POSIX line endings (https://github.com/tidyverse/readr/issues/1210)

* `vroom()` now outputs a tibble with the expected number of columns and types based on `col_types` and `col_names` even if the file is empty (#297).

* `vroom_write(path=)` has been deprecated, in favor of `file`, to match readr.
Expand Down
12 changes: 9 additions & 3 deletions src/delimited_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,6 @@ delimited_index::delimited_index(
size_t guessed_rows =
one_row_size > 0 ? (file_size - first_nl) / one_row_size * 1.1 : 0;

// Check for windows newlines
windows_newlines_ = first_nl > 0 && mmap_[first_nl - 1] == '\r';

std::unique_ptr<multi_progress> pb = nullptr;

if (progress_) {
Expand Down Expand Up @@ -351,6 +348,15 @@ delimited_index::get_trimmed_val(size_t i, bool is_first, bool is_last) const {
const char* begin = mmap_.data() + begin_p;
const char* end = mmap_.data() + end_p;

// Check for windows newlines if the last column */
if (is_last) {
if (begin < end) {
if (*(end - 1) == '\r') {
--end;
}
}
}

if (trim_ws_) {
trim_whitespace(begin, end);
}
Expand Down
27 changes: 7 additions & 20 deletions src/delimited_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,6 @@ class delimited_index : public index,
bool trim_ws_;
bool escape_double_;
bool escape_backslash_;
bool windows_newlines_;
size_t skip_;
const char* comment_;
size_t rows_;
Expand Down Expand Up @@ -264,7 +263,7 @@ class delimited_index : public index,
case QUOTED_FIELD:
return QUOTED_FIELD;
case QUOTED_END:
throw std::runtime_error("invalid 5");
return QUOTED_END;
}
throw std::runtime_error("should never happen");
}
Expand All @@ -285,7 +284,7 @@ class delimited_index : public index,
errors->add_parse_error(pos, cols);
// Add additional columns if there are too few
while (cols < num_cols - 1) {
destination.push_back(pos - windows_newlines_);
destination.push_back(pos);
++cols;
}
}
Expand Down Expand Up @@ -323,8 +322,8 @@ class delimited_index : public index,
const size_t update_size) {

// If there are no quotes quote will be '\0', so will just work
std::array<char, 7> query = {delim[0], '\n', '\r', '\\', '\0', '\0', '\0'};
auto query_i = 4;
std::array<char, 7> query = {delim[0], '\n', '\\', '\0', '\0', '\0'};
auto query_i = 3;
if (quote != '\0') {
query[query_i++] = quote;
}
Expand All @@ -342,13 +341,6 @@ class delimited_index : public index,
size_t pos = start;
size_t lines_read = 0;

// When reading from connections it is possible that a buffer ends on '\r',
// but has not yet skipped over the next '\n', so we check for this and do
// so here.
if (windows_newlines_ && state == RECORD_START && buf[pos] == '\n') {
++pos;
}

while (pos < end && lines_read < n_max) {
auto c = buf[pos];

Expand Down Expand Up @@ -379,6 +371,7 @@ class delimited_index : public index,
}

if (state == RECORD_START) {
// REprintf("RS: %i\n", pos);
destination.push_back(pos + file_offset);
}

Expand All @@ -388,10 +381,7 @@ class delimited_index : public index,
++cols;
}

else if (
(windows_newlines_ && c == '\r') ||
(!windows_newlines_ && c == '\n')) {
// c == '\n') {
else if (c == '\n') {
if (state ==
QUOTED_FIELD) { // This will work as long as num_threads = 1
if (num_threads != 1) {
Expand Down Expand Up @@ -425,16 +415,13 @@ class delimited_index : public index,
last_tick = pos;
}
}
if ((windows_newlines_ && c == '\r')) {
++pos;
}
}

else if (c == quote) {
state = quoted_state(state);
}

else if (c != '\r') {
else {
state = other_state(state);
++pos;
size_t buf_offset = strcspn(buf + pos, query.data());
Expand Down
9 changes: 1 addition & 8 deletions src/delimited_index_connection.cc
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,6 @@ delimited_index_connection::delimited_index_connection(
}
}

// Check for windows newlines
windows_newlines_ = first_nl > 0 && buf[i][first_nl - 1] == '\r';

std::unique_ptr<RProgress::RProgress> pb = nullptr;
if (progress_) {
pb = std::unique_ptr<RProgress::RProgress>(
Expand Down Expand Up @@ -251,11 +248,7 @@ delimited_index_connection::delimited_index_connection(
idx_[0].push_back(file_size);
++columns_;
} else {
if (windows_newlines_) {
idx_[1].push_back(file_size + 1);
} else {
idx_[1].push_back(file_size);
}
idx_[1].push_back(file_size);
}
}

Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test-vroom.R
Original file line number Diff line number Diff line change
Expand Up @@ -629,7 +629,7 @@ test_that("mismatched column names throw a classed warning", {
})

test_that("empty files still generate the correct column width and types", {
out <- vroom(I(""), col_names = c("foo", "bar"))
out <- vroom(I(""), col_names = c("foo", "bar"), col_types = list())
expect_equal(nrow(out), 0)
expect_equal(ncol(out), 2)
expect_equal(names(out), c("foo", "bar"))
Expand Down
4 changes: 4 additions & 0 deletions tests/testthat/test-vroom_lines.R
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,7 @@ test_that("vroom_lines uses na argument", {
expect_equal(vroom_lines(I("abc\n123"), na = "123", progress = FALSE), c("abc", NA_character_))
expect_equal(vroom_lines(I("abc\n123"), na = c("abc", "123"), progress = FALSE), c(NA_character_, NA_character_))
})

test_that("vroom_lines works with files with mixed line endings", {
expect_equal(vroom_lines(I("foo\r\n\nbar\n\r\nbaz\r\n")), c("foo", "", "bar", "", "baz"))
})

0 comments on commit 2b94f88

Please sign in to comment.