From 9a122a8be2272851a794cf8a56a97244eb90e7a6 Mon Sep 17 00:00:00 2001 From: Stuart Marshall Date: Sun, 20 Aug 2023 16:30:42 -0700 Subject: [PATCH 1/3] Handle header line expansion for all parser modes --- papaparse.js | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/papaparse.js b/papaparse.js index 3ce39621..ebbfbc60 100755 --- a/papaparse.js +++ b/papaparse.js @@ -1463,7 +1463,7 @@ License: MIT // Establish starting state cursor = 0; - var data = [], errors = [], row = [], lastCursor = 0; + var data = [], errors = [], row = [], lastCursor = 0, inputExpansion = 0; if (!input) return returnable(); @@ -1508,6 +1508,10 @@ License: MIT if (duplicateHeaders) { var editedInput = input.split(newline); editedInput[0] = Array.from(headerMap).join(delim); + // If we expanded the input due to duplicate headers then reduce cursor + // by the amount we expanded the input. + // This is needed for keeping leftover aggregate in parseChunk. + inputExpansion = editedInput[0].length - firstLine.length; input = editedInput.join(newline); } } @@ -1517,12 +1521,7 @@ License: MIT for (var i = 0; i < rows.length; i++) { row = rows[i]; - // use firstline as row length may be changed due to duplicated headers - if (i === 0 && firstLine !== undefined) { - cursor += firstLine.length; - }else{ - cursor += row.length; - } + cursor += row.length; if (i !== rows.length - 1) cursor += newline.length; else if (ignoreLastRow) @@ -1724,7 +1723,7 @@ License: MIT function pushRow(row) { data.push(row); - lastCursor = cursor; + lastCursor = cursor - inputExpansion; } /** From 0136a53c0375208f4472f7455ce7f5472714c536 Mon Sep 17 00:00:00 2001 From: Stuart Marshall Date: Sun, 20 Aug 2023 23:36:12 -0700 Subject: [PATCH 2/3] Add test for chunked parsing with duplicate header --- tests/node-tests.js | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/node-tests.js b/tests/node-tests.js index cad8058e..b4bbd500 100644 --- a/tests/node-tests.js +++ b/tests/node-tests.js @@ -164,6 +164,32 @@ describe('PapaParse', function() { }); }); + it('Checks cursor when file is large and has duplicate headers', function(done) { + this.timeout(30000); + var stepped = 0; + var startsWithEtiamOrLorem = true; + Papa.parse(fs.createReadStream(__dirname + '/verylong-sample.csv'), { + header: true, + transformHeader: function(headerName) { + return headerName === 'meaning of life' ? 'placeholder' : headerName; + }, + step: function(results, parser) { + stepped++; + if (results) + { + if (stepped > 1) { + const startsWithEtiam = results.data && results.data.placeholder && results.data.placeholder.startsWith("Etiam"); + const startsWithLorem = results.data && results.data.placeholder && results.data.placeholder.startsWith("Lorem"); + startsWithEtiamOrLorem = startsWithEtiamOrLorem && (startsWithEtiam || startsWithLorem); + } + } + }, + complete: function() { + assert(startsWithEtiamOrLorem); + done(); + } + }); + }); it('piped streaming CSV should be correctly parsed when header is true', function(done) { var data = []; From ad773e8ecdd42c552138da31547d82bbad086347 Mon Sep 17 00:00:00 2001 From: Stuart Marshall Date: Mon, 21 Aug 2023 15:00:18 -0700 Subject: [PATCH 3/3] Add test for trailing quote with renamed headers. Adjust code comment. --- papaparse.js | 16 +++++++++++----- tests/node-tests.js | 14 ++++++++++++++ 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/papaparse.js b/papaparse.js index ebbfbc60..45a094ac 100755 --- a/papaparse.js +++ b/papaparse.js @@ -1508,11 +1508,17 @@ License: MIT if (duplicateHeaders) { var editedInput = input.split(newline); editedInput[0] = Array.from(headerMap).join(delim); - // If we expanded the input due to duplicate headers then reduce cursor - // by the amount we expanded the input. - // This is needed for keeping leftover aggregate in parseChunk. + // If we change the size of the input due to duplicate headers + // or header renaming from transformHeader, then we need to + // record the difference so that we can adjust the cursor accordingly + // in `meta.cursor` value of the `parse` result. + // This is because the consumers of this method (e.g. ChunkStreamer) + // use the resulting `cursor` value to know how much of the input was + // consumed by the parser and are not aware of the parser implementation + // details for handling duplicate headers. inputExpansion = editedInput[0].length - firstLine.length; input = editedInput.join(newline); + inputLen = input.length; } } if (fastMode || (fastMode !== false && input.indexOf(quoteChar) === -1)) @@ -1723,7 +1729,7 @@ License: MIT function pushRow(row) { data.push(row); - lastCursor = cursor - inputExpansion; + lastCursor = cursor; } /** @@ -1784,7 +1790,7 @@ License: MIT linebreak: newline, aborted: aborted, truncated: !!stopped, - cursor: lastCursor + (baseIndex || 0), + cursor: lastCursor + (baseIndex || 0) - inputExpansion, renamedHeaders: renamedHeaders } }; diff --git a/tests/node-tests.js b/tests/node-tests.js index b4bbd500..9fcbfe72 100644 --- a/tests/node-tests.js +++ b/tests/node-tests.js @@ -191,6 +191,20 @@ describe('PapaParse', function() { }); }); + it('Handles quote at EOF when headers are modified', function(done) { + var data = []; + Papa.parse('field1,field1,field3\na,b,c\nd,e,"f"', { + header: true, + step: function(results) { + data.push(results.data); + }, + complete: function() { + assert.deepEqual(data, [{ field1: 'a', field1_1: 'b', field3: 'c' },{ field1: 'd', field1_1: 'e', field3: 'f' }]); + done(); + } + }); + }); + it('piped streaming CSV should be correctly parsed when header is true', function(done) { var data = []; var readStream = fs.createReadStream(__dirname + '/sample-header.csv', 'utf8');