-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Better handling of duplicate headers when parsing chunks #1017
base: master
Are you sure you want to change the base?
Changes from all commits
9a122a8
0136a53
ad773e8
b24698c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1463,7 +1463,7 @@ License: MIT | |
|
||
// Establish starting state | ||
cursor = 0; | ||
var data = [], errors = [], row = [], lastCursor = 0; | ||
var data = [], errors = [], row = [], lastCursor = 0, inputExpansion = 0; | ||
|
||
if (!input) | ||
return returnable(); | ||
|
@@ -1508,7 +1508,17 @@ License: MIT | |
if (duplicateHeaders) { | ||
var editedInput = input.split(newline); | ||
editedInput[0] = Array.from(headerMap).join(delim); | ||
// If we change the size of the input due to duplicate headers | ||
// or header renaming from transformHeader, then we need to | ||
// record the difference so that we can adjust the cursor accordingly | ||
// in `meta.cursor` value of the `parse` result. | ||
// This is because the consumers of this method (e.g. ChunkStreamer) | ||
// use the resulting `cursor` value to know how much of the input was | ||
// consumed by the parser and are not aware of the parser implementation | ||
// details for handling duplicate headers. | ||
inputExpansion = editedInput[0].length - firstLine.length; | ||
input = editedInput.join(newline); | ||
inputLen = input.length; | ||
} | ||
} | ||
if (fastMode || (fastMode !== false && input.indexOf(quoteChar) === -1)) | ||
|
@@ -1517,12 +1527,7 @@ License: MIT | |
for (var i = 0; i < rows.length; i++) | ||
{ | ||
row = rows[i]; | ||
// use firstline as row length may be changed due to duplicated headers | ||
if (i === 0 && firstLine !== undefined) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As firstline is no longer used, probably it can be removed as variable |
||
cursor += firstLine.length; | ||
}else{ | ||
cursor += row.length; | ||
} | ||
cursor += row.length; | ||
if (i !== rows.length - 1) | ||
cursor += newline.length; | ||
else if (ignoreLastRow) | ||
|
@@ -1785,7 +1790,7 @@ License: MIT | |
linebreak: newline, | ||
aborted: aborted, | ||
truncated: !!stopped, | ||
cursor: lastCursor + (baseIndex || 0), | ||
cursor: lastCursor + (baseIndex || 0) - inputExpansion, | ||
renamedHeaders: renamedHeaders | ||
} | ||
}; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -164,6 +164,46 @@ describe('PapaParse', function() { | |
}); | ||
}); | ||
|
||
it('Checks cursor when file is large and has duplicate headers', function(done) { | ||
this.timeout(30000); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is the timeout required? I think it should be removed. |
||
var stepped = 0; | ||
var startsWithEtiamOrLorem = true; | ||
Papa.parse(fs.createReadStream(__dirname + '/verylong-sample.csv'), { | ||
header: true, | ||
transformHeader: function(headerName) { | ||
return headerName === 'meaning of life' ? 'placeholder' : headerName; | ||
}, | ||
step: function(results, parser) { | ||
stepped++; | ||
if (results) | ||
{ | ||
if (stepped > 1) { | ||
const startsWithEtiam = results.data && results.data.placeholder && results.data.placeholder.startsWith("Etiam"); | ||
const startsWithLorem = results.data && results.data.placeholder && results.data.placeholder.startsWith("Lorem"); | ||
startsWithEtiamOrLorem = startsWithEtiamOrLorem && (startsWithEtiam || startsWithLorem); | ||
} | ||
} | ||
}, | ||
complete: function() { | ||
assert(startsWithEtiamOrLorem); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure which is the objective of such test. |
||
done(); | ||
} | ||
}); | ||
}); | ||
|
||
it('Handles quote at EOF when headers are modified', function(done) { | ||
var data = []; | ||
Papa.parse('field1,field1,field3\na,b,c\nd,e,"f"', { | ||
header: true, | ||
step: function(results) { | ||
data.push(results.data); | ||
}, | ||
complete: function() { | ||
assert.deepEqual(data, [{ field1: 'a', field1_1: 'b', field3: 'c' },{ field1: 'd', field1_1: 'e', field3: 'f' }]); | ||
done(); | ||
} | ||
}); | ||
}); | ||
|
||
it('piped streaming CSV should be correctly parsed when header is true', function(done) { | ||
var data = []; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we make the comment smaller?
What about:
store the number of removed chars to correctly report meta.cursor
?