Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix incorrect behavior of CDM CSV parser #14

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions src/main/resources/logback.xml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@
<ssl />
</appender>

<root level="DEBUG">
<appender-ref ref="JSON_TCP" />
<root level="INFO">
<appender-ref ref="STDOUT" />
</root>
<root level="INFO">
<appender-ref ref="JSON_TCP" />
</root>
</configuration>
3 changes: 2 additions & 1 deletion src/main/scala/models/cdm/CdmParser.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package models.cdm

import models.{ArcaneSchema, DataCell, DataRow, MergeKeyField}

import java.util.regex.Matcher
import scala.language.implicitConversions
import scala.util.matching.Regex

Expand Down Expand Up @@ -84,7 +85,7 @@ object CSVParser:

def replaceQuotedNewlines(csvLine: String): String = {
val regex = new Regex("\"[^\"]*(?:\"\"[^\"]*)*\"")
regex.replaceSomeIn(csvLine, m => Some(m.matched.replace("\n", ""))).replace("\r", "")
regex.replaceSomeIn(csvLine, m => Some(Matcher.quoteReplacement(m.matched.replace("\n", "")))).replace("\r", "")
}


Expand Down
17 changes: 14 additions & 3 deletions src/test/scala/models/CdmParserTests.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package models

import models.cdm.CSVParser

import com.sneaksanddata.arcane.framework.models.cdm.CSVParser.replaceQuotedNewlines
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.must.Matchers
import org.scalatest.matchers.should.Matchers.should
Expand All @@ -18,7 +19,9 @@ class CdmParserTests extends AnyFlatSpec with Matchers {
("\"q\",,\"1321\"", Seq(Some("q"), None, Some("1321"))),
("\"q\",,\"13,21\"", Seq(Some("q"), None, Some("13,21"))),
("123,,\", abc def\"", Seq(Some("123"), None, Some(", abc def"))),
("5637144576,\"NFO\",,0,", Seq(Some("5637144576"), Some("NFO"), None, Some("0"), None))
("5637144576,\"NFO\",,0,", Seq(Some("5637144576"), Some("NFO"), None, Some("0"), None)),
("5637144576,\"$NFO\",,0,", Seq(Some("5637144576"), Some("$NFO"), None, Some("0"), None))
("000000d3-0000-0000-0000-005001000000,\"12/4/2024 4:44:32 PM\",\"12/4/2024 4:44:32 PM\",0,0,0,0,0,0,\"USA\",,,,,0,\"USA\",,0,,0,0,0,5637144576,0,,0,,,0,,\"2022-08-09T06:35:15.0000000Z\",\"2023-07-24T18:08:53.0000000Z\",,0,,,0,,0,0,,\"1900-01-01T00:00:00.0000000Z\",,,0,,\"2023-07-24T18:08:54.0000000Z\",\"RRAO\",0,\"2023-02-17T01:41:04.0000000Z\",\"?\",0,\"dat\",1071872623,5637144576,0,5637144576,211,0,\"2023-02-17T01:41:04.0000000+00:00\",\"2023-07-24T18:08:54.0000000Z\",", Seq())
)

private val invalidCsvlines = Table(
Expand All @@ -28,15 +31,23 @@ class CdmParserTests extends AnyFlatSpec with Matchers {

it should "handle valid quoted CSV lines correctly" in {
forAll (validCsvLines) { (line, result) =>
CSVParser.parseCsvLine(line = line, headerCount = result.size) should equal(result)
val parseResult = CSVParser.parseCsvLine(line = line, headerCount = 62)
parseResult should equal(result)
}
}

it should "handle invalid quoted CSV lines correctly" in {
forAll (invalidCsvlines) { (line, result) =>
intercept[IllegalStateException] {
CSVParser.parseCsvLine(line, headerCount = result.size)
}
}
}

it should "replace quoted newlines correctly " in {
forAll (validCsvLines) { (line, result) =>
replaceQuotedNewlines(line) should equal(line)
}
}

}
Loading