Skip to content

Commit

Permalink
fix: several issues with character set handling (#299)
Browse files Browse the repository at this point in the history
- Fix several character sets not working, e.g. "ISO_IR 13" and "ISO_IR 166"
- Fix warning being shown when the "ISO_IR 6" character set is used
- Add tests for reading encoded string data in several different character sets
- Throw exception on an unsupported character set or multiple character sets rather than logging and continuing on (this can be overridden by setting ignoreErrors)
  • Loading branch information
richard-viney authored Aug 1, 2022
1 parent e983a14 commit 0b88213
Show file tree
Hide file tree
Showing 2 changed files with 148 additions and 10 deletions.
60 changes: 50 additions & 10 deletions src/DicomMessage.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,40 @@ const EXPLICIT_BIG_ENDIAN = "1.2.840.10008.1.2.2";
const singleVRs = ["SQ", "OF", "OW", "OB", "UN", "LT"];

const encodingMapping = {
"": "iso-8859-1",
"iso-ir-6": "iso-8859-1",
"iso-ir-13": "shift-jis",
"iso-ir-100": "latin1",
"iso-ir-101": "iso-8859-2",
"iso-ir-109": "iso-8859-3",
"iso-ir-110": "iso-8859-4",
"iso-ir-126": "iso-ir-126",
"iso-ir-127": "iso-ir-127",
"iso-ir-138": "iso-ir-138",
"iso-ir-144": "iso-ir-144",
"iso-ir-148": "iso-ir-148",
"iso-ir-166": "tis-620",
"iso-2022-ir-6": "iso-8859-1",
"iso-2022-ir-13": "shift-jis",
"iso-2022-ir-87": "iso-2022-jp",
"iso-2022-ir-100": "latin1",
"iso-2022-ir-101": "iso-8859-2",
"iso-2022-ir-109": "iso-8859-3",
"iso-2022-ir-110": "iso-8859-4",
"iso-2022-ir-126": "iso-ir-126",
"iso-2022-ir-127": "iso-ir-127",
"iso-2022-ir-138": "iso-ir-138",
"iso-2022-ir-144": "iso-ir-144",
"iso-2022-ir-148": "iso-ir-148",
"iso-2022-ir-149": "euc-kr",
"iso-2022-ir-159": "iso-2022-jp",
"iso-2022-ir-166": "tis-620",
"iso-2022-ir-58": "iso-ir-58",
"iso-ir-192": "utf-8",
"": "latin1"
gb18030: "gb18030",
"iso-2022-gbk": "gbk",
"iso-2022-58": "gb2312",
gbk: "gbk"
};

const encapsulatedSyntaxes = [
Expand Down Expand Up @@ -90,18 +122,26 @@ class DicomMessage {
coding = coding.replace(/[_ ]/g, "-").toLowerCase();
if (coding in encodingMapping) {
coding = encodingMapping[coding];
}
try {
bufferStream.setDecoder(new TextDecoder(coding));
} catch (error) {
console.warn(error);
} else if (ignoreErrors) {
console.warn(
`Unsupported character set: ${coding}, using default character set`
);
} else {
throw Error(`Unsupported character set: ${coding}`);
}
}
if (readInfo.values.length > 1) {
console.warn(
"multiple encodings not supported, using first encoding!",
readInfo.values
);
if (ignoreErrors) {
console.warn(
"Using multiple character sets is not supported, proceeding with just the first character set",
readInfo.values
);
} else {
throw Error(
`Using multiple character sets is not supported: ${readInfo.values}`
);
}
}
readInfo.values = ["ISO_IR 192"]; // change SpecificCharacterSet to UTF-8
}
Expand Down Expand Up @@ -156,7 +196,7 @@ class DicomMessage {
stream.reset();
stream.increment(128);
if (stream.readAsciiString(4) !== "DICM") {
throw new Error("Invalid a dicom file");
throw new Error("Invalid DICOM file, expected header is missing");
}
var el = DicomMessage._readTag(stream, useSyntax),
metaLength = el.values[0];
Expand Down
98 changes: 98 additions & 0 deletions test/data.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ const {
ReadBufferStream
} = dcmjs.data;

const IMPLICIT_LITTLE_ENDIAN = "1.2.840.10008.1.2";
const EXPLICIT_LITTLE_ENDIAN = "1.2.840.10008.1.2.1";

const fileMetaInformationVersionArray = new Uint8Array(2);
Expand Down Expand Up @@ -620,3 +621,100 @@ it("Writes encapsulated OB data which has an odd length with a padding byte in i
0x00000000 // SequenceDelimiterTag value (always zero)
]);
});

describe("With a SpecificCharacterSet tag", () => {
it("Reads a long string in the '' character set", async () => {
expect(readEncodedLongString("", [0x68, 0x69])).toEqual("hi");
});

it("Reads a long string in the ISO_IR 6 (default) character set", async () => {
expect(readEncodedLongString("ISO_IR 6", [0x68, 0x69])).toEqual("hi");
});

it("Reads a long string in the ISO_IR 13 (shift-jis) character set", async () => {
expect(readEncodedLongString("ISO_IR 13", [0x83, 0x8b])).toEqual("ル");
});

it("Reads a long string in the ISO_IR 166 (tis-620) character set", async () => {
expect(readEncodedLongString("ISO_IR 166", [0xb9, 0xf7])).toEqual("น๗");
});

it("Reads a long string in the ISO_IR 192 (utf-8) character set", async () => {
expect(readEncodedLongString("ISO_IR 192", [0xed, 0x95, 0x9c])).toEqual(
"한"
);
});

it("Throws an exception on an unsupported character set", async () => {
expect(() => readEncodedLongString("nope", [])).toThrow(
new Error("Unsupported character set: nope")
);
});

it("Doesn't throw an exception on an unsupported character set when ignoring errors", async () => {
expect(
readEncodedLongString("nope", [0x68, 0x69], { ignoreErrors: true })
).toEqual("hi");
});

it("Throws an exception on multiple character sets", async () => {
expect(() =>
readEncodedLongString("ISO_IR 13\\ISO_IR 166", [])
).toThrow(
/Using multiple character sets is not supported: ISO_IR 13,ISO_IR 166/
);
});

it("Doesn't throw an exception on multiple character sets when ignoring errors", async () => {
expect(
readEncodedLongString("ISO_IR 13\\ISO_IR 166", [0x68, 0x69], {
ignoreErrors: true
})
).toEqual("hi");
});

function readEncodedLongString(
specificCharacterSet,
encodedBytes,
readOptions = { ignoreErrors: false }
) {
// Pad to even lengths with spaces if needed
if (specificCharacterSet.length & 1) {
specificCharacterSet += " ";
}
if (encodedBytes.length & 1) {
encodedBytes.push(0x20);
}

// Manually construct the binary representation for the following two tags:
// - Tag #1: SpecificCharacterSet specifying the character set
// - Tag #2: InstitutionName which is a long string tag that will have its value
// set to the encoded bytes
const stream = new WriteBufferStream(
16 + specificCharacterSet.length + encodedBytes.length
);
stream.isLittleEndian = true;

// Write SpecificCharacterSet tag
stream.writeUint32(0x00050008);
stream.writeUint32(specificCharacterSet.length);
stream.writeAsciiString(specificCharacterSet);

// Write InstitutionName tag
stream.writeUint32(0x00800008);
stream.writeUint32(encodedBytes.length);
for (const encodedByte of encodedBytes) {
stream.writeUint8(encodedByte);
}

// Read the stream back to get the value of the InstitutionName tag
const readResult = DicomMessage._read(
new ReadBufferStream(stream.buffer),
IMPLICIT_LITTLE_ENDIAN,
readOptions
);

// Return the resulting UTF-8 string value for InstitutionName
return readResult["00080080"].Value[0];
}
});

0 comments on commit 0b88213

Please sign in to comment.