fix: several issues with character set handling (#299)

- Fix several character sets not working, e.g. "ISO_IR 13" and "ISO_IR 166" - Fix warning being shown when the "ISO_IR 6" character set is used - Add tests for reading encoded string data in several different character sets - Throw exception on an unsupported character set or multiple character sets rather than logging and continuing on (this can be overridden by setting ignoreErrors)
dcmjs-org · Aug 1, 2022 · 0b88213 · 0b88213
1 parent e983a14
commit 0b88213
Show file tree

Hide file tree

Showing 2 changed files with 148 additions and 10 deletions.
diff --git a/src/DicomMessage.js b/src/DicomMessage.js
@@ -10,8 +10,40 @@ const EXPLICIT_BIG_ENDIAN = "1.2.840.10008.1.2.2";
 const singleVRs = ["SQ", "OF", "OW", "OB", "UN", "LT"];
 
 const encodingMapping = {
+    "": "iso-8859-1",
+    "iso-ir-6": "iso-8859-1",
+    "iso-ir-13": "shift-jis",
+    "iso-ir-100": "latin1",
+    "iso-ir-101": "iso-8859-2",
+    "iso-ir-109": "iso-8859-3",
+    "iso-ir-110": "iso-8859-4",
+    "iso-ir-126": "iso-ir-126",
+    "iso-ir-127": "iso-ir-127",
+    "iso-ir-138": "iso-ir-138",
+    "iso-ir-144": "iso-ir-144",
+    "iso-ir-148": "iso-ir-148",
+    "iso-ir-166": "tis-620",
+    "iso-2022-ir-6": "iso-8859-1",
+    "iso-2022-ir-13": "shift-jis",
+    "iso-2022-ir-87": "iso-2022-jp",
+    "iso-2022-ir-100": "latin1",
+    "iso-2022-ir-101": "iso-8859-2",
+    "iso-2022-ir-109": "iso-8859-3",
+    "iso-2022-ir-110": "iso-8859-4",
+    "iso-2022-ir-126": "iso-ir-126",
+    "iso-2022-ir-127": "iso-ir-127",
+    "iso-2022-ir-138": "iso-ir-138",
+    "iso-2022-ir-144": "iso-ir-144",
+    "iso-2022-ir-148": "iso-ir-148",
+    "iso-2022-ir-149": "euc-kr",
+    "iso-2022-ir-159": "iso-2022-jp",
+    "iso-2022-ir-166": "tis-620",
+    "iso-2022-ir-58": "iso-ir-58",
     "iso-ir-192": "utf-8",
-    "": "latin1"
+    gb18030: "gb18030",
+    "iso-2022-gbk": "gbk",
+    "iso-2022-58": "gb2312",
+    gbk: "gbk"
 };
 
 const encapsulatedSyntaxes = [
@@ -90,18 +122,26 @@ class DicomMessage {
                         coding = coding.replace(/[_ ]/g, "-").toLowerCase();
                         if (coding in encodingMapping) {
                             coding = encodingMapping[coding];
-                        }
-                        try {
                             bufferStream.setDecoder(new TextDecoder(coding));
-                        } catch (error) {
-                            console.warn(error);
+                        } else if (ignoreErrors) {
+                            console.warn(
+                                `Unsupported character set: ${coding}, using default character set`
+                            );
+                        } else {
+                            throw Error(`Unsupported character set: ${coding}`);
                         }
                     }
                     if (readInfo.values.length > 1) {
-                        console.warn(
-                            "multiple encodings not supported, using first encoding!",
-                            readInfo.values
-                        );
+                        if (ignoreErrors) {
+                            console.warn(
+                                "Using multiple character sets is not supported, proceeding with just the first character set",
+                                readInfo.values
+                            );
+                        } else {
+                            throw Error(
+                                `Using multiple character sets is not supported: ${readInfo.values}`
+                            );
+                        }
                     }
                     readInfo.values = ["ISO_IR 192"]; // change SpecificCharacterSet to UTF-8
                 }
@@ -156,7 +196,7 @@ class DicomMessage {
         stream.reset();
         stream.increment(128);
         if (stream.readAsciiString(4) !== "DICM") {
-            throw new Error("Invalid a dicom file");
+            throw new Error("Invalid DICOM file, expected header is missing");
         }
         var el = DicomMessage._readTag(stream, useSyntax),
             metaLength = el.values[0];

diff --git a/test/data.test.js b/test/data.test.js
@@ -22,6 +22,7 @@ const {
     ReadBufferStream
 } = dcmjs.data;
 
+const IMPLICIT_LITTLE_ENDIAN = "1.2.840.10008.1.2";
 const EXPLICIT_LITTLE_ENDIAN = "1.2.840.10008.1.2.1";
 
 const fileMetaInformationVersionArray = new Uint8Array(2);
@@ -620,3 +621,100 @@ it("Writes encapsulated OB data which has an odd length with a padding byte in i
         0x00000000 // SequenceDelimiterTag value (always zero)
     ]);
 });
+
+describe("With a SpecificCharacterSet tag", () => {
+    it("Reads a long string in the '' character set", async () => {
+        expect(readEncodedLongString("", [0x68, 0x69])).toEqual("hi");
+    });
+
+    it("Reads a long string in the ISO_IR 6 (default) character set", async () => {
+        expect(readEncodedLongString("ISO_IR 6", [0x68, 0x69])).toEqual("hi");
+    });
+
+    it("Reads a long string in the ISO_IR 13 (shift-jis) character set", async () => {
+        expect(readEncodedLongString("ISO_IR 13", [0x83, 0x8b])).toEqual("ル");
+    });
+
+    it("Reads a long string in the ISO_IR 166 (tis-620) character set", async () => {
+        expect(readEncodedLongString("ISO_IR 166", [0xb9, 0xf7])).toEqual("น๗");
+    });
+
+    it("Reads a long string in the ISO_IR 192 (utf-8) character set", async () => {
+        expect(readEncodedLongString("ISO_IR 192", [0xed, 0x95, 0x9c])).toEqual(
+            "한"
+        );
+    });
+
+    it("Throws an exception on an unsupported character set", async () => {
+        expect(() => readEncodedLongString("nope", [])).toThrow(
+            new Error("Unsupported character set: nope")
+        );
+    });
+
+    it("Doesn't throw an exception on an unsupported character set when ignoring errors", async () => {
+        expect(
+            readEncodedLongString("nope", [0x68, 0x69], { ignoreErrors: true })
+        ).toEqual("hi");
+    });
+
+    it("Throws an exception on multiple character sets", async () => {
+        expect(() =>
+            readEncodedLongString("ISO_IR 13\\ISO_IR 166", [])
+        ).toThrow(
+            /Using multiple character sets is not supported: ISO_IR 13,ISO_IR 166/
+        );
+    });
+
+    it("Doesn't throw an exception on multiple character sets when ignoring errors", async () => {
+        expect(
+            readEncodedLongString("ISO_IR 13\\ISO_IR 166", [0x68, 0x69], {
+                ignoreErrors: true
+            })
+        ).toEqual("hi");
+    });
+
+    function readEncodedLongString(
+        specificCharacterSet,
+        encodedBytes,
+        readOptions = { ignoreErrors: false }
+    ) {
+        // Pad to even lengths with spaces if needed
+        if (specificCharacterSet.length & 1) {
+            specificCharacterSet += " ";
+        }
+        if (encodedBytes.length & 1) {
+            encodedBytes.push(0x20);
+        }
+
+        // Manually construct the binary representation for the following two tags:
+        // - Tag #1: SpecificCharacterSet specifying the character set
+        // - Tag #2: InstitutionName which is a long string tag that will have its value
+        //           set to the encoded bytes
+        const stream = new WriteBufferStream(
+            16 + specificCharacterSet.length + encodedBytes.length
+        );
+        stream.isLittleEndian = true;
+
+        // Write SpecificCharacterSet tag
+        stream.writeUint32(0x00050008);
+        stream.writeUint32(specificCharacterSet.length);
+        stream.writeAsciiString(specificCharacterSet);
+
+        // Write InstitutionName tag
+        stream.writeUint32(0x00800008);
+        stream.writeUint32(encodedBytes.length);
+        for (const encodedByte of encodedBytes) {
+            stream.writeUint8(encodedByte);
+        }
+
+        // Read the stream back to get the value of the InstitutionName tag
+        const readResult = DicomMessage._read(
+            new ReadBufferStream(stream.buffer),
+            IMPLICIT_LITTLE_ENDIAN,
+            readOptions
+        );
+
+        // Return the resulting UTF-8 string value for InstitutionName
+        return readResult["00080080"].Value[0];
+    }
+});