diff --git a/encodings/dbcs-codec.js b/encodings/dbcs-codec.js index 366809e..30de511 100644 --- a/encodings/dbcs-codec.js +++ b/encodings/dbcs-codec.js @@ -41,7 +41,7 @@ function DBCSCodec(codecOptions, iconv) { this.decodeTables = []; this.decodeTables[0] = UNASSIGNED_NODE.slice(0); // Create root node. - // Sometimes a MBCS char corresponds to a sequence of unicode chars. We store them as arrays of integers here. + // Sometimes a MBCS char corresponds to a sequence of unicode chars. We store them as arrays of integers here. this.decodeTableSeq = []; // Actual mapping tables consist of chunks. Use them to fill up decode tables. @@ -50,7 +50,7 @@ function DBCSCodec(codecOptions, iconv) { this.defaultCharUnicode = iconv.defaultCharUnicode; - + // Encode tables: Unicode -> DBCS. // `encodeTable` is array mapping from unicode char to encoded char. All its values are integers for performance. @@ -59,7 +59,7 @@ function DBCSCodec(codecOptions, iconv) { // == UNASSIGNED -> no conversion found. Output a default char. // <= SEQ_START -> it's an index in encodeTableSeq, see below. The character starts a sequence. this.encodeTable = []; - + // `encodeTableSeq` is used when a sequence of unicode characters is encoded as a single code. We use a tree of // objects where keys correspond to characters in sequence and leafs are the encoded dbcs values. A special DEF_CHAR key // means end of sequence (needed when one sequence is a strict subsequence of another). @@ -77,7 +77,7 @@ function DBCSCodec(codecOptions, iconv) { for (var j = val.from; j <= val.to; j++) skipEncodeChars[j] = true; } - + // Use decode trie to recursively fill out encode tables. this._fillEncodeTable(0, 0, skipEncodeChars); @@ -114,7 +114,7 @@ function DBCSCodec(codecOptions, iconv) { thirdByteNode[i] = NODE_START - fourthByteNodeIdx; for (var i = 0x30; i <= 0x39; i++) fourthByteNode[i] = GB18030_CODE - } + } } DBCSCodec.prototype.encoder = DBCSEncoder; @@ -179,7 +179,7 @@ DBCSCodec.prototype._addDecodeChunk = function(chunk) { else writeTable[curAddr++] = code; // Basic char } - } + } else if (typeof part === "number") { // Integer, meaning increasing sequence starting with prev character. var charCode = writeTable[curAddr - 1] + 1; for (var l = 0; l < part; l++) @@ -210,7 +210,7 @@ DBCSCodec.prototype._setEncodeChar = function(uCode, dbcsCode) { } DBCSCodec.prototype._setEncodeSequence = function(seq, dbcsCode) { - + // Get the root of character tree according to first character of the sequence. var uCode = seq[0]; var bucket = this._getEncodeBucket(uCode); @@ -263,6 +263,20 @@ DBCSCodec.prototype._fillEncodeTable = function(nodeIdx, prefix, skipEncodeChars } } +var savedEuroEncodeTable = null +function getEuroEncodeTable (encodeTable) { + if (savedEuroEncodeTable !== null) { + return savedEuroEncodeTable + } + savedEuroEncodeTable = [].concat(encodeTable) + var charCode = '€'.charCodeAt(0) + var columnPos = charCode >> 8 + var column = savedEuroEncodeTable[columnPos] + column = [].concat(column) + column[charCode & 0xFF] = 0xa2e3 + savedEuroEncodeTable[columnPos] = column + return savedEuroEncodeTable +} // == Encoder ================================================================== @@ -271,16 +285,19 @@ function DBCSEncoder(options, codec) { // Encoder state this.leadSurrogate = -1; this.seqObj = undefined; - + // Static data this.encodeTable = codec.encodeTable; this.encodeTableSeq = codec.encodeTableSeq; this.defaultCharSingleByte = codec.defCharSB; this.gb18030 = codec.gb18030; + if (this.gb18030 || (options && options.encodeEuro)) { + this.encodeTable = getEuroEncodeTable(this.encodeTable) + } } DBCSEncoder.prototype.write = function(str) { - var newBuf = new Buffer(str.length * (this.gb18030 ? 4 : 3)), + var newBuf = new Buffer(str.length * (this.gb18030 ? 4 : 3)), leadSurrogate = this.leadSurrogate, seqObj = this.seqObj, nextChar = -1, i = 0, j = 0; @@ -293,7 +310,7 @@ DBCSEncoder.prototype.write = function(str) { } else { var uCode = nextChar; - nextChar = -1; + nextChar = -1; } // 1. Handle surrogates. @@ -315,7 +332,7 @@ DBCSEncoder.prototype.write = function(str) { // Incomplete surrogate pair - only trail surrogate found. uCode = UNASSIGNED; } - + } } else if (leadSurrogate !== -1) { @@ -356,7 +373,7 @@ DBCSEncoder.prototype.write = function(str) { var subtable = this.encodeTable[uCode >> 8]; if (subtable !== undefined) dbcsCode = subtable[uCode & 0xFF]; - + if (dbcsCode <= SEQ_START) { // Sequence start seqObj = this.encodeTableSeq[SEQ_START-dbcsCode]; continue; @@ -379,7 +396,7 @@ DBCSEncoder.prototype.write = function(str) { // 3. Write dbcsCode character. if (dbcsCode === UNASSIGNED) dbcsCode = this.defaultCharSingleByte; - + if (dbcsCode < 0x100) { newBuf[j++] = dbcsCode; } @@ -426,7 +443,7 @@ DBCSEncoder.prototype.end = function() { newBuf[j++] = this.defaultCharSingleByte; this.leadSurrogate = -1; } - + return newBuf.slice(0, j); } @@ -450,21 +467,21 @@ function DBCSDecoder(options, codec) { DBCSDecoder.prototype.write = function(buf) { var newBuf = new Buffer(buf.length*2), - nodeIdx = this.nodeIdx, + nodeIdx = this.nodeIdx, prevBuf = this.prevBuf, prevBufOffset = this.prevBuf.length, seqStart = -this.prevBuf.length, // idx of the start of current parsed sequence. uCode; if (prevBufOffset > 0) // Make prev buf overlap a little to make it easier to slice later. prevBuf = Buffer.concat([prevBuf, buf.slice(0, 10)]); - + for (var i = 0, j = 0; i < buf.length; i++) { var curByte = (i >= 0) ? buf[i] : prevBuf[i + prevBufOffset]; // Lookup in current trie node. var uCode = this.decodeTables[nodeIdx][curByte]; - if (uCode >= 0) { + if (uCode >= 0) { // Normal character, just use it. } else if (uCode === UNASSIGNED) { // Unknown char. @@ -496,7 +513,7 @@ DBCSDecoder.prototype.write = function(buf) { throw new Error("iconv-lite internal error: invalid decoding table value " + uCode + " at " + nodeIdx + "/" + curByte); // Write the character to buffer, handling higher planes using surrogate pair. - if (uCode > 0xFFFF) { + if (uCode > 0xFFFF) { uCode -= 0x10000; var uCodeLead = 0xD800 + Math.floor(uCode / 0x400); newBuf[j++] = uCodeLead & 0xFF; @@ -551,4 +568,3 @@ function findIdx(table, val) { } return l; } - diff --git a/encodings/dbcs-data.js b/encodings/dbcs-data.js index 2bf7415..c3dc05d 100644 --- a/encodings/dbcs-data.js +++ b/encodings/dbcs-data.js @@ -3,13 +3,21 @@ // Description of supported double byte encodings and aliases. // Tables are not require()-d until they are needed to speed up library load. // require()-s are direct to support Browserify. +var savedGbkTable +var gbkTableGetter = function() { + if (savedGbkTable) { + return savedGbkTable + } + savedGbkTable = require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) + return savedGbkTable +} module.exports = { - + // == Japanese/ShiftJIS ==================================================== // All japanese encodings are based on JIS X set of standards: // JIS X 0201 - Single-byte encoding of ASCII + ¥ + Kana chars at 0xA1-0xDF. - // JIS X 0208 - Main set of 6879 characters, placed in 94x94 plane, to be encoded by 2 bytes. + // JIS X 0208 - Main set of 6879 characters, placed in 94x94 plane, to be encoded by 2 bytes. // Has several variations in 1978, 1983, 1990 and 1997. // JIS X 0212 - Supplementary plane of 6067 chars in 94x94 plane. 1990. Effectively dead. // JIS X 0213 - Extension and modern replacement of 0208 and 0212. Total chars: 11233. @@ -27,7 +35,7 @@ module.exports = { // 0x8F, (0xA1-0xFE)x2 - 0212 plane (94x94). // * JIS X 208: 7-bit, direct encoding of 0208. Byte ranges: 0x21-0x7E (94 values). Uncommon. // Used as-is in ISO2022 family. - // * ISO2022-JP: Stateful encoding, with escape sequences to switch between ASCII, + // * ISO2022-JP: Stateful encoding, with escape sequences to switch between ASCII, // 0201-1976 Roman, 0208-1978, 0208-1983. // * ISO2022-JP-1: Adds esc seq for 0212-1990. // * ISO2022-JP-2: Adds esc seq for GB2313-1980, KSX1001-1992, ISO8859-1, ISO8859-7. @@ -77,7 +85,9 @@ module.exports = { 'isoir58': 'gbk', // Microsoft's CP936 is a subset and approximation of GBK. - // TODO: Euro = 0x80 in cp936, but not in GBK (where it's valid but undefined) + // Euro = 0x80 in cp936, but not in GBK (where it's valid but undefined) + // We could be able to decode Euro(0x80) in any of CP936/GBK/GB18030 + // But we would not encoding it to 0x80 when the codec is GB18030 or encodeEuro === true option in decode option 'windows936': 'cp936', '936': 'cp936', 'cp936': { @@ -88,20 +98,19 @@ module.exports = { // GBK (~22000 chars) is an extension of CP936 that added user-mapped chars and some other. 'gbk': { type: '_dbcs', - table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) }, + table: gbkTableGetter, }, 'xgbk': 'gbk', // GB18030 is an algorithmic extension of GBK. 'gb18030': { type: '_dbcs', - table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) }, + table: gbkTableGetter, gb18030: function() { return require('./tables/gb18030-ranges.json') }, }, 'chinese': 'gb18030', - // TODO: Support GB18030 (~27000 chars + whole unicode mapping, cp54936) // http://icu-project.org/docs/papers/gb18030.html // http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml // http://www.khngai.com/chinese/charmap/tblgbk.php?page=0 @@ -133,7 +142,7 @@ module.exports = { // * Windows CP 951: Microsoft variant of Big5-HKSCS-2001. Seems to be never public. http://me.abelcheung.org/articles/research/what-is-cp951/ // * Big5-2003 (Taiwan standard) almost superset of cp950. // * Unicode-at-on (UAO) / Mozilla 1.8. Falling out of use on the Web. Not supported by other browsers. - // * Big5-HKSCS (-2001, -2004, -2008). Hong Kong standard. + // * Big5-HKSCS (-2001, -2004, -2008). Hong Kong standard. // many unicode code points moved from PUA to Supplementary plane (U+2XXXX) over the years. // Plus, it has 4 combining sequences. // Seems that Mozilla refused to support it for 10 yrs. https://bugzilla.mozilla.org/show_bug.cgi?id=162431 https://bugzilla.mozilla.org/show_bug.cgi?id=310299 @@ -144,7 +153,7 @@ module.exports = { // In the encoder, it might make sense to support encoding old PUA mappings to Big5 bytes seq-s. // Official spec: http://www.ogcio.gov.hk/en/business/tech_promotion/ccli/terms/doc/2003cmp_2008.txt // http://www.ogcio.gov.hk/tc/business/tech_promotion/ccli/terms/doc/hkscs-2008-big5-iso.txt - // + // // Current understanding of how to deal with Big5(-HKSCS) is in the Encoding Standard, http://encoding.spec.whatwg.org/#big5-encoder // Unicode mapping (http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT) is said to be wrong. diff --git a/test/gbk-test.js b/test/gbk-test.js index 00f5c79..802d461 100644 --- a/test/gbk-test.js +++ b/test/gbk-test.js @@ -6,12 +6,12 @@ var testString = "中国abc",//unicode contains GBK-code and ascii testStringGBKBuffer = new Buffer([0xd6,0xd0,0xb9,0xfa,0x61,0x62,0x63]); describe("GBK tests", function() { - it("GBK correctly encoded/decoded", function() { + it("GBK correctly encoded/decoded", function() { assert.strictEqual(iconv.encode(testString, "GBK").toString('binary'), testStringGBKBuffer.toString('binary')); assert.strictEqual(iconv.decode(testStringGBKBuffer, "GBK"), testString); }); - it("GB2312 correctly encoded/decoded", function() { + it("GB2312 correctly encoded/decoded", function() { assert.strictEqual(iconv.encode(testString, "GB2312").toString('binary'), testStringGBKBuffer.toString('binary')); assert.strictEqual(iconv.decode(testStringGBKBuffer, "GB2312"), testString); }); @@ -90,5 +90,56 @@ describe("GBK tests", function() { assert.strictEqual(strToHex(iconv.decode(gbkBuf, "GB18030")), strToHex(uChar)); } }); - }); + +describe('testing the cp936/gbk euro dollor symbol', function () { + it('test cp936 decode Euro dollor symbol', function () { + // Convert from an encoded buffer to js string. + var str = '' + str = iconv.decode(new Buffer([0x80]), 'gb2312') + assert.equal(str, '€') + + str = iconv.decode(new Buffer([0x80]), 'cp936') + assert.equal(str, '€') + + str = iconv.decode(new Buffer([0x80]), 'gbk') + assert.equal(str, '€') + + str = iconv.decode(new Buffer([0x80]), 'gb18030') + assert.equal(str, '€') + + // Decode a2e3 + str = iconv.decode(new Buffer([0xa2, 0xe3]), 'gb2312') + assert.equal(str.charCodeAt(0), 0xfffD) + + str = iconv.decode(new Buffer([0xa2, 0xe3]), 'cp936') + assert.equal(str.charCodeAt(0), 0xfffD) + + str = iconv.decode(new Buffer([0xa2, 0xe3]), 'gbk') + assert.equal(str, '€') + + str = iconv.decode(new Buffer([0xa2, 0xe3]), 'gb18030') + assert.equal(str, '€') + }) + + it('test cp936 encode Euro dollor symbol', function () { + var buffer = new Buffer([]) + + buffer = iconv.encode('€', 'gb2312') + assert.equal(buffer.toString('hex'), '80') + + buffer = iconv.encode('€', 'cp936') + assert.equal(buffer.toString('hex'), '80') + + // encodeEuro default is false + buffer = iconv.encode('€', 'gbk', {encodeEuro: false}) + assert.equal(buffer.toString('hex'), '80') + + buffer = iconv.encode('€', 'gbk', {encodeEuro: true}) + assert.equal(buffer.toString('hex'), 'a2e3') + + // https://en.wikipedia.org/wiki/GB_18030 + buffer = iconv.encode('€', 'gb18030') + assert.equal(buffer.toString('hex'), 'a2e3') + }) +})