Skip to content

Commit

Permalink
Fix GBK/GB18030 handling of Euro character. Fixes #114, #115
Browse files Browse the repository at this point in the history
  • Loading branch information
ashtuchkin committed Nov 21, 2016
1 parent 7b0c37f commit c884431
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 8 deletions.
17 changes: 9 additions & 8 deletions encodings/dbcs-data.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ module.exports = {
//
// Overall, it seems that it's a mess :( http://www8.plala.or.jp/tkubota1/unicode-symbols-map2.html


'shiftjis': {
type: '_dbcs',
table: function() { return require('./tables/shiftjis.json') },
Expand Down Expand Up @@ -66,8 +65,10 @@ module.exports = {
// TODO: IBM CCSID 942 = CP932, but F0-F9 custom chars and other char changes.
// TODO: IBM CCSID 943 = Shift_JIS = CP932 with original Shift_JIS lower 128 chars.


// == Chinese/GBK ==========================================================
// http://en.wikipedia.org/wiki/GBK
// We mostly implement W3C recommendation: https://www.w3.org/TR/encoding/#gbk-encoder

// Oldest GB2312 (1981, ~7600 chars) is a subset of CP936
'gb2312': 'cp936',
Expand All @@ -76,10 +77,8 @@ module.exports = {
'csgb2312': 'cp936',
'csiso58gb231280': 'cp936',
'euccn': 'cp936',
'isoir58': 'gbk',

// Microsoft's CP936 is a subset and approximation of GBK.
// TODO: Euro = 0x80 in cp936, but not in GBK (where it's valid but undefined)
'windows936': 'cp936',
'ms936': 'cp936',
'936': 'cp936',
Expand All @@ -94,20 +93,23 @@ module.exports = {
table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) },
},
'xgbk': 'gbk',
'isoir58': 'gbk',

// GB18030 is an algorithmic extension of GBK.
// Main source: https://www.w3.org/TR/encoding/#gbk-encoder
// http://icu-project.org/docs/papers/gb18030.html
// http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml
// http://www.khngai.com/chinese/charmap/tblgbk.php?page=0
'gb18030': {
type: '_dbcs',
table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) },
gb18030: function() { return require('./tables/gb18030-ranges.json') },
encodeSkipVals: [0x80],
encodeAdd: {'€': 0xA2E3},
},

'chinese': 'gb18030',

// TODO: Support GB18030 (~27000 chars + whole unicode mapping, cp54936)
// http://icu-project.org/docs/papers/gb18030.html
// http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml
// http://www.khngai.com/chinese/charmap/tblgbk.php?page=0

// == Korean ===============================================================
// EUC-KR, KS_C_5601 and KS X 1001 are exactly the same.
Expand Down Expand Up @@ -171,5 +173,4 @@ module.exports = {
'cnbig5': 'big5hkscs',
'csbig5': 'big5hkscs',
'xxbig5': 'big5hkscs',

};
18 changes: 18 additions & 0 deletions test/gbk-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,24 @@ describe("GBK tests", function() {
assert.strictEqual(iconv.decode(gbkChars, "GBK"), chars)
});

it("GBK and GB18030 correctly decodes and encodes Euro character", function() {
// Euro character (U+20AC) has two encodings in GBK family: 0x80 and 0xA2 0xE3
// According to W3C's technical recommendation (https://www.w3.org/TR/encoding/#gbk-encoder),
// Both GBK and GB18030 decoders should accept both encodings.
var gbkEuroEncoding1 = new Buffer([0x80]),
gbkEuroEncoding2 = new Buffer([0xA2, 0xE3]),
strEuro = "€";

assert.strictEqual(iconv.decode(gbkEuroEncoding1, "GBK"), strEuro);
assert.strictEqual(iconv.decode(gbkEuroEncoding2, "GBK"), strEuro);
assert.strictEqual(iconv.decode(gbkEuroEncoding1, "GB18030"), strEuro);
assert.strictEqual(iconv.decode(gbkEuroEncoding2, "GB18030"), strEuro);

// But when decoding, GBK should produce 0x80, but GB18030 - 0xA2 0xE3.
assert.strictEqual(iconv.encode(strEuro, "GBK").toString('hex'), gbkEuroEncoding1.toString('hex'));
assert.strictEqual(iconv.encode(strEuro, "GB18030").toString('hex'), gbkEuroEncoding2.toString('hex'));
});

it("GB18030 findIdx works correctly", function() {
function findIdxAlternative(table, val) {
for (var i = 0; i < table.length; i++)
Expand Down

0 comments on commit c884431

Please sign in to comment.