Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for Euro character encode properly in GBK/GB18030. #115

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 35 additions & 19 deletions encodings/dbcs-codec.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ function DBCSCodec(codecOptions, iconv) {
this.decodeTables = [];
this.decodeTables[0] = UNASSIGNED_NODE.slice(0); // Create root node.

// Sometimes a MBCS char corresponds to a sequence of unicode chars. We store them as arrays of integers here.
// Sometimes a MBCS char corresponds to a sequence of unicode chars. We store them as arrays of integers here.
this.decodeTableSeq = [];

// Actual mapping tables consist of chunks. Use them to fill up decode tables.
Expand All @@ -50,7 +50,7 @@ function DBCSCodec(codecOptions, iconv) {

this.defaultCharUnicode = iconv.defaultCharUnicode;


// Encode tables: Unicode -> DBCS.

// `encodeTable` is array mapping from unicode char to encoded char. All its values are integers for performance.
Expand All @@ -59,7 +59,7 @@ function DBCSCodec(codecOptions, iconv) {
// == UNASSIGNED -> no conversion found. Output a default char.
// <= SEQ_START -> it's an index in encodeTableSeq, see below. The character starts a sequence.
this.encodeTable = [];

// `encodeTableSeq` is used when a sequence of unicode characters is encoded as a single code. We use a tree of
// objects where keys correspond to characters in sequence and leafs are the encoded dbcs values. A special DEF_CHAR key
// means end of sequence (needed when one sequence is a strict subsequence of another).
Expand All @@ -77,7 +77,7 @@ function DBCSCodec(codecOptions, iconv) {
for (var j = val.from; j <= val.to; j++)
skipEncodeChars[j] = true;
}

// Use decode trie to recursively fill out encode tables.
this._fillEncodeTable(0, 0, skipEncodeChars);

Expand Down Expand Up @@ -114,7 +114,7 @@ function DBCSCodec(codecOptions, iconv) {
thirdByteNode[i] = NODE_START - fourthByteNodeIdx;
for (var i = 0x30; i <= 0x39; i++)
fourthByteNode[i] = GB18030_CODE
}
}
}

DBCSCodec.prototype.encoder = DBCSEncoder;
Expand Down Expand Up @@ -179,7 +179,7 @@ DBCSCodec.prototype._addDecodeChunk = function(chunk) {
else
writeTable[curAddr++] = code; // Basic char
}
}
}
else if (typeof part === "number") { // Integer, meaning increasing sequence starting with prev character.
var charCode = writeTable[curAddr - 1] + 1;
for (var l = 0; l < part; l++)
Expand Down Expand Up @@ -210,7 +210,7 @@ DBCSCodec.prototype._setEncodeChar = function(uCode, dbcsCode) {
}

DBCSCodec.prototype._setEncodeSequence = function(seq, dbcsCode) {

// Get the root of character tree according to first character of the sequence.
var uCode = seq[0];
var bucket = this._getEncodeBucket(uCode);
Expand Down Expand Up @@ -263,6 +263,20 @@ DBCSCodec.prototype._fillEncodeTable = function(nodeIdx, prefix, skipEncodeChars
}
}

var savedEuroEncodeTable = null
function getEuroEncodeTable (encodeTable) {
if (savedEuroEncodeTable !== null) {
return savedEuroEncodeTable
}
savedEuroEncodeTable = [].concat(encodeTable)
var charCode = '€'.charCodeAt(0)
var columnPos = charCode >> 8
var column = savedEuroEncodeTable[columnPos]
column = [].concat(column)
column[charCode & 0xFF] = 0xa2e3
savedEuroEncodeTable[columnPos] = column
return savedEuroEncodeTable
}


// == Encoder ==================================================================
Expand All @@ -271,16 +285,19 @@ function DBCSEncoder(options, codec) {
// Encoder state
this.leadSurrogate = -1;
this.seqObj = undefined;

// Static data
this.encodeTable = codec.encodeTable;
this.encodeTableSeq = codec.encodeTableSeq;
this.defaultCharSingleByte = codec.defCharSB;
this.gb18030 = codec.gb18030;
if (this.gb18030 || (options && options.encodeEuro)) {
this.encodeTable = getEuroEncodeTable(this.encodeTable)
}
}

DBCSEncoder.prototype.write = function(str) {
var newBuf = new Buffer(str.length * (this.gb18030 ? 4 : 3)),
var newBuf = new Buffer(str.length * (this.gb18030 ? 4 : 3)),
leadSurrogate = this.leadSurrogate,
seqObj = this.seqObj, nextChar = -1,
i = 0, j = 0;
Expand All @@ -293,7 +310,7 @@ DBCSEncoder.prototype.write = function(str) {
}
else {
var uCode = nextChar;
nextChar = -1;
nextChar = -1;
}

// 1. Handle surrogates.
Expand All @@ -315,7 +332,7 @@ DBCSEncoder.prototype.write = function(str) {
// Incomplete surrogate pair - only trail surrogate found.
uCode = UNASSIGNED;
}

}
}
else if (leadSurrogate !== -1) {
Expand Down Expand Up @@ -356,7 +373,7 @@ DBCSEncoder.prototype.write = function(str) {
var subtable = this.encodeTable[uCode >> 8];
if (subtable !== undefined)
dbcsCode = subtable[uCode & 0xFF];

if (dbcsCode <= SEQ_START) { // Sequence start
seqObj = this.encodeTableSeq[SEQ_START-dbcsCode];
continue;
Expand All @@ -379,7 +396,7 @@ DBCSEncoder.prototype.write = function(str) {
// 3. Write dbcsCode character.
if (dbcsCode === UNASSIGNED)
dbcsCode = this.defaultCharSingleByte;

if (dbcsCode < 0x100) {
newBuf[j++] = dbcsCode;
}
Expand Down Expand Up @@ -426,7 +443,7 @@ DBCSEncoder.prototype.end = function() {
newBuf[j++] = this.defaultCharSingleByte;
this.leadSurrogate = -1;
}

return newBuf.slice(0, j);
}

Expand All @@ -450,21 +467,21 @@ function DBCSDecoder(options, codec) {

DBCSDecoder.prototype.write = function(buf) {
var newBuf = new Buffer(buf.length*2),
nodeIdx = this.nodeIdx,
nodeIdx = this.nodeIdx,
prevBuf = this.prevBuf, prevBufOffset = this.prevBuf.length,
seqStart = -this.prevBuf.length, // idx of the start of current parsed sequence.
uCode;

if (prevBufOffset > 0) // Make prev buf overlap a little to make it easier to slice later.
prevBuf = Buffer.concat([prevBuf, buf.slice(0, 10)]);

for (var i = 0, j = 0; i < buf.length; i++) {
var curByte = (i >= 0) ? buf[i] : prevBuf[i + prevBufOffset];

// Lookup in current trie node.
var uCode = this.decodeTables[nodeIdx][curByte];

if (uCode >= 0) {
if (uCode >= 0) {
// Normal character, just use it.
}
else if (uCode === UNASSIGNED) { // Unknown char.
Expand Down Expand Up @@ -496,7 +513,7 @@ DBCSDecoder.prototype.write = function(buf) {
throw new Error("iconv-lite internal error: invalid decoding table value " + uCode + " at " + nodeIdx + "/" + curByte);

// Write the character to buffer, handling higher planes using surrogate pair.
if (uCode > 0xFFFF) {
if (uCode > 0xFFFF) {
uCode -= 0x10000;
var uCodeLead = 0xD800 + Math.floor(uCode / 0x400);
newBuf[j++] = uCodeLead & 0xFF;
Expand Down Expand Up @@ -551,4 +568,3 @@ function findIdx(table, val) {
}
return l;
}

27 changes: 18 additions & 9 deletions encodings/dbcs-data.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,21 @@
// Description of supported double byte encodings and aliases.
// Tables are not require()-d until they are needed to speed up library load.
// require()-s are direct to support Browserify.
var savedGbkTable
var gbkTableGetter = function() {
if (savedGbkTable) {
return savedGbkTable
}
savedGbkTable = require('./tables/cp936.json').concat(require('./tables/gbk-added.json'))
return savedGbkTable
}

module.exports = {

// == Japanese/ShiftJIS ====================================================
// All japanese encodings are based on JIS X set of standards:
// JIS X 0201 - Single-byte encoding of ASCII + ¥ + Kana chars at 0xA1-0xDF.
// JIS X 0208 - Main set of 6879 characters, placed in 94x94 plane, to be encoded by 2 bytes.
// JIS X 0208 - Main set of 6879 characters, placed in 94x94 plane, to be encoded by 2 bytes.
// Has several variations in 1978, 1983, 1990 and 1997.
// JIS X 0212 - Supplementary plane of 6067 chars in 94x94 plane. 1990. Effectively dead.
// JIS X 0213 - Extension and modern replacement of 0208 and 0212. Total chars: 11233.
Expand All @@ -27,7 +35,7 @@ module.exports = {
// 0x8F, (0xA1-0xFE)x2 - 0212 plane (94x94).
// * JIS X 208: 7-bit, direct encoding of 0208. Byte ranges: 0x21-0x7E (94 values). Uncommon.
// Used as-is in ISO2022 family.
// * ISO2022-JP: Stateful encoding, with escape sequences to switch between ASCII,
// * ISO2022-JP: Stateful encoding, with escape sequences to switch between ASCII,
// 0201-1976 Roman, 0208-1978, 0208-1983.
// * ISO2022-JP-1: Adds esc seq for 0212-1990.
// * ISO2022-JP-2: Adds esc seq for GB2313-1980, KSX1001-1992, ISO8859-1, ISO8859-7.
Expand Down Expand Up @@ -77,7 +85,9 @@ module.exports = {
'isoir58': 'gbk',

// Microsoft's CP936 is a subset and approximation of GBK.
// TODO: Euro = 0x80 in cp936, but not in GBK (where it's valid but undefined)
// Euro = 0x80 in cp936, but not in GBK (where it's valid but undefined)
// We could be able to decode Euro(0x80) in any of CP936/GBK/GB18030
// But we would not encoding it to 0x80 when the codec is GB18030 or encodeEuro === true option in decode option
'windows936': 'cp936',
'936': 'cp936',
'cp936': {
Expand All @@ -88,20 +98,19 @@ module.exports = {
// GBK (~22000 chars) is an extension of CP936 that added user-mapped chars and some other.
'gbk': {
type: '_dbcs',
table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) },
table: gbkTableGetter,
},
'xgbk': 'gbk',

// GB18030 is an algorithmic extension of GBK.
'gb18030': {
type: '_dbcs',
table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) },
table: gbkTableGetter,
gb18030: function() { return require('./tables/gb18030-ranges.json') },
},

'chinese': 'gb18030',

// TODO: Support GB18030 (~27000 chars + whole unicode mapping, cp54936)
// http://icu-project.org/docs/papers/gb18030.html
// http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml
// http://www.khngai.com/chinese/charmap/tblgbk.php?page=0
Expand Down Expand Up @@ -133,7 +142,7 @@ module.exports = {
// * Windows CP 951: Microsoft variant of Big5-HKSCS-2001. Seems to be never public. http://me.abelcheung.org/articles/research/what-is-cp951/
// * Big5-2003 (Taiwan standard) almost superset of cp950.
// * Unicode-at-on (UAO) / Mozilla 1.8. Falling out of use on the Web. Not supported by other browsers.
// * Big5-HKSCS (-2001, -2004, -2008). Hong Kong standard.
// * Big5-HKSCS (-2001, -2004, -2008). Hong Kong standard.
// many unicode code points moved from PUA to Supplementary plane (U+2XXXX) over the years.
// Plus, it has 4 combining sequences.
// Seems that Mozilla refused to support it for 10 yrs. https://bugzilla.mozilla.org/show_bug.cgi?id=162431 https://bugzilla.mozilla.org/show_bug.cgi?id=310299
Expand All @@ -144,7 +153,7 @@ module.exports = {
// In the encoder, it might make sense to support encoding old PUA mappings to Big5 bytes seq-s.
// Official spec: http://www.ogcio.gov.hk/en/business/tech_promotion/ccli/terms/doc/2003cmp_2008.txt
// http://www.ogcio.gov.hk/tc/business/tech_promotion/ccli/terms/doc/hkscs-2008-big5-iso.txt
//
//
// Current understanding of how to deal with Big5(-HKSCS) is in the Encoding Standard, http://encoding.spec.whatwg.org/#big5-encoder
// Unicode mapping (http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT) is said to be wrong.

Expand Down
57 changes: 54 additions & 3 deletions test/gbk-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ var testString = "中国abc",//unicode contains GBK-code and ascii
testStringGBKBuffer = new Buffer([0xd6,0xd0,0xb9,0xfa,0x61,0x62,0x63]);

describe("GBK tests", function() {
it("GBK correctly encoded/decoded", function() {
it("GBK correctly encoded/decoded", function() {
assert.strictEqual(iconv.encode(testString, "GBK").toString('binary'), testStringGBKBuffer.toString('binary'));
assert.strictEqual(iconv.decode(testStringGBKBuffer, "GBK"), testString);
});

it("GB2312 correctly encoded/decoded", function() {
it("GB2312 correctly encoded/decoded", function() {
assert.strictEqual(iconv.encode(testString, "GB2312").toString('binary'), testStringGBKBuffer.toString('binary'));
assert.strictEqual(iconv.decode(testStringGBKBuffer, "GB2312"), testString);
});
Expand Down Expand Up @@ -90,5 +90,56 @@ describe("GBK tests", function() {
assert.strictEqual(strToHex(iconv.decode(gbkBuf, "GB18030")), strToHex(uChar));
}
});

});

describe('testing the cp936/gbk euro dollor symbol', function () {
it('test cp936 decode Euro dollor symbol', function () {
// Convert from an encoded buffer to js string.
var str = ''
str = iconv.decode(new Buffer([0x80]), 'gb2312')
assert.equal(str, '€')

str = iconv.decode(new Buffer([0x80]), 'cp936')
assert.equal(str, '€')

str = iconv.decode(new Buffer([0x80]), 'gbk')
assert.equal(str, '€')

str = iconv.decode(new Buffer([0x80]), 'gb18030')
assert.equal(str, '€')

// Decode a2e3
str = iconv.decode(new Buffer([0xa2, 0xe3]), 'gb2312')
assert.equal(str.charCodeAt(0), 0xfffD)

str = iconv.decode(new Buffer([0xa2, 0xe3]), 'cp936')
assert.equal(str.charCodeAt(0), 0xfffD)

str = iconv.decode(new Buffer([0xa2, 0xe3]), 'gbk')
assert.equal(str, '€')

str = iconv.decode(new Buffer([0xa2, 0xe3]), 'gb18030')
assert.equal(str, '€')
})

it('test cp936 encode Euro dollor symbol', function () {
var buffer = new Buffer([])

buffer = iconv.encode('€', 'gb2312')
assert.equal(buffer.toString('hex'), '80')

buffer = iconv.encode('€', 'cp936')
assert.equal(buffer.toString('hex'), '80')

// encodeEuro default is false
buffer = iconv.encode('€', 'gbk', {encodeEuro: false})
assert.equal(buffer.toString('hex'), '80')

buffer = iconv.encode('€', 'gbk', {encodeEuro: true})
assert.equal(buffer.toString('hex'), 'a2e3')

// https://en.wikipedia.org/wiki/GB_18030
buffer = iconv.encode('€', 'gb18030')
assert.equal(buffer.toString('hex'), 'a2e3')
})
})