Skip to content

Commit

Permalink
Support for Euro character encode properly in GBK/GB18030.
Browse files Browse the repository at this point in the history
GB18030 already supported.
    // TODO: Support GB18030 (~27000 chars + whole unicode mapping, cp54936)
  • Loading branch information
lygstate committed Jan 16, 2016
1 parent 69a25dc commit 0fe89e5
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 28 deletions.
41 changes: 22 additions & 19 deletions encodings/dbcs-codec.js
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ function DBCSCodec(codecOptions, iconv) {
this.decodeTables = [];
this.decodeTables[0] = UNASSIGNED_NODE.slice(0); // Create root node.

// Sometimes a MBCS char corresponds to a sequence of unicode chars. We store them as arrays of integers here.
// Sometimes a MBCS char corresponds to a sequence of unicode chars. We store them as arrays of integers here.
this.decodeTableSeq = [];

// Actual mapping tables consist of chunks. Use them to fill up decode tables.
Expand All @@ -50,7 +50,7 @@ function DBCSCodec(codecOptions, iconv) {

this.defaultCharUnicode = iconv.defaultCharUnicode;


// Encode tables: Unicode -> DBCS.

// `encodeTable` is array mapping from unicode char to encoded char. All its values are integers for performance.
Expand All @@ -59,7 +59,7 @@ function DBCSCodec(codecOptions, iconv) {
// == UNASSIGNED -> no conversion found. Output a default char.
// <= SEQ_START -> it's an index in encodeTableSeq, see below. The character starts a sequence.
this.encodeTable = [];

// `encodeTableSeq` is used when a sequence of unicode characters is encoded as a single code. We use a tree of
// objects where keys correspond to characters in sequence and leafs are the encoded dbcs values. A special DEF_CHAR key
// means end of sequence (needed when one sequence is a strict subsequence of another).
Expand All @@ -77,7 +77,7 @@ function DBCSCodec(codecOptions, iconv) {
for (var j = val.from; j <= val.to; j++)
skipEncodeChars[j] = true;
}

// Use decode trie to recursively fill out encode tables.
this._fillEncodeTable(0, 0, skipEncodeChars);

Expand Down Expand Up @@ -114,7 +114,11 @@ function DBCSCodec(codecOptions, iconv) {
thirdByteNode[i] = NODE_START - fourthByteNodeIdx;
for (var i = 0x30; i <= 0x39; i++)
fourthByteNode[i] = GB18030_CODE
}
}
if (this.gb18030 || codecOptions.encodeEuro) {
var charCode = '€'.charCodeAt(0)
this.encodeTable[charCode >> 8][charCode & 0xFF] = 0xa2e3
}
}

DBCSCodec.prototype.encoder = DBCSEncoder;
Expand Down Expand Up @@ -179,7 +183,7 @@ DBCSCodec.prototype._addDecodeChunk = function(chunk) {
else
writeTable[curAddr++] = code; // Basic char
}
}
}
else if (typeof part === "number") { // Integer, meaning increasing sequence starting with prev character.
var charCode = writeTable[curAddr - 1] + 1;
for (var l = 0; l < part; l++)
Expand Down Expand Up @@ -210,7 +214,7 @@ DBCSCodec.prototype._setEncodeChar = function(uCode, dbcsCode) {
}

DBCSCodec.prototype._setEncodeSequence = function(seq, dbcsCode) {

// Get the root of character tree according to first character of the sequence.
var uCode = seq[0];
var bucket = this._getEncodeBucket(uCode);
Expand Down Expand Up @@ -271,7 +275,7 @@ function DBCSEncoder(options, codec) {
// Encoder state
this.leadSurrogate = -1;
this.seqObj = undefined;

// Static data
this.encodeTable = codec.encodeTable;
this.encodeTableSeq = codec.encodeTableSeq;
Expand All @@ -280,7 +284,7 @@ function DBCSEncoder(options, codec) {
}

DBCSEncoder.prototype.write = function(str) {
var newBuf = new Buffer(str.length * (this.gb18030 ? 4 : 3)),
var newBuf = new Buffer(str.length * (this.gb18030 ? 4 : 3)),
leadSurrogate = this.leadSurrogate,
seqObj = this.seqObj, nextChar = -1,
i = 0, j = 0;
Expand All @@ -293,7 +297,7 @@ DBCSEncoder.prototype.write = function(str) {
}
else {
var uCode = nextChar;
nextChar = -1;
nextChar = -1;
}

// 1. Handle surrogates.
Expand All @@ -315,7 +319,7 @@ DBCSEncoder.prototype.write = function(str) {
// Incomplete surrogate pair - only trail surrogate found.
uCode = UNASSIGNED;
}

}
}
else if (leadSurrogate !== -1) {
Expand Down Expand Up @@ -356,7 +360,7 @@ DBCSEncoder.prototype.write = function(str) {
var subtable = this.encodeTable[uCode >> 8];
if (subtable !== undefined)
dbcsCode = subtable[uCode & 0xFF];

if (dbcsCode <= SEQ_START) { // Sequence start
seqObj = this.encodeTableSeq[SEQ_START-dbcsCode];
continue;
Expand All @@ -379,7 +383,7 @@ DBCSEncoder.prototype.write = function(str) {
// 3. Write dbcsCode character.
if (dbcsCode === UNASSIGNED)
dbcsCode = this.defaultCharSingleByte;

if (dbcsCode < 0x100) {
newBuf[j++] = dbcsCode;
}
Expand Down Expand Up @@ -426,7 +430,7 @@ DBCSEncoder.prototype.end = function() {
newBuf[j++] = this.defaultCharSingleByte;
this.leadSurrogate = -1;
}

return newBuf.slice(0, j);
}

Expand All @@ -450,21 +454,21 @@ function DBCSDecoder(options, codec) {

DBCSDecoder.prototype.write = function(buf) {
var newBuf = new Buffer(buf.length*2),
nodeIdx = this.nodeIdx,
nodeIdx = this.nodeIdx,
prevBuf = this.prevBuf, prevBufOffset = this.prevBuf.length,
seqStart = -this.prevBuf.length, // idx of the start of current parsed sequence.
uCode;

if (prevBufOffset > 0) // Make prev buf overlap a little to make it easier to slice later.
prevBuf = Buffer.concat([prevBuf, buf.slice(0, 10)]);

for (var i = 0, j = 0; i < buf.length; i++) {
var curByte = (i >= 0) ? buf[i] : prevBuf[i + prevBufOffset];

// Lookup in current trie node.
var uCode = this.decodeTables[nodeIdx][curByte];

if (uCode >= 0) {
if (uCode >= 0) {
// Normal character, just use it.
}
else if (uCode === UNASSIGNED) { // Unknown char.
Expand Down Expand Up @@ -496,7 +500,7 @@ DBCSDecoder.prototype.write = function(buf) {
throw new Error("iconv-lite internal error: invalid decoding table value " + uCode + " at " + nodeIdx + "/" + curByte);

// Write the character to buffer, handling higher planes using surrogate pair.
if (uCode > 0xFFFF) {
if (uCode > 0xFFFF) {
uCode -= 0x10000;
var uCodeLead = 0xD800 + Math.floor(uCode / 0x400);
newBuf[j++] = uCodeLead & 0xFF;
Expand Down Expand Up @@ -551,4 +555,3 @@ function findIdx(table, val) {
}
return l;
}

26 changes: 18 additions & 8 deletions encodings/dbcs-data.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,21 @@
// Description of supported double byte encodings and aliases.
// Tables are not require()-d until they are needed to speed up library load.
// require()-s are direct to support Browserify.
var savedGbkTable
var gbkTableGetter = function() {
if (savedGbkTable) {
return savedGbkTable
}
savedGbkTable = require('./tables/cp936.json').concat(require('./tables/gbk-added.json'))
return savedGbkTable
}

module.exports = {

// == Japanese/ShiftJIS ====================================================
// All japanese encodings are based on JIS X set of standards:
// JIS X 0201 - Single-byte encoding of ASCII + ¥ + Kana chars at 0xA1-0xDF.
// JIS X 0208 - Main set of 6879 characters, placed in 94x94 plane, to be encoded by 2 bytes.
// JIS X 0208 - Main set of 6879 characters, placed in 94x94 plane, to be encoded by 2 bytes.
// Has several variations in 1978, 1983, 1990 and 1997.
// JIS X 0212 - Supplementary plane of 6067 chars in 94x94 plane. 1990. Effectively dead.
// JIS X 0213 - Extension and modern replacement of 0208 and 0212. Total chars: 11233.
Expand All @@ -27,7 +35,7 @@ module.exports = {
// 0x8F, (0xA1-0xFE)x2 - 0212 plane (94x94).
// * JIS X 208: 7-bit, direct encoding of 0208. Byte ranges: 0x21-0x7E (94 values). Uncommon.
// Used as-is in ISO2022 family.
// * ISO2022-JP: Stateful encoding, with escape sequences to switch between ASCII,
// * ISO2022-JP: Stateful encoding, with escape sequences to switch between ASCII,
// 0201-1976 Roman, 0208-1978, 0208-1983.
// * ISO2022-JP-1: Adds esc seq for 0212-1990.
// * ISO2022-JP-2: Adds esc seq for GB2313-1980, KSX1001-1992, ISO8859-1, ISO8859-7.
Expand Down Expand Up @@ -77,7 +85,9 @@ module.exports = {
'isoir58': 'gbk',

// Microsoft's CP936 is a subset and approximation of GBK.
// TODO: Euro = 0x80 in cp936, but not in GBK (where it's valid but undefined)
// Euro = 0x80 in cp936, but not in GBK (where it's valid but undefined)
// We could be able to decode Euro(0x80) in any of CP936/GBK/GB18030
// But we would not encoding it to 0x80 when the codec is GB18030 or encodeEuro === true option in codec Option
'windows936': 'cp936',
'936': 'cp936',
'cp936': {
Expand All @@ -88,7 +98,8 @@ module.exports = {
// GBK (~22000 chars) is an extension of CP936 that added user-mapped chars and some other.
'gbk': {
type: '_dbcs',
table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) },
table: gbkTableGetter,
encodeEuro: true,
},
'xgbk': 'gbk',

Expand All @@ -101,7 +112,6 @@ module.exports = {

'chinese': 'gb18030',

// TODO: Support GB18030 (~27000 chars + whole unicode mapping, cp54936)
// http://icu-project.org/docs/papers/gb18030.html
// http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml
// http://www.khngai.com/chinese/charmap/tblgbk.php?page=0
Expand Down Expand Up @@ -133,7 +143,7 @@ module.exports = {
// * Windows CP 951: Microsoft variant of Big5-HKSCS-2001. Seems to be never public. http://me.abelcheung.org/articles/research/what-is-cp951/
// * Big5-2003 (Taiwan standard) almost superset of cp950.
// * Unicode-at-on (UAO) / Mozilla 1.8. Falling out of use on the Web. Not supported by other browsers.
// * Big5-HKSCS (-2001, -2004, -2008). Hong Kong standard.
// * Big5-HKSCS (-2001, -2004, -2008). Hong Kong standard.
// many unicode code points moved from PUA to Supplementary plane (U+2XXXX) over the years.
// Plus, it has 4 combining sequences.
// Seems that Mozilla refused to support it for 10 yrs. https://bugzilla.mozilla.org/show_bug.cgi?id=162431 https://bugzilla.mozilla.org/show_bug.cgi?id=310299
Expand All @@ -144,7 +154,7 @@ module.exports = {
// In the encoder, it might make sense to support encoding old PUA mappings to Big5 bytes seq-s.
// Official spec: http://www.ogcio.gov.hk/en/business/tech_promotion/ccli/terms/doc/2003cmp_2008.txt
// http://www.ogcio.gov.hk/tc/business/tech_promotion/ccli/terms/doc/hkscs-2008-big5-iso.txt
//
//
// Current understanding of how to deal with Big5(-HKSCS) is in the Encoding Standard, http://encoding.spec.whatwg.org/#big5-encoder
// Unicode mapping (http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT) is said to be wrong.

Expand Down
36 changes: 35 additions & 1 deletion test/gbk-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -90,5 +90,39 @@ describe("GBK tests", function() {
assert.strictEqual(strToHex(iconv.decode(gbkBuf, "GB18030")), strToHex(uChar));
}
});

});

describe('testing the cp936/gbk euro dollor symbol', function () {
it('test cp936 decode Euro dollor symbol', function () {
// Convert from an encoded buffer to js string.
var str = ''
str = iconv.decode(new Buffer([0x80]), 'gb2312')
assert.equal(str, '€')

str = iconv.decode(new Buffer([0x80]), 'cp936')
assert.equal(str, '€')

str = iconv.decode(new Buffer([0x80]), 'gbk')
assert.equal(str, '€')

str = iconv.decode(new Buffer([0x80]), 'gb18030')
assert.equal(str, '€')
})

it('test cp936 encode Euro dollor symbol', function () {
var buffer = new Buffer([])

buffer = iconv.encode('€', 'gb2312')
assert.equal(buffer.toString('hex'), '80')

buffer = iconv.encode('€', 'cp936')
assert.equal(buffer.toString('hex'), '80')

buffer = iconv.encode('€', 'gbk')
assert.equal(buffer.toString('hex'), 'a2e3')

// https://en.wikipedia.org/wiki/GB_18030
buffer = iconv.encode('€', 'gb18030')
assert.equal(buffer.toString('hex'), 'a2e3')
})
})

0 comments on commit 0fe89e5

Please sign in to comment.