diff --git a/encodings/dbcs-data.js b/encodings/dbcs-data.js index 2bf7415..c3dc05d 100644 --- a/encodings/dbcs-data.js +++ b/encodings/dbcs-data.js @@ -3,13 +3,21 @@ // Description of supported double byte encodings and aliases. // Tables are not require()-d until they are needed to speed up library load. // require()-s are direct to support Browserify. +var savedGbkTable +var gbkTableGetter = function() { + if (savedGbkTable) { + return savedGbkTable + } + savedGbkTable = require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) + return savedGbkTable +} module.exports = { - + // == Japanese/ShiftJIS ==================================================== // All japanese encodings are based on JIS X set of standards: // JIS X 0201 - Single-byte encoding of ASCII + ¥ + Kana chars at 0xA1-0xDF. - // JIS X 0208 - Main set of 6879 characters, placed in 94x94 plane, to be encoded by 2 bytes. + // JIS X 0208 - Main set of 6879 characters, placed in 94x94 plane, to be encoded by 2 bytes. // Has several variations in 1978, 1983, 1990 and 1997. // JIS X 0212 - Supplementary plane of 6067 chars in 94x94 plane. 1990. Effectively dead. // JIS X 0213 - Extension and modern replacement of 0208 and 0212. Total chars: 11233. @@ -27,7 +35,7 @@ module.exports = { // 0x8F, (0xA1-0xFE)x2 - 0212 plane (94x94). // * JIS X 208: 7-bit, direct encoding of 0208. Byte ranges: 0x21-0x7E (94 values). Uncommon. // Used as-is in ISO2022 family. - // * ISO2022-JP: Stateful encoding, with escape sequences to switch between ASCII, + // * ISO2022-JP: Stateful encoding, with escape sequences to switch between ASCII, // 0201-1976 Roman, 0208-1978, 0208-1983. // * ISO2022-JP-1: Adds esc seq for 0212-1990. // * ISO2022-JP-2: Adds esc seq for GB2313-1980, KSX1001-1992, ISO8859-1, ISO8859-7. @@ -77,7 +85,9 @@ module.exports = { 'isoir58': 'gbk', // Microsoft's CP936 is a subset and approximation of GBK. - // TODO: Euro = 0x80 in cp936, but not in GBK (where it's valid but undefined) + // Euro = 0x80 in cp936, but not in GBK (where it's valid but undefined) + // We could be able to decode Euro(0x80) in any of CP936/GBK/GB18030 + // But we would not encoding it to 0x80 when the codec is GB18030 or encodeEuro === true option in decode option 'windows936': 'cp936', '936': 'cp936', 'cp936': { @@ -88,20 +98,19 @@ module.exports = { // GBK (~22000 chars) is an extension of CP936 that added user-mapped chars and some other. 'gbk': { type: '_dbcs', - table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) }, + table: gbkTableGetter, }, 'xgbk': 'gbk', // GB18030 is an algorithmic extension of GBK. 'gb18030': { type: '_dbcs', - table: function() { return require('./tables/cp936.json').concat(require('./tables/gbk-added.json')) }, + table: gbkTableGetter, gb18030: function() { return require('./tables/gb18030-ranges.json') }, }, 'chinese': 'gb18030', - // TODO: Support GB18030 (~27000 chars + whole unicode mapping, cp54936) // http://icu-project.org/docs/papers/gb18030.html // http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml // http://www.khngai.com/chinese/charmap/tblgbk.php?page=0 @@ -133,7 +142,7 @@ module.exports = { // * Windows CP 951: Microsoft variant of Big5-HKSCS-2001. Seems to be never public. http://me.abelcheung.org/articles/research/what-is-cp951/ // * Big5-2003 (Taiwan standard) almost superset of cp950. // * Unicode-at-on (UAO) / Mozilla 1.8. Falling out of use on the Web. Not supported by other browsers. - // * Big5-HKSCS (-2001, -2004, -2008). Hong Kong standard. + // * Big5-HKSCS (-2001, -2004, -2008). Hong Kong standard. // many unicode code points moved from PUA to Supplementary plane (U+2XXXX) over the years. // Plus, it has 4 combining sequences. // Seems that Mozilla refused to support it for 10 yrs. https://bugzilla.mozilla.org/show_bug.cgi?id=162431 https://bugzilla.mozilla.org/show_bug.cgi?id=310299 @@ -144,7 +153,7 @@ module.exports = { // In the encoder, it might make sense to support encoding old PUA mappings to Big5 bytes seq-s. // Official spec: http://www.ogcio.gov.hk/en/business/tech_promotion/ccli/terms/doc/2003cmp_2008.txt // http://www.ogcio.gov.hk/tc/business/tech_promotion/ccli/terms/doc/hkscs-2008-big5-iso.txt - // + // // Current understanding of how to deal with Big5(-HKSCS) is in the Encoding Standard, http://encoding.spec.whatwg.org/#big5-encoder // Unicode mapping (http://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT) is said to be wrong.