Skip to content

Commit

Permalink
Represent cid chars using integers, not strings.
Browse files Browse the repository at this point in the history
cid chars are 16-bit unsigned integers. Currently we convert them to
single-char strings when inserting them into the CMap, and then convert
them back to integers when extracting them from the CMap. This patch
changes CMap so that cid chars stay in integer format throughout, saving
both time and space.

When loading the PDF from issue #4580, this change reduces peak RSS from
~600 to ~370 MiB. It also improves overall speed on that PDF by ~26%,
going from 724 ms to 533 ms.
  • Loading branch information
nnethercote committed Aug 1, 2014
1 parent ad2ea78 commit adf58ed
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 33 deletions.
54 changes: 32 additions & 22 deletions src/core/cmap.js
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,10 @@ var CMap = (function CMapClosure() {
// where nBytePairs are ranges e.g. [low1, high1, low2, high2, ...]
this.codespaceRanges = [[], [], [], []];
this.numCodespaceRanges = 0;
// Map entries have one of two forms.
// - cid chars are 16-bit unsigned integers, stored as integers.
// - bf chars are variable-length byte sequences, stored as strings, with
// one byte per character.
this._map = [];
this.vertical = false;
this.useCMap = null;
Expand All @@ -210,25 +214,31 @@ var CMap = (function CMapClosure() {
this.numCodespaceRanges++;
},

mapRange: function(low, high, dstLow) {
mapCidRange: function(low, high, dstLow) {
while (low <= high) {
this._map[low++] = dstLow++;
}
},

mapBfRange: function(low, high, dstLow) {
var lastByte = dstLow.length - 1;
while (low <= high) {
this._map[low] = dstLow;
this._map[low++] = dstLow;
// Only the last byte has to be incremented.
dstLow = dstLow.substr(0, lastByte) +
String.fromCharCode(dstLow.charCodeAt(lastByte) + 1);
++low;
}
},

mapRangeToArray: function(low, high, array) {
mapBfRangeToArray: function(low, high, array) {
var i = 0, ii = array.length;
while (low <= high && i < ii) {
this._map[low] = array[i++];
++low;
}
},

// This is used for both bf and cid chars.
mapOne: function(src, dst) {
this._map[src] = dst;
},
Expand Down Expand Up @@ -302,7 +312,7 @@ var IdentityCMap = (function IdentityCMapClosure() {
CMap.call(this);
this.vertical = vertical;
this.addCodespaceRange(n, 0, 0xffff);
this.mapRange(0, 0xffff, '\u0000');
this.mapCidRange(0, 0xffff, 0);
}
Util.inherit(IdentityCMap, CMap, {});

Expand Down Expand Up @@ -522,24 +532,24 @@ var BinaryCMapReader = (function BinaryCMapReaderClosure() {
case 2: // cidchar
stream.readHex(char, dataSize);
code = stream.readNumber();
cMap.mapOne(hexToInt(char, dataSize), String.fromCharCode(code));
cMap.mapOne(hexToInt(char, dataSize), code);
for (i = 1; i < subitemsCount; i++) {
incHex(char, dataSize);
if (!sequence) {
stream.readHexNumber(tmp, dataSize);
addHex(char, tmp, dataSize);
}
code = stream.readSigned() + (code + 1);
cMap.mapOne(hexToInt(char, dataSize), String.fromCharCode(code));
cMap.mapOne(hexToInt(char, dataSize), code);
}
break;
case 3: // cidrange
stream.readHex(start, dataSize);
stream.readHexNumber(end, dataSize);
addHex(end, start, dataSize);
code = stream.readNumber();
cMap.mapRange(hexToInt(start, dataSize), hexToInt(end, dataSize),
String.fromCharCode(code));
cMap.mapCidRange(hexToInt(start, dataSize), hexToInt(end, dataSize),
code);
for (i = 1; i < subitemsCount; i++) {
incHex(end, dataSize);
if (!sequence) {
Expand All @@ -551,8 +561,8 @@ var BinaryCMapReader = (function BinaryCMapReaderClosure() {
stream.readHexNumber(end, dataSize);
addHex(end, start, dataSize);
code = stream.readNumber();
cMap.mapRange(hexToInt(start, dataSize), hexToInt(end, dataSize),
String.fromCharCode(code));
cMap.mapCidRange(hexToInt(start, dataSize), hexToInt(end, dataSize),
code);
}
break;
case 4: // bfchar
Expand All @@ -578,9 +588,9 @@ var BinaryCMapReader = (function BinaryCMapReaderClosure() {
stream.readHexNumber(end, ucs2DataSize);
addHex(end, start, ucs2DataSize);
stream.readHex(charCode, dataSize);
cMap.mapRange(hexToInt(start, ucs2DataSize),
hexToInt(end, ucs2DataSize),
hexToStr(charCode, dataSize));
cMap.mapBfRange(hexToInt(start, ucs2DataSize),
hexToInt(end, ucs2DataSize),
hexToStr(charCode, dataSize));
for (i = 1; i < subitemsCount; i++) {
incHex(end, ucs2DataSize);
if (!sequence) {
Expand All @@ -592,9 +602,9 @@ var BinaryCMapReader = (function BinaryCMapReaderClosure() {
stream.readHexNumber(end, ucs2DataSize);
addHex(end, start, ucs2DataSize);
stream.readHex(charCode, dataSize);
cMap.mapRange(hexToInt(start, ucs2DataSize),
hexToInt(end, ucs2DataSize),
hexToStr(charCode, dataSize));
cMap.mapBfRange(hexToInt(start, ucs2DataSize),
hexToInt(end, ucs2DataSize),
hexToStr(charCode, dataSize));
}
break;
default:
Expand Down Expand Up @@ -675,15 +685,15 @@ var CMapFactory = (function CMapFactoryClosure() {
obj = lexer.getObj();
if (isInt(obj) || isString(obj)) {
var dstLow = isInt(obj) ? String.fromCharCode(obj) : obj;
cMap.mapRange(low, high, dstLow);
cMap.mapBfRange(low, high, dstLow);
} else if (isCmd(obj, '[')) {
obj = lexer.getObj();
var array = [];
while (!isCmd(obj, ']') && !isEOF(obj)) {
array.push(obj);
obj = lexer.getObj();
}
cMap.mapRangeToArray(low, high, array);
cMap.mapBfRangeToArray(low, high, array);
} else {
break;
}
Expand All @@ -704,7 +714,7 @@ var CMapFactory = (function CMapFactoryClosure() {
var src = strToInt(obj);
obj = lexer.getObj();
expectInt(obj);
var dst = String.fromCharCode(obj);
var dst = obj;
cMap.mapOne(src, dst);
}
}
Expand All @@ -725,8 +735,8 @@ var CMapFactory = (function CMapFactoryClosure() {
var high = strToInt(obj);
obj = lexer.getObj();
expectInt(obj);
var dstLow = String.fromCharCode(obj);
cMap.mapRange(low, high, dstLow);
var dstLow = obj;
cMap.mapCidRange(low, high, dstLow);
}
}

Expand Down
15 changes: 7 additions & 8 deletions src/core/fonts.js
Original file line number Diff line number Diff line change
Expand Up @@ -3899,8 +3899,7 @@ var Font = (function FontClosure() {
var cidToGidMap = properties.cidToGidMap || [];
var cidToGidMapLength = cidToGidMap.length;
properties.cMap.forEach(function(charCode, cid) {
assert(cid.length === 1, 'Max size of CID is 65,535');
cid = cid.charCodeAt(0);
assert(cid <= 0xffff, 'Max size of CID is 65,535');
var glyphId = -1;
if (cidToGidMapLength === 0) {
glyphId = charCode;
Expand Down Expand Up @@ -4370,10 +4369,10 @@ var Font = (function FontClosure() {
var cMap = properties.cMap;
toUnicode = [];
cMap.forEach(function(charcode, cid) {
assert(cid.length === 1, 'Max size of CID is 65,535');
assert(cid <= 0xffff, 'Max size of CID is 65,535');
// e) Map the CID obtained in step (a) according to the CMap obtained
// in step (d), producing a Unicode value.
var ucs2 = ucs2CMap.lookup(cid.charCodeAt(0));
var ucs2 = ucs2CMap.lookup(cid);
if (ucs2) {
toUnicode[charcode] =
String.fromCharCode((ucs2.charCodeAt(0) << 8) +
Expand Down Expand Up @@ -4415,7 +4414,7 @@ var Font = (function FontClosure() {
var charcode = 0;
if (this.composite) {
if (this.cMap.contains(glyphUnicode)) {
charcode = this.cMap.lookup(glyphUnicode).charCodeAt(0);
charcode = this.cMap.lookup(glyphUnicode);
}
}
// ... via toUnicode map
Expand Down Expand Up @@ -4444,7 +4443,7 @@ var Font = (function FontClosure() {

var widthCode = charcode;
if (this.cMap && this.cMap.contains(charcode)) {
widthCode = this.cMap.lookup(charcode).charCodeAt(0);
widthCode = this.cMap.lookup(charcode);
}
width = this.widths[widthCode];
width = isNum(width) ? width : this.defaultWidth;
Expand Down Expand Up @@ -5626,8 +5625,8 @@ var CFFFont = (function CFFFontClosure() {
// If the font is actually a CID font then we should use the charset
// to map CIDs to GIDs.
for (glyphId = 0; glyphId < charsets.length; glyphId++) {
var cidString = String.fromCharCode(charsets[glyphId]);
var charCode = properties.cMap.charCodeOf(cidString);
var cid = charsets[glyphId];
var charCode = properties.cMap.charCodeOf(cid);
charCodeToGlyphId[charCode] = glyphId;
}
} else {
Expand Down
6 changes: 3 additions & 3 deletions test/unit/cmap_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ describe('cmap', function() {
'endcidchar\n';
var stream = new StringStream(str);
var cmap = CMapFactory.create(stream);
expect(cmap.lookup(0x14)).toEqual(String.fromCharCode(0x00));
expect(cmap.lookup(0x14)).toEqual(0x00);
expect(cmap.lookup(0x15)).toBeUndefined();
});
it('parses begincidrange', function() {
Expand All @@ -54,8 +54,8 @@ describe('cmap', function() {
var stream = new StringStream(str);
var cmap = CMapFactory.create(stream);
expect(cmap.lookup(0x15)).toBeUndefined();
expect(cmap.lookup(0x16)).toEqual(String.fromCharCode(0x00));
expect(cmap.lookup(0x1B)).toEqual(String.fromCharCode(0x05));
expect(cmap.lookup(0x16)).toEqual(0x00);
expect(cmap.lookup(0x1B)).toEqual(0x05);
expect(cmap.lookup(0x1C)).toBeUndefined();
});
it('decodes codespace ranges', function() {
Expand Down

0 comments on commit adf58ed

Please sign in to comment.