Skip to content

Commit

Permalink
feat: Various Character Encoding Improvements (#270)
Browse files Browse the repository at this point in the history
* Support HTML5 charset tag

In HTML5 `<meta charset="">` is shorthand for `<meta http-equiv="content-type" content="">`
https://developer.mozilla.org/en-US/docs/Web/HTML/Element/meta

* Handle more character encoding declaration methods.
  • Loading branch information
benubois authored and adampash committed Feb 12, 2019
1 parent b3fa18b commit 0e27448
Show file tree
Hide file tree
Showing 5 changed files with 402 additions and 209 deletions.
559 changes: 360 additions & 199 deletions fixtures/nock/resource-test.js

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions src/resource/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,13 @@ const Resource = {
let $ = cheerio.load(decodedContent);

// after first cheerio.load, check to see if encoding matches
const metaContentType = $('meta[http-equiv=content-type]').attr('content');
const metaContentType =
$('meta[http-equiv=content-type i]').attr('content') ||
$('meta[charset]').attr('charset');
const properEncoding = getEncoding(metaContentType);

// if encodings in the header/body dont match, use the one in the body
if (properEncoding !== encoding) {
if (metaContentType && properEncoding !== encoding) {
decodedContent = iconv.decode(content, properEncoding);
$ = cheerio.load(decodedContent);
}
Expand Down
30 changes: 30 additions & 0 deletions src/resource/index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,36 @@ describe('Resource', () => {
assert.equal(typeof $, 'function');
});

it('fetches with different encoding and case insensitive regex', async () => {
const url =
'https://www.finam.ru/analysis/newsitem/putin-nagradil-grefa-ordenom-20190208-203615/';
const $ = await Resource.create(url);
const metaContentType = $('meta[http-equiv=content-type i]').attr(
'value'
);

assert.equal(getEncoding(metaContentType), 'windows-1251');

const badEncodingRe = /&#xFFFD;/g;

assert.equal(badEncodingRe.test($.html()), false);
assert.equal(typeof $, 'function');
});

it('fetches with different encoding and HTML5 charset tag', async () => {
const url =
'https://www.idnes.cz/fotbal/prvni-liga/fotbalova-liga-8-kolo-slovan-liberec-slovacko.A170925_173123_fotbal_min';
const $ = await Resource.create(url);
const metaContentType = $('meta[charset]').attr('charset');

assert.equal(getEncoding(metaContentType), 'windows-1250');

const badEncodingRe = /&#xFFFD;/g;

assert.equal(badEncodingRe.test($.html()), false);
assert.equal(typeof $, 'function');
});

it('handles special encoding', async () => {
const url =
'http://www.elmundo.es/opinion/2016/11/19/582f476846163fc65a8b4578.html';
Expand Down
5 changes: 2 additions & 3 deletions src/test-helpers.js
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,11 @@ export function record(name, options = {}) {
// eslint-disable-next-line no-console
console.log(
`This is disabled for browser/node interop. To capture fixutres,
open ${'`src/test-helpers.js`'} and comment out lines 57 and 58 and
uncomment the fs import at top of file.`
open ${'`src/test-helpers.js`'} and uncomment lines 58 and 59 and
the fs import at top of file.`
);
// const text = `const nock = require('nock');\n${has_fixtures.join('\n')}`;
// fs.writeFile(fp, text, done);

} else {
done();
}
Expand Down
11 changes: 6 additions & 5 deletions src/utils/text/get-encoding.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ import { DEFAULT_ENCODING, ENCODING_RE } from './constants';
// ensure correctly encoded responses
export default function getEncoding(str) {
let encoding = DEFAULT_ENCODING;
if (ENCODING_RE.test(str)) {
const testEncode = ENCODING_RE.exec(str)[1];
if (iconv.encodingExists(testEncode)) {
encoding = testEncode;
}
const matches = ENCODING_RE.exec(str);
if (matches !== null) {
[, str] = matches;
}
if (iconv.encodingExists(str)) {
encoding = str;
}
return encoding;
}

0 comments on commit 0e27448

Please sign in to comment.