Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: MIME RFC2047 Decoding #630

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
171 changes: 171 additions & 0 deletions src/core/operations/MIMEDecoding.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
/**
* @author mshwed [[email protected]]
* @copyright Crown Copyright 2019
* @license Apache-2.0
*/

import Operation from "../Operation";
import OperationError from "../errors/OperationError";
import Utils from "../Utils";
import { fromHex } from "../lib/Hex.mjs";
import { fromBase64 } from "../lib/Base64";
import cptable from "../vendor/js-codepage/cptable.js";

/**
* MIME Decoding operation
*/
class MIMEDecoding extends Operation {

/**
* MIMEDecoding constructor
*/
constructor() {
super();

this.name = "MIME Decoding";
this.module = "Default";
this.description = "Enables the decoding of MIME message header extensions for non-ASCII text";
this.infoURL = "https://tools.ietf.org/html/rfc2047";
this.inputType = "byteArray";
this.outputType = "string";
this.args = [];
}

/**
* @param {byteArray} input
* @param {Object[]} args
* @returns {string}
*/
run(input, args) {
const mimeEncodedText = Utils.byteArrayToUtf8(input);
const encodedHeaders = mimeEncodedText.replace(/\r\n/g, "\n");

const decodedHeader = this.decodeHeaders(encodedHeaders);

return decodedHeader;
}

/**
* Decode MIME header strings
*
* @param headerString
*/
decodeHeaders(headerString) {
// No encoded words detected
let i = headerString.indexOf("=?");
if (i === -1) return headerString;

let decodedHeaders = headerString.slice(0, i);
let header = headerString.slice(i);

let isBetweenWords = false;
let start, cur, charset, encoding, j, end, text;
while (header.length > -1) {
start = header.indexOf("=?");
if (start === -1) break;
cur = start + "=?".length;

i = header.slice(cur).indexOf("?");
if (i === -1) break;

charset = header.slice(cur, cur + i);
cur += i + "?".length;

if (header.length < cur + "Q??=".length) break;

encoding = header[cur];
cur += 1;

if (header[cur] !== "?") break;

cur += 1;

j = header.slice(cur).indexOf("?=");
if (j === -1) break;

text = header.slice(cur, cur + j);
end = cur + j + "?=".length;

if (encoding.toLowerCase() === "b") {
text = fromBase64(text);
} else if (encoding.toLowerCase() === "q") {
text = this.parseQEncodedWord(text);
} else {
isBetweenWords = false;
decodedHeaders += header.slice(0, start + 2);
header = header.slice(start + 2);
}

if (start > 0 && (!isBetweenWords || header.slice(0, start).search(/\S/g) > -1)) {
decodedHeaders += header.slice(0, start);
}

decodedHeaders += this.convertFromCharset(charset, text);

header = header.slice(end);
isBetweenWords = true;
}

if (header.length > 0) {
decodedHeaders += header;
}

return decodedHeaders;
}

/**
* Converts decoded text for supported charsets.
* Supports UTF-8, US-ASCII, ISO-8859-*
*
* @param encodedWord
*/
convertFromCharset(charset, encodedText) {
charset = charset.toLowerCase();
const parsedCharset = charset.split("-");

if (parsedCharset.length === 2 && parsedCharset[0] === "utf" && charset === "utf-8") {
return cptable.utils.decode(65001, encodedText);
} else if (parsedCharset.length === 2 && charset === "us-ascii") {
return cptable.utils.decode(20127, encodedText);
} else if (parsedCharset.length === 3 && parsedCharset[0] === "iso" && parsedCharset[1] === "8859") {
const isoCharset = parseInt(parsedCharset[2], 10);
if (isoCharset >= 1 && isoCharset <= 16) {
return cptable.utils.decode(28590 + isoCharset, encodedText);
}
}

throw new OperationError("Unhandled Charset");
}

/**
* Parses a Q encoded word
*
* @param encodedWord
*/
parseQEncodedWord(encodedWord) {
let decodedWord = "";
for (let i = 0; i < encodedWord.length; i++) {
if (encodedWord[i] === "_") {
decodedWord += " ";
// Parse hex encoding
} else if (encodedWord[i] === "=") {
if ((i + 2) >= encodedWord.length) throw new OperationError("Incorrectly Encoded Word");
const decodedHex = Utils.byteArrayToChars(fromHex(encodedWord.substring(i + 1, i + 3)));
decodedWord += decodedHex;
i += 2;
} else if (
(encodedWord[i].charCodeAt(0) >= " ".charCodeAt(0) && encodedWord[i].charCodeAt(0) <= "~".charCodeAt(0)) ||
encodedWord[i] === "\n" ||
encodedWord[i] === "\r" ||
encodedWord[i] === "\t") {
decodedWord += encodedWord[i];
} else {
throw new OperationError("Incorrectly Encoded Word");
}
}

return decodedWord;
}
}

export default MIMEDecoding;
1 change: 1 addition & 0 deletions tests/operations/index.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ import "./tests/Protobuf";
import "./tests/ParseSSHHostKey";
import "./tests/DefangIP";
import "./tests/ParseUDP";
import "./tests/MIMEDecoding";

// Cannot test operations that use the File type yet
//import "./tests/SplitColourChannels";
Expand Down
89 changes: 89 additions & 0 deletions tests/operations/tests/MIMEDecoding.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/**
* MIME Header Decoding tests
*
* @author mshwed [[email protected]]
* @copyright Crown Copyright 2019
* @license Apache-2.0
*/

import TestRegister from "../../lib/TestRegister.mjs";

TestRegister.addTests([
{
name: "Encoded comments",
input: "(=?ISO-8859-1?Q?a?=)",
expectedOutput: "(a)",
recipeConfig: [
{
"op": "MIME Decoding",
"args": []
}
]
},
{
name: "Encoded adjacent comments whitespace",
input: "(=?ISO-8859-1?Q?a?= b)",
expectedOutput: "(a b)",
recipeConfig: [
{
"op": "MIME Decoding",
"args": []
}
]
},
{
name: "Encoded adjacent single whitespace ignored",
input: "(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)",
expectedOutput: "(ab)",
recipeConfig: [
{
"op": "MIME Decoding",
"args": []
}
]
},
{
name: "Encoded adjacent double whitespace ignored",
input: "(=?ISO-8859-1?Q?a?= =?ISO-8859-1?Q?b?=)",
expectedOutput: "(ab)",
recipeConfig: [
{
"op": "MIME Decoding",
"args": []
}
]
},
{
name: "Encoded adjacent CRLF whitespace ignored",
input: "(=?ISO-8859-1?Q?a?=\r\n =?ISO-8859-1?Q?b?=)",
expectedOutput: "(ab)",
recipeConfig: [
{
"op": "MIME Decoding",
"args": []
}
]
},
{
name: "UTF-8 Encodings Multiple Headers",
input: "=?utf-8?q?=C3=89ric?= <[email protected]>, =?utf-8?q?Ana=C3=AFs?= <[email protected]>",
expectedOutput: "Éric <[email protected]>, Anaïs <[email protected]>",
recipeConfig: [
{
"op": "MIME Decoding",
"args": []
}
]
},
{
name: "ISO Decoding",
input: "From: =?US-ASCII?Q?Keith_Moore?= <[email protected]>\nTo: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <[email protected]>\nCC: =?ISO-8859-1?Q?Andr=E9?= Pirard <[email protected]>\nSubject: =?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=\n=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=",
expectedOutput: "From: Keith Moore <[email protected]>\nTo: Keld Jørn Simonsen <[email protected]>\nCC: André Pirard <[email protected]>\nSubject: If you can read this you understand the example.",
recipeConfig: [
{
"op": "MIME Decoding",
"args": []
}
]
}
]);