Skip to content

Commit

Permalink
Merge pull request #34 from emilhf/cut-operation
Browse files Browse the repository at this point in the history
Cut operation
  • Loading branch information
bee-san authored Dec 22, 2023
2 parents d9dbf75 + 1ab3baf commit 73716a2
Show file tree
Hide file tree
Showing 4 changed files with 320 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/core/config/Categories.json
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@
{
"name": "Utils",
"ops": [
"Cut",
"Diff",
"Remove whitespace",
"Remove null bytes",
Expand Down
217 changes: 217 additions & 0 deletions src/core/operations/Cut.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
/**
* @author emilhf [[email protected]]
* @copyright Crown Copyright 2020
* @license Apache-2.0
*/

import Operation from "../Operation.mjs";
import OperationError from "../errors/OperationError.mjs";
import {SPLIT_DELIM_OPTIONS, JOIN_DELIM_OPTIONS} from "../lib/Delim.mjs";
import XRegExp from "xregexp";

/**
* Cut operation
*/
class Cut extends Operation {

/**
* Cut constructor
*/
constructor() {
super();

this.name = "Cut";
this.module = "Utils";
this.description = "Extract fields from records similarly to <code>awk</code> and <code>cut</code>. The expression <code>1, 3-4</code> will extract the 2nd, 4th and 5th fields. <code>3, 1 &quot;T&quot; 2</code> will extract the 4th field, then combine the 2nd and 3rd field into a new field (with the letter 'T' separating the original values).<br><br>If no input field delimiter is set, <strong>fixed width mode</strong> is enabled: Fields become the indices of the payload, and ranges will be appended to the current output field instead of creating new fields. This aids in carving e.g. CSVs from fixed width data.";
this.infoURL = "https://en.wikipedia.org/wiki/Cut_(Unix)";
this.inputType = "string";
this.outputType = "string";
this.args = [
{
"name": "Common input type",
"type": "populateOption",
"value": [
{
name: "User defined",
value: ""
},
{
name: "CSV",
value: ","
},
{
name: "TSV",
value: "\\t"
},
{
name: "PSV",
value: "\\|"
},
{
name: "Space aligned",
value: "\\s+"
}
],
"target": 4
},
{
"name": "Expression",
"type": "text",
"value": "0-"
},
{
"name": "Input record delimiter",
"type": "editableOptionShort",
"value": SPLIT_DELIM_OPTIONS,
"defaultIndex": 2
},
{
"name": "Output record delimiter",
"type": "editableOptionShort",
"value": SPLIT_DELIM_OPTIONS,
"defaultIndex": 2
},
{
"name": "Input field delimiter",
"type": "shortString",
"value": ""
},
{
"name": "Output field delimiter",
"type": "editableOptionShort",
"value": JOIN_DELIM_OPTIONS,
"defaultIndex": 3
}
];
}

/**
* @param {string} input
* @param {Object[]} args
* @returns {string}
*/
run(input, args) {
const [, expr, inRecordDelim, outRecordDelim, inFieldDelim, outFieldDelim] = args;
const split = new XRegExp(inFieldDelim);
const fixedWidth = inFieldDelim === "";

/**
* @param {Array[]}
* @returns {Array[]}
*/
const gr = (data) => {
data = fixedWidth ? data : data.split(split);
return this.extract(data, expr, fixedWidth).join(outFieldDelim);
};

return input.split(inRecordDelim).map(gr).join(outRecordDelim);
// return gr(input);
}

/**
* Extracts fields as specified by the extraction expression. If fixedWidth
* is true, ranges do not introduce new fields, but rather append to the
* current field being dealt with.
*
* The extract expression is a lightweight DSL similar to the fields flag
* (-f) of cut in UNIX, and also incorporates elements of the awk print
* statement. It departs from cut in a few noteworthy ways:
*
* - Reverse ranges are supported, e.g. 4-1.
*
* - Negative field values, e.g. -1, are offsets from the end of the data.
* Note that negative ranges are not supported.
*
* - Fields are numbered from 0 instead of 1.
*
* - New fields can be constructed by combining existing fields. This
* operation also supports appending strings: '1 "@" 2' will join field 1
* and 2 with "@" in between them.
*
* @param {Array[]} data
* @param {string} expr
* @param {Boolean} fixedWidth
* @returns {Array[Number]}
*/
extract(data, expr, fixedWidth) {
const maxOffset = data.length - 1;

/**
* @param {Number} n
* @returns {Array[]}
*/
const pick = (n) => n < 0 ? data[maxOffset + n + 1] : data[n];

const fields = [];
let currentField = [];
let previousToken = null;
const tokens = expr.trim().match(/((".*?")|(\d+-\d*)|(-?\d+)|(,))/g);
tokens.forEach(token => {
// Field separator
if (token.match(/^,$/)) {
previousToken = "delimiter";
if (currentField.length) {
fields.push(currentField.join(""));
currentField = [];
}
return;
}

if (!fixedWidth && previousToken === "range") {
throw new OperationError(
`Cannot join '${token}', as previous term was a range. Requires fixed width mode.`
);
}

if (token.match("^-?[0-9]+$")) {
previousToken = "extraction";
const n = Number(token);
currentField.push(pick(n));
return;
}
if (token.match(/^\d+-\d*$/)) {
previousToken = "range";
if (!fixedWidth && currentField.length) {
throw new OperationError(
`Cannot join range '${token}' with rest of field: ${currentField.join("")}. Requires fixed width mode.`
);
}
const m = token.match(/^([0-9]+)-([0-9]*)$/);
const a = Number(m[1]);
const b = m[2] === "" ? maxOffset: Number(m[2]);

const vals = [];
if (a <= b) {
for (let i = a; i <= b && i <= maxOffset; i++) {
vals.push(pick(i));
}
} else {
for (let i = a; i >= b && i <= maxOffset; i--) {
vals.push(pick(i));
}
}

if (fixedWidth) {
currentField.push(...vals);
} else {
fields.push(...vals);
}
return;
}
if (token.match(/^".*"$/)) {
previousToken = "string";
const m = token.match(/"(.*)"/);
currentField.push(m[1]);
}
// NOT REACHED
});
// Terminal condition
if (currentField.length) {
fields.push(currentField.join(""));
}
return fields;
}

}

export default Cut;
1 change: 1 addition & 0 deletions tests/operations/index.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import "./tests/Compress.mjs";
import "./tests/ConditionalJump.mjs";
import "./tests/Crypt.mjs";
import "./tests/CSV.mjs";
import "./tests/Cut.mjs";
import "./tests/DateTime.mjs";
import "./tests/ExtractEmailAddresses.mjs";
import "./tests/Fork.mjs";
Expand Down
101 changes: 101 additions & 0 deletions tests/operations/tests/Cut.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/**
* Cut operation tests
*
* @author emilhf [[email protected]]
*
* @copyright Crown Copyright 2020
* @license Apache-2.0
*/

import TestRegister from "../../lib/TestRegister.mjs";

TestRegister.addTests([
{
name: "Extract single field",
input: "test1,test2,test3",
expectedOutput: "test2",
recipeConfig: [
{
op: "Cut",
args: ["User defined", "1", "\\n", "\\n", ",", ","],
},
],
},
{
name: "Extract range",
input: "test1,test2,test3",
expectedOutput: "test2,test3",
recipeConfig: [
{
op: "Cut",
args: ["User defined", "1-2", "\\n", "\\n", ",", ","],
},
],
},
{
name: "Extract reverse range",
input: "test1,test2,test3",
expectedOutput: "test2,test1",
recipeConfig: [
{
op: "Cut",
args: ["User defined", "1-0", "\\n", "\\n", ",", ","],
},
],
},
{
name: "Extract multiple ranges",
input: "test1,test2,test3",
expectedOutput: "test2,test3,test1",
recipeConfig: [
{
op: "Cut",
args: ["User defined", "1-2,0", "\\n", "\\n", ",", ","],
},
],
},
{
name: "Combine two existing fields",
input: "john.doe,CONTOSO\nadams,CONTOSO",
expectedOutput: "john.doe@CONTOSO\nadams@CONTOSO",
recipeConfig: [
{
op: "Cut",
args: ["User defined", "0 \"@\" 1", "\\n", "\\n", ",", ","],
},
],
},
{
name: "Fixed width to CSV",
input: "abcdefghijklmnopqrstuvxyz",
expectedOutput: "abc,xyz",
recipeConfig: [
{
op: "Cut",
args: ["User defined", "0-2, 22-24", "\\n", "\\n", "", ","],
},
],
},
{
name: "Extract and convert CSV to TSV",
input: "ITEM,VALUE\nflamingo,439\nvodka,14",
expectedOutput: "ITEM\tVALUE\nflamingo\t439\nvodka\t14",
recipeConfig: [
{
op: "Cut",
args: ["User defined", "0-", "\\n", "\\n", ",", "\\t"],
}
],
},
{
name: "Extract with wrong delimiter",
input: "test1,test2",
expectedOutput: "test1,test2",
recipeConfig: [
{
op: "Cut",
args: ["User defined", "0-", "\\n", "\\n", "\\t", ";"],
},
],
},
]);

0 comments on commit 73716a2

Please sign in to comment.