Added basic OCR text extraction

gchq · Sep 4, 2019 · f1659af · f1659af
1 parent c6de3eb
commit f1659af
Show file tree

Hide file tree

Showing 4 changed files with 160 additions and 10 deletions.
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -143,6 +143,7 @@
     "sortablejs": "^1.9.0",
     "split.js": "^1.5.11",
     "ssdeep.js": "0.0.2",
+    "tesseract.js": "^2.0.0-alpha.15",
     "ua-parser-js": "^0.7.20",
     "utf8": "^3.0.0",
     "vkbeautify": "^0.99.3",

diff --git a/src/core/config/Categories.json b/src/core/config/Categories.json
@@ -403,7 +403,8 @@
             "Hex Density chart",
             "Scatter chart",
             "Series chart",
-            "Heatmap chart"
+            "Heatmap chart",
+            "OCR"
         ]
     },
     {

diff --git a/src/core/operations/OCR.mjs b/src/core/operations/OCR.mjs
@@ -0,0 +1,87 @@
+/**
+ * @author mshwed [[email protected]]
+ * @copyright Crown Copyright 2019
+ * @license Apache-2.0
+ */
+
+import Operation from "../Operation.mjs";
+import OperationError from "../errors/OperationError.mjs";
+import { isImage } from "../lib/FileType.mjs";
+import { isWorkerEnvironment } from "../Utils.mjs";
+
+import jimp from "jimp";
+import Tesseract from "tesseract.js";
+const { TesseractWorker } = Tesseract;
+
+/**
+ * OCR operation
+ */
+class OCR extends Operation {
+
+    /**
+     * OCR constructor
+     */
+    constructor() {
+        super();
+
+        this.name = "OCR";
+        this.module = "Default";
+        this.description = "Optical character recognition or optical character reader (OCR) is the mechanical or electronic conversion of images of typed, handwritten or printed text into machine-encoded text.";
+        this.infoURL = "https://en.wikipedia.org/wiki/Optical_character_recognition";
+        this.inputType = "ArrayBuffer";
+        this.outputType = "string";
+        this.args = [
+            /* Example arguments. See the project wiki for full details.
+            {
+                name: "First arg",
+                type: "string",
+                value: "Don't Panic"
+            },
+            {
+                name: "Second arg",
+                type: "number",
+                value: 42
+            }
+            */
+        ];
+    }
+
+    /**
+     * @param {ArrayBuffer} input
+     * @param {Object[]} args
+     * @returns {Object}
+     */
+    async run(input, args) {
+        if (!isImage(input)) {
+            throw new OperationError("Invalid File Type");
+        }
+
+        try {
+            if (isWorkerEnvironment())
+                self.sendStatusMessage("Performing OCR on image...");
+
+            let image;
+            try {
+                image = await jimp.read(input);
+                image = await image.getBase64Async(jimp.AUTO);
+            } catch (err) {
+                throw new OperationError(`Error loading image. (${err})`);
+            }
+
+            const worker = new TesseractWorker();
+
+            const result = await worker.recognize(image)
+                .progress(progress => {
+                    if (isWorkerEnvironment()) self.sendStatusMessage(`${progress.status} - ${parseFloat(progress.progress).toFixed(2)}%`);
+                });
+
+            console.log(result);
+
+            return result.text;
+        } catch (err) {
+            throw new OperationError(`Error performing OCR on image. (${err})`);
+        }
+    }
+}
+
+export default OCR;