From f1659af5e4dc7738d176bfe76a656096b2d691d2 Mon Sep 17 00:00:00 2001 From: mshwed Date: Wed, 4 Sep 2019 14:37:02 -0400 Subject: [PATCH 1/3] Added basic OCR text extraction --- package-lock.json | 79 ++++++++++++++++++++++++++---- package.json | 1 + src/core/config/Categories.json | 3 +- src/core/operations/OCR.mjs | 87 +++++++++++++++++++++++++++++++++ 4 files changed, 160 insertions(+), 10 deletions(-) create mode 100644 src/core/operations/OCR.mjs diff --git a/package-lock.json b/package-lock.json index 1ac4451c..d9c6c56b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2165,7 +2165,6 @@ "version": "0.18.1", "resolved": "https://registry.npmjs.org/axios/-/axios-0.18.1.tgz", "integrity": "sha512-0BfJq4NSfQXd+SkFdrvFbG7addhYSBA2mQwISr46pD6E5iqkWg02RAs8vyTT/j0RTnoYmeXauBuSv1qKwR179g==", - "dev": true, "requires": { "follow-redirects": "1.5.10", "is-buffer": "^2.0.2" @@ -5712,7 +5711,6 @@ "version": "1.5.10", "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.5.10.tgz", "integrity": "sha512-0V5l4Cizzvqt5D44aTXbFZz+FtyXV1vrDN6qrelxtfYQKW0KO0W2T/hkE8xvGa/540LkZlkaUjO4ailYTFtHVQ==", - "dev": true, "requires": { "debug": "=3.1.0" }, @@ -5721,7 +5719,6 @@ "version": "3.1.0", "resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz", "integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==", - "dev": true, "requires": { "ms": "2.0.0" } @@ -7464,6 +7461,11 @@ "postcss": "^7.0.14" } }, + "idb-keyval": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/idb-keyval/-/idb-keyval-3.2.0.tgz", + "integrity": "sha512-slx8Q6oywCCSfKgPgL0sEsXtPVnSbTLWpyiDcu6msHOyKOLari1TD1qocXVCft80umnkk3/Qqh3lwoFt8T/BPQ==" + }, "ieee754": { "version": "1.1.12", "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.1.12.tgz", @@ -7756,8 +7758,7 @@ "is-buffer": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-2.0.3.tgz", - "integrity": "sha512-U15Q7MXTuZlrbymiz95PJpZxu8IlipAp4dtS3wOdgPXx3mqBnslrWU14kxfHB+Py/+2PVKSr37dMAgM2A4uArw==", - "dev": true + "integrity": "sha512-U15Q7MXTuZlrbymiz95PJpZxu8IlipAp4dtS3wOdgPXx3mqBnslrWU14kxfHB+Py/+2PVKSr37dMAgM2A4uArw==" }, "is-builtin-module": { "version": "1.0.0", @@ -7981,8 +7982,7 @@ "is-url": { "version": "1.2.4", "resolved": "https://registry.npmjs.org/is-url/-/is-url-1.2.4.tgz", - "integrity": "sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww==", - "dev": true + "integrity": "sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww==" }, "is-utf8": { "version": "0.2.1", @@ -9284,6 +9284,11 @@ "lower-case": "^1.1.1" } }, + "node-fetch": { + "version": "2.6.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.6.0.tgz", + "integrity": "sha512-8dG4H5ujfvFiqDmVu9fQ5bOHUC15JMjMY/Zumv26oOvvVJjM67KF8koCWIabKQ1GJIa9r2mMZscBq/TbdOcmNA==" + }, "node-forge": { "version": "0.8.5", "resolved": "https://registry.npmjs.org/node-forge/-/node-forge-0.8.5.tgz", @@ -9719,6 +9724,11 @@ "mimic-fn": "^1.0.0" } }, + "opencollective-postinstall": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/opencollective-postinstall/-/opencollective-postinstall-2.0.2.tgz", + "integrity": "sha512-pVOEP16TrAO2/fjej1IdOyupJY8KDUM1CvsaScRbw6oddvpQoOfGk4ywha0HKKVAD6RkW4x6Q+tNBwhf3Bgpuw==" + }, "opener": { "version": "1.5.1", "resolved": "https://registry.npmjs.org/opener/-/opener-1.5.1.tgz", @@ -11247,8 +11257,7 @@ "resolve-url": { "version": "0.2.1", "resolved": "https://registry.npmjs.org/resolve-url/-/resolve-url-0.2.1.tgz", - "integrity": "sha1-LGN/53yJOv0qZj/iGqkIAGjiBSo=", - "dev": true + "integrity": "sha1-LGN/53yJOv0qZj/iGqkIAGjiBSo=" }, "restore-cursor": { "version": "2.0.0", @@ -12647,6 +12656,58 @@ } } }, + "tesseract.js": { + "version": "2.0.0-alpha.15", + "resolved": "https://registry.npmjs.org/tesseract.js/-/tesseract.js-2.0.0-alpha.15.tgz", + "integrity": "sha512-qM1XUFVlTO+tx6oVRpd9QQ8PwQLxo3qhbfIHByUlUVIqWx6y/U9xlHIaG033/Tjfs2EQ0NAehPTOJ+eNElsXEg==", + "requires": { + "axios": "^0.18.0", + "check-types": "^7.4.0", + "is-url": "1.2.2", + "node-fetch": "^2.3.0", + "opencollective-postinstall": "^2.0.2", + "resolve-url": "^0.2.1", + "tesseract.js-core": "^2.0.0-beta.11", + "tesseract.js-utils": "^1.0.0-beta.8" + }, + "dependencies": { + "check-types": { + "version": "7.4.0", + "resolved": "https://registry.npmjs.org/check-types/-/check-types-7.4.0.tgz", + "integrity": "sha512-YbulWHdfP99UfZ73NcUDlNJhEIDgm9Doq9GhpyXbF+7Aegi3CVV7qqMCKTTqJxlvEvnQBp9IA+dxsGN6xK/nSg==" + }, + "is-url": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/is-url/-/is-url-1.2.2.tgz", + "integrity": "sha1-SYkFpZO/R8wtnn9zg3K792lsfyY=" + } + } + }, + "tesseract.js-core": { + "version": "2.0.0-beta.11", + "resolved": "https://registry.npmjs.org/tesseract.js-core/-/tesseract.js-core-2.0.0-beta.11.tgz", + "integrity": "sha512-07haKH2JYYo0OfIJoioMS9dDiI5Hrl7+r1MqjeNAAT5WpKO0ATe4cpncC8s1kz0e3s1kaC5WOwL3YJcjbJE+hg==" + }, + "tesseract.js-utils": { + "version": "1.0.0-beta.8", + "resolved": "https://registry.npmjs.org/tesseract.js-utils/-/tesseract.js-utils-1.0.0-beta.8.tgz", + "integrity": "sha512-qjHBfWfzo2o1ZY9XI0Wh2hmpp38+mIgCMOk60W5Yyie/pBl421VLBKOZUEwQgpbLnOJ24VU6Q8yXsVgtFFHcFg==", + "requires": { + "axios": "^0.18.0", + "bmp-js": "^0.1.0", + "file-type": "^10.5.0", + "idb-keyval": "^3.1.0", + "is-url": "^1.2.4", + "zlibjs": "^0.3.1" + }, + "dependencies": { + "file-type": { + "version": "10.11.0", + "resolved": "https://registry.npmjs.org/file-type/-/file-type-10.11.0.tgz", + "integrity": "sha512-uzk64HRpUZyTGZtVuvrjP0FYxzQrBf4rojot6J65YMEbwBLB0CWm0CLojVpwpmFmxcE/lkvYICgfcGozbBq6rw==" + } + } + }, "text-table": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz", diff --git a/package.json b/package.json index 78dc2c3e..13a041cf 100644 --- a/package.json +++ b/package.json @@ -143,6 +143,7 @@ "sortablejs": "^1.9.0", "split.js": "^1.5.11", "ssdeep.js": "0.0.2", + "tesseract.js": "^2.0.0-alpha.15", "ua-parser-js": "^0.7.20", "utf8": "^3.0.0", "vkbeautify": "^0.99.3", diff --git a/src/core/config/Categories.json b/src/core/config/Categories.json index 73ac6c2d..cbe67de1 100755 --- a/src/core/config/Categories.json +++ b/src/core/config/Categories.json @@ -403,7 +403,8 @@ "Hex Density chart", "Scatter chart", "Series chart", - "Heatmap chart" + "Heatmap chart", + "OCR" ] }, { diff --git a/src/core/operations/OCR.mjs b/src/core/operations/OCR.mjs new file mode 100644 index 00000000..35081f14 --- /dev/null +++ b/src/core/operations/OCR.mjs @@ -0,0 +1,87 @@ +/** + * @author mshwed [m@ttshwed.com] + * @copyright Crown Copyright 2019 + * @license Apache-2.0 + */ + +import Operation from "../Operation.mjs"; +import OperationError from "../errors/OperationError.mjs"; +import { isImage } from "../lib/FileType.mjs"; +import { isWorkerEnvironment } from "../Utils.mjs"; + +import jimp from "jimp"; +import Tesseract from "tesseract.js"; +const { TesseractWorker } = Tesseract; + +/** + * OCR operation + */ +class OCR extends Operation { + + /** + * OCR constructor + */ + constructor() { + super(); + + this.name = "OCR"; + this.module = "Default"; + this.description = "Optical character recognition or optical character reader (OCR) is the mechanical or electronic conversion of images of typed, handwritten or printed text into machine-encoded text."; + this.infoURL = "https://en.wikipedia.org/wiki/Optical_character_recognition"; + this.inputType = "ArrayBuffer"; + this.outputType = "string"; + this.args = [ + /* Example arguments. See the project wiki for full details. + { + name: "First arg", + type: "string", + value: "Don't Panic" + }, + { + name: "Second arg", + type: "number", + value: 42 + } + */ + ]; + } + + /** + * @param {ArrayBuffer} input + * @param {Object[]} args + * @returns {Object} + */ + async run(input, args) { + if (!isImage(input)) { + throw new OperationError("Invalid File Type"); + } + + try { + if (isWorkerEnvironment()) + self.sendStatusMessage("Performing OCR on image..."); + + let image; + try { + image = await jimp.read(input); + image = await image.getBase64Async(jimp.AUTO); + } catch (err) { + throw new OperationError(`Error loading image. (${err})`); + } + + const worker = new TesseractWorker(); + + const result = await worker.recognize(image) + .progress(progress => { + if (isWorkerEnvironment()) self.sendStatusMessage(`${progress.status} - ${parseFloat(progress.progress).toFixed(2)}%`); + }); + + console.log(result); + + return result.text; + } catch (err) { + throw new OperationError(`Error performing OCR on image. (${err})`); + } + } +} + +export default OCR; From 8dde7325143b6f02992140b47b9ae46e1dd85df3 Mon Sep 17 00:00:00 2001 From: mshwed Date: Thu, 5 Sep 2019 09:20:59 -0400 Subject: [PATCH 2/3] Fixed linting issues --- src/core/operations/OCR.mjs | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/src/core/operations/OCR.mjs b/src/core/operations/OCR.mjs index 35081f14..5b4e6d79 100644 --- a/src/core/operations/OCR.mjs +++ b/src/core/operations/OCR.mjs @@ -30,26 +30,13 @@ class OCR extends Operation { this.infoURL = "https://en.wikipedia.org/wiki/Optical_character_recognition"; this.inputType = "ArrayBuffer"; this.outputType = "string"; - this.args = [ - /* Example arguments. See the project wiki for full details. - { - name: "First arg", - type: "string", - value: "Don't Panic" - }, - { - name: "Second arg", - type: "number", - value: 42 - } - */ - ]; + this.args = []; } /** * @param {ArrayBuffer} input * @param {Object[]} args - * @returns {Object} + * @returns {string} */ async run(input, args) { if (!isImage(input)) { @@ -72,11 +59,9 @@ class OCR extends Operation { const result = await worker.recognize(image) .progress(progress => { - if (isWorkerEnvironment()) self.sendStatusMessage(`${progress.status} - ${parseFloat(progress.progress).toFixed(2)}%`); + if (isWorkerEnvironment()) self.sendStatusMessage(`${progress.status} - ${(parseFloat(progress.progress)*100).toFixed(2)}%`); }); - console.log(result); - return result.text; } catch (err) { throw new OperationError(`Error performing OCR on image. (${err})`); From 7eabaf0de619bfcc1c3d90ae94b96d746321826b Mon Sep 17 00:00:00 2001 From: n1474335 Date: Fri, 13 Sep 2019 14:34:08 +0100 Subject: [PATCH 3/3] Cleaned up and improved OCR operation --- src/core/config/Categories.json | 4 +- ...CR.mjs => OpticalCharacterRecognition.mjs} | 55 ++++++++++--------- tests/operations/tests/Image.mjs | 17 +++++- 3 files changed, 48 insertions(+), 28 deletions(-) rename src/core/operations/{OCR.mjs => OpticalCharacterRecognition.mjs} (50%) diff --git a/src/core/config/Categories.json b/src/core/config/Categories.json index b990393c..94f7fd30 100755 --- a/src/core/config/Categories.json +++ b/src/core/config/Categories.json @@ -383,6 +383,7 @@ "ops": [ "Render Image", "Play Media", + "Optical Character Recognition", "Remove EXIF", "Extract EXIF", "Split Colour Channels", @@ -405,8 +406,7 @@ "Hex Density chart", "Scatter chart", "Series chart", - "Heatmap chart", - "OCR" + "Heatmap chart" ] }, { diff --git a/src/core/operations/OCR.mjs b/src/core/operations/OpticalCharacterRecognition.mjs similarity index 50% rename from src/core/operations/OCR.mjs rename to src/core/operations/OpticalCharacterRecognition.mjs index 5b4e6d79..ccfbcd1f 100644 --- a/src/core/operations/OCR.mjs +++ b/src/core/operations/OpticalCharacterRecognition.mjs @@ -1,4 +1,5 @@ /** + * @author n1474335 [n1474335@gmail.com] * @author mshwed [m@ttshwed.com] * @copyright Crown Copyright 2019 * @license Apache-2.0 @@ -7,30 +8,36 @@ import Operation from "../Operation.mjs"; import OperationError from "../errors/OperationError.mjs"; import { isImage } from "../lib/FileType.mjs"; +import { toBase64 } from "../lib/Base64.mjs"; import { isWorkerEnvironment } from "../Utils.mjs"; -import jimp from "jimp"; import Tesseract from "tesseract.js"; const { TesseractWorker } = Tesseract; /** - * OCR operation + * Optical Character Recognition operation */ -class OCR extends Operation { +class OpticalCharacterRecognition extends Operation { /** - * OCR constructor + * OpticalCharacterRecognition constructor */ constructor() { super(); - this.name = "OCR"; - this.module = "Default"; - this.description = "Optical character recognition or optical character reader (OCR) is the mechanical or electronic conversion of images of typed, handwritten or printed text into machine-encoded text."; - this.infoURL = "https://en.wikipedia.org/wiki/Optical_character_recognition"; + this.name = "Optical Character Recognition"; + this.module = "Image"; + this.description = "Optical character recognition or optical character reader (OCR) is the mechanical or electronic conversion of images of typed, handwritten or printed text into machine-encoded text.

Supported image formats: png, jpg, bmp, pbm."; + this.infoURL = "https://wikipedia.org/wiki/Optical_character_recognition"; this.inputType = "ArrayBuffer"; this.outputType = "string"; - this.args = []; + this.args = [ + { + name: "Show confidence", + type: "boolean", + value: true + } + ]; } /** @@ -39,34 +46,32 @@ class OCR extends Operation { * @returns {string} */ async run(input, args) { - if (!isImage(input)) { + const [showConfidence] = args; + + const type = isImage(input); + if (!type) { throw new OperationError("Invalid File Type"); } try { - if (isWorkerEnvironment()) - self.sendStatusMessage("Performing OCR on image..."); - - let image; - try { - image = await jimp.read(input); - image = await image.getBase64Async(jimp.AUTO); - } catch (err) { - throw new OperationError(`Error loading image. (${err})`); - } - + const image = `data:${type};base64,${toBase64(input)}`; const worker = new TesseractWorker(); - const result = await worker.recognize(image) .progress(progress => { - if (isWorkerEnvironment()) self.sendStatusMessage(`${progress.status} - ${(parseFloat(progress.progress)*100).toFixed(2)}%`); + if (isWorkerEnvironment()) { + self.sendStatusMessage(`Status: ${progress.status} - ${(parseFloat(progress.progress)*100).toFixed(2)}%`); + } }); - return result.text; + if (showConfidence) { + return `Confidence: ${result.confidence}%\n\n${result.text}`; + } else { + return result.text; + } } catch (err) { throw new OperationError(`Error performing OCR on image. (${err})`); } } } -export default OCR; +export default OpticalCharacterRecognition; diff --git a/tests/operations/tests/Image.mjs b/tests/operations/tests/Image.mjs index ea5c64a4..307420f2 100644 --- a/tests/operations/tests/Image.mjs +++ b/tests/operations/tests/Image.mjs @@ -247,5 +247,20 @@ TestRegister.addTests([ args: ["None"] } ] - } + }, + /*{ Commented out as it takes a while to run and drops a file to disk (eng.traineddata) + name: "Optical Character Recognition", + input: "iVBORw0KGgoAAAANSUhEUgAAAUAAAAC0CAIAAABqhmJGAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAAASuSURBVHhe7dftVdswAIbRzsVAzMM0XabDUCOUxLYsWW4Jp+/pvf9w9GH76CHw4x2IJWAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAI9p8G/PbyY8rL2686g8t+vnqHTyfgIYfvz/26veTXn/UKX8+f0EU9bHrtu/6KfAN/AwEXAj7lFf2TBFw4nae8on+SgIvJ01n/KLzpDK+L3bT/Ap4O+HC+V12mTH+M3gzcLbIY/EO6HfxYp13k09nb6r3UqcdnjoCL3ll72J26h+35Oxy2XvZ0wOLaXq9v2+F1UC+7RZtMZ/DnfX1lwDOPzwUCLo7O2trtDK8H3M/iqoc6bj1subT68XTA/F7bGJooyzKbhTvLPHY8eJLHlbNX1DqYUVfdXbqwJjsCLsans37aNNJM6w68OR0wv9f9ymKw3k67yn2ZZpHlg3a3zis60s6oV+ZvlzMCLoanc3Dsdt9TdWT/lM8OmNjr5KY72jmzq1zfrbvXtVtmRMDF8HTWcgaaqIrD1U4G/MFewxrW262s5jS/Fzpmdts6mnHy+Fwl4GJ0OjsNrG1P/y7CNo3+gEt7jW56MVprNed7A/5w+n6YJ+BieDpnj/jO6pweTz0acGWvmZveL9XOmd3x6wKuTt8PEwRczLRw4eje1XX7c/cDruw1uuneOu2c4aOvzI57mJhRh1xZlQ0BF+Oz9vcF96fuB1zYa7R2b5mD6/XSwdfg8snj4q21+W/L02dfzIxhQMDFyTm6Hd7m+JYP7rPKT5sRuzhOBywm91rUkYc3fV9ltchtr8VmzuGOdfDB9N1tFYefNfdXLmyGjNZkhoCLUQufVqd/7z7rUcLW/XieDvg0s9difNOdRV5ePibt5vTuazusWbF9rs2E5v4mH58LBFyMW7g5OID7s9cMuTygmt9rcNPb5MrAz0lHc3Z9Ht7XZsxqxO36ZtLR/c0+PpMEzLOc/4LhrwmYZ6lfywJ+JgHzJPr9DgLmi23/zdXvcwmYL7YKWL1PJ2AIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmAIJmCI9f7+G6yFxVg/GyYwAAAAAElFTkSuQmCC", + expectedOutput: "Tesseract.js\n", + recipeConfig: [ + { + "op": "From Base64", + "args": ["A-Za-z0-9+/=", true] + }, + { + "op": "Optical Character Recognition", + "args": [false] + } + ] + }*/ ]);