ocr/ocrad/ocrocradengine.cpp

0001 /************************************************************************
0002  *                                  *
0003  *  This file is part of Kooka, a scanning/OCR application using    *
0004  *  Qt <http://www.qt.io> and KDE Frameworks <http://www.kde.org>.  *
0005  *                                  *
0006  *  Copyright (C) 2000-2016 Klaas Freitag <freitag@suse.de>     *
0007  *                          Jonathan Marten <jjm@keelhaul.me.uk>    *
0008  *                                  *
0009  *  Kooka is free software; you can redistribute it and/or modify it    *
0010  *  under the terms of the GNU Library General Public License as    *
0011  *  published by the Free Software Foundation and appearing in the  *
0012  *  file COPYING included in the packaging of this file;  either    *
0013  *  version 2 of the License, or (at your option) any later version.    *
0014  *                                  *
0015  *  As a special exception, permission is given to link this program    *
0016  *  with any version of the KADMOS OCR/ICR engine (a product of     *
0017  *  reRecognition GmbH, Kreuzlingen), and distribute the resulting  *
0018  *  executable without including the source code for KADMOS in the  *
0019  *  source distribution.                        *
0020  *                                  *
0021  *  This program is distributed in the hope that it will be useful, *
0022  *  but WITHOUT ANY WARRANTY; without even the implied warranty of  *
0023  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the   *
0024  *  GNU General Public License for more details.            *
0025  *                                  *
0026  *  You should have received a copy of the GNU General Public       *
0027  *  License along with this program;  see the file COPYING.  If     *
0028  *  not, see <http://www.gnu.org/licenses/>.                *
0029  *                                  *
0030  ************************************************************************/
0031
0032 #include "ocrocradengine.h"
0033
0034 #include <qregexp.h>
0035 #include <qfile.h>
0036 #include <qdir.h>
0037 #include <qfileinfo.h>
0038 #include <qtemporaryfile.h>
0039 #include <qprocess.h>
0040
0041 #include <klocalizedstring.h>
0042 #include <kpluginfactory.h>
0043
0044 #include "imageformat.h"
0045 #include "kookasettings.h"
0046 #include "ocrocraddialog.h"
0047 #include "executablepathdialogue.h"
0048 #include "ocr_logging.h"
0049
0050
0051 K_PLUGIN_FACTORY_WITH_JSON(OcrOcradEngineFactory, "kookaocr-ocrad.json", registerPlugin<OcrOcradEngine>();)
0052 #include "ocrocradengine.moc"
0053
0054
0055 static const char UndetectedChar = '_';
0056
0057
0058 OcrOcradEngine::OcrOcradEngine(QObject *pnt, const QVariantList &args)
0059     : AbstractOcrEngine(pnt, "OcrOcradEngine")
0060 {
0061     m_ocrImagePBM = QString();
0062     m_tempOrfName = QString();
0063     ocradVersion = 0;
0064 }
0065
0066
0067 AbstractOcrDialogue *OcrOcradEngine::createOcrDialogue(AbstractOcrEngine *plugin, QWidget *pnt)
0068 {
0069     return (new OcrOcradDialog(plugin, pnt));
0070 }
0071
0072
0073 bool OcrOcradEngine::createOcrProcess(AbstractOcrDialogue *dia, ScanImage::Ptr img)
0074 {
0075     OcrOcradDialog *parentDialog = static_cast<OcrOcradDialog *>(dia);
0076     ocradVersion = parentDialog->getNumVersion();
0077
0078     const QString cmd = parentDialog->getOCRCmd();
0079
0080     const QString ocrResultFile = tempSaveImage(img, ImageFormat("BMP"), 8);
0081     setResultImage(ocrResultFile);
0082     // TODO: if the input file is local and is readable by OCRAD,
0083     // can use it directly (but don't delete it afterwards!)
0084     m_ocrImagePBM = tempSaveImage(img, ImageFormat("PBM"), 1);
0085
0086     QProcess *proc = initOcrProcess();          // start process for OCR
0087     QStringList args;                   // arguments for process
0088
0089     m_tempOrfName = tempFileName("orf");
0090     args << "-x" << m_tempOrfName;          // the ORF result file
0091
0092     args << QFile::encodeName(m_ocrImagePBM);       // name of the input image
0093
0094     // Layout Detection
0095     int layoutMode = KookaSettings::ocrOcradLayoutDetection();
0096     if (ocradVersion >= 18)             // OCRAD 0.18 or later
0097     {                           // has only on/off
0098         if (layoutMode != 0) args << "-l";
0099     }
0100     else                        // OCRAD 0.17 or earlier
0101     {                           // had 3 options
0102         args << "-l" << QString::number(layoutMode);
0103     }
0104
0105     QString s = KookaSettings::ocrOcradFormat();
0106     if (!s.isEmpty()) args << "-F" << s;
0107
0108     s = KookaSettings::ocrOcradCharset();
0109     if (!s.isEmpty()) args << "-c" << s;
0110
0111     s = KookaSettings::ocrOcradFilter();
0112     if (!s.isEmpty()) args << "-e" << s;
0113
0114     s = KookaSettings::ocrOcradTransform();
0115     if (!s.isEmpty()) args << "-t" << s;
0116
0117     if (KookaSettings::ocrOcradInvert()) args << "-i";
0118
0119     if (KookaSettings::ocrOcradThresholdEnable()) {
0120         s = KookaSettings::ocrOcradThresholdValue();
0121         if (!s.isEmpty()) args << "-T" << (s + "%");
0122     }
0123
0124     if (verboseDebug()) args << "-v";
0125
0126     s = KookaSettings::ocrOcradExtraArguments();
0127     if (!s.isEmpty()) args << s;
0128
0129     proc->setProgram(cmd);
0130     proc->setArguments(args);
0131
0132     proc->setProcessChannelMode(QProcess::SeparateChannels);
0133     m_tempStdoutLog = tempFileName("stdout.log");
0134     proc->setStandardOutputFile(m_tempStdoutLog);
0135
0136     return (runOcrProcess());
0137 }
0138
0139
0140 QStringList OcrOcradEngine::tempFiles(bool retain)
0141 {
0142     QStringList result;
0143     result << m_ocrImagePBM;
0144     result << m_tempOrfName;
0145     result << m_tempStdoutLog;
0146
0147     return (result);
0148 }
0149
0150
0151 bool OcrOcradEngine::finishedOcrProcess(QProcess *proc)
0152 {
0153     qCDebug(OCR_LOG);
0154     QString errStr = readORF(m_tempOrfName);        // parse the OCR results
0155     if (errStr.isEmpty()) return (true);        // parsed successfulyl
0156
0157     setErrorText(errStr);               // record the parse error
0158     return (false);                 // parsing failed
0159 }
0160
0161
0162 /*
0163   From http://kooka.kde.org/news/
0164
0165 ORF Proposal: Ocr Result File    August 20, 2003
0166
0167 Ocrad is the first OCR (Optical Character Recognition) application that implements
0168 output of OCR results in a special file format that could be easily processed by
0169 frontend programs.  To provide a proper frontend connection, ocrad implements the
0170 export of the OCR results into a so called ORF, which simply means Ocr Result File.
0171
0172 The ORF Format is a special file format that contains OCR results like the detected
0173 characters and their position on the source image in a simply parseable format.
0174 Frontend programs can read the file and retrieve information about the OCR engine
0175 run and show up the results visually.
0176
0177 All lines starting with '#' are ignored.
0178
0179 The first valid line has the form 'source file filename', where 'filename' is the
0180 name of the PBM file being processed.
0181
0182 The second valid line has the form 'total blocks n', where 'n' is the total number
0183 of text blocks in the source image.
0184
0185 For each text block in the source image, the following data follows:
0186
0187   A line in the form 'block i x y w h', where 'i' is the block number and 'x y w h'
0188   are the block position and size as described below for character boxes.
0189
0190   A line in the form 'lines n', where 'n' is the number of lines in this block.
0191
0192 For each line in every text block, the following data follows:
0193
0194   A line in the form 'line i chars n height h', where 'i' is the line number,
0195   'n' is the number of characters in this line,
0196   and 'h' is the mean height of the characters in this line (in pixels).
0197
0198   n lines (one for every character) in the form "x y w h b;g[,'c'v]...".
0199   'x' = the left border (x-coordinate) of the char bounding box in the source image (in pixels).
0200   'y' = the top border (y-coordinate).
0201   'w' = the width of the bounding box.
0202   'h' = the height of the bounding box.
0203   'b' = the percent of black pixels in the bounding box.
0204   'g' = the number of different recognition guesses for this character.
0205
0206   The result characters follow after the number of guesses in the form of a
0207   comma-separated list of pairs. Every pair is formed by the actual recognised
0208   char enclosed in single quotes, followed by the confidence value without
0209   space between them.
0210
0211 See the following snippet (the beginning of an orf) as a sample ORF:
0212
0213   # Ocr Results File. Created by GNU ocrad version 0.4
0214   source file test1.pbm
0215   total blocks 1
0216   block 1 0 0 560 792
0217   lines 12
0218   line 1 chars 10 height 26
0219   71 109 17 26;2,'0'1,'o'0
0220   93 109 15 26;2,'1'1,'l'0
0221   110 109 18 26;1,'2'0
0222   131 109 18 26;1,'3'0
0223   151 109 19 26;1,'4'0
0224   172 109 17 26;1,'5'0
0225   193 109 17 26;1,'6'0
0226   213 108 17 27;1,'7'0
0227   232 109 18 26;1,'8'0
0228   253 109 17 26;1,'9'0
0229   line 2 chars 14 height 27
0230   68 153 29 27;1,'A'0
0231   97 153 24 27;1,'B'0
0232   ...
0233
0234 The ORF format was defined by Antonio Diaz and Klaas Freitag. Comments are very
0235 welcome.
0236 */
0237
0238 QString OcrOcradEngine::readORF(const QString &fileName)
0239 {
0240     QFile file(fileName);
0241     // some checks on the ORF
0242     if (!file.exists()) {
0243         return (xi18nc("@info", "File <filename>%1</filename> does not exist", fileName));
0244     }
0245     QFileInfo fi(fileName);
0246     if (!fi.isReadable()) {
0247         return (xi18nc("@info", "File <filename>%1</filename> unreadable", fileName));
0248     }
0249
0250     if (!file.open(QIODevice::ReadOnly)) {
0251         return (xi18nc("@info", "Cannot open file <filename>%1</filename>", fileName));
0252     }
0253     QTextStream stream(&file);
0254
0255     qCDebug(OCR_LOG) << "Starting to analyse ORF" << fileName << "version" << ocradVersion;
0256
0257     // to match "block 1 0 0 560 792"
0258     const QRegExp rx1("^.*block\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)");
0259     // to match "line 5 chars 13 height 20"
0260     const QRegExp rx2("^line\\s+(\\d+)\\s+chars\\s+(\\d+)\\s+height\\s+\\d+");
0261     // to match " 1, 'r'0"
0262     const QRegExp rx3("^\\s*(\\d+)");
0263     // to match "110 109 18 26"
0264     const QRegExp rx4("(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)");
0265
0266     /* use a global line number counter here, not the one from the orf. The orf one
0267      * starts at 0 for every block, but we want line-no counting page global here.
0268      */
0269     int lineNo = 0;
0270     int blockCnt = 0;
0271     QString line;
0272     QRect blockRect;
0273
0274     startResultDocument();
0275
0276     while (!stream.atEnd()) {
0277         line = stream.readLine().trimmed();     // line of text excluding '\n'
0278
0279         if (line.startsWith("#")) {
0280             continue;    // ignore comments
0281         }
0282
0283         if (verboseDebug()) {
0284             qCDebug(OCR_LOG) << "# Line" << line;
0285         }
0286         if (line.startsWith("source file ")) {
0287             continue;                   // source file name, ignore
0288         } else if (line.startsWith("total blocks ")) {  // total count of blocks,
0289                             // must be first line
0290             blockCnt = line.mid(13).toInt();
0291             qCDebug(OCR_LOG) << "Block count (V<10)" << blockCnt;
0292         } else if (line.startsWith("total text blocks ")) {
0293             blockCnt = line.mid(18).toInt();
0294             qCDebug(OCR_LOG) << "Block count (V>10)" << blockCnt;
0295         } else if (line.startsWith("block ") || line.startsWith("text block ")) {
0296                             // start of text block
0297                             // matching "block 1 0 0 560 792"
0298             if (rx1.indexIn(line) == -1) {
0299                 qCDebug(OCR_LOG) << "Failed to match 'block' line" << line;
0300                 continue;
0301             }
0302
0303             int currBlock = (rx1.cap(1).toInt()) - 1;
0304             blockRect.setRect(rx1.cap(2).toInt(), rx1.cap(3).toInt(),
0305                               rx1.cap(4).toInt(), rx1.cap(5).toInt());
0306             if (verboseDebug()) qCDebug(OCR_LOG) << "Current block" << currBlock << "rect" << blockRect;
0307         } else if (line.startsWith("lines ")) {     // lines in this block
0308             if (verboseDebug()) qCDebug(OCR_LOG) << "Block line count" << line.mid(6).toInt();
0309         } else if (line.startsWith("line ")) {      // start of text line
0310             startLine();
0311
0312             if (rx2.indexIn(line) == -1) {
0313                 qCDebug(OCR_LOG) << "Failed to match 'line' line" << line;
0314                 continue;
0315             }
0316
0317             int charCount = rx2.cap(2).toInt();
0318             if (verboseDebug()) qCDebug(OCR_LOG) << "Expecting" << charCount << "chars for line" << lineNo;
0319
0320             QString word;
0321             QRect wordRect;
0322
0323             for (int c = 0; c < charCount && !stream.atEnd(); ++c) {
0324                 // read one line per character
0325                 QString charLine = stream.readLine();
0326                 int semiPos = charLine.indexOf(';');
0327                 if (semiPos == -1) {
0328                     qCDebug(OCR_LOG) << "No ';' in 'char' line" << charLine;
0329                     continue;
0330                 }
0331
0332                 // rectStr contains the rectangle of the character
0333                 QString rectStr = charLine.left(semiPos);
0334                 // resultStr contains the OCRed result character(s)
0335                 QString resultStr = charLine.mid(semiPos + 1);
0336
0337                 QChar detectedChar = UndetectedChar;
0338
0339                 // find how many alternatives, matching " 1, 'r'0"
0340                 if (rx3.indexIn(resultStr) == -1) {
0341                     qCDebug(OCR_LOG) << "Failed to match" << resultStr << "in 'char' line" << charLine;
0342                     continue;
0343                 }
0344
0345                 int altCount = rx3.cap(1).toInt();
0346                 if (altCount == 0) {            // no alternatives,
0347                             // undecipherable character
0348                     if (verboseDebug()) {
0349                         qCDebug(OCR_LOG) << "Undecipherable character in 'char' line" << charLine;
0350                     }
0351                 } else {
0352                     int h = resultStr.indexOf(',');
0353                     if (h == -1) {
0354                         qCDebug(OCR_LOG) << "No ',' in" << resultStr << "in 'char' line" << charLine;
0355                         continue;
0356                     }
0357                     resultStr = resultStr.remove(0, h + 1).trimmed();
0358
0359                     // TODO: this only uses the first alternative
0360                     detectedChar = resultStr.at(1);
0361
0362                     // Analyse the result rectangle
0363                     if (detectedChar != ' ') {
0364                         if (rx4.indexIn(rectStr) == -1) {
0365                             qCDebug(OCR_LOG) << "Failed to match" << rectStr << "in 'char' line" << charLine;
0366                             continue;
0367                         }
0368
0369                         QRect r(rx4.cap(1).toInt(), rx4.cap(2).toInt(),
0370                                 rx4.cap(3).toInt(), rx4.cap(4).toInt());
0371                         wordRect |= r;
0372                     }
0373                 }
0374
0375                 if (detectedChar == ' ') {      // space terminates the word
0376                     if (ocradVersion < 10) {        // offset is relative to block
0377                         wordRect.translate(blockRect.x(), blockRect.y());
0378                     }
0379
0380                     OcrWordData wd;
0381                     wd.setProperty(OcrWordData::Rectangle, wordRect);
0382                     addWord(word, wd);
0383
0384                     word = QString();           // reset for next time
0385                     wordRect = QRect();
0386                 } else {
0387                     word.append(detectedChar);      // append char to word
0388                 }
0389             }                       // end of text line loop
0390             ++lineNo;
0391
0392             if (!word.isEmpty()) {          // last word in line
0393                 if (ocradVersion < 10) {        // offset is relative to block
0394                     wordRect.translate(blockRect.x(), blockRect.y());
0395                 }
0396
0397                 OcrWordData wd;
0398                 wd.setProperty(OcrWordData::Rectangle, wordRect);
0399                 addWord(word, wd);
0400
0401                 word = QString();           // reset for next time
0402                 wordRect = QRect();
0403             }
0404
0405             finishLine();
0406         } else {
0407             qCDebug(OCR_LOG) << "Unknown line format" << line;
0408         }
0409     }
0410
0411     file.close();                   // finished with ORF file
0412     finishResultDocument();
0413     qCDebug(OCR_LOG) << "Finished analysing ORF";
0414
0415     return (QString());                 // no error detected
0416 }
0417
0418
0419 void OcrOcradEngine::openAdvancedSettings()
0420 {
0421     ExecutablePathDialogue d(nullptr);
0422
0423     QString exec = KookaSettings::ocrOcradBinary();
0424     if (exec.isEmpty())
0425     {
0426         KConfigSkeletonItem *ski = KookaSettings::self()->ocrOcradBinaryItem();
0427         ski->setDefault();
0428         exec = KookaSettings::ocrOcradBinary();
0429     }
0430
0431     d.setPath(exec);
0432     d.setLabel(i18n("Name or path of the OCRAD executable:"));
0433     if (!d.exec()) return;
0434
0435     KookaSettings::setOcrOcradBinary(d.path());
0436 }