File indexing completed on 2024-05-12 15:55:08

0001 /************************************************************************
0002  *                                  *
0003  *  This file is part of Kooka, a scanning/OCR application using    *
0004  *  Qt <http://www.qt.io> and KDE Frameworks <http://www.kde.org>.  *
0005  *                                  *
0006  *  Copyright (C) 2020      Jonathan Marten <jjm@keelhaul.me.uk>    *
0007  *                                  *
0008  *  Kooka is free software; you can redistribute it and/or modify it    *
0009  *  under the terms of the GNU Library General Public License as    *
0010  *  published by the Free Software Foundation and appearing in the  *
0011  *  file COPYING included in the packaging of this file;  either    *
0012  *  version 2 of the License, or (at your option) any later version.    *
0013  *                                  *
0014  *  As a special exception, permission is given to link this program    *
0015  *  with any version of the KADMOS OCR/ICR engine (a product of     *
0016  *  reRecognition GmbH, Kreuzlingen), and distribute the resulting  *
0017  *  executable without including the source code for KADMOS in the  *
0018  *  source distribution.                        *
0019  *                                  *
0020  *  This program is distributed in the hope that it will be useful, *
0021  *  but WITHOUT ANY WARRANTY; without even the implied warranty of  *
0022  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the   *
0023  *  GNU General Public License for more details.            *
0024  *                                  *
0025  *  You should have received a copy of the GNU General Public       *
0026  *  License along with this program;  see the file COPYING.  If     *
0027  *  not, see <http://www.gnu.org/licenses/>.                *
0028  *                                  *
0029  ************************************************************************/
0030 
0031 #include "ocrtesseractengine.h"
0032 
0033 #include <qregexp.h>
0034 #include <qfile.h>
0035 #include <qfileinfo.h>
0036 
0037 #include <QXmlStreamReader>
0038 
0039 #include <klocalizedstring.h>
0040 #include <kpluginfactory.h>
0041 
0042 #include "imageformat.h"
0043 #include "kookasettings.h"
0044 #include "ocrtesseractdialog.h"
0045 #include "executablepathdialogue.h"
0046 #include "ocr_logging.h"
0047 
0048 
0049 K_PLUGIN_FACTORY_WITH_JSON(OcrTesseractEngineFactory, "kookaocr-tesseract.json", registerPlugin<OcrTesseractEngine>();)
0050 #include "ocrtesseractengine.moc"
0051 
0052 
0053 OcrTesseractEngine::OcrTesseractEngine(QObject *pnt, const QVariantList &args)
0054     : AbstractOcrEngine(pnt, "OcrTesseractEngine")
0055 {
0056     m_ocrImageIn = QString();
0057     m_tempHOCROut = QString();
0058     m_tesseractVersion = 0;
0059 }
0060 
0061 
0062 AbstractOcrDialogue *OcrTesseractEngine::createOcrDialogue(AbstractOcrEngine *plugin, QWidget *pnt)
0063 {
0064     return (new OcrTesseractDialog(plugin, pnt));
0065 }
0066 
0067 
0068 bool OcrTesseractEngine::createOcrProcess(AbstractOcrDialogue *dia, ScanImage::Ptr img)
0069 {
0070     OcrTesseractDialog *parentDialog = static_cast<OcrTesseractDialog *>(dia);
0071     m_tesseractVersion = parentDialog->getNumVersion();
0072 
0073     const QString cmd = parentDialog->getOCRCmd();
0074 
0075     const QString ocrResultFile = tempSaveImage(img, ImageFormat("PGM"), 8);
0076     setResultImage(ocrResultFile);
0077     // TODO: if the input file is local and is readable by Tesseract,
0078     // can use it directly (but don't delete it afterwards!)
0079     m_ocrImageIn = tempSaveImage(img, ImageFormat("PNG"), 8);
0080 
0081     QProcess *proc = initOcrProcess();          // start process for OCR
0082     QStringList args;                   // arguments for process
0083 
0084     // Input file
0085     args << QFile::encodeName(m_ocrImageIn);        // file with the input image
0086 
0087     // Output base name
0088     m_tempHOCROut = tempFileName("");           // Tesseract just wants base name
0089     args << QFile::encodeName(m_tempHOCROut);       // the HOCR result file
0090     m_tempHOCROut += ".hocr";               // suffix that it will have
0091 
0092     // Language
0093     QString s = KookaSettings::ocrTesseractLanguage();
0094     if (!s.isEmpty()) args << "-l" << s;
0095 
0096     // User words
0097     QUrl u = KookaSettings::ocrTesseractUserWords();
0098     if (u.isValid()) args << "--user-words" << u.toLocalFile();
0099 
0100     // User patterns
0101     u = KookaSettings::ocrTesseractUserPatterns();
0102     if (u.isValid()) args << "--user-patterns" << u.toLocalFile();
0103 
0104     // Page segmentation mode
0105     s = KookaSettings::ocrTesseractSegmentationMode();
0106     if (!s.isEmpty()) args << "--psm" << s;
0107 
0108     // OCR engine mode
0109     s = KookaSettings::ocrTesseractEngineMode();
0110     if (!s.isEmpty()) args << "--oem" << s;
0111 
0112     //if (verboseDebug()) args << "-v";
0113 
0114     s = KookaSettings::ocrTesseractExtraArguments();
0115     if (!s.isEmpty()) args << s;
0116 
0117     // Output format.  This option generates HOCR (HTML with OCR markup)
0118     // as specificied at http://kba.cloud/hocr-spec/1.2/
0119     args << "hocr";
0120 
0121     proc->setProgram(cmd);
0122     proc->setArguments(args);
0123 
0124     proc->setProcessChannelMode(QProcess::SeparateChannels);
0125     m_tempStdoutLog = tempFileName("stdout.log");
0126     proc->setStandardOutputFile(m_tempStdoutLog);
0127 
0128     return (runOcrProcess());
0129 }
0130 
0131 
0132 QStringList OcrTesseractEngine::tempFiles(bool retain)
0133 {
0134     QStringList result;
0135     result << m_ocrImageIn;
0136     result << m_tempHOCROut;
0137     result << m_tempStdoutLog;
0138     return (result);
0139 }
0140 
0141 
0142 bool OcrTesseractEngine::finishedOcrProcess(QProcess *proc)
0143 {
0144     qCDebug(OCR_LOG);
0145     QString errStr = readHOCR(m_tempHOCROut);       // parse the OCR results
0146     if (errStr.isEmpty()) return (true);        // parsed successfully
0147 
0148     setErrorText(errStr);               // record the parse error
0149     return (false);                 // parsing failed
0150 }
0151 
0152 
0153 QString OcrTesseractEngine::readHOCR(const QString &fileName)
0154 {
0155     // some basic checks on the file
0156     QFileInfo fi(fileName);
0157     if (!fi.exists()) return (xi18nc("@info", "File <filename>%1</filename> does not exist", fileName));
0158     if (!fi.isReadable()) return (xi18nc("@info", "File <filename>%1</filename> unreadable", fileName));
0159 
0160     qCDebug(OCR_LOG) << "Starting to analyse HOCR" << fileName;
0161 
0162     QFile file(fileName);
0163     if (!file.open(QIODevice::ReadOnly))
0164     {
0165         return (xi18nc("@info", "Cannot open file <filename>%1</filename>", fileName));
0166     }
0167 
0168     startResultDocument();              // start document to receive results
0169 
0170     QXmlStreamReader reader(&file);
0171     while (!reader.atEnd())
0172     {
0173         reader.readNext();              // get next XML token
0174         if (!reader.isStartElement()) continue;     // only interested in element start
0175 
0176         // We only take note of elements defining new paragraphs, new lines and words.
0177         // Examples of these are:
0178         //
0179         //   <p class='ocr_par' id='par_1_1' lang='eng' title="bbox 52 35 1469 84">
0180         //   <span class='ocr_line' id='line_1_1' title="bbox 52 35 1469 84; baseline 0.001 -10; x_size 41; x_descenders 9; x_ascenders 8">
0181         //   <span class='ocrx_word' id='word_1_1' title='bbox 52 42 123 74; x_wconf 92'>The</span> 
0182         //
0183         // These same HTML elements may specify other page layout data which are
0184         // not supported, but the class ID allows them to be distinguished.
0185 
0186         const QStringRef name = reader.name();
0187         if (name=="span" || name=="p")          // may be either SPAN or P element
0188         {
0189             const QStringRef cls = reader.attributes().value("class");
0190             if (cls=="ocr_par" || cls=="ocr_line")  // paragraph or line start
0191             {
0192                 // The start of a paragraph is always followed by the start of a line.
0193                 // Generating a new output line for both means that paragraphs are
0194                 // separated by blank lines, as intended.
0195                 startLine();
0196             }
0197             else if (cls=="ocrx_word")
0198             {
0199                 OcrWordData wd;
0200 
0201                 // The TITLE attribute of the SPAN element indicates the word bounding box.
0202                 const QString ttl = reader.attributes().value("title").toString();
0203 
0204                 QRegExp rx("bbox\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+);");
0205                 if (!ttl.isEmpty() && ttl.contains(rx))
0206                 {
0207                     QRect wordRect;
0208                     wordRect.setLeft(rx.cap(1).toInt());
0209                     wordRect.setTop(rx.cap(2).toInt());
0210                     wordRect.setRight(rx.cap(3).toInt());
0211                     wordRect.setBottom(rx.cap(4).toInt());
0212                     wd.setProperty(OcrWordData::Rectangle, wordRect);
0213                 }
0214 
0215                 // The contained text is the recognised word.  No child HTML elements
0216                 // are expected inside a word SPAN.
0217                 QString text = reader.readElementText(QXmlStreamReader::SkipChildElements);
0218                 addWord(text, wd);
0219             }
0220         }
0221     }
0222 
0223     if (reader.hasError())
0224     {
0225         qCDebug(OCR_LOG) << "XML reader error, line" << reader.lineNumber() << reader.error();
0226         return (i18n("HOCR parsing error, %1", reader.errorString()));
0227     }
0228 
0229     finishResultDocument();             // finished with output document
0230     file.close();                   // finished with HOCR file
0231 
0232     qCDebug(OCR_LOG) << "Finished analysing HOCR";
0233 
0234     return (QString());                 // no error detected
0235 }
0236 
0237 
0238 void OcrTesseractEngine::openAdvancedSettings()
0239 {
0240     ExecutablePathDialogue d(nullptr);
0241 
0242     QString exec = KookaSettings::ocrTesseractBinary();
0243     if (exec.isEmpty())
0244     {
0245         KConfigSkeletonItem *ski = KookaSettings::self()->ocrTesseractBinaryItem();
0246         ski->setDefault();
0247         exec = KookaSettings::ocrTesseractBinary();
0248     }
0249 
0250     d.setPath(exec);
0251     d.setLabel(i18n("Name or path of the Tesseract executable:"));
0252     if (!d.exec()) return;
0253 
0254     KookaSettings::setOcrTesseractBinary(d.path());
0255 }