File indexing completed on 2024-05-12 15:55:08
0001 /************************************************************************ 0002 * * 0003 * This file is part of Kooka, a scanning/OCR application using * 0004 * Qt <http://www.qt.io> and KDE Frameworks <http://www.kde.org>. * 0005 * * 0006 * Copyright (C) 2020 Jonathan Marten <jjm@keelhaul.me.uk> * 0007 * * 0008 * Kooka is free software; you can redistribute it and/or modify it * 0009 * under the terms of the GNU Library General Public License as * 0010 * published by the Free Software Foundation and appearing in the * 0011 * file COPYING included in the packaging of this file; either * 0012 * version 2 of the License, or (at your option) any later version. * 0013 * * 0014 * As a special exception, permission is given to link this program * 0015 * with any version of the KADMOS OCR/ICR engine (a product of * 0016 * reRecognition GmbH, Kreuzlingen), and distribute the resulting * 0017 * executable without including the source code for KADMOS in the * 0018 * source distribution. * 0019 * * 0020 * This program is distributed in the hope that it will be useful, * 0021 * but WITHOUT ANY WARRANTY; without even the implied warranty of * 0022 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * 0023 * GNU General Public License for more details. * 0024 * * 0025 * You should have received a copy of the GNU General Public * 0026 * License along with this program; see the file COPYING. If * 0027 * not, see <http://www.gnu.org/licenses/>. * 0028 * * 0029 ************************************************************************/ 0030 0031 #include "ocrtesseractengine.h" 0032 0033 #include <qregexp.h> 0034 #include <qfile.h> 0035 #include <qfileinfo.h> 0036 0037 #include <QXmlStreamReader> 0038 0039 #include <klocalizedstring.h> 0040 #include <kpluginfactory.h> 0041 0042 #include "imageformat.h" 0043 #include "kookasettings.h" 0044 #include "ocrtesseractdialog.h" 0045 #include "executablepathdialogue.h" 0046 #include "ocr_logging.h" 0047 0048 0049 K_PLUGIN_FACTORY_WITH_JSON(OcrTesseractEngineFactory, "kookaocr-tesseract.json", registerPlugin<OcrTesseractEngine>();) 0050 #include "ocrtesseractengine.moc" 0051 0052 0053 OcrTesseractEngine::OcrTesseractEngine(QObject *pnt, const QVariantList &args) 0054 : AbstractOcrEngine(pnt, "OcrTesseractEngine") 0055 { 0056 m_ocrImageIn = QString(); 0057 m_tempHOCROut = QString(); 0058 m_tesseractVersion = 0; 0059 } 0060 0061 0062 AbstractOcrDialogue *OcrTesseractEngine::createOcrDialogue(AbstractOcrEngine *plugin, QWidget *pnt) 0063 { 0064 return (new OcrTesseractDialog(plugin, pnt)); 0065 } 0066 0067 0068 bool OcrTesseractEngine::createOcrProcess(AbstractOcrDialogue *dia, ScanImage::Ptr img) 0069 { 0070 OcrTesseractDialog *parentDialog = static_cast<OcrTesseractDialog *>(dia); 0071 m_tesseractVersion = parentDialog->getNumVersion(); 0072 0073 const QString cmd = parentDialog->getOCRCmd(); 0074 0075 const QString ocrResultFile = tempSaveImage(img, ImageFormat("PGM"), 8); 0076 setResultImage(ocrResultFile); 0077 // TODO: if the input file is local and is readable by Tesseract, 0078 // can use it directly (but don't delete it afterwards!) 0079 m_ocrImageIn = tempSaveImage(img, ImageFormat("PNG"), 8); 0080 0081 QProcess *proc = initOcrProcess(); // start process for OCR 0082 QStringList args; // arguments for process 0083 0084 // Input file 0085 args << QFile::encodeName(m_ocrImageIn); // file with the input image 0086 0087 // Output base name 0088 m_tempHOCROut = tempFileName(""); // Tesseract just wants base name 0089 args << QFile::encodeName(m_tempHOCROut); // the HOCR result file 0090 m_tempHOCROut += ".hocr"; // suffix that it will have 0091 0092 // Language 0093 QString s = KookaSettings::ocrTesseractLanguage(); 0094 if (!s.isEmpty()) args << "-l" << s; 0095 0096 // User words 0097 QUrl u = KookaSettings::ocrTesseractUserWords(); 0098 if (u.isValid()) args << "--user-words" << u.toLocalFile(); 0099 0100 // User patterns 0101 u = KookaSettings::ocrTesseractUserPatterns(); 0102 if (u.isValid()) args << "--user-patterns" << u.toLocalFile(); 0103 0104 // Page segmentation mode 0105 s = KookaSettings::ocrTesseractSegmentationMode(); 0106 if (!s.isEmpty()) args << "--psm" << s; 0107 0108 // OCR engine mode 0109 s = KookaSettings::ocrTesseractEngineMode(); 0110 if (!s.isEmpty()) args << "--oem" << s; 0111 0112 //if (verboseDebug()) args << "-v"; 0113 0114 s = KookaSettings::ocrTesseractExtraArguments(); 0115 if (!s.isEmpty()) args << s; 0116 0117 // Output format. This option generates HOCR (HTML with OCR markup) 0118 // as specificied at http://kba.cloud/hocr-spec/1.2/ 0119 args << "hocr"; 0120 0121 proc->setProgram(cmd); 0122 proc->setArguments(args); 0123 0124 proc->setProcessChannelMode(QProcess::SeparateChannels); 0125 m_tempStdoutLog = tempFileName("stdout.log"); 0126 proc->setStandardOutputFile(m_tempStdoutLog); 0127 0128 return (runOcrProcess()); 0129 } 0130 0131 0132 QStringList OcrTesseractEngine::tempFiles(bool retain) 0133 { 0134 QStringList result; 0135 result << m_ocrImageIn; 0136 result << m_tempHOCROut; 0137 result << m_tempStdoutLog; 0138 return (result); 0139 } 0140 0141 0142 bool OcrTesseractEngine::finishedOcrProcess(QProcess *proc) 0143 { 0144 qCDebug(OCR_LOG); 0145 QString errStr = readHOCR(m_tempHOCROut); // parse the OCR results 0146 if (errStr.isEmpty()) return (true); // parsed successfully 0147 0148 setErrorText(errStr); // record the parse error 0149 return (false); // parsing failed 0150 } 0151 0152 0153 QString OcrTesseractEngine::readHOCR(const QString &fileName) 0154 { 0155 // some basic checks on the file 0156 QFileInfo fi(fileName); 0157 if (!fi.exists()) return (xi18nc("@info", "File <filename>%1</filename> does not exist", fileName)); 0158 if (!fi.isReadable()) return (xi18nc("@info", "File <filename>%1</filename> unreadable", fileName)); 0159 0160 qCDebug(OCR_LOG) << "Starting to analyse HOCR" << fileName; 0161 0162 QFile file(fileName); 0163 if (!file.open(QIODevice::ReadOnly)) 0164 { 0165 return (xi18nc("@info", "Cannot open file <filename>%1</filename>", fileName)); 0166 } 0167 0168 startResultDocument(); // start document to receive results 0169 0170 QXmlStreamReader reader(&file); 0171 while (!reader.atEnd()) 0172 { 0173 reader.readNext(); // get next XML token 0174 if (!reader.isStartElement()) continue; // only interested in element start 0175 0176 // We only take note of elements defining new paragraphs, new lines and words. 0177 // Examples of these are: 0178 // 0179 // <p class='ocr_par' id='par_1_1' lang='eng' title="bbox 52 35 1469 84"> 0180 // <span class='ocr_line' id='line_1_1' title="bbox 52 35 1469 84; baseline 0.001 -10; x_size 41; x_descenders 9; x_ascenders 8"> 0181 // <span class='ocrx_word' id='word_1_1' title='bbox 52 42 123 74; x_wconf 92'>The</span> 0182 // 0183 // These same HTML elements may specify other page layout data which are 0184 // not supported, but the class ID allows them to be distinguished. 0185 0186 const QStringRef name = reader.name(); 0187 if (name=="span" || name=="p") // may be either SPAN or P element 0188 { 0189 const QStringRef cls = reader.attributes().value("class"); 0190 if (cls=="ocr_par" || cls=="ocr_line") // paragraph or line start 0191 { 0192 // The start of a paragraph is always followed by the start of a line. 0193 // Generating a new output line for both means that paragraphs are 0194 // separated by blank lines, as intended. 0195 startLine(); 0196 } 0197 else if (cls=="ocrx_word") 0198 { 0199 OcrWordData wd; 0200 0201 // The TITLE attribute of the SPAN element indicates the word bounding box. 0202 const QString ttl = reader.attributes().value("title").toString(); 0203 0204 QRegExp rx("bbox\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+);"); 0205 if (!ttl.isEmpty() && ttl.contains(rx)) 0206 { 0207 QRect wordRect; 0208 wordRect.setLeft(rx.cap(1).toInt()); 0209 wordRect.setTop(rx.cap(2).toInt()); 0210 wordRect.setRight(rx.cap(3).toInt()); 0211 wordRect.setBottom(rx.cap(4).toInt()); 0212 wd.setProperty(OcrWordData::Rectangle, wordRect); 0213 } 0214 0215 // The contained text is the recognised word. No child HTML elements 0216 // are expected inside a word SPAN. 0217 QString text = reader.readElementText(QXmlStreamReader::SkipChildElements); 0218 addWord(text, wd); 0219 } 0220 } 0221 } 0222 0223 if (reader.hasError()) 0224 { 0225 qCDebug(OCR_LOG) << "XML reader error, line" << reader.lineNumber() << reader.error(); 0226 return (i18n("HOCR parsing error, %1", reader.errorString())); 0227 } 0228 0229 finishResultDocument(); // finished with output document 0230 file.close(); // finished with HOCR file 0231 0232 qCDebug(OCR_LOG) << "Finished analysing HOCR"; 0233 0234 return (QString()); // no error detected 0235 } 0236 0237 0238 void OcrTesseractEngine::openAdvancedSettings() 0239 { 0240 ExecutablePathDialogue d(nullptr); 0241 0242 QString exec = KookaSettings::ocrTesseractBinary(); 0243 if (exec.isEmpty()) 0244 { 0245 KConfigSkeletonItem *ski = KookaSettings::self()->ocrTesseractBinaryItem(); 0246 ski->setDefault(); 0247 exec = KookaSettings::ocrTesseractBinary(); 0248 } 0249 0250 d.setPath(exec); 0251 d.setLabel(i18n("Name or path of the Tesseract executable:")); 0252 if (!d.exec()) return; 0253 0254 KookaSettings::setOcrTesseractBinary(d.path()); 0255 }