File indexing completed on 2024-04-28 15:39:59

0001 // SPDX-FileCopyrightText: 2005-2013 Jesper K. Pedersen <jesper.pedersen@kdab.com>
0002 // SPDX-FileCopyrightText: 2006-2010 Tuomas Suutari <tuomas@nepnep.net>
0003 // SPDX-FileCopyrightText: 2007 Dirk Mueller <mueller@kde.org>
0004 // SPDX-FileCopyrightText: 2007-2008 Laurent Montel <montel@kde.org>
0005 // SPDX-FileCopyrightText: 2007-2010 Jan Kundrát <jkt@flaska.net>
0006 // SPDX-FileCopyrightText: 2008-2009 Henner Zeller <h.zeller@acm.org>
0007 // SPDX-FileCopyrightText: 2010 Wes Hardaker <kpa@capturedonearth.com>
0008 // SPDX-FileCopyrightText: 2010-2012 Miika Turkia <miika.turkia@gmail.com>
0009 // SPDX-FileCopyrightText: 2011 Andreas Neustifter <andreas.neustifter@gmail.com>
0010 // SPDX-FileCopyrightText: 2012 Yuri Chornoivan <yurchor@ukr.net>
0011 // SPDX-FileCopyrightText: 2012-2023 Johannes Zarl-Zierl <johannes@zarl-zierl.at>
0012 // SPDX-FileCopyrightText: 2015 Tobias Leupold <tl@stonemx.de>
0013 // SPDX-FileCopyrightText: 2017-2019 Robert Krawitz <rlk@alum.mit.edu>
0014 // SPDX-FileCopyrightText: 2018 Antoni Bella Pérez <antonibella5@yahoo.com>
0015 //
0016 // SPDX-License-Identifier: GPL-2.0-or-later
0017 
0018 #include "NewImageFinder.h"
0019 
0020 #include "FastDir.h"
0021 #include "ImageDB.h"
0022 #include "ImageScout.h"
0023 #include "MD5Map.h"
0024 
0025 #include <BackgroundJobs/HandleVideoThumbnailRequestJob.h>
0026 #include <BackgroundJobs/ReadVideoLengthJob.h>
0027 #include <BackgroundJobs/SearchForVideosWithoutVideoThumbnailsJob.h>
0028 #include <BackgroundTaskManager/JobManager.h>
0029 #include <ImageManager/RawImageDecoder.h>
0030 #include <ImageManager/ThumbnailBuilder.h>
0031 #include <MainWindow/FeatureDialog.h>
0032 #include <MainWindow/Window.h>
0033 #include <Utilities/FileUtil.h>
0034 #include <kpabase/FileExtensions.h>
0035 #include <kpabase/FileNameUtil.h>
0036 #include <kpabase/Logging.h>
0037 #include <kpabase/SettingsData.h>
0038 #include <kpaexif/Database.h>
0039 #include <kpathumbnails/ThumbnailCache.h>
0040 
0041 #include <KLocalizedString>
0042 #include <KMessageBox>
0043 #include <QApplication>
0044 #include <QDataStream>
0045 #include <QElapsedTimer>
0046 #include <QEventLoop>
0047 #include <QFile>
0048 #include <QFileInfo>
0049 #include <QImageReader>
0050 #include <QLoggingCategory>
0051 #include <QMimeDatabase>
0052 #include <QProgressBar>
0053 #include <QProgressDialog>
0054 #include <QStringList>
0055 
0056 using namespace DB;
0057 
0058 /*****************************************************************
0059  *
0060  * NOTES ON PERFORMANCE
0061  * ===== == ===========
0062  *
0063  * - Robert Krawitz <rlk@alum.mit.edu> 2018-05-24
0064  *
0065  *
0066  * GENERAL NOTES ON STORAGE I/O
0067  * ------- ----- -- ------- ---
0068  *
0069  * The two main gates to loading new images are:
0070  *
0071  * 1) I/O (how fast can we read images off mass storage)
0072  *
0073  *    Different I/O devices have different characteristics in terms of
0074  *    througput, media latency, and protocol latency.
0075  *
0076  *    - Throughput is the raw speed at which data can be transferred,
0077  *      limited by the physical and/or electronic characteristics of
0078  *      the medium and the interface.  Short of reducing the amount of
0079  *      data that's transferred, or clever games with using the most
0080  *      efficient part of the medium (the outer tracks only for HDD's,
0081  *      a practice referred to as "short stroking" because it reduces
0082  *      the distance the head has to seek, at the cost of wasting a
0083  *      lot of capacity), there's nothing that can be done about this.
0084  *
0085  *    - Media latency is the latency component due to characteristics
0086  *      of the underlying storage medium.  For spinning disks, this is
0087  *      a function of rotational latency and sek latency.  In some
0088  *      cases, particularly with hard disks, it is possible to reduce
0089  *      media latency by arranging to access the data in a way that
0090  *      reduces seeking.  See DB/FastDir.cpp for an example of this.
0091  *
0092  *      While media latency can sometimes be hidden by overlapping
0093  *      I/O, generally not possible to avoid it.  Sometimes trying too
0094  *      hard can actually increase media latency if it results in I/O
0095  *      operations competing against each other requiring additional
0096  *      seeks.
0097  *
0098  *      Overlapping I/O with computation is another matter; that can
0099  *      easily yield benefit, especially if it eliminates rotational
0100  *      latency.
0101  *
0102  *    - Protocol latency.  This refers to things like SATA overhead,
0103  *      network overhead (for images stored on a network), and so
0104  *      forth.  This can encompass multiple things, and often they can
0105  *      be pipelined by means of multiple queued I/O operations.  For
0106  *      example, multiple commands can be issued to modern interfaces
0107  *      (SATA, NVMe) and many network interfaces without waiting for
0108  *      earlier operations to return.
0109  *
0110  *      If protocol latency is high compared with media latency,
0111  *      having multiple requests outstanding simultaneously can
0112  *      yield significant benefits.
0113  *
0114  *    iostat is a valuable tool for investigating throughput and
0115  *    looking for possible optimizations.  The IO/sec and data
0116  *    read/written per second when compared against known media
0117  *    characteristics (disk and SSD throughput, network bandwidth)
0118  *    provides valuable information about whether we're getting close
0119  *    to full performance from the I/O, and user and system CPU time
0120  *    give us additional clues about whether we're I/O-bound or
0121  *    CPU-bound.
0122  *
0123  *    Historically in the computer field, operations that require
0124  *    relatively simple processing on large volumes of data are I/O
0125  *    bound.  But with very fast I/O devices such as NVMe SSDs, some
0126  *    of which reach 3 GB/sec, that's not always the case.
0127  *
0128  * 2) Image (mostly JPEG) loading.
0129  *
0130  *    This is a function of image characteristics and image processing
0131  *    libraries.  Sometimes it's possible to apply parameters to
0132  *    the underlying image loader to speed it up.  This shows up as user
0133  *    CPU time.  Usually the only way to improve this performance
0134  *    characteristic is to use more or faster CPU cores (sometimes GPUs
0135  *    can assist here) or use better image loading routines (better
0136  *    libraries).
0137  *
0138  *
0139  * DESCRIPTION OF KPHOTOALBUM IMAGE LOAD PROCESS
0140  * ----------- -- ----------- ----- ---- -------
0141  *
0142  * KPhotoAlbum, when it loads an image, performs three processing steps:
0143  *
0144  * 1) Compute the MD5 checksum
0145  *
0146  * 2) Extract the Exif metadata
0147  *
0148  * 3) Generate a thumbnail
0149  *
0150  * Previous to this round of performance tuning, the first two steps
0151  * were performed in the first pass, and thumbnails were generated in
0152  * a separate pass.  Assuming that the set of new images is large enough
0153  * that they cannot all fit in RAM buffers, this results in the I/O
0154  * being performed twice.  The rewrite results in I/O being performed once.
0155  *
0156  * In addition, I have made many other changes:
0157  *
0158  * 1) Prior to the MD5 calculation step, a new thread, called a "scout
0159  *    thread", reads the files into memory.  While this memory is not
0160  *    directly used in the later computations, it results in the images
0161  *    being in RAM when they are later needed, making the I/O very fast
0162  *    (copying data in memory rather than reading it from storage).
0163  *
0164  *    This is a way to overlap I/O with computation.
0165  *
0166  * 2) The MD5 checksum uses its own I/O to read the data in in larger
0167  *    chunks than the Qt MD5 routine does.  The Qt routine reads it in
0168  *    in 4KiB chunks; my experimentation has found that 256KiB chunks
0169  *    are more efficient, even with a scout thread (it reduces the
0170  *    number of system calls).
0171  *
0172  * 3) When searching for other images to stack with the image being
0173  *    loaded, the new image loader no longer attempts to determine
0174  *    whether other candidate filenames are present, nor does it
0175  *    compute the MD5 checksum of any such files it does find.  Rather,
0176  *    it only checks for files that are already in KPhotoAlbum, either
0177  *    previously or as a result of the current load.  Merely checking
0178  *    for the presence of another file is not cheap, and it's not
0179  *    necessary; if an image will belong to a stack, we'll either know
0180  *    it now or when other images that can be stacked are loaded.
0181  *
0182  * 4) The Exif metadata extraction is now done only once; previously
0183  *    it was performed several times at different stages of the loading
0184  *    process.
0185  *
0186  * 5) The thumbnail index is now written out incrementally rather than
0187  *    the entire index (which can be many megabytes in a large image
0188  *    database) being rewritten frequently.  The index is fully rewritten
0189  *    prior to exit.
0190  *
0191  *
0192  * BASELINE PERFORMANCE
0193  * -------- -----------
0194  *
0195  * These measurements were all taken on a Lenovo ThinkPad P70 with 32
0196  * GB of dual-channel DDR4-2400 DRAM, a Xeon E3-1505M CPU (4 cores/8
0197  * total hyperthreads, 2.8-3.7 GHz Skylake; usually runs around
0198  * 3.1-3.2 GHz in practice), a Seagate ST2000LM015-2E8174 2TB HDD, and
0199  * a Crucial MX300 1TB SATA SSD.  Published numbers and measurements I
0200  * took otherwise indicate that the HDD can handle about 105-110
0201  * MB/sec with a maximum of 180 IO/sec (in a favorable case).  The SSD
0202  * is rated to handle 530 MB/sec read, 510 MB/sec write, 92K random
0203  * reads/sec, and 83K random writes/sec.
0204  *
0205  * The image set I used for all measurements, except as noted,
0206  * consists of 10839 total files of which about 85% are 20 MP JPEG and
0207  * the remainder (with a few exceptions are 20 MP RAW files from a
0208  * Canon EOS 7D mkII camera.  The total dataset is about 92 GB in
0209  * size.
0210  *
0211  * I baselined both drives by reading the same dataset by means of
0212  *
0213  * % ls | xargs cat | dd bs=1048576 of=/dev/null
0214  *
0215  * The HDD required between 850 and 870 seconds (14'10" to 14'30") to
0216  * perform this operation, yielding about 105-108 MB/sec.  The SSD
0217  * achieved about 271 MB/sec, which is well under its rated throughput
0218  * (hdparm -Tt yields 355 MB/sec, which is likewise nowhere close to
0219  * its rated throughput).  hdparm -Tt on the HDD yields about 120
0220  * MB/sec, but throughput to an HDD depends upon which part of the
0221  * disk is being read.  The outer tracks have a greater angular
0222  * density to achieve the same linear density (in other words, the
0223  * circumference of an outer track is longer than that of an inner
0224  * track, and the data is stored at a constant linear density).  So
0225  * hdparm isn't very useful on an HDD except as a best case.
0226  *
0227  * Note also that hdparm does a single stream read from the device.
0228  * It does not take advantage of the ability to queue multiple
0229  * requests.
0230  *
0231  *
0232  * ANALYSIS OF KPHOTOALBUM LOAD PERFORMANCE
0233  * -------- -- ----------- ---- -----------
0234  *
0235  * I analyzed the following cases, with images stored both on the
0236  * HDD and the SSD:
0237  *
0238  * 1) Images loaded (All, JPEG only, RAW only)
0239  *
0240  * B) Thumbnail creation (Including, Excluding)
0241  *
0242  * C) Scout threads (0, 1, 2, 3)
0243  *
0244  * The JPG image set constitutes 9293 images totaling about 55 GB.  The
0245  *   JPEG files are mostly 20 MP high quality files, in the range of
0246  *   6-10 MB.
0247  * The RAW image set constitutes 1544 images totaling about 37 GB.  The
0248  *   RAW files are 20 MP files, in the range of 25 MB.
0249  * The ALL set consists of 10839 or 10840 images totaling about 92 GB
0250  *   (the above set plus 2 .MOV files and in some cases one additional
0251  *   JPEG file).
0252  *
0253  * Times are elapsed times; CPU consumption is approximate user+system
0254  * CPU consumption.  Numbers in parentheses are with thumbnail
0255  * building disabled.  Note that in the cases with no scout threads on
0256  * the SSD the times were reproducibly shorter with thumbnail building
0257  * enabled (reasons are not determined at this time).
0258  *
0259  * Cases building RAW thumbnails generally consumed somewhat more
0260  * system CPU (in the range of 10-15%) than JPEG-only cases.  This may
0261  * be due to custom I/O routines used for generating thumbnails with
0262  * JPEG files; RAW files used the I/O provided by libkdcraw, which
0263  * uses smaller I/O operations.
0264  *
0265  * Estimating CPU time for mixed workloads proved very problematic,
0266  * as there were significant changes over time.
0267  *
0268  * Elapsed Time
0269  * ------- ----
0270  *
0271  *                                 SSD                     HDD
0272  *
0273  * JPG - 0 scouts                  4:03 (3:59)
0274  * JPG - 1 scout                   2:46 (2:44)
0275  * JPG - 2 scouts                  2:20 (2:07)
0276  * JPG - 3 scouts                  2:21 (1:58)
0277  *
0278  * ALL - 0 scouts                  6:32 (7:03)            16:01
0279  * ALL - 1 scout                   4:33 (4:33)            15:01
0280  * ALL - 2 scouts                  3:37 (3:28)            16:59
0281  * ALL - 3 scouts                  3:36 (3:15)
0282  *
0283  * RAW - 0 scouts                  2:18 (2:46)
0284  * RAW - 1 scout                   1:46 (1:46)
0285  * RAW - 2 scouts                  1:17 (1:17)
0286  * RAW - 3 scouts                  1:13 (1:13)
0287  *
0288  * User+System CPU
0289  * ----------- ---
0290  *
0291  *                                 SSD                     HDD
0292  *
0293  * JPG - 0 scouts                  40% (12%)
0294  * JPG - 1 scout                   70% (20%)
0295  * JPG - 2 scouts                  85% (15%)
0296  * JPG - 3 scouts                  85% (15%)
0297  *
0298  * RAW - 0 scouts                  15% (10%)
0299  * RAW - 1 scout                   18% (12%)
0300  * RAW - 2 scouts                  25% (15%)
0301  * RAW - 3 scouts                  25% (15%)
0302  *
0303  * I also used kcachegrind to measure CPU consumption on smaller
0304  * subsets of images (with and without thumbnail creation).  In terms
0305  * of user CPU consumption, thumbnail creation constitutes the large
0306  * majority of CPU cycles for processing JPEG files, followed by MD5
0307  * computation, with Exif parsing lagging far behind.  For RAW files,
0308  * MD5 computation consumes more cycles, likely in part due to the
0309  * larger size of RAW files but possibly also related to the smaller
0310  * filesize of embedded thumbnails (on the Canon 7D mkII, the embedded
0311  * thumbnail is full size but low quality).
0312  *
0313  * With thumbnail generation:
0314  * ---- --------- -----------
0315  *
0316  *                                 RAW             JPEG
0317  *
0318  * Thumbnail generation            44%             82%
0319  *   libjpeg processing              43%             82%
0320  * MD5 computation                 51%             13%
0321  * Read Exif                        1%              1.0%
0322  *
0323  * Without thumbnail generation:
0324  * ------- --------- -----------
0325  *
0326  *                                 RAW             JPEG
0327  *
0328  * MD5 computation                 92%             80%
0329  * Read Exif                        4%             10%
0330  *
0331  *
0332  * CONCLUSIONS
0333  * -----------
0334  *
0335  * For loading files from hard disk (likely the most common case),
0336  * there's no reason to consider any loading method other than using a
0337  * single scout thread and computing thumbnails concurrently.  Even
0338  * with thumbnail computation, there is very little CPU utilization.
0339  *
0340  * Loading from SATA SSD benefits from two scout threads, and possibly
0341  * more.  For minimal time to regain control, there is some benefit
0342  * seen from separating thumbnail generation from the rest of the
0343  * processing stages at the cost of more total elapsed time.  This is
0344  * more evident with JPEG files than with RAW files in this test case.
0345  * RAW files typically have smaller thumbnail images which can be
0346  * extracted and processed more quickly than full-size JPEG files.  On
0347  * a slower CPU, it may be desirable to return control to the user
0348  * even if the thumbnails are not built yet.
0349  *
0350  * Two other cases would be NVMe (or other very fast) SSDs and network
0351  * storage.  Since we're seeing evidence of CPU saturation on SATA
0352  * SSDs, we would likely see this even more strongly with NVMe; with
0353  * large numbers of images it may be desirable to separate the
0354  * thumbnail building from the rest of the processing.  It may also be
0355  * beneficial to use more scout threads.
0356  *
0357  * Network storage presents a different problem.  It is likely to have
0358  * lower throughput -- and certainly much higher latency -- than even
0359  * HDD, unless the underlying storage medium is SSD and the data is
0360  * located on a very fast, low latency network.  So there would be no
0361  * benefit to separating thumbnail processing.  However, due to
0362  * protocol vs. media latency discussed above, it may well work to use
0363  * more scout threads.  However, this may saturate the network and the
0364  * storage, to the detriment of other users, and there's probably no
0365  * general (or easily discoverable) optimum for this.
0366  *
0367  * It's my judgment that most images will be stored on HDDs for at
0368  * least the next few years, so tuning for that use case is probably
0369  * the best single choice to be made.
0370  *
0371  *****************************************************************/
0372 
0373 namespace
0374 {
0375 
0376 bool canReadImage(const DB::FileName &fileName)
0377 {
0378     bool fastMode = !Settings::SettingsData::instance()->ignoreFileExtension();
0379     QMimeDatabase::MatchMode mode = fastMode ? QMimeDatabase::MatchExtension : QMimeDatabase::MatchDefault;
0380     QMimeDatabase db;
0381     QMimeType mimeType = db.mimeTypeForFile(fileName.absolute(), mode);
0382 
0383     return QImageReader::supportedMimeTypes().contains(mimeType.name().toUtf8())
0384         || ImageManager::ImageDecoder::mightDecode(fileName);
0385 }
0386 }
0387 
0388 QMutex NewImageFinder::s_imageFinderLock;
0389 
0390 bool NewImageFinder::findImages()
0391 {
0392     using namespace std::chrono_literals;
0393     if (!s_imageFinderLock.try_lock_for(500ms)) {
0394         qCInfo(DBLog) << "NewImageFinder::findImages() called while searching for new images. Try again later...";
0395         return false;
0396     }
0397     // Load the information from the XML file.
0398     DB::FileNameSet loadedFiles;
0399 
0400     QElapsedTimer timer;
0401 
0402     timer.start();
0403     // TODO: maybe the database interface should allow to query if it
0404     // knows about an image ? Here we've to iterate through all of them and it
0405     // might be more efficient do do this in the database without fetching the
0406     // whole info.
0407     const auto knownFiles = DB::ImageDB::instance()->files();
0408     for (const DB::FileName &fileName : knownFiles) {
0409         loadedFiles.insert(fileName);
0410     }
0411 
0412     m_pendingLoad.clear();
0413     searchForNewFiles(loadedFiles, Settings::SettingsData::instance()->imageDirectory());
0414     int filesToLoad = m_pendingLoad.count();
0415     loadExtraFiles();
0416 
0417     qCDebug(TimingLog) << "Loaded " << filesToLoad << " images in " << timer.elapsed() / 1000.0 << " seconds";
0418 
0419     // Man this is not super optimal, but will be changed onces the image finder moves to become a background task.
0420     if (MainWindow::FeatureDialog::hasVideoThumbnailer()) {
0421         BackgroundTaskManager::JobManager::instance()->addJob(
0422             new BackgroundJobs::SearchForVideosWithoutVideoThumbnailsJob);
0423     }
0424 
0425     s_imageFinderLock.unlock();
0426     // To avoid deciding if the new images are shown in a given thumbnail view or in a given search
0427     // we rather just go to home.
0428     return (!m_pendingLoad.isEmpty()); // returns if new images was found.
0429 }
0430 
0431 void NewImageFinder::searchForNewFiles(const DB::FileNameSet &loadedFiles, QString directory)
0432 {
0433     qApp->processEvents(QEventLoop::AllEvents);
0434     directory = Utilities::stripEndingForwardSlash(directory);
0435 
0436     qCDebug(DBFileOpsLog) << "searching for new files in" << directory;
0437     FastDir dir(directory);
0438     const QStringList dirList = dir.entryList();
0439     ImageManager::RAWImageDecoder rawDec;
0440     QStringList excluded;
0441     excluded << Settings::SettingsData::instance()->excludeDirectories();
0442     excluded = excluded.at(0).split(QString::fromLatin1(","));
0443 
0444     bool skipSymlinks = Settings::SettingsData::instance()->skipSymlinks();
0445 
0446     // Keep files within a directory more local by processing all files within the
0447     // directory, and then all subdirectories.
0448     QStringList subdirList;
0449 
0450     for (QStringList::const_iterator it = dirList.constBegin(); it != dirList.constEnd(); ++it) {
0451         const DB::FileName file = DB::FileName::fromAbsolutePath(directory + QString::fromLatin1("/") + *it);
0452         if ((*it) == QString::fromLatin1(".") || (*it) == QString::fromLatin1("..")
0453             || excluded.contains((*it)) || loadedFiles.contains(file)
0454             || KPABase::fileCanBeSkipped(loadedFiles, file)
0455             || (*it) == QString::fromLatin1("CategoryImages"))
0456             continue;
0457 
0458         QFileInfo fi(file.absolute());
0459 
0460         if (!fi.isReadable())
0461             continue;
0462         if (skipSymlinks && fi.isSymLink())
0463             continue;
0464 
0465         if (fi.isFile()) {
0466             if (!DB::ImageDB::instance()->isBlocking(file)) {
0467                 if (canReadImage(file)) {
0468                     qCDebug(DBFileOpsLog) << "Found new image:" << file.relative();
0469                     m_pendingLoad.append(qMakePair(file, DB::Image));
0470                 } else if (KPABase::isVideo(file)) {
0471                     qCDebug(DBFileOpsLog) << "Found new video:" << file.relative();
0472                     m_pendingLoad.append(qMakePair(file, DB::Video));
0473                 }
0474             }
0475         } else if (fi.isDir()) {
0476             subdirList.append(file.absolute());
0477         }
0478     }
0479     for (QStringList::const_iterator it = subdirList.constBegin(); it != subdirList.constEnd(); ++it)
0480         searchForNewFiles(loadedFiles, *it);
0481 }
0482 
0483 void NewImageFinder::loadExtraFiles()
0484 {
0485     // FIXME: should be converted to a threadpool for SMP stuff and whatnot :]
0486     QProgressDialog dialog;
0487     QElapsedTimer timeSinceProgressUpdate;
0488     dialog.setLabelText(i18n("<p><b>Loading information from new files</b></p>"
0489                              "<p>Depending on the number of images, this may take some time.<br/>"
0490                              "However, there is only a delay when new images are found.</p>"));
0491     QProgressBar *progressBar = new QProgressBar;
0492     progressBar->setFormat(QLatin1String("%v/%m"));
0493     dialog.setBar(progressBar);
0494     dialog.setMaximum(m_pendingLoad.count());
0495     dialog.setMinimumDuration(1000);
0496     QAtomicInt loadedCount = 0;
0497 
0498     setupFileVersionDetection();
0499 
0500     int count = 0;
0501 
0502     MD5::resetMD5Cache();
0503     ImageScoutQueue asyncPreloadQueue;
0504     for (LoadList::Iterator it = m_pendingLoad.begin(); it != m_pendingLoad.end(); ++it) {
0505         asyncPreloadQueue.enqueue((*it).first);
0506     }
0507 
0508     ImageScout scout(asyncPreloadQueue, loadedCount, Settings::SettingsData::instance()->getPreloadThreadCount());
0509     if (Settings::SettingsData::instance()->getOverlapLoadMD5())
0510         scout.setPreloadFunc(DB::PreloadMD5Sum);
0511     scout.start();
0512 
0513     DB::ImageDB::instance()->exifDB()->startInsertTransaction();
0514     dialog.setValue(count); // ensure to call setProgress(0)
0515     timeSinceProgressUpdate.start();
0516     for (LoadList::Iterator it = m_pendingLoad.begin(); it != m_pendingLoad.end(); ++it, ++count) {
0517         qApp->processEvents(QEventLoop::AllEvents);
0518 
0519         if (dialog.wasCanceled()) {
0520             m_pendingLoad.clear();
0521             DB::ImageDB::instance()->exifDB()->abortInsertTransaction();
0522             return;
0523         }
0524         // (*it).first: DB::FileName
0525         // (*it).second: DB::MediaType
0526         loadExtraFile((*it).first, (*it).second);
0527         loadedCount++; // Atomic
0528         if (timeSinceProgressUpdate.elapsed() >= 1000) {
0529             dialog.setValue(count);
0530             timeSinceProgressUpdate.restart();
0531         }
0532     }
0533     dialog.setValue(count);
0534     // loadExtraFile() has already inserted all images into the
0535     // database, but without committing the changes
0536     DB::ImageDB::instance()->commitDelayedImages();
0537     DB::ImageDB::instance()->exifDB()->commitInsertTransaction();
0538 
0539     ImageManager::ThumbnailBuilder::instance()->save();
0540 }
0541 
0542 void NewImageFinder::setupFileVersionDetection()
0543 {
0544     // should be cached because loading once per image is expensive
0545     m_modifiedFileCompString = Settings::SettingsData::instance()->modifiedFileComponent();
0546     m_modifiedFileComponent = QRegExp(m_modifiedFileCompString);
0547 
0548     m_originalFileComponents << Settings::SettingsData::instance()->originalFileComponent();
0549     m_originalFileComponents = m_originalFileComponents.at(0).split(QString::fromLatin1(";"));
0550 }
0551 
0552 void NewImageFinder::loadExtraFile(const DB::FileName &newFileName, DB::MediaType type)
0553 {
0554     qCDebug(DBFileOpsLog) << "loadExtraFile(" << newFileName.relative() << ")";
0555     MD5 sum = MD5Sum(newFileName);
0556     if (handleIfImageHasBeenMoved(newFileName, sum))
0557         return;
0558 
0559     // check to see if this is a new version of a previous image
0560     // We'll get the Exif data later, when we get the MD5 checksum.
0561     ImageInfoPtr info = ImageInfoPtr(new ImageInfo(newFileName, type, DB::FileInformation::Ignore));
0562     ImageInfoPtr originalInfo;
0563     DB::FileName originalFileName;
0564 
0565     if (Settings::SettingsData::instance()->detectModifiedFiles()) {
0566         // requires at least *something* in the modifiedFileComponent
0567         if (m_modifiedFileCompString.length() >= 0 && newFileName.relative().contains(m_modifiedFileComponent)) {
0568 
0569             for (QStringList::const_iterator it = m_originalFileComponents.constBegin();
0570                  it != m_originalFileComponents.constEnd(); ++it) {
0571                 QString tmp = newFileName.relative();
0572                 tmp.replace(m_modifiedFileComponent, (*it));
0573                 originalFileName = DB::FileName::fromRelativePath(tmp);
0574 
0575                 MD5 originalSum;
0576                 if (newFileName == originalFileName)
0577                     originalSum = sum;
0578                 else if (DB::ImageDB::instance()->md5Map()->containsFile(originalFileName))
0579                     originalSum = DB::ImageDB::instance()->md5Map()->lookupFile(originalFileName);
0580                 else
0581                     // Do *not* attempt to compute the checksum here.  It forces a filesystem
0582                     // lookup on a file that may not exist and substantially degrades
0583                     // performance by about 25% on an SSD and about 30% on a spinning disk.
0584                     // If one of these other files exist, it will be found later in
0585                     // the image search at which point we'll detect the modified file.
0586                     continue;
0587                 if (DB::ImageDB::instance()->md5Map()->contains(originalSum)) {
0588                     // we have a previous copy of this file; copy it's data
0589                     // from the original.
0590                     originalInfo = DB::ImageDB::instance()->info(originalFileName);
0591                     if (!originalInfo) {
0592                         qCDebug(DBLog) << "Original info not found by name for " << originalFileName.absolute() << ", trying by MD5 sum.";
0593                         originalFileName = DB::ImageDB::instance()->md5Map()->lookup(originalSum);
0594 
0595                         if (!originalFileName.isNull()) {
0596                             qCDebug(DBLog) << "Substitute image " << originalFileName.absolute() << " found.";
0597                             originalInfo = DB::ImageDB::instance()->info(originalFileName);
0598                         }
0599 
0600                         if (!originalInfo) {
0601                             qCWarning(DBLog, "How did that happen? We couldn't find info for the original image %s; can't copy the original data to %s",
0602                                       qPrintable(originalFileName.absolute()), qPrintable(newFileName.absolute()));
0603                             continue;
0604                         }
0605                     }
0606                     info->copyExtraData(*originalInfo);
0607 
0608                     /* if requested to move, then delete old data from original */
0609                     if (Settings::SettingsData::instance()->moveOriginalContents()) {
0610                         originalInfo->removeExtraData();
0611                     }
0612 
0613                     break;
0614                 }
0615             }
0616         }
0617     }
0618     ImageInfoList newImages;
0619     newImages.append(info);
0620     DB::ImageDB::instance()->addImages(newImages, false);
0621 
0622     // also inserts image into exif db if present:
0623     info->setMD5Sum(sum);
0624     DB::ImageDB::instance()->md5Map()->insert(sum, info->fileName());
0625 
0626     if (originalInfo && Settings::SettingsData::instance()->autoStackNewFiles()) {
0627 
0628         // stack the files together
0629         DB::FileName olderfile = originalFileName;
0630         DB::FileName newerfile = info->fileName();
0631         DB::FileNameList tostack;
0632 
0633         // the newest file should go to the top of the stack
0634         tostack.append(newerfile);
0635 
0636         DB::FileNameList oldStack;
0637         if ((oldStack = DB::ImageDB::instance()->getStackFor(olderfile)).isEmpty()) {
0638             tostack.append(olderfile);
0639         } else {
0640             for (const DB::FileName &tmp : oldStack) {
0641                 tostack.append(tmp);
0642             }
0643         }
0644         DB::ImageDB::instance()->stack(tostack);
0645         MainWindow::Window::theMainWindow()->setStackHead(newerfile);
0646 
0647         // ordering: XXX we ideally want to place the new image right
0648         // after the older one in the list.
0649     }
0650 
0651     markUnTagged(info);
0652     ImageManager::ThumbnailBuilder::instance()->buildOneThumbnail(info);
0653     if (info->isVideo() && MainWindow::FeatureDialog::hasVideoThumbnailer()) {
0654         // needs to be done *after* insertion into database
0655         BackgroundTaskManager::JobManager::instance()->addJob(
0656             new BackgroundJobs::ReadVideoLengthJob(info->fileName(), BackgroundTaskManager::BackgroundVideoPreviewRequest));
0657     }
0658 }
0659 
0660 bool NewImageFinder::handleIfImageHasBeenMoved(const FileName &newFileName, const MD5 &sum)
0661 {
0662     if (DB::ImageDB::instance()->md5Map()->contains(sum)) {
0663         const DB::FileName matchedFileName = DB::ImageDB::instance()->md5Map()->lookup(sum);
0664         QFileInfo fi(matchedFileName.absolute());
0665 
0666         if (!fi.exists()) {
0667             // The file we had a collapse with didn't exists anymore so it is likely moved to this new name
0668             ImageInfoPtr info = DB::ImageDB::instance()->info(matchedFileName);
0669             if (!info)
0670                 qCWarning(DBLog, "How did that happen? We couldn't find info for the images %s", qPrintable(matchedFileName.relative()));
0671             else {
0672                 fi = QFileInfo(matchedFileName.relative());
0673                 if (info->label() == fi.completeBaseName()) {
0674                     fi = QFileInfo(newFileName.absolute());
0675                     info->setLabel(fi.completeBaseName());
0676                 }
0677 
0678                 DB::ImageDB::instance()->renameImage(info, newFileName);
0679 
0680                 // We need to insert the new name into the MD5 map,
0681                 // as it is a map, the value for the moved file will automatically be deleted.
0682 
0683                 DB::ImageDB::instance()->md5Map()->insert(sum, info->fileName());
0684 
0685                 DB::ImageDB::instance()->exifDB()->remove(matchedFileName);
0686                 DB::ImageDB::instance()->exifDB()->add(newFileName);
0687                 ImageManager::ThumbnailBuilder::instance()->buildOneThumbnail(info);
0688                 return true;
0689             }
0690         }
0691     }
0692     return false; // The image wasn't just moved
0693 }
0694 
0695 bool NewImageFinder::calculateMD5sums(
0696     const DB::FileNameList &list,
0697     DB::MD5Map *md5Map,
0698     bool *wasCanceled)
0699 {
0700     // FIXME: should be converted to a threadpool for SMP stuff and whatnot :]
0701     QProgressDialog dialog;
0702     dialog.setLabelText(
0703         i18np("<p><b>Calculating checksum for %1 file</b></p>", "<p><b>Calculating checksums for %1 files</b></p>", list.size())
0704         + i18n("<p>By storing a checksum for each image "
0705                "KPhotoAlbum is capable of finding images "
0706                "even when you have moved them on the disk.</p>"));
0707     dialog.setMaximum(list.size());
0708     dialog.setMinimumDuration(1000);
0709 
0710     int count = 0;
0711     DB::FileNameList cantRead;
0712     bool dirty = false;
0713 
0714     for (const FileName &fileName : list) {
0715         if (count % 10 == 0) {
0716             dialog.setValue(count); // ensure to call setProgress(0)
0717             qApp->processEvents(QEventLoop::AllEvents);
0718 
0719             if (dialog.wasCanceled()) {
0720                 if (wasCanceled)
0721                     *wasCanceled = true;
0722                 return dirty;
0723             }
0724         }
0725 
0726         MD5 md5 = MD5Sum(fileName);
0727         if (md5.isNull()) {
0728             cantRead << fileName;
0729             continue;
0730         }
0731 
0732         ImageInfoPtr info = ImageDB::instance()->info(fileName);
0733         if (info->MD5Sum() != md5) {
0734             info->setMD5Sum(md5);
0735             dirty = true;
0736             MainWindow::Window::theMainWindow()->thumbnailCache()->removeThumbnail(fileName);
0737             BackgroundJobs::HandleVideoThumbnailRequestJob::removeFullScaleFrame(fileName);
0738         }
0739 
0740         md5Map->insert(md5, fileName);
0741 
0742         ++count;
0743     }
0744     if (wasCanceled)
0745         *wasCanceled = false;
0746 
0747     if (!cantRead.empty())
0748         KMessageBox::informationList(nullptr, i18n("Following files could not be read:"), cantRead.toStringList(DB::RelativeToImageRoot));
0749 
0750     return dirty;
0751 }
0752 
0753 void DB::NewImageFinder::markUnTagged(ImageInfoPtr info)
0754 {
0755     if (DB::ImageDB::instance()->untaggedCategoryFeatureConfigured()) {
0756         info->addCategoryInfo(Settings::SettingsData::instance()->untaggedCategory(),
0757                               Settings::SettingsData::instance()->untaggedTag());
0758     }
0759 }
0760 // vi:expandtab:tabstop=4 shiftwidth=4: