File indexing completed on 2024-04-28 15:39:59
0001 // SPDX-FileCopyrightText: 2005-2013 Jesper K. Pedersen <jesper.pedersen@kdab.com> 0002 // SPDX-FileCopyrightText: 2006-2010 Tuomas Suutari <tuomas@nepnep.net> 0003 // SPDX-FileCopyrightText: 2007 Dirk Mueller <mueller@kde.org> 0004 // SPDX-FileCopyrightText: 2007-2008 Laurent Montel <montel@kde.org> 0005 // SPDX-FileCopyrightText: 2007-2010 Jan Kundrát <jkt@flaska.net> 0006 // SPDX-FileCopyrightText: 2008-2009 Henner Zeller <h.zeller@acm.org> 0007 // SPDX-FileCopyrightText: 2010 Wes Hardaker <kpa@capturedonearth.com> 0008 // SPDX-FileCopyrightText: 2010-2012 Miika Turkia <miika.turkia@gmail.com> 0009 // SPDX-FileCopyrightText: 2011 Andreas Neustifter <andreas.neustifter@gmail.com> 0010 // SPDX-FileCopyrightText: 2012 Yuri Chornoivan <yurchor@ukr.net> 0011 // SPDX-FileCopyrightText: 2012-2023 Johannes Zarl-Zierl <johannes@zarl-zierl.at> 0012 // SPDX-FileCopyrightText: 2015 Tobias Leupold <tl@stonemx.de> 0013 // SPDX-FileCopyrightText: 2017-2019 Robert Krawitz <rlk@alum.mit.edu> 0014 // SPDX-FileCopyrightText: 2018 Antoni Bella Pérez <antonibella5@yahoo.com> 0015 // 0016 // SPDX-License-Identifier: GPL-2.0-or-later 0017 0018 #include "NewImageFinder.h" 0019 0020 #include "FastDir.h" 0021 #include "ImageDB.h" 0022 #include "ImageScout.h" 0023 #include "MD5Map.h" 0024 0025 #include <BackgroundJobs/HandleVideoThumbnailRequestJob.h> 0026 #include <BackgroundJobs/ReadVideoLengthJob.h> 0027 #include <BackgroundJobs/SearchForVideosWithoutVideoThumbnailsJob.h> 0028 #include <BackgroundTaskManager/JobManager.h> 0029 #include <ImageManager/RawImageDecoder.h> 0030 #include <ImageManager/ThumbnailBuilder.h> 0031 #include <MainWindow/FeatureDialog.h> 0032 #include <MainWindow/Window.h> 0033 #include <Utilities/FileUtil.h> 0034 #include <kpabase/FileExtensions.h> 0035 #include <kpabase/FileNameUtil.h> 0036 #include <kpabase/Logging.h> 0037 #include <kpabase/SettingsData.h> 0038 #include <kpaexif/Database.h> 0039 #include <kpathumbnails/ThumbnailCache.h> 0040 0041 #include <KLocalizedString> 0042 #include <KMessageBox> 0043 #include <QApplication> 0044 #include <QDataStream> 0045 #include <QElapsedTimer> 0046 #include <QEventLoop> 0047 #include <QFile> 0048 #include <QFileInfo> 0049 #include <QImageReader> 0050 #include <QLoggingCategory> 0051 #include <QMimeDatabase> 0052 #include <QProgressBar> 0053 #include <QProgressDialog> 0054 #include <QStringList> 0055 0056 using namespace DB; 0057 0058 /***************************************************************** 0059 * 0060 * NOTES ON PERFORMANCE 0061 * ===== == =========== 0062 * 0063 * - Robert Krawitz <rlk@alum.mit.edu> 2018-05-24 0064 * 0065 * 0066 * GENERAL NOTES ON STORAGE I/O 0067 * ------- ----- -- ------- --- 0068 * 0069 * The two main gates to loading new images are: 0070 * 0071 * 1) I/O (how fast can we read images off mass storage) 0072 * 0073 * Different I/O devices have different characteristics in terms of 0074 * througput, media latency, and protocol latency. 0075 * 0076 * - Throughput is the raw speed at which data can be transferred, 0077 * limited by the physical and/or electronic characteristics of 0078 * the medium and the interface. Short of reducing the amount of 0079 * data that's transferred, or clever games with using the most 0080 * efficient part of the medium (the outer tracks only for HDD's, 0081 * a practice referred to as "short stroking" because it reduces 0082 * the distance the head has to seek, at the cost of wasting a 0083 * lot of capacity), there's nothing that can be done about this. 0084 * 0085 * - Media latency is the latency component due to characteristics 0086 * of the underlying storage medium. For spinning disks, this is 0087 * a function of rotational latency and sek latency. In some 0088 * cases, particularly with hard disks, it is possible to reduce 0089 * media latency by arranging to access the data in a way that 0090 * reduces seeking. See DB/FastDir.cpp for an example of this. 0091 * 0092 * While media latency can sometimes be hidden by overlapping 0093 * I/O, generally not possible to avoid it. Sometimes trying too 0094 * hard can actually increase media latency if it results in I/O 0095 * operations competing against each other requiring additional 0096 * seeks. 0097 * 0098 * Overlapping I/O with computation is another matter; that can 0099 * easily yield benefit, especially if it eliminates rotational 0100 * latency. 0101 * 0102 * - Protocol latency. This refers to things like SATA overhead, 0103 * network overhead (for images stored on a network), and so 0104 * forth. This can encompass multiple things, and often they can 0105 * be pipelined by means of multiple queued I/O operations. For 0106 * example, multiple commands can be issued to modern interfaces 0107 * (SATA, NVMe) and many network interfaces without waiting for 0108 * earlier operations to return. 0109 * 0110 * If protocol latency is high compared with media latency, 0111 * having multiple requests outstanding simultaneously can 0112 * yield significant benefits. 0113 * 0114 * iostat is a valuable tool for investigating throughput and 0115 * looking for possible optimizations. The IO/sec and data 0116 * read/written per second when compared against known media 0117 * characteristics (disk and SSD throughput, network bandwidth) 0118 * provides valuable information about whether we're getting close 0119 * to full performance from the I/O, and user and system CPU time 0120 * give us additional clues about whether we're I/O-bound or 0121 * CPU-bound. 0122 * 0123 * Historically in the computer field, operations that require 0124 * relatively simple processing on large volumes of data are I/O 0125 * bound. But with very fast I/O devices such as NVMe SSDs, some 0126 * of which reach 3 GB/sec, that's not always the case. 0127 * 0128 * 2) Image (mostly JPEG) loading. 0129 * 0130 * This is a function of image characteristics and image processing 0131 * libraries. Sometimes it's possible to apply parameters to 0132 * the underlying image loader to speed it up. This shows up as user 0133 * CPU time. Usually the only way to improve this performance 0134 * characteristic is to use more or faster CPU cores (sometimes GPUs 0135 * can assist here) or use better image loading routines (better 0136 * libraries). 0137 * 0138 * 0139 * DESCRIPTION OF KPHOTOALBUM IMAGE LOAD PROCESS 0140 * ----------- -- ----------- ----- ---- ------- 0141 * 0142 * KPhotoAlbum, when it loads an image, performs three processing steps: 0143 * 0144 * 1) Compute the MD5 checksum 0145 * 0146 * 2) Extract the Exif metadata 0147 * 0148 * 3) Generate a thumbnail 0149 * 0150 * Previous to this round of performance tuning, the first two steps 0151 * were performed in the first pass, and thumbnails were generated in 0152 * a separate pass. Assuming that the set of new images is large enough 0153 * that they cannot all fit in RAM buffers, this results in the I/O 0154 * being performed twice. The rewrite results in I/O being performed once. 0155 * 0156 * In addition, I have made many other changes: 0157 * 0158 * 1) Prior to the MD5 calculation step, a new thread, called a "scout 0159 * thread", reads the files into memory. While this memory is not 0160 * directly used in the later computations, it results in the images 0161 * being in RAM when they are later needed, making the I/O very fast 0162 * (copying data in memory rather than reading it from storage). 0163 * 0164 * This is a way to overlap I/O with computation. 0165 * 0166 * 2) The MD5 checksum uses its own I/O to read the data in in larger 0167 * chunks than the Qt MD5 routine does. The Qt routine reads it in 0168 * in 4KiB chunks; my experimentation has found that 256KiB chunks 0169 * are more efficient, even with a scout thread (it reduces the 0170 * number of system calls). 0171 * 0172 * 3) When searching for other images to stack with the image being 0173 * loaded, the new image loader no longer attempts to determine 0174 * whether other candidate filenames are present, nor does it 0175 * compute the MD5 checksum of any such files it does find. Rather, 0176 * it only checks for files that are already in KPhotoAlbum, either 0177 * previously or as a result of the current load. Merely checking 0178 * for the presence of another file is not cheap, and it's not 0179 * necessary; if an image will belong to a stack, we'll either know 0180 * it now or when other images that can be stacked are loaded. 0181 * 0182 * 4) The Exif metadata extraction is now done only once; previously 0183 * it was performed several times at different stages of the loading 0184 * process. 0185 * 0186 * 5) The thumbnail index is now written out incrementally rather than 0187 * the entire index (which can be many megabytes in a large image 0188 * database) being rewritten frequently. The index is fully rewritten 0189 * prior to exit. 0190 * 0191 * 0192 * BASELINE PERFORMANCE 0193 * -------- ----------- 0194 * 0195 * These measurements were all taken on a Lenovo ThinkPad P70 with 32 0196 * GB of dual-channel DDR4-2400 DRAM, a Xeon E3-1505M CPU (4 cores/8 0197 * total hyperthreads, 2.8-3.7 GHz Skylake; usually runs around 0198 * 3.1-3.2 GHz in practice), a Seagate ST2000LM015-2E8174 2TB HDD, and 0199 * a Crucial MX300 1TB SATA SSD. Published numbers and measurements I 0200 * took otherwise indicate that the HDD can handle about 105-110 0201 * MB/sec with a maximum of 180 IO/sec (in a favorable case). The SSD 0202 * is rated to handle 530 MB/sec read, 510 MB/sec write, 92K random 0203 * reads/sec, and 83K random writes/sec. 0204 * 0205 * The image set I used for all measurements, except as noted, 0206 * consists of 10839 total files of which about 85% are 20 MP JPEG and 0207 * the remainder (with a few exceptions are 20 MP RAW files from a 0208 * Canon EOS 7D mkII camera. The total dataset is about 92 GB in 0209 * size. 0210 * 0211 * I baselined both drives by reading the same dataset by means of 0212 * 0213 * % ls | xargs cat | dd bs=1048576 of=/dev/null 0214 * 0215 * The HDD required between 850 and 870 seconds (14'10" to 14'30") to 0216 * perform this operation, yielding about 105-108 MB/sec. The SSD 0217 * achieved about 271 MB/sec, which is well under its rated throughput 0218 * (hdparm -Tt yields 355 MB/sec, which is likewise nowhere close to 0219 * its rated throughput). hdparm -Tt on the HDD yields about 120 0220 * MB/sec, but throughput to an HDD depends upon which part of the 0221 * disk is being read. The outer tracks have a greater angular 0222 * density to achieve the same linear density (in other words, the 0223 * circumference of an outer track is longer than that of an inner 0224 * track, and the data is stored at a constant linear density). So 0225 * hdparm isn't very useful on an HDD except as a best case. 0226 * 0227 * Note also that hdparm does a single stream read from the device. 0228 * It does not take advantage of the ability to queue multiple 0229 * requests. 0230 * 0231 * 0232 * ANALYSIS OF KPHOTOALBUM LOAD PERFORMANCE 0233 * -------- -- ----------- ---- ----------- 0234 * 0235 * I analyzed the following cases, with images stored both on the 0236 * HDD and the SSD: 0237 * 0238 * 1) Images loaded (All, JPEG only, RAW only) 0239 * 0240 * B) Thumbnail creation (Including, Excluding) 0241 * 0242 * C) Scout threads (0, 1, 2, 3) 0243 * 0244 * The JPG image set constitutes 9293 images totaling about 55 GB. The 0245 * JPEG files are mostly 20 MP high quality files, in the range of 0246 * 6-10 MB. 0247 * The RAW image set constitutes 1544 images totaling about 37 GB. The 0248 * RAW files are 20 MP files, in the range of 25 MB. 0249 * The ALL set consists of 10839 or 10840 images totaling about 92 GB 0250 * (the above set plus 2 .MOV files and in some cases one additional 0251 * JPEG file). 0252 * 0253 * Times are elapsed times; CPU consumption is approximate user+system 0254 * CPU consumption. Numbers in parentheses are with thumbnail 0255 * building disabled. Note that in the cases with no scout threads on 0256 * the SSD the times were reproducibly shorter with thumbnail building 0257 * enabled (reasons are not determined at this time). 0258 * 0259 * Cases building RAW thumbnails generally consumed somewhat more 0260 * system CPU (in the range of 10-15%) than JPEG-only cases. This may 0261 * be due to custom I/O routines used for generating thumbnails with 0262 * JPEG files; RAW files used the I/O provided by libkdcraw, which 0263 * uses smaller I/O operations. 0264 * 0265 * Estimating CPU time for mixed workloads proved very problematic, 0266 * as there were significant changes over time. 0267 * 0268 * Elapsed Time 0269 * ------- ---- 0270 * 0271 * SSD HDD 0272 * 0273 * JPG - 0 scouts 4:03 (3:59) 0274 * JPG - 1 scout 2:46 (2:44) 0275 * JPG - 2 scouts 2:20 (2:07) 0276 * JPG - 3 scouts 2:21 (1:58) 0277 * 0278 * ALL - 0 scouts 6:32 (7:03) 16:01 0279 * ALL - 1 scout 4:33 (4:33) 15:01 0280 * ALL - 2 scouts 3:37 (3:28) 16:59 0281 * ALL - 3 scouts 3:36 (3:15) 0282 * 0283 * RAW - 0 scouts 2:18 (2:46) 0284 * RAW - 1 scout 1:46 (1:46) 0285 * RAW - 2 scouts 1:17 (1:17) 0286 * RAW - 3 scouts 1:13 (1:13) 0287 * 0288 * User+System CPU 0289 * ----------- --- 0290 * 0291 * SSD HDD 0292 * 0293 * JPG - 0 scouts 40% (12%) 0294 * JPG - 1 scout 70% (20%) 0295 * JPG - 2 scouts 85% (15%) 0296 * JPG - 3 scouts 85% (15%) 0297 * 0298 * RAW - 0 scouts 15% (10%) 0299 * RAW - 1 scout 18% (12%) 0300 * RAW - 2 scouts 25% (15%) 0301 * RAW - 3 scouts 25% (15%) 0302 * 0303 * I also used kcachegrind to measure CPU consumption on smaller 0304 * subsets of images (with and without thumbnail creation). In terms 0305 * of user CPU consumption, thumbnail creation constitutes the large 0306 * majority of CPU cycles for processing JPEG files, followed by MD5 0307 * computation, with Exif parsing lagging far behind. For RAW files, 0308 * MD5 computation consumes more cycles, likely in part due to the 0309 * larger size of RAW files but possibly also related to the smaller 0310 * filesize of embedded thumbnails (on the Canon 7D mkII, the embedded 0311 * thumbnail is full size but low quality). 0312 * 0313 * With thumbnail generation: 0314 * ---- --------- ----------- 0315 * 0316 * RAW JPEG 0317 * 0318 * Thumbnail generation 44% 82% 0319 * libjpeg processing 43% 82% 0320 * MD5 computation 51% 13% 0321 * Read Exif 1% 1.0% 0322 * 0323 * Without thumbnail generation: 0324 * ------- --------- ----------- 0325 * 0326 * RAW JPEG 0327 * 0328 * MD5 computation 92% 80% 0329 * Read Exif 4% 10% 0330 * 0331 * 0332 * CONCLUSIONS 0333 * ----------- 0334 * 0335 * For loading files from hard disk (likely the most common case), 0336 * there's no reason to consider any loading method other than using a 0337 * single scout thread and computing thumbnails concurrently. Even 0338 * with thumbnail computation, there is very little CPU utilization. 0339 * 0340 * Loading from SATA SSD benefits from two scout threads, and possibly 0341 * more. For minimal time to regain control, there is some benefit 0342 * seen from separating thumbnail generation from the rest of the 0343 * processing stages at the cost of more total elapsed time. This is 0344 * more evident with JPEG files than with RAW files in this test case. 0345 * RAW files typically have smaller thumbnail images which can be 0346 * extracted and processed more quickly than full-size JPEG files. On 0347 * a slower CPU, it may be desirable to return control to the user 0348 * even if the thumbnails are not built yet. 0349 * 0350 * Two other cases would be NVMe (or other very fast) SSDs and network 0351 * storage. Since we're seeing evidence of CPU saturation on SATA 0352 * SSDs, we would likely see this even more strongly with NVMe; with 0353 * large numbers of images it may be desirable to separate the 0354 * thumbnail building from the rest of the processing. It may also be 0355 * beneficial to use more scout threads. 0356 * 0357 * Network storage presents a different problem. It is likely to have 0358 * lower throughput -- and certainly much higher latency -- than even 0359 * HDD, unless the underlying storage medium is SSD and the data is 0360 * located on a very fast, low latency network. So there would be no 0361 * benefit to separating thumbnail processing. However, due to 0362 * protocol vs. media latency discussed above, it may well work to use 0363 * more scout threads. However, this may saturate the network and the 0364 * storage, to the detriment of other users, and there's probably no 0365 * general (or easily discoverable) optimum for this. 0366 * 0367 * It's my judgment that most images will be stored on HDDs for at 0368 * least the next few years, so tuning for that use case is probably 0369 * the best single choice to be made. 0370 * 0371 *****************************************************************/ 0372 0373 namespace 0374 { 0375 0376 bool canReadImage(const DB::FileName &fileName) 0377 { 0378 bool fastMode = !Settings::SettingsData::instance()->ignoreFileExtension(); 0379 QMimeDatabase::MatchMode mode = fastMode ? QMimeDatabase::MatchExtension : QMimeDatabase::MatchDefault; 0380 QMimeDatabase db; 0381 QMimeType mimeType = db.mimeTypeForFile(fileName.absolute(), mode); 0382 0383 return QImageReader::supportedMimeTypes().contains(mimeType.name().toUtf8()) 0384 || ImageManager::ImageDecoder::mightDecode(fileName); 0385 } 0386 } 0387 0388 QMutex NewImageFinder::s_imageFinderLock; 0389 0390 bool NewImageFinder::findImages() 0391 { 0392 using namespace std::chrono_literals; 0393 if (!s_imageFinderLock.try_lock_for(500ms)) { 0394 qCInfo(DBLog) << "NewImageFinder::findImages() called while searching for new images. Try again later..."; 0395 return false; 0396 } 0397 // Load the information from the XML file. 0398 DB::FileNameSet loadedFiles; 0399 0400 QElapsedTimer timer; 0401 0402 timer.start(); 0403 // TODO: maybe the database interface should allow to query if it 0404 // knows about an image ? Here we've to iterate through all of them and it 0405 // might be more efficient do do this in the database without fetching the 0406 // whole info. 0407 const auto knownFiles = DB::ImageDB::instance()->files(); 0408 for (const DB::FileName &fileName : knownFiles) { 0409 loadedFiles.insert(fileName); 0410 } 0411 0412 m_pendingLoad.clear(); 0413 searchForNewFiles(loadedFiles, Settings::SettingsData::instance()->imageDirectory()); 0414 int filesToLoad = m_pendingLoad.count(); 0415 loadExtraFiles(); 0416 0417 qCDebug(TimingLog) << "Loaded " << filesToLoad << " images in " << timer.elapsed() / 1000.0 << " seconds"; 0418 0419 // Man this is not super optimal, but will be changed onces the image finder moves to become a background task. 0420 if (MainWindow::FeatureDialog::hasVideoThumbnailer()) { 0421 BackgroundTaskManager::JobManager::instance()->addJob( 0422 new BackgroundJobs::SearchForVideosWithoutVideoThumbnailsJob); 0423 } 0424 0425 s_imageFinderLock.unlock(); 0426 // To avoid deciding if the new images are shown in a given thumbnail view or in a given search 0427 // we rather just go to home. 0428 return (!m_pendingLoad.isEmpty()); // returns if new images was found. 0429 } 0430 0431 void NewImageFinder::searchForNewFiles(const DB::FileNameSet &loadedFiles, QString directory) 0432 { 0433 qApp->processEvents(QEventLoop::AllEvents); 0434 directory = Utilities::stripEndingForwardSlash(directory); 0435 0436 qCDebug(DBFileOpsLog) << "searching for new files in" << directory; 0437 FastDir dir(directory); 0438 const QStringList dirList = dir.entryList(); 0439 ImageManager::RAWImageDecoder rawDec; 0440 QStringList excluded; 0441 excluded << Settings::SettingsData::instance()->excludeDirectories(); 0442 excluded = excluded.at(0).split(QString::fromLatin1(",")); 0443 0444 bool skipSymlinks = Settings::SettingsData::instance()->skipSymlinks(); 0445 0446 // Keep files within a directory more local by processing all files within the 0447 // directory, and then all subdirectories. 0448 QStringList subdirList; 0449 0450 for (QStringList::const_iterator it = dirList.constBegin(); it != dirList.constEnd(); ++it) { 0451 const DB::FileName file = DB::FileName::fromAbsolutePath(directory + QString::fromLatin1("/") + *it); 0452 if ((*it) == QString::fromLatin1(".") || (*it) == QString::fromLatin1("..") 0453 || excluded.contains((*it)) || loadedFiles.contains(file) 0454 || KPABase::fileCanBeSkipped(loadedFiles, file) 0455 || (*it) == QString::fromLatin1("CategoryImages")) 0456 continue; 0457 0458 QFileInfo fi(file.absolute()); 0459 0460 if (!fi.isReadable()) 0461 continue; 0462 if (skipSymlinks && fi.isSymLink()) 0463 continue; 0464 0465 if (fi.isFile()) { 0466 if (!DB::ImageDB::instance()->isBlocking(file)) { 0467 if (canReadImage(file)) { 0468 qCDebug(DBFileOpsLog) << "Found new image:" << file.relative(); 0469 m_pendingLoad.append(qMakePair(file, DB::Image)); 0470 } else if (KPABase::isVideo(file)) { 0471 qCDebug(DBFileOpsLog) << "Found new video:" << file.relative(); 0472 m_pendingLoad.append(qMakePair(file, DB::Video)); 0473 } 0474 } 0475 } else if (fi.isDir()) { 0476 subdirList.append(file.absolute()); 0477 } 0478 } 0479 for (QStringList::const_iterator it = subdirList.constBegin(); it != subdirList.constEnd(); ++it) 0480 searchForNewFiles(loadedFiles, *it); 0481 } 0482 0483 void NewImageFinder::loadExtraFiles() 0484 { 0485 // FIXME: should be converted to a threadpool for SMP stuff and whatnot :] 0486 QProgressDialog dialog; 0487 QElapsedTimer timeSinceProgressUpdate; 0488 dialog.setLabelText(i18n("<p><b>Loading information from new files</b></p>" 0489 "<p>Depending on the number of images, this may take some time.<br/>" 0490 "However, there is only a delay when new images are found.</p>")); 0491 QProgressBar *progressBar = new QProgressBar; 0492 progressBar->setFormat(QLatin1String("%v/%m")); 0493 dialog.setBar(progressBar); 0494 dialog.setMaximum(m_pendingLoad.count()); 0495 dialog.setMinimumDuration(1000); 0496 QAtomicInt loadedCount = 0; 0497 0498 setupFileVersionDetection(); 0499 0500 int count = 0; 0501 0502 MD5::resetMD5Cache(); 0503 ImageScoutQueue asyncPreloadQueue; 0504 for (LoadList::Iterator it = m_pendingLoad.begin(); it != m_pendingLoad.end(); ++it) { 0505 asyncPreloadQueue.enqueue((*it).first); 0506 } 0507 0508 ImageScout scout(asyncPreloadQueue, loadedCount, Settings::SettingsData::instance()->getPreloadThreadCount()); 0509 if (Settings::SettingsData::instance()->getOverlapLoadMD5()) 0510 scout.setPreloadFunc(DB::PreloadMD5Sum); 0511 scout.start(); 0512 0513 DB::ImageDB::instance()->exifDB()->startInsertTransaction(); 0514 dialog.setValue(count); // ensure to call setProgress(0) 0515 timeSinceProgressUpdate.start(); 0516 for (LoadList::Iterator it = m_pendingLoad.begin(); it != m_pendingLoad.end(); ++it, ++count) { 0517 qApp->processEvents(QEventLoop::AllEvents); 0518 0519 if (dialog.wasCanceled()) { 0520 m_pendingLoad.clear(); 0521 DB::ImageDB::instance()->exifDB()->abortInsertTransaction(); 0522 return; 0523 } 0524 // (*it).first: DB::FileName 0525 // (*it).second: DB::MediaType 0526 loadExtraFile((*it).first, (*it).second); 0527 loadedCount++; // Atomic 0528 if (timeSinceProgressUpdate.elapsed() >= 1000) { 0529 dialog.setValue(count); 0530 timeSinceProgressUpdate.restart(); 0531 } 0532 } 0533 dialog.setValue(count); 0534 // loadExtraFile() has already inserted all images into the 0535 // database, but without committing the changes 0536 DB::ImageDB::instance()->commitDelayedImages(); 0537 DB::ImageDB::instance()->exifDB()->commitInsertTransaction(); 0538 0539 ImageManager::ThumbnailBuilder::instance()->save(); 0540 } 0541 0542 void NewImageFinder::setupFileVersionDetection() 0543 { 0544 // should be cached because loading once per image is expensive 0545 m_modifiedFileCompString = Settings::SettingsData::instance()->modifiedFileComponent(); 0546 m_modifiedFileComponent = QRegExp(m_modifiedFileCompString); 0547 0548 m_originalFileComponents << Settings::SettingsData::instance()->originalFileComponent(); 0549 m_originalFileComponents = m_originalFileComponents.at(0).split(QString::fromLatin1(";")); 0550 } 0551 0552 void NewImageFinder::loadExtraFile(const DB::FileName &newFileName, DB::MediaType type) 0553 { 0554 qCDebug(DBFileOpsLog) << "loadExtraFile(" << newFileName.relative() << ")"; 0555 MD5 sum = MD5Sum(newFileName); 0556 if (handleIfImageHasBeenMoved(newFileName, sum)) 0557 return; 0558 0559 // check to see if this is a new version of a previous image 0560 // We'll get the Exif data later, when we get the MD5 checksum. 0561 ImageInfoPtr info = ImageInfoPtr(new ImageInfo(newFileName, type, DB::FileInformation::Ignore)); 0562 ImageInfoPtr originalInfo; 0563 DB::FileName originalFileName; 0564 0565 if (Settings::SettingsData::instance()->detectModifiedFiles()) { 0566 // requires at least *something* in the modifiedFileComponent 0567 if (m_modifiedFileCompString.length() >= 0 && newFileName.relative().contains(m_modifiedFileComponent)) { 0568 0569 for (QStringList::const_iterator it = m_originalFileComponents.constBegin(); 0570 it != m_originalFileComponents.constEnd(); ++it) { 0571 QString tmp = newFileName.relative(); 0572 tmp.replace(m_modifiedFileComponent, (*it)); 0573 originalFileName = DB::FileName::fromRelativePath(tmp); 0574 0575 MD5 originalSum; 0576 if (newFileName == originalFileName) 0577 originalSum = sum; 0578 else if (DB::ImageDB::instance()->md5Map()->containsFile(originalFileName)) 0579 originalSum = DB::ImageDB::instance()->md5Map()->lookupFile(originalFileName); 0580 else 0581 // Do *not* attempt to compute the checksum here. It forces a filesystem 0582 // lookup on a file that may not exist and substantially degrades 0583 // performance by about 25% on an SSD and about 30% on a spinning disk. 0584 // If one of these other files exist, it will be found later in 0585 // the image search at which point we'll detect the modified file. 0586 continue; 0587 if (DB::ImageDB::instance()->md5Map()->contains(originalSum)) { 0588 // we have a previous copy of this file; copy it's data 0589 // from the original. 0590 originalInfo = DB::ImageDB::instance()->info(originalFileName); 0591 if (!originalInfo) { 0592 qCDebug(DBLog) << "Original info not found by name for " << originalFileName.absolute() << ", trying by MD5 sum."; 0593 originalFileName = DB::ImageDB::instance()->md5Map()->lookup(originalSum); 0594 0595 if (!originalFileName.isNull()) { 0596 qCDebug(DBLog) << "Substitute image " << originalFileName.absolute() << " found."; 0597 originalInfo = DB::ImageDB::instance()->info(originalFileName); 0598 } 0599 0600 if (!originalInfo) { 0601 qCWarning(DBLog, "How did that happen? We couldn't find info for the original image %s; can't copy the original data to %s", 0602 qPrintable(originalFileName.absolute()), qPrintable(newFileName.absolute())); 0603 continue; 0604 } 0605 } 0606 info->copyExtraData(*originalInfo); 0607 0608 /* if requested to move, then delete old data from original */ 0609 if (Settings::SettingsData::instance()->moveOriginalContents()) { 0610 originalInfo->removeExtraData(); 0611 } 0612 0613 break; 0614 } 0615 } 0616 } 0617 } 0618 ImageInfoList newImages; 0619 newImages.append(info); 0620 DB::ImageDB::instance()->addImages(newImages, false); 0621 0622 // also inserts image into exif db if present: 0623 info->setMD5Sum(sum); 0624 DB::ImageDB::instance()->md5Map()->insert(sum, info->fileName()); 0625 0626 if (originalInfo && Settings::SettingsData::instance()->autoStackNewFiles()) { 0627 0628 // stack the files together 0629 DB::FileName olderfile = originalFileName; 0630 DB::FileName newerfile = info->fileName(); 0631 DB::FileNameList tostack; 0632 0633 // the newest file should go to the top of the stack 0634 tostack.append(newerfile); 0635 0636 DB::FileNameList oldStack; 0637 if ((oldStack = DB::ImageDB::instance()->getStackFor(olderfile)).isEmpty()) { 0638 tostack.append(olderfile); 0639 } else { 0640 for (const DB::FileName &tmp : oldStack) { 0641 tostack.append(tmp); 0642 } 0643 } 0644 DB::ImageDB::instance()->stack(tostack); 0645 MainWindow::Window::theMainWindow()->setStackHead(newerfile); 0646 0647 // ordering: XXX we ideally want to place the new image right 0648 // after the older one in the list. 0649 } 0650 0651 markUnTagged(info); 0652 ImageManager::ThumbnailBuilder::instance()->buildOneThumbnail(info); 0653 if (info->isVideo() && MainWindow::FeatureDialog::hasVideoThumbnailer()) { 0654 // needs to be done *after* insertion into database 0655 BackgroundTaskManager::JobManager::instance()->addJob( 0656 new BackgroundJobs::ReadVideoLengthJob(info->fileName(), BackgroundTaskManager::BackgroundVideoPreviewRequest)); 0657 } 0658 } 0659 0660 bool NewImageFinder::handleIfImageHasBeenMoved(const FileName &newFileName, const MD5 &sum) 0661 { 0662 if (DB::ImageDB::instance()->md5Map()->contains(sum)) { 0663 const DB::FileName matchedFileName = DB::ImageDB::instance()->md5Map()->lookup(sum); 0664 QFileInfo fi(matchedFileName.absolute()); 0665 0666 if (!fi.exists()) { 0667 // The file we had a collapse with didn't exists anymore so it is likely moved to this new name 0668 ImageInfoPtr info = DB::ImageDB::instance()->info(matchedFileName); 0669 if (!info) 0670 qCWarning(DBLog, "How did that happen? We couldn't find info for the images %s", qPrintable(matchedFileName.relative())); 0671 else { 0672 fi = QFileInfo(matchedFileName.relative()); 0673 if (info->label() == fi.completeBaseName()) { 0674 fi = QFileInfo(newFileName.absolute()); 0675 info->setLabel(fi.completeBaseName()); 0676 } 0677 0678 DB::ImageDB::instance()->renameImage(info, newFileName); 0679 0680 // We need to insert the new name into the MD5 map, 0681 // as it is a map, the value for the moved file will automatically be deleted. 0682 0683 DB::ImageDB::instance()->md5Map()->insert(sum, info->fileName()); 0684 0685 DB::ImageDB::instance()->exifDB()->remove(matchedFileName); 0686 DB::ImageDB::instance()->exifDB()->add(newFileName); 0687 ImageManager::ThumbnailBuilder::instance()->buildOneThumbnail(info); 0688 return true; 0689 } 0690 } 0691 } 0692 return false; // The image wasn't just moved 0693 } 0694 0695 bool NewImageFinder::calculateMD5sums( 0696 const DB::FileNameList &list, 0697 DB::MD5Map *md5Map, 0698 bool *wasCanceled) 0699 { 0700 // FIXME: should be converted to a threadpool for SMP stuff and whatnot :] 0701 QProgressDialog dialog; 0702 dialog.setLabelText( 0703 i18np("<p><b>Calculating checksum for %1 file</b></p>", "<p><b>Calculating checksums for %1 files</b></p>", list.size()) 0704 + i18n("<p>By storing a checksum for each image " 0705 "KPhotoAlbum is capable of finding images " 0706 "even when you have moved them on the disk.</p>")); 0707 dialog.setMaximum(list.size()); 0708 dialog.setMinimumDuration(1000); 0709 0710 int count = 0; 0711 DB::FileNameList cantRead; 0712 bool dirty = false; 0713 0714 for (const FileName &fileName : list) { 0715 if (count % 10 == 0) { 0716 dialog.setValue(count); // ensure to call setProgress(0) 0717 qApp->processEvents(QEventLoop::AllEvents); 0718 0719 if (dialog.wasCanceled()) { 0720 if (wasCanceled) 0721 *wasCanceled = true; 0722 return dirty; 0723 } 0724 } 0725 0726 MD5 md5 = MD5Sum(fileName); 0727 if (md5.isNull()) { 0728 cantRead << fileName; 0729 continue; 0730 } 0731 0732 ImageInfoPtr info = ImageDB::instance()->info(fileName); 0733 if (info->MD5Sum() != md5) { 0734 info->setMD5Sum(md5); 0735 dirty = true; 0736 MainWindow::Window::theMainWindow()->thumbnailCache()->removeThumbnail(fileName); 0737 BackgroundJobs::HandleVideoThumbnailRequestJob::removeFullScaleFrame(fileName); 0738 } 0739 0740 md5Map->insert(md5, fileName); 0741 0742 ++count; 0743 } 0744 if (wasCanceled) 0745 *wasCanceled = false; 0746 0747 if (!cantRead.empty()) 0748 KMessageBox::informationList(nullptr, i18n("Following files could not be read:"), cantRead.toStringList(DB::RelativeToImageRoot)); 0749 0750 return dirty; 0751 } 0752 0753 void DB::NewImageFinder::markUnTagged(ImageInfoPtr info) 0754 { 0755 if (DB::ImageDB::instance()->untaggedCategoryFeatureConfigured()) { 0756 info->addCategoryInfo(Settings::SettingsData::instance()->untaggedCategory(), 0757 Settings::SettingsData::instance()->untaggedTag()); 0758 } 0759 } 0760 // vi:expandtab:tabstop=4 shiftwidth=4: