File indexing completed on 2025-03-09 03:54:58
0001 /* ============================================================ 0002 * 0003 * This file is a part of digiKam 0004 * 0005 * Date : 2019-08-08 0006 * Description : Derived class to perform YOLO neural network inference 0007 * for face detection. Credit: Ayoosh Kathuria (for Yolov3 blog post), 0008 * sthanhng (for example of face detection with Yolov3). 0009 * More information with Yolov3: 0010 * https://towardsdatascience.com/yolo-v3-object-detection-53fb7d3bfe6b 0011 * sthanhng github on face detection with Yolov3: 0012 * https://github.com/sthanhng/yoloface 0013 * 0014 * SPDX-FileCopyrightText: 2019 by Thanh Trung Dinh <dinhthanhtrung1996 at gmail dot com> 0015 * SPDX-FileCopyrightText: 2020-2024 by Gilles Caulier <caulier dot gilles at gmail dot com> 0016 * 0017 * SPDX-License-Identifier: GPL-2.0-or-later 0018 * 0019 * ============================================================ */ 0020 0021 #include "dnnfacedetectoryolo.h" 0022 0023 // Qt includes 0024 0025 #include <QList> 0026 #include <QRect> 0027 #include <QString> 0028 #include <QFileInfo> 0029 #include <QMutexLocker> 0030 #include <QElapsedTimer> 0031 #include <QStandardPaths> 0032 0033 // Local includes 0034 0035 #include "digikam_debug.h" 0036 #include "digikam_config.h" 0037 0038 namespace Digikam 0039 { 0040 0041 DNNFaceDetectorYOLO::DNNFaceDetectorYOLO() 0042 : DNNFaceDetectorBase(1.0F / 255.0F, cv::Scalar(0.0, 0.0, 0.0), cv::Size(416, 416)) 0043 { 0044 loadModels(); 0045 } 0046 0047 DNNFaceDetectorYOLO::~DNNFaceDetectorYOLO() 0048 { 0049 } 0050 0051 bool DNNFaceDetectorYOLO::loadModels() 0052 { 0053 QString appPath = QStandardPaths::locate(QStandardPaths::GenericDataLocation, 0054 QLatin1String("digikam/facesengine"), 0055 QStandardPaths::LocateDirectory); 0056 0057 QString model = QLatin1String("yolov3-face.cfg"); 0058 QString data = QLatin1String("yolov3-wider_16000.weights"); 0059 0060 QString nnmodel = appPath + QLatin1Char('/') + model; 0061 QString nndata = appPath + QLatin1Char('/') + data; 0062 0063 if (QFileInfo::exists(nnmodel) && QFileInfo::exists(nndata)) 0064 { 0065 try 0066 { 0067 qCDebug(DIGIKAM_FACEDB_LOG) << "YOLO model:" << model << ", YOLO data:" << data; 0068 0069 #ifdef Q_OS_WIN 0070 0071 net = cv::dnn::readNetFromDarknet(nnmodel.toLocal8Bit().constData(), 0072 nndata.toLocal8Bit().constData()); 0073 0074 #else 0075 0076 net = cv::dnn::readNetFromDarknet(nnmodel.toStdString(), 0077 nndata.toStdString()); 0078 0079 #endif 0080 0081 net.setPreferableBackend(cv::dnn::DNN_BACKEND_DEFAULT); 0082 net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU); 0083 0084 #if (OPENCV_VERSION == QT_VERSION_CHECK(4, 7, 0)) 0085 0086 net.enableWinograd(false); 0087 0088 #endif 0089 0090 } 0091 catch (cv::Exception& e) 0092 { 0093 qCWarning(DIGIKAM_FACEDB_LOG) << "cv::Exception:" << e.what(); 0094 0095 return false; 0096 } 0097 catch (...) 0098 { 0099 qCWarning(DIGIKAM_FACEDB_LOG) << "Default exception from OpenCV"; 0100 0101 return false; 0102 } 0103 } 0104 else 0105 { 0106 qCCritical(DIGIKAM_FACEDB_LOG) << "Cannot found faces engine DNN model" << model << "or" << data; 0107 qCCritical(DIGIKAM_FACEDB_LOG) << "Faces detection feature cannot be used!"; 0108 0109 return false; 0110 } 0111 0112 return true; 0113 } 0114 0115 void DNNFaceDetectorYOLO::detectFaces(const cv::Mat& inputImage, 0116 const cv::Size& paddedSize, 0117 std::vector<cv::Rect>& detectedBboxes) 0118 { 0119 QElapsedTimer timer; 0120 0121 if (inputImage.empty()) 0122 { 0123 qCDebug(DIGIKAM_FACESENGINE_LOG) << "Invalid image given, not detecting faces."; 0124 return; 0125 } 0126 0127 cv::Mat inputBlob = cv::dnn::blobFromImage(inputImage, scaleFactor, inputImageSize, meanValToSubtract, true, false); 0128 std::vector<cv::Mat> outs; 0129 0130 if (!net.empty()) 0131 { 0132 QMutexLocker lock(&mutex); 0133 net.setInput(inputBlob); 0134 timer.start(); 0135 net.forward(outs, getOutputsNames()); 0136 qCDebug(DIGIKAM_FACESENGINE_LOG) << "forward YOLO detection in" << timer.elapsed() << "ms"; 0137 } 0138 0139 timer.start(); 0140 0141 postprocess(outs, paddedSize, detectedBboxes); 0142 0143 qCDebug(DIGIKAM_FACESENGINE_LOG) << "postprocess YOLO detection in" << timer.elapsed() << "ms"; 0144 } 0145 0146 void DNNFaceDetectorYOLO::postprocess(const std::vector<cv::Mat>& outs, 0147 const cv::Size& paddedSize, 0148 std::vector<cv::Rect>& detectedBboxes) const 0149 { 0150 std::vector<float> goodConfidences; 0151 std::vector<float> doubtConfidences; 0152 std::vector<float> confidences; 0153 std::vector<cv::Rect> goodBoxes; 0154 std::vector<cv::Rect> doubtBoxes; 0155 std::vector<cv::Rect> boxes; 0156 0157 for (size_t i = 0 ; i < outs.size() ; ++i) 0158 { 0159 // Scan through all the bounding boxes output from the network and keep only the 0160 // ones with high confidence scores. Assign the box's class label as the class 0161 // with the highest score for the box. 0162 0163 float* data = reinterpret_cast<float*>(outs[i].data); 0164 0165 for (int j = 0 ; j < outs[i].rows ; ++j, data += outs[i].cols) 0166 { 0167 cv::Mat scores = outs[i].row(j).colRange(5, outs[i].cols); 0168 0169 // Get the value and location of the maximum score 0170 0171 double confidence; 0172 cv::minMaxLoc(scores, nullptr, &confidence, nullptr, nullptr); 0173 0174 if (confidence > confidenceThreshold) 0175 { 0176 int centerX = (int)(data[0] * inputImageSize.width); 0177 int centerY = (int)(data[1] * inputImageSize.height); 0178 int width = (int)(data[2] * inputImageSize.width); 0179 int height = (int)(data[3] * inputImageSize.height); 0180 0181 int left = centerX - width / 2; 0182 int right = centerX + width / 2; 0183 int top = centerY - height / 2; 0184 int bottom = centerY + height / 2; 0185 0186 selectBbox(paddedSize, 0187 confidence, 0188 left, 0189 right, 0190 top, 0191 bottom, 0192 goodConfidences, 0193 goodBoxes, 0194 doubtConfidences, 0195 doubtBoxes); 0196 } 0197 } 0198 } 0199 0200 qCDebug(DIGIKAM_FACESENGINE_LOG) << "nb of doubtbox = " << doubtBoxes.size(); 0201 qCDebug(DIGIKAM_FACESENGINE_LOG) << "nb of goodbox = " << goodBoxes.size(); 0202 0203 if (goodBoxes.empty()) 0204 { 0205 boxes = doubtBoxes; 0206 confidences = doubtConfidences; 0207 } 0208 else 0209 { 0210 boxes = goodBoxes; 0211 confidences = goodConfidences; 0212 } 0213 0214 // Perform non maximum suppression to eliminate redundant overlapping boxes with lower confidences 0215 0216 std::vector<int> indices; 0217 cv::dnn::NMSBoxes(boxes, confidences, confidenceThreshold, nmsThreshold, indices); 0218 0219 // Get detected bounding boxes 0220 0221 for (size_t i = 0 ; i < indices.size() ; ++i) 0222 { 0223 cv::Rect bbox = boxes[indices[i]]; 0224 correctBbox(bbox, paddedSize); 0225 detectedBboxes.push_back(cv::Rect(bbox.x, bbox.y, bbox.width, bbox.height)); 0226 } 0227 } 0228 0229 /** Get the names of the output layers 0230 */ 0231 std::vector<cv::String> DNNFaceDetectorYOLO::getOutputsNames() const 0232 { 0233 static std::vector<cv::String> names; 0234 0235 if (!net.empty() && names.empty()) 0236 { 0237 // Get the indices of the output layers, i.e. the layers with unconnected outputs 0238 0239 std::vector<int> outLayers = net.getUnconnectedOutLayers(); 0240 0241 // Get the names of all the layers in the network 0242 0243 std::vector<cv::String> layersNames = net.getLayerNames(); 0244 0245 // Get the names of the output layers in names 0246 0247 names.resize(outLayers.size()); 0248 0249 for (size_t i = 0 ; i < outLayers.size() ; ++i) 0250 { 0251 names[i] = layersNames[outLayers[i] - 1]; 0252 } 0253 } 0254 0255 return names; 0256 } 0257 0258 } // namespace Digikam