detection/opencv-dnn/dnnfacedetectoryolo.cpp

0001 /* ============================================================
0002  *
0003  * This file is a part of digiKam
0004  *
0005  * Date        : 2019-08-08
0006  * Description : Derived class to perform YOLO neural network inference
0007  *               for face detection. Credit: Ayoosh Kathuria (for Yolov3 blog post),
0008  *               sthanhng (for example of face detection with Yolov3).
0009  *               More information with Yolov3:
0010  *               https://towardsdatascience.com/yolo-v3-object-detection-53fb7d3bfe6b
0011  *               sthanhng github on face detection with Yolov3:
0012  *               https://github.com/sthanhng/yoloface
0013  *
0014  * SPDX-FileCopyrightText: 2019      by Thanh Trung Dinh <dinhthanhtrung1996 at gmail dot com>
0015  * SPDX-FileCopyrightText: 2020-2024 by Gilles Caulier <caulier dot gilles at gmail dot com>
0016  *
0017  * SPDX-License-Identifier: GPL-2.0-or-later
0018  *
0019  * ============================================================ */
0020
0021 #include "dnnfacedetectoryolo.h"
0022
0023 // Qt includes
0024
0025 #include <QList>
0026 #include <QRect>
0027 #include <QString>
0028 #include <QFileInfo>
0029 #include <QMutexLocker>
0030 #include <QElapsedTimer>
0031 #include <QStandardPaths>
0032
0033 // Local includes
0034
0035 #include "digikam_debug.h"
0036 #include "digikam_config.h"
0037
0038 namespace Digikam
0039 {
0040
0041 DNNFaceDetectorYOLO::DNNFaceDetectorYOLO()
0042     : DNNFaceDetectorBase(1.0F / 255.0F, cv::Scalar(0.0, 0.0, 0.0), cv::Size(416, 416))
0043 {
0044     loadModels();
0045 }
0046
0047 DNNFaceDetectorYOLO::~DNNFaceDetectorYOLO()
0048 {
0049 }
0050
0051 bool DNNFaceDetectorYOLO::loadModels()
0052 {
0053     QString appPath = QStandardPaths::locate(QStandardPaths::GenericDataLocation,
0054                                              QLatin1String("digikam/facesengine"),
0055                                              QStandardPaths::LocateDirectory);
0056
0057     QString model   = QLatin1String("yolov3-face.cfg");
0058     QString data    = QLatin1String("yolov3-wider_16000.weights");
0059
0060     QString nnmodel = appPath + QLatin1Char('/') + model;
0061     QString nndata  = appPath + QLatin1Char('/') + data;
0062
0063     if (QFileInfo::exists(nnmodel) && QFileInfo::exists(nndata))
0064     {
0065         try
0066         {
0067             qCDebug(DIGIKAM_FACEDB_LOG) << "YOLO model:" << model << ", YOLO data:" << data;
0068
0069 #ifdef Q_OS_WIN
0070
0071             net = cv::dnn::readNetFromDarknet(nnmodel.toLocal8Bit().constData(),
0072                                               nndata.toLocal8Bit().constData());
0073
0074 #else
0075
0076             net = cv::dnn::readNetFromDarknet(nnmodel.toStdString(),
0077                                               nndata.toStdString());
0078
0079 #endif
0080
0081             net.setPreferableBackend(cv::dnn::DNN_BACKEND_DEFAULT);
0082             net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
0083
0084 #if (OPENCV_VERSION == QT_VERSION_CHECK(4, 7, 0))
0085
0086             net.enableWinograd(false);
0087
0088 #endif
0089
0090         }
0091         catch (cv::Exception& e)
0092         {
0093             qCWarning(DIGIKAM_FACEDB_LOG) << "cv::Exception:" << e.what();
0094
0095             return false;
0096         }
0097         catch (...)
0098         {
0099            qCWarning(DIGIKAM_FACEDB_LOG) << "Default exception from OpenCV";
0100
0101            return false;
0102         }
0103     }
0104     else
0105     {
0106         qCCritical(DIGIKAM_FACEDB_LOG) << "Cannot found faces engine DNN model" << model << "or" << data;
0107         qCCritical(DIGIKAM_FACEDB_LOG) << "Faces detection feature cannot be used!";
0108
0109         return false;
0110     }
0111
0112     return true;
0113 }
0114
0115 void DNNFaceDetectorYOLO::detectFaces(const cv::Mat& inputImage,
0116                                       const cv::Size& paddedSize,
0117                                       std::vector<cv::Rect>& detectedBboxes)
0118 {
0119     QElapsedTimer timer;
0120
0121     if (inputImage.empty())
0122     {
0123         qCDebug(DIGIKAM_FACESENGINE_LOG) << "Invalid image given, not detecting faces.";
0124         return;
0125     }
0126
0127     cv::Mat inputBlob = cv::dnn::blobFromImage(inputImage, scaleFactor, inputImageSize, meanValToSubtract, true, false);
0128     std::vector<cv::Mat> outs;
0129
0130     if (!net.empty())
0131     {
0132         QMutexLocker lock(&mutex);
0133         net.setInput(inputBlob);
0134         timer.start();
0135         net.forward(outs, getOutputsNames());
0136         qCDebug(DIGIKAM_FACESENGINE_LOG) << "forward YOLO detection in" << timer.elapsed() << "ms";
0137     }
0138
0139     timer.start();
0140
0141     postprocess(outs, paddedSize, detectedBboxes);
0142
0143     qCDebug(DIGIKAM_FACESENGINE_LOG) << "postprocess YOLO detection in" << timer.elapsed() << "ms";
0144 }
0145
0146 void DNNFaceDetectorYOLO::postprocess(const std::vector<cv::Mat>& outs,
0147                                       const cv::Size& paddedSize,
0148                                       std::vector<cv::Rect>& detectedBboxes) const
0149 {
0150     std::vector<float>    goodConfidences;
0151     std::vector<float>    doubtConfidences;
0152     std::vector<float>    confidences;
0153     std::vector<cv::Rect> goodBoxes;
0154     std::vector<cv::Rect> doubtBoxes;
0155     std::vector<cv::Rect> boxes;
0156
0157     for (size_t i = 0 ; i < outs.size() ; ++i)
0158     {
0159         // Scan through all the bounding boxes output from the network and keep only the
0160         // ones with high confidence scores. Assign the box's class label as the class
0161         // with the highest score for the box.
0162
0163         float* data = reinterpret_cast<float*>(outs[i].data);
0164
0165         for (int j = 0 ; j < outs[i].rows ; ++j, data += outs[i].cols)
0166         {
0167             cv::Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
0168
0169             // Get the value and location of the maximum score
0170
0171             double confidence;
0172             cv::minMaxLoc(scores, nullptr, &confidence, nullptr, nullptr);
0173
0174             if (confidence > confidenceThreshold)
0175             {
0176                 int centerX = (int)(data[0] * inputImageSize.width);
0177                 int centerY = (int)(data[1] * inputImageSize.height);
0178                 int width   = (int)(data[2] * inputImageSize.width);
0179                 int height  = (int)(data[3] * inputImageSize.height);
0180
0181                 int left    = centerX - width  / 2;
0182                 int right   = centerX + width  / 2;
0183                 int top     = centerY - height / 2;
0184                 int bottom  = centerY + height / 2;
0185
0186                 selectBbox(paddedSize,
0187                            confidence,
0188                            left,
0189                            right,
0190                            top,
0191                            bottom,
0192                            goodConfidences,
0193                            goodBoxes,
0194                            doubtConfidences,
0195                            doubtBoxes);
0196             }
0197         }
0198     }
0199
0200     qCDebug(DIGIKAM_FACESENGINE_LOG) << "nb of doubtbox = " << doubtBoxes.size();
0201     qCDebug(DIGIKAM_FACESENGINE_LOG) << "nb of goodbox = "  << goodBoxes.size();
0202
0203     if (goodBoxes.empty())
0204     {
0205         boxes       = doubtBoxes;
0206         confidences = doubtConfidences;
0207     }
0208     else
0209     {
0210         boxes       = goodBoxes;
0211         confidences = goodConfidences;
0212     }
0213
0214     // Perform non maximum suppression to eliminate redundant overlapping boxes with lower confidences
0215
0216     std::vector<int> indices;
0217     cv::dnn::NMSBoxes(boxes, confidences, confidenceThreshold, nmsThreshold, indices);
0218
0219     // Get detected bounding boxes
0220
0221     for (size_t i = 0 ; i < indices.size() ; ++i)
0222     {
0223         cv::Rect bbox = boxes[indices[i]];
0224         correctBbox(bbox, paddedSize);
0225         detectedBboxes.push_back(cv::Rect(bbox.x, bbox.y, bbox.width, bbox.height));
0226     }
0227 }
0228
0229 /** Get the names of the output layers
0230  */
0231 std::vector<cv::String> DNNFaceDetectorYOLO::getOutputsNames() const
0232 {
0233     static std::vector<cv::String> names;
0234
0235     if (!net.empty() && names.empty())
0236     {
0237         // Get the indices of the output layers, i.e. the layers with unconnected outputs
0238
0239         std::vector<int> outLayers          = net.getUnconnectedOutLayers();
0240
0241         // Get the names of all the layers in the network
0242
0243         std::vector<cv::String> layersNames = net.getLayerNames();
0244
0245         // Get the names of the output layers in names
0246
0247         names.resize(outLayers.size());
0248
0249         for (size_t i = 0 ; i < outLayers.size() ; ++i)
0250         {
0251             names[i] = layersNames[outLayers[i] - 1];
0252         }
0253     }
0254
0255     return names;
0256 }
0257
0258 } // namespace Digikam