File indexing completed on 2022-11-22 14:07:13

0001 /*
0002     SPDX-FileCopyrightText: 2021 Jean-Baptiste Mardelle <jb@kdenlive.org>
0003 
0004     SPDX-License-Identifier: GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
0005 */
0006 
0007 #include "speechdialog.h"
0008 
0009 #include "bin/model/subtitlemodel.hpp"
0010 #include "core.h"
0011 #include "kdenlive_debug.h"
0012 #include "kdenlivesettings.h"
0013 #include "mainwindow.h"
0014 #include "monitor/monitor.h"
0015 
0016 #include "mlt++/MltConsumer.h"
0017 #include "mlt++/MltProfile.h"
0018 #include "mlt++/MltTractor.h"
0019 
0020 #include <KLocalizedString>
0021 #include <KMessageWidget>
0022 #include <QButtonGroup>
0023 #include <QDir>
0024 #include <QFontDatabase>
0025 #include <QProcess>
0026 
0027 #include <memory>
0028 #include <utility>
0029 
0030 SpeechDialog::SpeechDialog(std::shared_ptr<TimelineItemModel> timeline, QPoint zone, int tid, bool, bool, QWidget *parent)
0031     : QDialog(parent)
0032     , m_timeline(timeline)
0033     , m_zone(zone)
0034     , m_tid(-1)
0035 
0036 {
0037     setFont(QFontDatabase::systemFont(QFontDatabase::SmallestReadableFont));
0038     setupUi(this);
0039     m_stt = new SpeechToText();
0040     buttonBox->button(QDialogButtonBox::Apply)->setText(i18n("Process"));
0041     speech_info->hide();
0042     m_voskConfig = new QAction(i18n("Configure"), this);
0043     connect(m_voskConfig, &QAction::triggered, []() { pCore->window()->slotPreferences(8); });
0044     m_modelsConnection = connect(pCore.get(), &Core::voskModelUpdate, this, [&](const QStringList &models) {
0045         language_box->clear();
0046         language_box->addItems(models);
0047         if (models.isEmpty()) {
0048             speech_info->addAction(m_voskConfig);
0049             speech_info->setMessageType(KMessageWidget::Information);
0050             speech_info->setText(i18n("Please install speech recognition models"));
0051             speech_info->animatedShow();
0052         } else {
0053             if (!KdenliveSettings::vosk_srt_model().isEmpty() && models.contains(KdenliveSettings::vosk_srt_model())) {
0054                 int ix = language_box->findText(KdenliveSettings::vosk_srt_model());
0055                 if (ix > -1) {
0056                     language_box->setCurrentIndex(ix);
0057                 }
0058             }
0059         }
0060     });
0061     QButtonGroup *buttonGroup = new QButtonGroup(this);
0062     buttonGroup->addButton(timeline_zone);
0063     buttonGroup->addButton(timeline_track);
0064     buttonGroup->addButton(timeline_clips);
0065     connect(buttonGroup, QOverload<QAbstractButton *>::of(&QButtonGroup::buttonClicked), [=, selectedTrack = tid, sourceZone = zone](QAbstractButton *button) {
0066         speech_info->animatedHide();
0067         buttonBox->button(QDialogButtonBox::Apply)->setEnabled(true);
0068         if (button == timeline_clips) {
0069             std::unordered_set<int> selection = timeline->getCurrentSelection();
0070             int cid = -1;
0071             m_tid = -1;
0072             int firstPos = -1;
0073             for (const auto &s : selection) {
0074                 // Find first clip
0075                 if (!timeline->isClip(s)) {
0076                     continue;
0077                 }
0078                 int pos = timeline->getClipPosition(s);
0079                 if (firstPos == -1 || pos < firstPos) {
0080                     cid = s;
0081                     firstPos = pos;
0082                     m_tid = timeline->getClipTrackId(cid);
0083                     if (!timeline->isAudioTrack(m_tid)) {
0084                         m_tid = timeline->getMirrorAudioTrackId(m_tid);
0085                     }
0086                 }
0087             }
0088             if (m_tid == -1) {
0089                 speech_info->setMessageType(KMessageWidget::Information);
0090                 speech_info->setText(i18n("No audio track available for selected clip"));
0091                 speech_info->animatedShow();
0092                 buttonBox->button(QDialogButtonBox::Apply)->setEnabled(false);
0093                 return;
0094             }
0095             if (timeline->isClip(cid)) {
0096                 m_zone.setX(timeline->getClipPosition(cid));
0097                 m_zone.setY(m_zone.x() + timeline->getClipPlaytime(cid));
0098             } else {
0099                 speech_info->setMessageType(KMessageWidget::Information);
0100                 speech_info->setText(i18n("Select a clip in timeline to perform analysis"));
0101                 speech_info->animatedShow();
0102                 buttonBox->button(QDialogButtonBox::Apply)->setEnabled(false);
0103             }
0104         } else {
0105             if (button == timeline_track) {
0106                 m_tid = selectedTrack;
0107                 if (!timeline->isAudioTrack(m_tid)) {
0108                     m_tid = timeline->getMirrorAudioTrackId(m_tid);
0109                 }
0110                 if (m_tid == -1) {
0111                     speech_info->setMessageType(KMessageWidget::Information);
0112                     speech_info->setText(i18n("No audio track found"));
0113                     speech_info->animatedShow();
0114                     buttonBox->button(QDialogButtonBox::Apply)->setEnabled(false);
0115                 }
0116             } else {
0117                 m_tid = -1;
0118             }
0119             m_zone = sourceZone;
0120         }
0121     });
0122     connect(language_box, static_cast<void (QComboBox::*)(int)>(&QComboBox::activated), this,
0123             [this]() { KdenliveSettings::setVosk_srt_model(language_box->currentText()); });
0124     connect(buttonBox->button(QDialogButtonBox::Apply), &QPushButton::clicked, this, [this]() { slotProcessSpeech(); });
0125     m_stt->parseVoskDictionaries();
0126     frame_progress->setVisible(false);
0127     connect(button_abort, &QToolButton::clicked, this, [this]() {
0128         if (m_speechJob && m_speechJob->state() == QProcess::Running) {
0129             m_speechJob->kill();
0130         }
0131     });
0132 }
0133 
0134 SpeechDialog::~SpeechDialog()
0135 {
0136     QObject::disconnect(m_modelsConnection);
0137 }
0138 
0139 void SpeechDialog::slotProcessSpeech()
0140 {
0141     m_stt->checkDependencies();
0142     if (!m_stt->checkSetup() || !m_stt->missingDependencies().isEmpty()) {
0143         speech_info->setMessageType(KMessageWidget::Warning);
0144         speech_info->setText(i18n("Please configure speech to text."));
0145         speech_info->animatedShow();
0146         speech_info->addAction(m_voskConfig);
0147         return;
0148     }
0149     speech_info->removeAction(m_voskConfig);
0150     speech_info->setMessageType(KMessageWidget::Information);
0151     speech_info->setText(i18n("Starting audio export"));
0152     speech_info->show();
0153     qApp->processEvents();
0154     QString sceneList;
0155     QString speech;
0156     QString audio;
0157     QTemporaryFile tmpPlaylist(QDir::temp().absoluteFilePath(QStringLiteral("XXXXXX.mlt")));
0158     m_tmpSrt = std::make_unique<QTemporaryFile>(QDir::temp().absoluteFilePath(QStringLiteral("XXXXXX.srt")));
0159     m_tmpAudio = std::make_unique<QTemporaryFile>(QDir::temp().absoluteFilePath(QStringLiteral("XXXXXX.wav")));
0160     if (tmpPlaylist.open()) {
0161         sceneList = tmpPlaylist.fileName();
0162     }
0163     tmpPlaylist.close();
0164     if (m_tmpSrt->open()) {
0165         speech = m_tmpSrt->fileName();
0166     }
0167     m_tmpSrt->close();
0168     if (m_tmpAudio->open()) {
0169         audio = m_tmpAudio->fileName();
0170     }
0171     m_tmpAudio->close();
0172     m_timeline->sceneList(QDir::temp().absolutePath(), sceneList);
0173     // TODO: do the rendering in another thread to not block the UI
0174 
0175     Mlt::Producer producer(*m_timeline->tractor()->profile(), "xml", sceneList.toUtf8().constData());
0176     int tracksCount = m_timeline->tractor()->count();
0177     std::shared_ptr<Mlt::Service> s(new Mlt::Service(producer));
0178     std::shared_ptr<Mlt::Multitrack> multi = nullptr;
0179     bool multitrackFound = false;
0180     for (int i = 0; i < 10; i++) {
0181         s.reset(s->producer());
0182         if (s == nullptr || !s->is_valid()) {
0183             break;
0184         }
0185         if (s->type() == mlt_service_multitrack_type) {
0186             multi.reset(new Mlt::Multitrack(*s.get()));
0187             if (multi->count() == tracksCount) {
0188                 // Match
0189                 multitrackFound = true;
0190                 break;
0191             }
0192         }
0193     }
0194     if (multitrackFound) {
0195         int trackPos = -1;
0196         if (m_tid > -1) {
0197             trackPos = m_timeline->getTrackMltIndex(m_tid);
0198         }
0199         int tid = 0;
0200         for (int i = 0; i < multi->count(); i++) {
0201             std::shared_ptr<Mlt::Producer> tk(multi->track(i));
0202             if (tk->get_int("hide") == 1) {
0203                 // Video track, hide it
0204                 tk->set("hide", 3);
0205             } else if (tid == 0 || (trackPos > -1 && trackPos != tid)) {
0206                 // We only want a specific audio track
0207                 tk->set("hide", 3);
0208             }
0209             tid++;
0210         }
0211     }
0212     Mlt::Consumer xmlConsumer(*m_timeline->tractor()->profile(), "avformat", audio.toUtf8().constData());
0213     if (!xmlConsumer.is_valid() || !producer.is_valid()) {
0214         qDebug() << "=== STARTING CONSUMER ERROR";
0215         if (!producer.is_valid()) {
0216             qDebug() << "=== PRODUCER INVALID";
0217         }
0218         speech_info->setMessageType(KMessageWidget::Warning);
0219         speech_info->setText(i18n("Audio export failed"));
0220         qApp->processEvents();
0221         return;
0222     }
0223     speech_progress->setValue(0);
0224     frame_progress->setVisible(true);
0225     buttonBox->button(QDialogButtonBox::Apply)->setEnabled(false);
0226     qApp->processEvents();
0227     xmlConsumer.set("terminate_on_pause", 1);
0228     xmlConsumer.set("properties", "WAV");
0229     producer.set_in_and_out(m_zone.x(), m_zone.y());
0230     xmlConsumer.connect(producer);
0231 
0232     qDebug() << "=== STARTING RENDER C, IN:" << m_zone.x() << " - " << m_zone.y();
0233     m_duration = m_zone.y() - m_zone.x();
0234     qApp->processEvents();
0235     xmlConsumer.run();
0236     qApp->processEvents();
0237     qDebug() << "=== STARTING RENDER D";
0238     QString language = language_box->currentText();
0239     speech_info->setMessageType(KMessageWidget::Information);
0240     speech_info->setText(i18n("Starting speech recognition"));
0241     qApp->processEvents();
0242     QString modelDirectory = m_stt->voskModelPath();
0243     qDebug() << "==== ANALYSIS SPEECH: " << modelDirectory << " - " << language << " - " << audio << " - " << speech;
0244     m_speechJob = std::make_unique<QProcess>(this);
0245     connect(m_speechJob.get(), &QProcess::readyReadStandardOutput, this, &SpeechDialog::slotProcessProgress);
0246     connect(m_speechJob.get(), static_cast<void (QProcess::*)(int, QProcess::ExitStatus)>(&QProcess::finished), this,
0247             [this, speech](int, QProcess::ExitStatus status) { slotProcessSpeechStatus(status, speech); });
0248     m_speechJob->start(m_stt->pythonExec(), {m_stt->subtitleScript(), modelDirectory, language, audio, speech});
0249 }
0250 
0251 void SpeechDialog::slotProcessSpeechStatus(QProcess::ExitStatus status, const QString &srtFile)
0252 {
0253     if (status == QProcess::CrashExit) {
0254         speech_info->setMessageType(KMessageWidget::Warning);
0255         speech_info->setText(i18n("Speech recognition aborted."));
0256         speech_info->animatedShow();
0257     } else {
0258         if (QFile::exists(srtFile)) {
0259             m_timeline->getSubtitleModel()->importSubtitle(srtFile, m_zone.x(), true);
0260             speech_info->setMessageType(KMessageWidget::Positive);
0261             speech_info->setText(i18n("Subtitles imported"));
0262         } else {
0263             speech_info->setMessageType(KMessageWidget::Warning);
0264             speech_info->setText(i18n("Speech recognition failed"));
0265         }
0266     }
0267     buttonBox->button(QDialogButtonBox::Apply)->setEnabled(true);
0268     frame_progress->setVisible(false);
0269 }
0270 
0271 void SpeechDialog::slotProcessProgress()
0272 {
0273     QString saveData = QString::fromUtf8(m_speechJob->readAll());
0274     qDebug() << "==== GOT SPEECH DATA: " << saveData;
0275     if (saveData.startsWith(QStringLiteral("progress:"))) {
0276         double prog = saveData.section(QLatin1Char(':'), 1).toInt() * 3.12;
0277         qDebug() << "=== GOT DATA:\n" << saveData;
0278         speech_progress->setValue(static_cast<int>(100 * prog / m_duration));
0279     }
0280 }