File indexing completed on 2024-04-14 04:46:23

0001 /*
0002     SPDX-FileCopyrightText: 2021 Jean-Baptiste Mardelle <jb@kdenlive.org>
0003 
0004     SPDX-License-Identifier: GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
0005 */
0006 
0007 #include "speechdialog.h"
0008 
0009 #include "bin/model/subtitlemodel.hpp"
0010 #include "core.h"
0011 #include "kdenlive_debug.h"
0012 #include "kdenlivesettings.h"
0013 #include "mainwindow.h"
0014 #include "monitor/monitor.h"
0015 
0016 #include "mlt++/MltConsumer.h"
0017 #include "mlt++/MltProfile.h"
0018 #include "mlt++/MltTractor.h"
0019 
0020 #include <KLocalizedString>
0021 #include <KMessageBox>
0022 #include <KMessageWidget>
0023 #include <QButtonGroup>
0024 #include <QDir>
0025 #include <QFontDatabase>
0026 #include <QProcess>
0027 #include <kwidgetsaddons_version.h>
0028 
0029 #include <memory>
0030 #include <utility>
0031 
0032 SpeechDialog::SpeechDialog(std::shared_ptr<TimelineItemModel> timeline, QPoint zone, int tid, bool, bool, QWidget *parent)
0033     : QDialog(parent)
0034     , m_timeline(timeline)
0035     , m_zone(zone)
0036     , m_tid(-1)
0037 
0038 {
0039     setFont(QFontDatabase::systemFont(QFontDatabase::SmallestReadableFont));
0040     setupUi(this);
0041     speech_info->hide();
0042     setWindowTitle(i18n("Automatic Subtitling"));
0043     m_voskConfig = new QAction(i18n("Configure"), this);
0044     connect(m_voskConfig, &QAction::triggered, [this]() {
0045         pCore->window()->slotShowPreferencePage(Kdenlive::PageSpeech);
0046         close();
0047     });
0048     m_logAction = new QAction(i18n("Show log"), this);
0049     connect(m_logAction, &QAction::triggered, [&]() { KMessageBox::detailedError(QApplication::activeWindow(), i18n("Speech Recognition log"), m_errorLog); });
0050 
0051     if (KdenliveSettings::speechEngine() == QLatin1String("whisper")) {
0052         // Whisper model
0053         m_stt = new SpeechToText(SpeechToText::EngineType::EngineWhisper);
0054         QList<std::pair<QString, QString>> whisperModels = m_stt->whisperModels();
0055         for (auto &w : whisperModels) {
0056             speech_model->addItem(w.first, w.second);
0057         }
0058         int ix = speech_model->findData(KdenliveSettings::whisperModel());
0059         if (ix > -1) {
0060             speech_model->setCurrentIndex(ix);
0061         }
0062         if (speech_language->count() == 0) {
0063             // Fill whisper languages
0064             QMap<QString, QString> languages = m_stt->whisperLanguages();
0065             QMapIterator<QString, QString> j(languages);
0066             while (j.hasNext()) {
0067                 j.next();
0068                 speech_language->addItem(j.key(), j.value());
0069             }
0070             int ix = speech_language->findData(KdenliveSettings::whisperLanguage());
0071             if (ix > -1) {
0072                 speech_language->setCurrentIndex(ix);
0073             }
0074         }
0075         speech_language->setEnabled(!KdenliveSettings::whisperModel().endsWith(QLatin1String(".en")));
0076         translate_box->setChecked(KdenliveSettings::whisperTranslate());
0077 
0078     } else {
0079         // Vosk model
0080         whisper_settings->setVisible(false);
0081         m_stt = new SpeechToText(SpeechToText::EngineType::EngineVosk);
0082         connect(pCore.get(), &Core::voskModelUpdate, this, &SpeechDialog::updateVoskModels);
0083         m_stt->parseVoskDictionaries();
0084     }
0085     buttonBox->button(QDialogButtonBox::Apply)->setText(i18n("Process"));
0086 
0087     QButtonGroup *buttonGroup = new QButtonGroup(this);
0088     buttonGroup->addButton(timeline_full);
0089     buttonGroup->addButton(timeline_zone);
0090     buttonGroup->addButton(timeline_track);
0091     buttonGroup->addButton(timeline_clips);
0092     connect(buttonGroup, QOverload<QAbstractButton *>::of(&QButtonGroup::buttonClicked), [=, selectedTrack = tid, sourceZone = zone](QAbstractButton *button) {
0093         speech_info->animatedHide();
0094         buttonBox->button(QDialogButtonBox::Apply)->setEnabled(true);
0095         if (button == timeline_full) {
0096             m_tid = -1;
0097             m_zone = QPoint(0, pCore->projectDuration() - 1);
0098         } else if (button == timeline_clips) {
0099             std::unordered_set<int> selection = timeline->getCurrentSelection();
0100             int cid = -1;
0101             m_tid = -1;
0102             int firstPos = -1;
0103             for (const auto &s : selection) {
0104                 // Find first clip
0105                 if (!timeline->isClip(s)) {
0106                     continue;
0107                 }
0108                 int pos = timeline->getClipPosition(s);
0109                 if (firstPos == -1 || pos < firstPos) {
0110                     cid = s;
0111                     firstPos = pos;
0112                     m_tid = timeline->getClipTrackId(cid);
0113                     if (!timeline->isAudioTrack(m_tid)) {
0114                         m_tid = timeline->getMirrorAudioTrackId(m_tid);
0115                     }
0116                 }
0117             }
0118             if (m_tid == -1) {
0119                 speech_info->setMessageType(KMessageWidget::Information);
0120                 speech_info->setText(i18n("No audio track available for selected clip"));
0121                 speech_info->animatedShow();
0122                 buttonBox->button(QDialogButtonBox::Apply)->setEnabled(false);
0123                 return;
0124             }
0125             if (timeline->isClip(cid)) {
0126                 m_zone.setX(timeline->getClipPosition(cid));
0127                 m_zone.setY(m_zone.x() + timeline->getClipPlaytime(cid));
0128             } else {
0129                 speech_info->setMessageType(KMessageWidget::Information);
0130                 speech_info->setText(i18n("Select a clip in timeline to perform analysis"));
0131                 speech_info->animatedShow();
0132                 buttonBox->button(QDialogButtonBox::Apply)->setEnabled(false);
0133             }
0134         } else {
0135             if (button == timeline_track) {
0136                 m_tid = selectedTrack;
0137                 if (timeline->isSubtitleTrack(m_tid)) {
0138                     m_tid = -1;
0139                 } else if (!timeline->isAudioTrack(m_tid)) {
0140                     m_tid = timeline->getMirrorAudioTrackId(m_tid);
0141                 }
0142                 if (m_tid == -1) {
0143                     speech_info->setMessageType(KMessageWidget::Information);
0144                     speech_info->setText(i18n("No audio track found"));
0145                     speech_info->animatedShow();
0146                     buttonBox->button(QDialogButtonBox::Apply)->setEnabled(false);
0147                 }
0148             } else {
0149                 m_tid = -1;
0150             }
0151             m_zone = sourceZone;
0152         }
0153     });
0154     connect(speech_model, static_cast<void (QComboBox::*)(int)>(&QComboBox::activated), this, [this]() {
0155         if (KdenliveSettings::speechEngine() == QLatin1String("whisper")) {
0156             const QString modelName = speech_model->currentData().toString();
0157             KdenliveSettings::setWhisperModel(modelName);
0158             speech_language->setEnabled(!modelName.endsWith(QLatin1String(".en")));
0159         } else {
0160             KdenliveSettings::setVosk_srt_model(speech_model->currentText());
0161         }
0162     });
0163     connect(buttonBox->button(QDialogButtonBox::Apply), &QPushButton::clicked, this, [this]() { slotProcessSpeech(); });
0164     frame_progress->setVisible(false);
0165     connect(button_abort, &QToolButton::clicked, this, [this]() {
0166         if (m_speechJob && m_speechJob->state() == QProcess::Running) {
0167             m_speechJob->kill();
0168         }
0169     });
0170     m_stt->checkDependencies();
0171 }
0172 
0173 SpeechDialog::~SpeechDialog() {}
0174 
0175 void SpeechDialog::updateVoskModels(const QStringList models)
0176 {
0177     speech_model->clear();
0178     speech_model->addItems(models);
0179     if (models.isEmpty()) {
0180         speech_info->addAction(m_voskConfig);
0181         speech_info->setMessageType(KMessageWidget::Information);
0182         speech_info->setText(i18n("Please install speech recognition models"));
0183         speech_info->show();
0184         buttonBox->button(QDialogButtonBox::Apply)->setEnabled(false);
0185     } else {
0186         if (!KdenliveSettings::vosk_srt_model().isEmpty() && models.contains(KdenliveSettings::vosk_srt_model())) {
0187             int ix = speech_model->findText(KdenliveSettings::vosk_srt_model());
0188             if (ix > -1) {
0189                 speech_model->setCurrentIndex(ix);
0190             }
0191         }
0192         buttonBox->button(QDialogButtonBox::Apply)->setEnabled(true);
0193     }
0194 }
0195 
0196 void SpeechDialog::slotProcessSpeech()
0197 {
0198     speech_info->setMessageType(KMessageWidget::Information);
0199     speech_info->setText(i18nc("@label:textbox", "Checking setup…"));
0200     speech_info->show();
0201     buttonBox->button(QDialogButtonBox::Apply)->setEnabled(false);
0202     if (!m_stt->checkSetup() || !m_stt->missingDependencies().isEmpty()) {
0203         speech_info->setMessageType(KMessageWidget::Warning);
0204         speech_info->setText(i18n("Please configure speech to text."));
0205         speech_info->animatedShow();
0206         speech_info->addAction(m_voskConfig);
0207         return;
0208     }
0209     speech_info->removeAction(m_voskConfig);
0210     speech_info->setMessageType(KMessageWidget::Information);
0211     speech_info->setText(i18n("Starting audio export"));
0212     speech_info->show();
0213     qApp->processEvents();
0214     QString sceneList;
0215     QString speech;
0216     QString audio;
0217     QTemporaryFile tmpPlaylist(QDir::temp().absoluteFilePath(QStringLiteral("XXXXXX.mlt")));
0218     m_tmpSrt = std::make_unique<QTemporaryFile>(QDir::temp().absoluteFilePath(QStringLiteral("XXXXXX.srt")));
0219     m_tmpAudio = std::make_unique<QTemporaryFile>(QDir::temp().absoluteFilePath(QStringLiteral("XXXXXX.wav")));
0220     if (tmpPlaylist.open()) {
0221         sceneList = tmpPlaylist.fileName();
0222     }
0223     tmpPlaylist.close();
0224     if (m_tmpSrt->open()) {
0225         speech = m_tmpSrt->fileName();
0226     }
0227     m_tmpSrt->close();
0228     if (m_tmpAudio->open()) {
0229         audio = m_tmpAudio->fileName();
0230     }
0231     m_tmpAudio->close();
0232     m_timeline->sceneList(QDir::temp().absolutePath(), sceneList);
0233     // TODO: do the rendering in another thread to not block the UI
0234 
0235     Mlt::Producer producer(m_timeline->tractor()->get_profile(), "xml", sceneList.toUtf8().constData());
0236     int tracksCount = m_timeline->tractor()->count();
0237     std::shared_ptr<Mlt::Service> s(new Mlt::Service(producer));
0238     std::shared_ptr<Mlt::Multitrack> multi = nullptr;
0239     bool multitrackFound = false;
0240     for (int i = 0; i < 10; i++) {
0241         s.reset(s->producer());
0242         if (s == nullptr || !s->is_valid()) {
0243             break;
0244         }
0245         if (s->type() == mlt_service_multitrack_type) {
0246             multi.reset(new Mlt::Multitrack(*s.get()));
0247             if (multi->count() == tracksCount) {
0248                 // Match
0249                 multitrackFound = true;
0250                 break;
0251             }
0252         }
0253     }
0254     if (multitrackFound) {
0255         int trackPos = -1;
0256         if (m_tid > -1) {
0257             trackPos = m_timeline->getTrackMltIndex(m_tid);
0258         }
0259         int tid = 0;
0260         for (int i = 0; i < multi->count(); i++) {
0261             std::shared_ptr<Mlt::Producer> tk(multi->track(i));
0262             if (tk->get_int("hide") == 1) {
0263                 // Video track, hide it
0264                 tk->set("hide", 3);
0265             } else if (tid == 0 || (trackPos > -1 && trackPos != tid)) {
0266                 // We only want a specific audio track
0267                 tk->set("hide", 3);
0268             }
0269             tid++;
0270         }
0271     }
0272     Mlt::Consumer xmlConsumer(m_timeline->tractor()->get_profile(), "avformat", audio.toUtf8().constData());
0273     if (!xmlConsumer.is_valid() || !producer.is_valid()) {
0274         qDebug() << "=== STARTING CONSUMER ERROR";
0275         if (!producer.is_valid()) {
0276             qDebug() << "=== PRODUCER INVALID";
0277         }
0278         speech_info->setMessageType(KMessageWidget::Warning);
0279         speech_info->setText(i18n("Audio export failed"));
0280         qApp->processEvents();
0281         return;
0282     }
0283     speech_progress->setValue(0);
0284     m_errorLog.clear();
0285 #if KWIDGETSADDONS_VERSION >= QT_VERSION_CHECK(5, 100, 0)
0286     speech_info->clearActions();
0287 #else
0288     speech_info->removeAction(m_logAction);
0289 #endif
0290     frame_progress->setVisible(true);
0291     buttonBox->button(QDialogButtonBox::Apply)->setEnabled(false);
0292     qApp->processEvents();
0293     xmlConsumer.set("terminate_on_pause", 1);
0294     xmlConsumer.set("properties", "WAV");
0295     producer.set_in_and_out(m_zone.x(), m_zone.y());
0296     xmlConsumer.connect(producer);
0297 
0298     qDebug() << "=== STARTING RENDER C, IN:" << m_zone.x() << " - " << m_zone.y();
0299     m_duration = m_zone.y() - m_zone.x();
0300     qApp->processEvents();
0301     xmlConsumer.run();
0302     qApp->processEvents();
0303     qDebug() << "=== STARTING RENDER D";
0304     speech_info->setMessageType(KMessageWidget::Information);
0305     speech_info->setText(i18n("Starting speech recognition"));
0306     qApp->processEvents();
0307     QString modelDirectory = m_stt->voskModelPath();
0308     m_speechJob = std::make_unique<QProcess>(this);
0309     connect(m_speechJob.get(), static_cast<void (QProcess::*)(int, QProcess::ExitStatus)>(&QProcess::finished), this,
0310             [this, speech](int, QProcess::ExitStatus status) { slotProcessSpeechStatus(status, speech); });
0311     if (KdenliveSettings::speechEngine() == QLatin1String("whisper")) {
0312         // Whisper
0313         QString modelName = speech_model->currentData().toString();
0314         m_speechJob->setProcessChannelMode(QProcess::MergedChannels);
0315         connect(m_speechJob.get(), &QProcess::readyReadStandardOutput, this, &SpeechDialog::slotProcessWhisperProgress);
0316         QString language = speech_language->isEnabled() && !speech_language->currentData().isNull()
0317                                ? QString("language=%1").arg(speech_language->currentData().toString())
0318                                : QString();
0319         qDebug() << "==== ANALYSIS SPEECH: " << m_stt->subtitleScript() << " " << audio << " " << modelName << " " << speech << " "
0320                  << KdenliveSettings::whisperDevice() << " " << (translate_box->isChecked() ? QStringLiteral("translate") : QStringLiteral("transcribe")) << " "
0321                  << language;
0322         if (KdenliveSettings::whisperDisableFP16()) {
0323             language.append(QStringLiteral(" fp16=False"));
0324         }
0325         m_speechJob->start(m_stt->pythonExec(), {m_stt->subtitleScript(), audio, modelName, speech, KdenliveSettings::whisperDevice(),
0326                                                  translate_box->isChecked() ? QStringLiteral("translate") : QStringLiteral("transcribe"), language});
0327     } else {
0328         // Vosk
0329         QString modelName = speech_model->currentText();
0330         connect(m_speechJob.get(), &QProcess::readyReadStandardOutput, this, &SpeechDialog::slotProcessProgress);
0331         m_speechJob->start(m_stt->pythonExec(), {m_stt->subtitleScript(), modelDirectory, modelName, audio, speech});
0332     }
0333 }
0334 
0335 void SpeechDialog::slotProcessSpeechStatus(QProcess::ExitStatus status, const QString &srtFile)
0336 {
0337     if (!m_errorLog.isEmpty()) {
0338         speech_info->addAction(m_logAction);
0339     }
0340     if (status == QProcess::CrashExit) {
0341         speech_info->setMessageType(KMessageWidget::Warning);
0342         speech_info->setText(i18n("Speech recognition aborted."));
0343         speech_info->animatedShow();
0344     } else {
0345         if (QFile::exists(srtFile)) {
0346             m_timeline->getSubtitleModel()->importSubtitle(srtFile, m_zone.x(), true);
0347             speech_info->setMessageType(KMessageWidget::Positive);
0348             speech_info->setText(i18n("Subtitles imported"));
0349         } else {
0350             speech_info->setMessageType(KMessageWidget::Warning);
0351             speech_info->setText(i18n("Speech recognition failed"));
0352         }
0353     }
0354     buttonBox->button(QDialogButtonBox::Apply)->setEnabled(true);
0355     frame_progress->setVisible(false);
0356 }
0357 
0358 void SpeechDialog::slotProcessProgress()
0359 {
0360     QString saveData = QString::fromUtf8(m_speechJob->readAll());
0361     if (saveData.startsWith(QStringLiteral("progress:"))) {
0362         double prog = saveData.section(QLatin1Char(':'), 1).toInt() * 3.12;
0363         speech_progress->setValue(static_cast<int>(100 * prog / m_duration));
0364     }
0365 }
0366 
0367 void SpeechDialog::slotProcessWhisperProgress()
0368 {
0369     QString saveData = QString::fromUtf8(m_speechJob->readAll());
0370     if (saveData.contains(QStringLiteral("%|"))) {
0371         int prog = saveData.section(QLatin1Char('%'), 0, 0).toInt();
0372         qDebug() << "=== GOT DATA:\n" << saveData << " = " << prog;
0373         speech_progress->setValue(prog);
0374     } else {
0375         m_errorLog.append(saveData);
0376     }
0377 }