File indexing completed on 2024-04-14 04:46:23
0001 /* 0002 SPDX-FileCopyrightText: 2021 Jean-Baptiste Mardelle <jb@kdenlive.org> 0003 0004 SPDX-License-Identifier: GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL 0005 */ 0006 0007 #include "speechdialog.h" 0008 0009 #include "bin/model/subtitlemodel.hpp" 0010 #include "core.h" 0011 #include "kdenlive_debug.h" 0012 #include "kdenlivesettings.h" 0013 #include "mainwindow.h" 0014 #include "monitor/monitor.h" 0015 0016 #include "mlt++/MltConsumer.h" 0017 #include "mlt++/MltProfile.h" 0018 #include "mlt++/MltTractor.h" 0019 0020 #include <KLocalizedString> 0021 #include <KMessageBox> 0022 #include <KMessageWidget> 0023 #include <QButtonGroup> 0024 #include <QDir> 0025 #include <QFontDatabase> 0026 #include <QProcess> 0027 #include <kwidgetsaddons_version.h> 0028 0029 #include <memory> 0030 #include <utility> 0031 0032 SpeechDialog::SpeechDialog(std::shared_ptr<TimelineItemModel> timeline, QPoint zone, int tid, bool, bool, QWidget *parent) 0033 : QDialog(parent) 0034 , m_timeline(timeline) 0035 , m_zone(zone) 0036 , m_tid(-1) 0037 0038 { 0039 setFont(QFontDatabase::systemFont(QFontDatabase::SmallestReadableFont)); 0040 setupUi(this); 0041 speech_info->hide(); 0042 setWindowTitle(i18n("Automatic Subtitling")); 0043 m_voskConfig = new QAction(i18n("Configure"), this); 0044 connect(m_voskConfig, &QAction::triggered, [this]() { 0045 pCore->window()->slotShowPreferencePage(Kdenlive::PageSpeech); 0046 close(); 0047 }); 0048 m_logAction = new QAction(i18n("Show log"), this); 0049 connect(m_logAction, &QAction::triggered, [&]() { KMessageBox::detailedError(QApplication::activeWindow(), i18n("Speech Recognition log"), m_errorLog); }); 0050 0051 if (KdenliveSettings::speechEngine() == QLatin1String("whisper")) { 0052 // Whisper model 0053 m_stt = new SpeechToText(SpeechToText::EngineType::EngineWhisper); 0054 QList<std::pair<QString, QString>> whisperModels = m_stt->whisperModels(); 0055 for (auto &w : whisperModels) { 0056 speech_model->addItem(w.first, w.second); 0057 } 0058 int ix = speech_model->findData(KdenliveSettings::whisperModel()); 0059 if (ix > -1) { 0060 speech_model->setCurrentIndex(ix); 0061 } 0062 if (speech_language->count() == 0) { 0063 // Fill whisper languages 0064 QMap<QString, QString> languages = m_stt->whisperLanguages(); 0065 QMapIterator<QString, QString> j(languages); 0066 while (j.hasNext()) { 0067 j.next(); 0068 speech_language->addItem(j.key(), j.value()); 0069 } 0070 int ix = speech_language->findData(KdenliveSettings::whisperLanguage()); 0071 if (ix > -1) { 0072 speech_language->setCurrentIndex(ix); 0073 } 0074 } 0075 speech_language->setEnabled(!KdenliveSettings::whisperModel().endsWith(QLatin1String(".en"))); 0076 translate_box->setChecked(KdenliveSettings::whisperTranslate()); 0077 0078 } else { 0079 // Vosk model 0080 whisper_settings->setVisible(false); 0081 m_stt = new SpeechToText(SpeechToText::EngineType::EngineVosk); 0082 connect(pCore.get(), &Core::voskModelUpdate, this, &SpeechDialog::updateVoskModels); 0083 m_stt->parseVoskDictionaries(); 0084 } 0085 buttonBox->button(QDialogButtonBox::Apply)->setText(i18n("Process")); 0086 0087 QButtonGroup *buttonGroup = new QButtonGroup(this); 0088 buttonGroup->addButton(timeline_full); 0089 buttonGroup->addButton(timeline_zone); 0090 buttonGroup->addButton(timeline_track); 0091 buttonGroup->addButton(timeline_clips); 0092 connect(buttonGroup, QOverload<QAbstractButton *>::of(&QButtonGroup::buttonClicked), [=, selectedTrack = tid, sourceZone = zone](QAbstractButton *button) { 0093 speech_info->animatedHide(); 0094 buttonBox->button(QDialogButtonBox::Apply)->setEnabled(true); 0095 if (button == timeline_full) { 0096 m_tid = -1; 0097 m_zone = QPoint(0, pCore->projectDuration() - 1); 0098 } else if (button == timeline_clips) { 0099 std::unordered_set<int> selection = timeline->getCurrentSelection(); 0100 int cid = -1; 0101 m_tid = -1; 0102 int firstPos = -1; 0103 for (const auto &s : selection) { 0104 // Find first clip 0105 if (!timeline->isClip(s)) { 0106 continue; 0107 } 0108 int pos = timeline->getClipPosition(s); 0109 if (firstPos == -1 || pos < firstPos) { 0110 cid = s; 0111 firstPos = pos; 0112 m_tid = timeline->getClipTrackId(cid); 0113 if (!timeline->isAudioTrack(m_tid)) { 0114 m_tid = timeline->getMirrorAudioTrackId(m_tid); 0115 } 0116 } 0117 } 0118 if (m_tid == -1) { 0119 speech_info->setMessageType(KMessageWidget::Information); 0120 speech_info->setText(i18n("No audio track available for selected clip")); 0121 speech_info->animatedShow(); 0122 buttonBox->button(QDialogButtonBox::Apply)->setEnabled(false); 0123 return; 0124 } 0125 if (timeline->isClip(cid)) { 0126 m_zone.setX(timeline->getClipPosition(cid)); 0127 m_zone.setY(m_zone.x() + timeline->getClipPlaytime(cid)); 0128 } else { 0129 speech_info->setMessageType(KMessageWidget::Information); 0130 speech_info->setText(i18n("Select a clip in timeline to perform analysis")); 0131 speech_info->animatedShow(); 0132 buttonBox->button(QDialogButtonBox::Apply)->setEnabled(false); 0133 } 0134 } else { 0135 if (button == timeline_track) { 0136 m_tid = selectedTrack; 0137 if (timeline->isSubtitleTrack(m_tid)) { 0138 m_tid = -1; 0139 } else if (!timeline->isAudioTrack(m_tid)) { 0140 m_tid = timeline->getMirrorAudioTrackId(m_tid); 0141 } 0142 if (m_tid == -1) { 0143 speech_info->setMessageType(KMessageWidget::Information); 0144 speech_info->setText(i18n("No audio track found")); 0145 speech_info->animatedShow(); 0146 buttonBox->button(QDialogButtonBox::Apply)->setEnabled(false); 0147 } 0148 } else { 0149 m_tid = -1; 0150 } 0151 m_zone = sourceZone; 0152 } 0153 }); 0154 connect(speech_model, static_cast<void (QComboBox::*)(int)>(&QComboBox::activated), this, [this]() { 0155 if (KdenliveSettings::speechEngine() == QLatin1String("whisper")) { 0156 const QString modelName = speech_model->currentData().toString(); 0157 KdenliveSettings::setWhisperModel(modelName); 0158 speech_language->setEnabled(!modelName.endsWith(QLatin1String(".en"))); 0159 } else { 0160 KdenliveSettings::setVosk_srt_model(speech_model->currentText()); 0161 } 0162 }); 0163 connect(buttonBox->button(QDialogButtonBox::Apply), &QPushButton::clicked, this, [this]() { slotProcessSpeech(); }); 0164 frame_progress->setVisible(false); 0165 connect(button_abort, &QToolButton::clicked, this, [this]() { 0166 if (m_speechJob && m_speechJob->state() == QProcess::Running) { 0167 m_speechJob->kill(); 0168 } 0169 }); 0170 m_stt->checkDependencies(); 0171 } 0172 0173 SpeechDialog::~SpeechDialog() {} 0174 0175 void SpeechDialog::updateVoskModels(const QStringList models) 0176 { 0177 speech_model->clear(); 0178 speech_model->addItems(models); 0179 if (models.isEmpty()) { 0180 speech_info->addAction(m_voskConfig); 0181 speech_info->setMessageType(KMessageWidget::Information); 0182 speech_info->setText(i18n("Please install speech recognition models")); 0183 speech_info->show(); 0184 buttonBox->button(QDialogButtonBox::Apply)->setEnabled(false); 0185 } else { 0186 if (!KdenliveSettings::vosk_srt_model().isEmpty() && models.contains(KdenliveSettings::vosk_srt_model())) { 0187 int ix = speech_model->findText(KdenliveSettings::vosk_srt_model()); 0188 if (ix > -1) { 0189 speech_model->setCurrentIndex(ix); 0190 } 0191 } 0192 buttonBox->button(QDialogButtonBox::Apply)->setEnabled(true); 0193 } 0194 } 0195 0196 void SpeechDialog::slotProcessSpeech() 0197 { 0198 speech_info->setMessageType(KMessageWidget::Information); 0199 speech_info->setText(i18nc("@label:textbox", "Checking setup…")); 0200 speech_info->show(); 0201 buttonBox->button(QDialogButtonBox::Apply)->setEnabled(false); 0202 if (!m_stt->checkSetup() || !m_stt->missingDependencies().isEmpty()) { 0203 speech_info->setMessageType(KMessageWidget::Warning); 0204 speech_info->setText(i18n("Please configure speech to text.")); 0205 speech_info->animatedShow(); 0206 speech_info->addAction(m_voskConfig); 0207 return; 0208 } 0209 speech_info->removeAction(m_voskConfig); 0210 speech_info->setMessageType(KMessageWidget::Information); 0211 speech_info->setText(i18n("Starting audio export")); 0212 speech_info->show(); 0213 qApp->processEvents(); 0214 QString sceneList; 0215 QString speech; 0216 QString audio; 0217 QTemporaryFile tmpPlaylist(QDir::temp().absoluteFilePath(QStringLiteral("XXXXXX.mlt"))); 0218 m_tmpSrt = std::make_unique<QTemporaryFile>(QDir::temp().absoluteFilePath(QStringLiteral("XXXXXX.srt"))); 0219 m_tmpAudio = std::make_unique<QTemporaryFile>(QDir::temp().absoluteFilePath(QStringLiteral("XXXXXX.wav"))); 0220 if (tmpPlaylist.open()) { 0221 sceneList = tmpPlaylist.fileName(); 0222 } 0223 tmpPlaylist.close(); 0224 if (m_tmpSrt->open()) { 0225 speech = m_tmpSrt->fileName(); 0226 } 0227 m_tmpSrt->close(); 0228 if (m_tmpAudio->open()) { 0229 audio = m_tmpAudio->fileName(); 0230 } 0231 m_tmpAudio->close(); 0232 m_timeline->sceneList(QDir::temp().absolutePath(), sceneList); 0233 // TODO: do the rendering in another thread to not block the UI 0234 0235 Mlt::Producer producer(m_timeline->tractor()->get_profile(), "xml", sceneList.toUtf8().constData()); 0236 int tracksCount = m_timeline->tractor()->count(); 0237 std::shared_ptr<Mlt::Service> s(new Mlt::Service(producer)); 0238 std::shared_ptr<Mlt::Multitrack> multi = nullptr; 0239 bool multitrackFound = false; 0240 for (int i = 0; i < 10; i++) { 0241 s.reset(s->producer()); 0242 if (s == nullptr || !s->is_valid()) { 0243 break; 0244 } 0245 if (s->type() == mlt_service_multitrack_type) { 0246 multi.reset(new Mlt::Multitrack(*s.get())); 0247 if (multi->count() == tracksCount) { 0248 // Match 0249 multitrackFound = true; 0250 break; 0251 } 0252 } 0253 } 0254 if (multitrackFound) { 0255 int trackPos = -1; 0256 if (m_tid > -1) { 0257 trackPos = m_timeline->getTrackMltIndex(m_tid); 0258 } 0259 int tid = 0; 0260 for (int i = 0; i < multi->count(); i++) { 0261 std::shared_ptr<Mlt::Producer> tk(multi->track(i)); 0262 if (tk->get_int("hide") == 1) { 0263 // Video track, hide it 0264 tk->set("hide", 3); 0265 } else if (tid == 0 || (trackPos > -1 && trackPos != tid)) { 0266 // We only want a specific audio track 0267 tk->set("hide", 3); 0268 } 0269 tid++; 0270 } 0271 } 0272 Mlt::Consumer xmlConsumer(m_timeline->tractor()->get_profile(), "avformat", audio.toUtf8().constData()); 0273 if (!xmlConsumer.is_valid() || !producer.is_valid()) { 0274 qDebug() << "=== STARTING CONSUMER ERROR"; 0275 if (!producer.is_valid()) { 0276 qDebug() << "=== PRODUCER INVALID"; 0277 } 0278 speech_info->setMessageType(KMessageWidget::Warning); 0279 speech_info->setText(i18n("Audio export failed")); 0280 qApp->processEvents(); 0281 return; 0282 } 0283 speech_progress->setValue(0); 0284 m_errorLog.clear(); 0285 #if KWIDGETSADDONS_VERSION >= QT_VERSION_CHECK(5, 100, 0) 0286 speech_info->clearActions(); 0287 #else 0288 speech_info->removeAction(m_logAction); 0289 #endif 0290 frame_progress->setVisible(true); 0291 buttonBox->button(QDialogButtonBox::Apply)->setEnabled(false); 0292 qApp->processEvents(); 0293 xmlConsumer.set("terminate_on_pause", 1); 0294 xmlConsumer.set("properties", "WAV"); 0295 producer.set_in_and_out(m_zone.x(), m_zone.y()); 0296 xmlConsumer.connect(producer); 0297 0298 qDebug() << "=== STARTING RENDER C, IN:" << m_zone.x() << " - " << m_zone.y(); 0299 m_duration = m_zone.y() - m_zone.x(); 0300 qApp->processEvents(); 0301 xmlConsumer.run(); 0302 qApp->processEvents(); 0303 qDebug() << "=== STARTING RENDER D"; 0304 speech_info->setMessageType(KMessageWidget::Information); 0305 speech_info->setText(i18n("Starting speech recognition")); 0306 qApp->processEvents(); 0307 QString modelDirectory = m_stt->voskModelPath(); 0308 m_speechJob = std::make_unique<QProcess>(this); 0309 connect(m_speechJob.get(), static_cast<void (QProcess::*)(int, QProcess::ExitStatus)>(&QProcess::finished), this, 0310 [this, speech](int, QProcess::ExitStatus status) { slotProcessSpeechStatus(status, speech); }); 0311 if (KdenliveSettings::speechEngine() == QLatin1String("whisper")) { 0312 // Whisper 0313 QString modelName = speech_model->currentData().toString(); 0314 m_speechJob->setProcessChannelMode(QProcess::MergedChannels); 0315 connect(m_speechJob.get(), &QProcess::readyReadStandardOutput, this, &SpeechDialog::slotProcessWhisperProgress); 0316 QString language = speech_language->isEnabled() && !speech_language->currentData().isNull() 0317 ? QString("language=%1").arg(speech_language->currentData().toString()) 0318 : QString(); 0319 qDebug() << "==== ANALYSIS SPEECH: " << m_stt->subtitleScript() << " " << audio << " " << modelName << " " << speech << " " 0320 << KdenliveSettings::whisperDevice() << " " << (translate_box->isChecked() ? QStringLiteral("translate") : QStringLiteral("transcribe")) << " " 0321 << language; 0322 if (KdenliveSettings::whisperDisableFP16()) { 0323 language.append(QStringLiteral(" fp16=False")); 0324 } 0325 m_speechJob->start(m_stt->pythonExec(), {m_stt->subtitleScript(), audio, modelName, speech, KdenliveSettings::whisperDevice(), 0326 translate_box->isChecked() ? QStringLiteral("translate") : QStringLiteral("transcribe"), language}); 0327 } else { 0328 // Vosk 0329 QString modelName = speech_model->currentText(); 0330 connect(m_speechJob.get(), &QProcess::readyReadStandardOutput, this, &SpeechDialog::slotProcessProgress); 0331 m_speechJob->start(m_stt->pythonExec(), {m_stt->subtitleScript(), modelDirectory, modelName, audio, speech}); 0332 } 0333 } 0334 0335 void SpeechDialog::slotProcessSpeechStatus(QProcess::ExitStatus status, const QString &srtFile) 0336 { 0337 if (!m_errorLog.isEmpty()) { 0338 speech_info->addAction(m_logAction); 0339 } 0340 if (status == QProcess::CrashExit) { 0341 speech_info->setMessageType(KMessageWidget::Warning); 0342 speech_info->setText(i18n("Speech recognition aborted.")); 0343 speech_info->animatedShow(); 0344 } else { 0345 if (QFile::exists(srtFile)) { 0346 m_timeline->getSubtitleModel()->importSubtitle(srtFile, m_zone.x(), true); 0347 speech_info->setMessageType(KMessageWidget::Positive); 0348 speech_info->setText(i18n("Subtitles imported")); 0349 } else { 0350 speech_info->setMessageType(KMessageWidget::Warning); 0351 speech_info->setText(i18n("Speech recognition failed")); 0352 } 0353 } 0354 buttonBox->button(QDialogButtonBox::Apply)->setEnabled(true); 0355 frame_progress->setVisible(false); 0356 } 0357 0358 void SpeechDialog::slotProcessProgress() 0359 { 0360 QString saveData = QString::fromUtf8(m_speechJob->readAll()); 0361 if (saveData.startsWith(QStringLiteral("progress:"))) { 0362 double prog = saveData.section(QLatin1Char(':'), 1).toInt() * 3.12; 0363 speech_progress->setValue(static_cast<int>(100 * prog / m_duration)); 0364 } 0365 } 0366 0367 void SpeechDialog::slotProcessWhisperProgress() 0368 { 0369 QString saveData = QString::fromUtf8(m_speechJob->readAll()); 0370 if (saveData.contains(QStringLiteral("%|"))) { 0371 int prog = saveData.section(QLatin1Char('%'), 0, 0).toInt(); 0372 qDebug() << "=== GOT DATA:\n" << saveData << " = " << prog; 0373 speech_progress->setValue(prog); 0374 } else { 0375 m_errorLog.append(saveData); 0376 } 0377 }