File indexing completed on 2024-04-28 08:44:23
0001 /* 0002 SPDX-FileCopyrightText: 2021 Jean-Baptiste Mardelle <jb@kdenlive.org> 0003 SPDX-FileCopyrightText: 2022 Julius Künzel <jk.kdedev@smartlab.uber.space> 0004 0005 SPDX-License-Identifier: GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL 0006 */ 0007 0008 #include "speechtotext.h" 0009 #include "core.h" 0010 #include "kdenlivesettings.h" 0011 0012 #include <KLocalizedString> 0013 #include <QDebug> 0014 #include <QDir> 0015 #include <QStandardPaths> 0016 0017 SpeechToText::SpeechToText(EngineType engineType, QObject *parent) 0018 : AbstractPythonInterface(parent) 0019 , m_engineType(engineType) 0020 { 0021 if (engineType == EngineType::EngineVosk) { 0022 addDependency(QStringLiteral("vosk"), i18n("speech features")); 0023 addDependency(QStringLiteral("srt"), i18n("automated subtitling")); 0024 addScript(QStringLiteral("speech.py")); 0025 addScript(QStringLiteral("speechtotext.py")); 0026 } else if (engineType == EngineType::EngineWhisper) { 0027 addDependency(QStringLiteral("openai-whisper"), i18n("speech features")); 0028 addDependency(QStringLiteral("srt"), i18n("automated subtitling")); 0029 addDependency(QStringLiteral("torch"), i18n("machine learning framework")); 0030 addScript(QStringLiteral("whispertotext.py")); 0031 addScript(QStringLiteral("whispertosrt.py")); 0032 } 0033 } 0034 0035 QString SpeechToText::featureName() 0036 { 0037 return i18n("Speech to text"); 0038 } 0039 0040 QString SpeechToText::voskModelPath() 0041 { 0042 QString modelDirectory = KdenliveSettings::vosk_folder_path(); 0043 if (modelDirectory.isEmpty()) { 0044 modelDirectory = QStandardPaths::locate(QStandardPaths::AppDataLocation, QStringLiteral("speechmodels"), QStandardPaths::LocateDirectory); 0045 } 0046 return modelDirectory; 0047 } 0048 0049 QList<std::pair<QString, QString>> SpeechToText::whisperModels() 0050 { 0051 QList<std::pair<QString, QString>> models; 0052 models.append({i18n("Tiny"), QStringLiteral("tiny")}); 0053 models.append({i18n("Base"), QStringLiteral("base")}); 0054 models.append({i18n("Small"), QStringLiteral("small")}); 0055 models.append({i18n("Medium"), QStringLiteral("medium")}); 0056 models.append({i18n("Large"), QStringLiteral("large")}); 0057 models.append({i18n("Tiny - English only"), QStringLiteral("tiny.en")}); 0058 models.append({i18n("Base - English only"), QStringLiteral("base.en")}); 0059 models.append({i18n("Small - English only"), QStringLiteral("small.en")}); 0060 models.append({i18n("Medium - English only"), QStringLiteral("medium.en")}); 0061 return models; 0062 } 0063 0064 QMap<QString, QString> SpeechToText::whisperLanguages() 0065 { 0066 QMap<QString, QString> models; 0067 models.insert(i18n("Audodetect"), QString()); 0068 models.insert(i18n("Afrikaans"), QStringLiteral("Afrikaans")); 0069 models.insert(i18n("Albanian"), QStringLiteral("Albanian")); 0070 models.insert(i18n("Amharic"), QStringLiteral("Amharic")); 0071 models.insert(i18n("Arabic"), QStringLiteral("Arabic")); 0072 models.insert(i18n("Armenian"), QStringLiteral("Armenian")); 0073 models.insert(i18n("Assamese"), QStringLiteral("Assamese")); 0074 models.insert(i18n("Azerbaijani"), QStringLiteral("Azerbaijani")); 0075 models.insert(i18n("Bashkir"), QStringLiteral("Bashkir")); 0076 models.insert(i18n("Basque"), QStringLiteral("Basque")); 0077 models.insert(i18n("Belarusian"), QStringLiteral("Belarusian")); 0078 models.insert(i18n("Bengali"), QStringLiteral("Bengali")); 0079 models.insert(i18n("Bosnian"), QStringLiteral("Bosnian")); 0080 models.insert(i18n("Breton"), QStringLiteral("Breton")); 0081 models.insert(i18n("Bulgarian"), QStringLiteral("Bulgarian")); 0082 models.insert(i18n("Burmese"), QStringLiteral("Burmese")); 0083 models.insert(i18n("Castilian"), QStringLiteral("Castilian")); 0084 models.insert(i18n("Catalan"), QStringLiteral("Catalan")); 0085 models.insert(i18n("Chinese"), QStringLiteral("Chinese")); 0086 models.insert(i18n("Croatian"), QStringLiteral("Croatian")); 0087 models.insert(i18n("Czech"), QStringLiteral("Czech")); 0088 models.insert(i18n("Danish"), QStringLiteral("Danish")); 0089 models.insert(i18n("Dutch"), QStringLiteral("Dutch")); 0090 models.insert(i18n("English"), QStringLiteral("English")); 0091 models.insert(i18n("Estonian"), QStringLiteral("Estonian")); 0092 models.insert(i18n("Faroese"), QStringLiteral("Faroese")); 0093 models.insert(i18n("Finnish"), QStringLiteral("Finnish")); 0094 models.insert(i18n("Flemish"), QStringLiteral("Flemish")); 0095 models.insert(i18n("French"), QStringLiteral("French")); 0096 models.insert(i18n("Galician"), QStringLiteral("Galician")); 0097 models.insert(i18n("Georgian"), QStringLiteral("Georgian")); 0098 models.insert(i18n("German"), QStringLiteral("German")); 0099 models.insert(i18n("Greek"), QStringLiteral("Greek")); 0100 models.insert(i18n("Gujarati"), QStringLiteral("Gujarati")); 0101 models.insert(i18n("Haitian"), QStringLiteral("Haitian")); 0102 models.insert(i18n("Haitian Creole"), QStringLiteral("Haitian Creole")); 0103 models.insert(i18n("Hausa"), QStringLiteral("Hausa")); 0104 models.insert(i18n("Hawaiian"), QStringLiteral("Hawaiian")); 0105 models.insert(i18n("Hebrew"), QStringLiteral("Hebrew")); 0106 models.insert(i18n("Hindi"), QStringLiteral("Hindi")); 0107 models.insert(i18n("Hungarian"), QStringLiteral("Hungarian")); 0108 models.insert(i18n("Icelandic"), QStringLiteral("Icelandic")); 0109 models.insert(i18n("Indonesian"), QStringLiteral("Indonesian")); 0110 models.insert(i18n("Italian"), QStringLiteral("Italian")); 0111 models.insert(i18n("Japanese"), QStringLiteral("Japanese")); 0112 models.insert(i18n("Javanese"), QStringLiteral("Javanese")); 0113 models.insert(i18n("Kannada"), QStringLiteral("Kannada")); 0114 models.insert(i18n("Kazakh"), QStringLiteral("Kazakh")); 0115 models.insert(i18n("Khmer"), QStringLiteral("Khmer")); 0116 models.insert(i18n("Korean"), QStringLiteral("Korean")); 0117 models.insert(i18n("Lao"), QStringLiteral("Lao")); 0118 models.insert(i18n("Latin"), QStringLiteral("Latin")); 0119 models.insert(i18n("Latvian"), QStringLiteral("Latvian")); 0120 models.insert(i18n("Letzeburgesch"), QStringLiteral("Letzeburgesch")); 0121 models.insert(i18n("Lingala"), QStringLiteral("Lingala")); 0122 models.insert(i18n("Lithuanian"), QStringLiteral("Lithuanian")); 0123 models.insert(i18n("Luxembourgish"), QStringLiteral("Luxembourgish")); 0124 models.insert(i18n("Macedonian"), QStringLiteral("Macedonian")); 0125 models.insert(i18n("Malagasy"), QStringLiteral("Malagasy")); 0126 models.insert(i18n("Malay"), QStringLiteral("Malay")); 0127 models.insert(i18n("Malayalam"), QStringLiteral("Malayalam")); 0128 models.insert(i18n("Maltese"), QStringLiteral("Maltese")); 0129 models.insert(i18n("Maori"), QStringLiteral("Maori")); 0130 models.insert(i18n("Marathi"), QStringLiteral("Marathi")); 0131 models.insert(i18n("Moldavian"), QStringLiteral("Moldavian")); 0132 models.insert(i18n("Moldovan"), QStringLiteral("Moldovan")); 0133 models.insert(i18n("Mongolian"), QStringLiteral("Mongolian")); 0134 models.insert(i18n("Myanmar"), QStringLiteral("Myanmar")); 0135 models.insert(i18n("Nepali"), QStringLiteral("Nepali")); 0136 models.insert(i18n("Norwegian"), QStringLiteral("Norwegian")); 0137 models.insert(i18n("Nynorsk"), QStringLiteral("Nynorsk")); 0138 models.insert(i18n("Occitan"), QStringLiteral("Occitan")); 0139 models.insert(i18n("Panjabi"), QStringLiteral("Panjabi")); 0140 models.insert(i18n("Pashto"), QStringLiteral("Pashto")); 0141 models.insert(i18n("Persian"), QStringLiteral("Persian")); 0142 models.insert(i18n("Polish"), QStringLiteral("Polish")); 0143 models.insert(i18n("Portuguese"), QStringLiteral("Portuguese")); 0144 models.insert(i18n("Punjabi"), QStringLiteral("Punjabi")); 0145 models.insert(i18n("Pushto"), QStringLiteral("Pushto")); 0146 models.insert(i18n("Romanian"), QStringLiteral("Romanian")); 0147 models.insert(i18n("Russian"), QStringLiteral("Russian")); 0148 models.insert(i18n("Sanskrit"), QStringLiteral("Sanskrit")); 0149 models.insert(i18n("Serbian"), QStringLiteral("Serbian")); 0150 models.insert(i18n("Shona"), QStringLiteral("Shona")); 0151 models.insert(i18n("Sindhi"), QStringLiteral("Sindhi")); 0152 models.insert(i18n("Sinhala"), QStringLiteral("Sinhala")); 0153 models.insert(i18n("Sinhalese"), QStringLiteral("Sinhalese")); 0154 models.insert(i18n("Slovak"), QStringLiteral("Slovak")); 0155 models.insert(i18n("Slovenian"), QStringLiteral("Slovenian")); 0156 models.insert(i18n("Somali"), QStringLiteral("Somali")); 0157 models.insert(i18n("Spanish"), QStringLiteral("Spanish")); 0158 models.insert(i18n("Sundanese"), QStringLiteral("Sundanese")); 0159 models.insert(i18n("Swahili"), QStringLiteral("Swahili")); 0160 models.insert(i18n("Swedish"), QStringLiteral("Swedish")); 0161 models.insert(i18n("Tagalog"), QStringLiteral("Tagalog")); 0162 models.insert(i18n("Tajik"), QStringLiteral("Tajik")); 0163 models.insert(i18n("Tamil"), QStringLiteral("Tamil")); 0164 models.insert(i18n("Tatar"), QStringLiteral("Tatar")); 0165 models.insert(i18n("Telugu"), QStringLiteral("Telugu")); 0166 models.insert(i18n("Thai"), QStringLiteral("Thai")); 0167 models.insert(i18n("Tibetan"), QStringLiteral("Tibetan")); 0168 models.insert(i18n("Turkish"), QStringLiteral("Turkish")); 0169 models.insert(i18n("Turkmen"), QStringLiteral("Turkmen")); 0170 models.insert(i18n("Ukrainian"), QStringLiteral("Ukrainian")); 0171 models.insert(i18n("Urdu"), QStringLiteral("Urdu")); 0172 models.insert(i18n("Uzbek"), QStringLiteral("Uzbek")); 0173 models.insert(i18n("Valencian"), QStringLiteral("Valencian")); 0174 models.insert(i18n("Vietnamese"), QStringLiteral("Vietnamese")); 0175 models.insert(i18n("Welsh"), QStringLiteral("Welsh")); 0176 models.insert(i18n("Yiddish"), QStringLiteral("Yiddish")); 0177 models.insert(i18n("Yoruba"), QStringLiteral("Yoruba")); 0178 return models; 0179 } 0180 0181 QStringList SpeechToText::parseVoskDictionaries() 0182 { 0183 QString modelDirectory = voskModelPath(); 0184 if (modelDirectory.isEmpty()) { 0185 qDebug() << "=== /// CANNOT ACCESS SPEECH DICTIONARIES FOLDER"; 0186 Q_EMIT pCore->voskModelUpdate({}); 0187 return {}; 0188 } 0189 QDir dir = QDir(modelDirectory); 0190 QStringList dicts = dir.entryList(QDir::Dirs | QDir::NoDotAndDotDot); 0191 QStringList final; 0192 for (auto &d : dicts) { 0193 QDir sub(dir.absoluteFilePath(d)); 0194 if (sub.exists(QStringLiteral("mfcc.conf")) || (sub.exists(QStringLiteral("conf/mfcc.conf")))) { 0195 final << d; 0196 } 0197 } 0198 Q_EMIT pCore->voskModelUpdate(final); 0199 return final; 0200 } 0201 0202 QString SpeechToText::subtitleScript() 0203 { 0204 if (m_engineType == EngineType::EngineWhisper) { 0205 return m_scripts->value(QStringLiteral("whispertosrt.py")); 0206 } 0207 return m_scripts->value(QStringLiteral("speech.py")); 0208 } 0209 0210 QString SpeechToText::speechScript() 0211 { 0212 if (m_engineType == EngineType::EngineWhisper) { 0213 return m_scripts->value(QStringLiteral("whispertotext.py")); 0214 } 0215 return m_scripts->value(QStringLiteral("speechtotext.py")); 0216 }