File indexing completed on 2024-04-14 04:46:58

0001 /*
0002     SPDX-FileCopyrightText: 2021 Jean-Baptiste Mardelle <jb@kdenlive.org>
0003     SPDX-FileCopyrightText: 2022 Julius Künzel <jk.kdedev@smartlab.uber.space>
0004 
0005     SPDX-License-Identifier: GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
0006 */
0007 
0008 #include "speechtotext.h"
0009 #include "core.h"
0010 #include "kdenlivesettings.h"
0011 
0012 #include <KLocalizedString>
0013 #include <QDebug>
0014 #include <QDir>
0015 #include <QStandardPaths>
0016 
0017 SpeechToText::SpeechToText(EngineType engineType, QObject *parent)
0018     : AbstractPythonInterface(parent)
0019     , m_engineType(engineType)
0020 {
0021     if (engineType == EngineType::EngineVosk) {
0022         addDependency(QStringLiteral("vosk"), i18n("speech features"));
0023         addDependency(QStringLiteral("srt"), i18n("automated subtitling"));
0024         addScript(QStringLiteral("speech.py"));
0025         addScript(QStringLiteral("speechtotext.py"));
0026     } else if (engineType == EngineType::EngineWhisper) {
0027         addDependency(QStringLiteral("openai-whisper"), i18n("speech features"));
0028         addDependency(QStringLiteral("srt"), i18n("automated subtitling"));
0029         addDependency(QStringLiteral("torch"), i18n("machine learning framework"));
0030         addScript(QStringLiteral("whispertotext.py"));
0031         addScript(QStringLiteral("whispertosrt.py"));
0032     }
0033 }
0034 
0035 QString SpeechToText::featureName()
0036 {
0037     return i18n("Speech to text");
0038 }
0039 
0040 QString SpeechToText::voskModelPath()
0041 {
0042     QString modelDirectory = KdenliveSettings::vosk_folder_path();
0043     if (modelDirectory.isEmpty()) {
0044         modelDirectory = QStandardPaths::locate(QStandardPaths::AppDataLocation, QStringLiteral("speechmodels"), QStandardPaths::LocateDirectory);
0045     }
0046     return modelDirectory;
0047 }
0048 
0049 QList<std::pair<QString, QString>> SpeechToText::whisperModels()
0050 {
0051     QList<std::pair<QString, QString>> models;
0052     models.append({i18n("Tiny"), QStringLiteral("tiny")});
0053     models.append({i18n("Base"), QStringLiteral("base")});
0054     models.append({i18n("Small"), QStringLiteral("small")});
0055     models.append({i18n("Medium"), QStringLiteral("medium")});
0056     models.append({i18n("Large"), QStringLiteral("large")});
0057     models.append({i18n("Tiny - English only"), QStringLiteral("tiny.en")});
0058     models.append({i18n("Base - English only"), QStringLiteral("base.en")});
0059     models.append({i18n("Small - English only"), QStringLiteral("small.en")});
0060     models.append({i18n("Medium - English only"), QStringLiteral("medium.en")});
0061     return models;
0062 }
0063 
0064 QMap<QString, QString> SpeechToText::whisperLanguages()
0065 {
0066     QMap<QString, QString> models;
0067     models.insert(i18n("Audodetect"), QString());
0068     models.insert(i18n("Afrikaans"), QStringLiteral("Afrikaans"));
0069     models.insert(i18n("Albanian"), QStringLiteral("Albanian"));
0070     models.insert(i18n("Amharic"), QStringLiteral("Amharic"));
0071     models.insert(i18n("Arabic"), QStringLiteral("Arabic"));
0072     models.insert(i18n("Armenian"), QStringLiteral("Armenian"));
0073     models.insert(i18n("Assamese"), QStringLiteral("Assamese"));
0074     models.insert(i18n("Azerbaijani"), QStringLiteral("Azerbaijani"));
0075     models.insert(i18n("Bashkir"), QStringLiteral("Bashkir"));
0076     models.insert(i18n("Basque"), QStringLiteral("Basque"));
0077     models.insert(i18n("Belarusian"), QStringLiteral("Belarusian"));
0078     models.insert(i18n("Bengali"), QStringLiteral("Bengali"));
0079     models.insert(i18n("Bosnian"), QStringLiteral("Bosnian"));
0080     models.insert(i18n("Breton"), QStringLiteral("Breton"));
0081     models.insert(i18n("Bulgarian"), QStringLiteral("Bulgarian"));
0082     models.insert(i18n("Burmese"), QStringLiteral("Burmese"));
0083     models.insert(i18n("Castilian"), QStringLiteral("Castilian"));
0084     models.insert(i18n("Catalan"), QStringLiteral("Catalan"));
0085     models.insert(i18n("Chinese"), QStringLiteral("Chinese"));
0086     models.insert(i18n("Croatian"), QStringLiteral("Croatian"));
0087     models.insert(i18n("Czech"), QStringLiteral("Czech"));
0088     models.insert(i18n("Danish"), QStringLiteral("Danish"));
0089     models.insert(i18n("Dutch"), QStringLiteral("Dutch"));
0090     models.insert(i18n("English"), QStringLiteral("English"));
0091     models.insert(i18n("Estonian"), QStringLiteral("Estonian"));
0092     models.insert(i18n("Faroese"), QStringLiteral("Faroese"));
0093     models.insert(i18n("Finnish"), QStringLiteral("Finnish"));
0094     models.insert(i18n("Flemish"), QStringLiteral("Flemish"));
0095     models.insert(i18n("French"), QStringLiteral("French"));
0096     models.insert(i18n("Galician"), QStringLiteral("Galician"));
0097     models.insert(i18n("Georgian"), QStringLiteral("Georgian"));
0098     models.insert(i18n("German"), QStringLiteral("German"));
0099     models.insert(i18n("Greek"), QStringLiteral("Greek"));
0100     models.insert(i18n("Gujarati"), QStringLiteral("Gujarati"));
0101     models.insert(i18n("Haitian"), QStringLiteral("Haitian"));
0102     models.insert(i18n("Haitian Creole"), QStringLiteral("Haitian Creole"));
0103     models.insert(i18n("Hausa"), QStringLiteral("Hausa"));
0104     models.insert(i18n("Hawaiian"), QStringLiteral("Hawaiian"));
0105     models.insert(i18n("Hebrew"), QStringLiteral("Hebrew"));
0106     models.insert(i18n("Hindi"), QStringLiteral("Hindi"));
0107     models.insert(i18n("Hungarian"), QStringLiteral("Hungarian"));
0108     models.insert(i18n("Icelandic"), QStringLiteral("Icelandic"));
0109     models.insert(i18n("Indonesian"), QStringLiteral("Indonesian"));
0110     models.insert(i18n("Italian"), QStringLiteral("Italian"));
0111     models.insert(i18n("Japanese"), QStringLiteral("Japanese"));
0112     models.insert(i18n("Javanese"), QStringLiteral("Javanese"));
0113     models.insert(i18n("Kannada"), QStringLiteral("Kannada"));
0114     models.insert(i18n("Kazakh"), QStringLiteral("Kazakh"));
0115     models.insert(i18n("Khmer"), QStringLiteral("Khmer"));
0116     models.insert(i18n("Korean"), QStringLiteral("Korean"));
0117     models.insert(i18n("Lao"), QStringLiteral("Lao"));
0118     models.insert(i18n("Latin"), QStringLiteral("Latin"));
0119     models.insert(i18n("Latvian"), QStringLiteral("Latvian"));
0120     models.insert(i18n("Letzeburgesch"), QStringLiteral("Letzeburgesch"));
0121     models.insert(i18n("Lingala"), QStringLiteral("Lingala"));
0122     models.insert(i18n("Lithuanian"), QStringLiteral("Lithuanian"));
0123     models.insert(i18n("Luxembourgish"), QStringLiteral("Luxembourgish"));
0124     models.insert(i18n("Macedonian"), QStringLiteral("Macedonian"));
0125     models.insert(i18n("Malagasy"), QStringLiteral("Malagasy"));
0126     models.insert(i18n("Malay"), QStringLiteral("Malay"));
0127     models.insert(i18n("Malayalam"), QStringLiteral("Malayalam"));
0128     models.insert(i18n("Maltese"), QStringLiteral("Maltese"));
0129     models.insert(i18n("Maori"), QStringLiteral("Maori"));
0130     models.insert(i18n("Marathi"), QStringLiteral("Marathi"));
0131     models.insert(i18n("Moldavian"), QStringLiteral("Moldavian"));
0132     models.insert(i18n("Moldovan"), QStringLiteral("Moldovan"));
0133     models.insert(i18n("Mongolian"), QStringLiteral("Mongolian"));
0134     models.insert(i18n("Myanmar"), QStringLiteral("Myanmar"));
0135     models.insert(i18n("Nepali"), QStringLiteral("Nepali"));
0136     models.insert(i18n("Norwegian"), QStringLiteral("Norwegian"));
0137     models.insert(i18n("Nynorsk"), QStringLiteral("Nynorsk"));
0138     models.insert(i18n("Occitan"), QStringLiteral("Occitan"));
0139     models.insert(i18n("Panjabi"), QStringLiteral("Panjabi"));
0140     models.insert(i18n("Pashto"), QStringLiteral("Pashto"));
0141     models.insert(i18n("Persian"), QStringLiteral("Persian"));
0142     models.insert(i18n("Polish"), QStringLiteral("Polish"));
0143     models.insert(i18n("Portuguese"), QStringLiteral("Portuguese"));
0144     models.insert(i18n("Punjabi"), QStringLiteral("Punjabi"));
0145     models.insert(i18n("Pushto"), QStringLiteral("Pushto"));
0146     models.insert(i18n("Romanian"), QStringLiteral("Romanian"));
0147     models.insert(i18n("Russian"), QStringLiteral("Russian"));
0148     models.insert(i18n("Sanskrit"), QStringLiteral("Sanskrit"));
0149     models.insert(i18n("Serbian"), QStringLiteral("Serbian"));
0150     models.insert(i18n("Shona"), QStringLiteral("Shona"));
0151     models.insert(i18n("Sindhi"), QStringLiteral("Sindhi"));
0152     models.insert(i18n("Sinhala"), QStringLiteral("Sinhala"));
0153     models.insert(i18n("Sinhalese"), QStringLiteral("Sinhalese"));
0154     models.insert(i18n("Slovak"), QStringLiteral("Slovak"));
0155     models.insert(i18n("Slovenian"), QStringLiteral("Slovenian"));
0156     models.insert(i18n("Somali"), QStringLiteral("Somali"));
0157     models.insert(i18n("Spanish"), QStringLiteral("Spanish"));
0158     models.insert(i18n("Sundanese"), QStringLiteral("Sundanese"));
0159     models.insert(i18n("Swahili"), QStringLiteral("Swahili"));
0160     models.insert(i18n("Swedish"), QStringLiteral("Swedish"));
0161     models.insert(i18n("Tagalog"), QStringLiteral("Tagalog"));
0162     models.insert(i18n("Tajik"), QStringLiteral("Tajik"));
0163     models.insert(i18n("Tamil"), QStringLiteral("Tamil"));
0164     models.insert(i18n("Tatar"), QStringLiteral("Tatar"));
0165     models.insert(i18n("Telugu"), QStringLiteral("Telugu"));
0166     models.insert(i18n("Thai"), QStringLiteral("Thai"));
0167     models.insert(i18n("Tibetan"), QStringLiteral("Tibetan"));
0168     models.insert(i18n("Turkish"), QStringLiteral("Turkish"));
0169     models.insert(i18n("Turkmen"), QStringLiteral("Turkmen"));
0170     models.insert(i18n("Ukrainian"), QStringLiteral("Ukrainian"));
0171     models.insert(i18n("Urdu"), QStringLiteral("Urdu"));
0172     models.insert(i18n("Uzbek"), QStringLiteral("Uzbek"));
0173     models.insert(i18n("Valencian"), QStringLiteral("Valencian"));
0174     models.insert(i18n("Vietnamese"), QStringLiteral("Vietnamese"));
0175     models.insert(i18n("Welsh"), QStringLiteral("Welsh"));
0176     models.insert(i18n("Yiddish"), QStringLiteral("Yiddish"));
0177     models.insert(i18n("Yoruba"), QStringLiteral("Yoruba"));
0178     return models;
0179 }
0180 
0181 QStringList SpeechToText::parseVoskDictionaries()
0182 {
0183     QString modelDirectory = voskModelPath();
0184     if (modelDirectory.isEmpty()) {
0185         qDebug() << "=== /// CANNOT ACCESS SPEECH DICTIONARIES FOLDER";
0186         Q_EMIT pCore->voskModelUpdate({});
0187         return {};
0188     }
0189     QDir dir = QDir(modelDirectory);
0190     QStringList dicts = dir.entryList(QDir::Dirs | QDir::NoDotAndDotDot);
0191     QStringList final;
0192     for (auto &d : dicts) {
0193         QDir sub(dir.absoluteFilePath(d));
0194         if (sub.exists(QStringLiteral("mfcc.conf")) || (sub.exists(QStringLiteral("conf/mfcc.conf")))) {
0195             final << d;
0196         }
0197     }
0198     Q_EMIT pCore->voskModelUpdate(final);
0199     return final;
0200 }
0201 
0202 QString SpeechToText::subtitleScript()
0203 {
0204     if (m_engineType == EngineType::EngineWhisper) {
0205         return m_scripts->value(QStringLiteral("whispertosrt.py"));
0206     }
0207     return m_scripts->value(QStringLiteral("speech.py"));
0208 }
0209 
0210 QString SpeechToText::speechScript()
0211 {
0212     if (m_engineType == EngineType::EngineWhisper) {
0213         return m_scripts->value(QStringLiteral("whispertotext.py"));
0214     }
0215     return m_scripts->value(QStringLiteral("speechtotext.py"));
0216 }