File indexing completed on 2024-12-22 04:40:15
0001 /* 0002 SPDX-FileCopyrightText: 2010-2022 Mladen Milinkovic <max@smoothware.net> 0003 0004 SPDX-License-Identifier: GPL-2.0-or-later 0005 */ 0006 0007 #include "plugin-config.h" 0008 0009 #include "pocketsphinxplugin.h" 0010 #include "pocketsphinxconfigwidget.h" 0011 #include "pocketsphinxconfig.h" 0012 #include "videoplayer/waveformat.h" 0013 0014 #include <pocketsphinx.h> 0015 0016 0017 using namespace SubtitleComposer; 0018 0019 PocketSphinxPlugin::PocketSphinxPlugin() 0020 : SpeechPlugin(), 0021 m_psConfig(nullptr), 0022 m_psDecoder(nullptr) 0023 { 0024 } 0025 0026 /*virtual*/ const QString & 0027 PocketSphinxPlugin::name() 0028 { 0029 static const QString name(QStringLiteral("PocketSphinx")); 0030 return name; 0031 } 0032 0033 const WaveFormat & 0034 PocketSphinxPlugin::waveFormat() const 0035 { 0036 static const WaveFormat wf(16000, 1, 16, true); 0037 return wf; 0038 } 0039 0040 /*virtual*/ bool 0041 PocketSphinxPlugin::init() 0042 { 0043 m_psConfig = cmd_ln_init(nullptr, ps_args(), true, 0044 "-hmm", QUrl(PocketSphinxConfig::acousticModelPath()).toLocalFile().toUtf8().constData(), 0045 "-lm", QUrl(PocketSphinxConfig::trigramModelFile()).toLocalFile().toUtf8().constData(), 0046 "-dict", QUrl(PocketSphinxConfig::lexiconFile()).toLocalFile().toUtf8().constData(), 0047 // "-frate", "100", 0048 // Num of silence frames to keep after speech to silence transition. (pocketsphinx default: 50) 0049 "-vad_postspeech", QByteArray::number(PocketSphinxConfig::vadPostSpeech()).constData(), 0050 // Num of speech frames to keep before silence to speech transition. (pocketsphinx default: 20) 0051 "-vad_prespeech", QByteArray::number(PocketSphinxConfig::vadPreSpeech()).constData(), 0052 // Num of speech frames to trigger VAD from silence to speech. (pocketsphinx default: 10) 0053 "-vad_startspeech", QByteArray::number(PocketSphinxConfig::vadStartSpeech()).constData(), 0054 // Threshold for decision between noise and silence frames. 0055 // Log-ratio between signal level and noise level. (pocketsphinx default: 2.0) 0056 "-vad_threshold", QByteArray::number(PocketSphinxConfig::vadTreshold()).constData(), 0057 nullptr); 0058 if(m_psConfig == nullptr) { 0059 qWarning() << "Failed to create PocketSphinx config object"; 0060 return false; 0061 } 0062 0063 m_psDecoder = ps_init(m_psConfig); 0064 if(m_psDecoder == nullptr) { 0065 qWarning() << "Failed to create PocketSphinx recognizer"; 0066 return false; 0067 } 0068 0069 m_psFrameRate = cmd_ln_int32_r(m_psConfig, "-frate"); 0070 0071 m_lineText.clear(); 0072 m_lineIn = m_lineOut = 0; 0073 0074 m_utteranceStarted = false; 0075 m_speechStarted = false; 0076 0077 return true; 0078 } 0079 0080 /*virtual*/ void 0081 PocketSphinxPlugin::cleanup() 0082 { 0083 if(m_psDecoder != nullptr) { 0084 ps_free(m_psDecoder); 0085 m_psDecoder = nullptr; 0086 } 0087 if(m_psConfig != nullptr) { 0088 cmd_ln_free_r(m_psConfig); 0089 m_psConfig = nullptr; 0090 } 0091 } 0092 0093 void 0094 PocketSphinxPlugin::processUtterance() 0095 { 0096 qint32 score; 0097 char const *hyp = ps_get_hyp(m_psDecoder, &score); 0098 if(!hyp || !*hyp) 0099 return; 0100 0101 #ifdef HAS_NEW_PS_SEG_ITER 0102 ps_seg_t *iter = ps_seg_iter(m_psDecoder); 0103 #else 0104 ps_seg_t *iter = ps_seg_iter(m_psDecoder, nullptr); 0105 #endif 0106 while(iter != nullptr) { 0107 const char *word = ps_seg_word(iter); 0108 int wordIn, wordOut; 0109 ps_seg_frames(iter, &wordIn, &wordOut); 0110 if(*word == '<' || *word == '[') { 0111 // "<s>" "</s>" "<sil>" "[SPEECH]" 0112 if(!m_lineText.isEmpty()) { 0113 emit textRecognized(m_lineText, 0114 double(m_lineIn) * 1000. / double(m_psFrameRate), 0115 double(m_lineOut) * 1000. / double(m_psFrameRate)); 0116 m_lineText.clear(); 0117 } 0118 } else { 0119 QString sWord = QString::fromLatin1(word); 0120 0121 // strip number suffix 0122 const char *pos = word; 0123 while(*pos && *pos != '(') 0124 pos++; 0125 sWord.truncate(pos - word); 0126 0127 if(m_lineText.isEmpty()) { 0128 m_lineText = sWord; 0129 m_lineIn = wordIn; 0130 } else { 0131 m_lineText += ' '; 0132 m_lineText += sWord; 0133 } 0134 m_lineOut = wordOut; 0135 } 0136 0137 iter = ps_seg_next(iter); 0138 } 0139 if(!m_lineText.isEmpty()) { 0140 emit textRecognized(m_lineText, 0141 double(m_lineIn) * 1000. / double(m_psFrameRate), 0142 double(m_lineOut) * 1000. / double(m_psFrameRate)); 0143 m_lineText.clear(); 0144 } 0145 } 0146 0147 /*virtual*/ void 0148 PocketSphinxPlugin::processSamples(const void *sampleData, qint32 sampleCount) 0149 { 0150 if(!m_utteranceStarted) { 0151 ps_start_utt(m_psDecoder); 0152 m_utteranceStarted = true; 0153 m_speechStarted = false; 0154 } 0155 0156 ps_process_raw(m_psDecoder, reinterpret_cast<const int16 *>(sampleData), sampleCount, false, false); 0157 0158 if(ps_get_in_speech(m_psDecoder)) { 0159 m_speechStarted = true; 0160 } else { 0161 if(m_utteranceStarted && m_speechStarted) { 0162 ps_end_utt(m_psDecoder); 0163 processUtterance(); 0164 m_speechStarted = false; 0165 m_utteranceStarted = false; 0166 } 0167 } 0168 } 0169 0170 /*virtual*/ void 0171 PocketSphinxPlugin::processComplete() 0172 { 0173 if(m_psDecoder) { 0174 if(m_utteranceStarted) 0175 ps_end_utt(m_psDecoder); 0176 processUtterance(); 0177 } 0178 } 0179 0180 QWidget * 0181 PocketSphinxPlugin::newConfigWidget(QWidget *parent) 0182 { 0183 return new PocketSphinxConfigWidget(parent); 0184 } 0185 0186 KCoreConfigSkeleton * 0187 PocketSphinxPlugin::config() const 0188 { 0189 return PocketSphinxConfig::self(); 0190 }