File indexing completed on 2024-12-22 04:40:15

0001 /*
0002     SPDX-FileCopyrightText: 2010-2022 Mladen Milinkovic <max@smoothware.net>
0003 
0004     SPDX-License-Identifier: GPL-2.0-or-later
0005 */
0006 
0007 #include "plugin-config.h"
0008 
0009 #include "pocketsphinxplugin.h"
0010 #include "pocketsphinxconfigwidget.h"
0011 #include "pocketsphinxconfig.h"
0012 #include "videoplayer/waveformat.h"
0013 
0014 #include <pocketsphinx.h>
0015 
0016 
0017 using namespace SubtitleComposer;
0018 
0019 PocketSphinxPlugin::PocketSphinxPlugin()
0020     : SpeechPlugin(),
0021       m_psConfig(nullptr),
0022       m_psDecoder(nullptr)
0023 {
0024 }
0025 
0026 /*virtual*/ const QString &
0027 PocketSphinxPlugin::name()
0028 {
0029     static const QString name(QStringLiteral("PocketSphinx"));
0030     return name;
0031 }
0032 
0033 const WaveFormat &
0034 PocketSphinxPlugin::waveFormat() const
0035 {
0036     static const WaveFormat wf(16000, 1, 16, true);
0037     return wf;
0038 }
0039 
0040 /*virtual*/ bool
0041 PocketSphinxPlugin::init()
0042 {
0043     m_psConfig = cmd_ln_init(nullptr, ps_args(), true,
0044                  "-hmm", QUrl(PocketSphinxConfig::acousticModelPath()).toLocalFile().toUtf8().constData(),
0045                  "-lm", QUrl(PocketSphinxConfig::trigramModelFile()).toLocalFile().toUtf8().constData(),
0046                  "-dict", QUrl(PocketSphinxConfig::lexiconFile()).toLocalFile().toUtf8().constData(),
0047 //               "-frate", "100",
0048                  // Num of silence frames to keep after speech to silence transition. (pocketsphinx default: 50)
0049                  "-vad_postspeech", QByteArray::number(PocketSphinxConfig::vadPostSpeech()).constData(),
0050                  // Num of speech frames to keep before silence to speech transition. (pocketsphinx default: 20)
0051                  "-vad_prespeech", QByteArray::number(PocketSphinxConfig::vadPreSpeech()).constData(),
0052                  // Num of speech frames to trigger VAD from silence to speech. (pocketsphinx default: 10)
0053                  "-vad_startspeech", QByteArray::number(PocketSphinxConfig::vadStartSpeech()).constData(),
0054                  // Threshold for decision between noise and silence frames.
0055                  // Log-ratio between signal level and noise level. (pocketsphinx default: 2.0)
0056                  "-vad_threshold", QByteArray::number(PocketSphinxConfig::vadTreshold()).constData(),
0057                  nullptr);
0058     if(m_psConfig == nullptr) {
0059         qWarning() << "Failed to create PocketSphinx config object";
0060         return false;
0061     }
0062 
0063     m_psDecoder = ps_init(m_psConfig);
0064     if(m_psDecoder == nullptr) {
0065         qWarning() << "Failed to create PocketSphinx recognizer";
0066         return false;
0067     }
0068 
0069     m_psFrameRate = cmd_ln_int32_r(m_psConfig, "-frate");
0070 
0071     m_lineText.clear();
0072     m_lineIn = m_lineOut = 0;
0073 
0074     m_utteranceStarted = false;
0075     m_speechStarted = false;
0076 
0077     return true;
0078 }
0079 
0080 /*virtual*/ void
0081 PocketSphinxPlugin::cleanup()
0082 {
0083     if(m_psDecoder != nullptr) {
0084         ps_free(m_psDecoder);
0085         m_psDecoder = nullptr;
0086     }
0087     if(m_psConfig != nullptr) {
0088         cmd_ln_free_r(m_psConfig);
0089         m_psConfig = nullptr;
0090     }
0091 }
0092 
0093 void
0094 PocketSphinxPlugin::processUtterance()
0095 {
0096     qint32 score;
0097     char const *hyp = ps_get_hyp(m_psDecoder, &score);
0098     if(!hyp || !*hyp)
0099         return;
0100 
0101 #ifdef HAS_NEW_PS_SEG_ITER
0102     ps_seg_t *iter = ps_seg_iter(m_psDecoder);
0103 #else
0104     ps_seg_t *iter = ps_seg_iter(m_psDecoder, nullptr);
0105 #endif
0106     while(iter != nullptr) {
0107         const char *word = ps_seg_word(iter);
0108         int wordIn, wordOut;
0109         ps_seg_frames(iter, &wordIn, &wordOut);
0110         if(*word == '<' || *word == '[') {
0111             // "<s>" "</s>" "<sil>" "[SPEECH]"
0112             if(!m_lineText.isEmpty()) {
0113                 emit textRecognized(m_lineText,
0114                                       double(m_lineIn) * 1000. / double(m_psFrameRate),
0115                                       double(m_lineOut) * 1000. / double(m_psFrameRate));
0116                 m_lineText.clear();
0117             }
0118         } else {
0119             QString sWord = QString::fromLatin1(word);
0120 
0121             // strip number suffix
0122             const char *pos = word;
0123             while(*pos && *pos != '(')
0124                 pos++;
0125             sWord.truncate(pos - word);
0126 
0127             if(m_lineText.isEmpty()) {
0128                 m_lineText = sWord;
0129                 m_lineIn = wordIn;
0130             } else {
0131                 m_lineText += ' ';
0132                 m_lineText += sWord;
0133             }
0134             m_lineOut = wordOut;
0135         }
0136 
0137         iter = ps_seg_next(iter);
0138     }
0139     if(!m_lineText.isEmpty()) {
0140         emit textRecognized(m_lineText,
0141                               double(m_lineIn) * 1000. / double(m_psFrameRate),
0142                               double(m_lineOut) * 1000. / double(m_psFrameRate));
0143         m_lineText.clear();
0144     }
0145 }
0146 
0147 /*virtual*/ void
0148 PocketSphinxPlugin::processSamples(const void *sampleData, qint32 sampleCount)
0149 {
0150     if(!m_utteranceStarted) {
0151         ps_start_utt(m_psDecoder);
0152         m_utteranceStarted = true;
0153         m_speechStarted = false;
0154     }
0155 
0156     ps_process_raw(m_psDecoder, reinterpret_cast<const int16 *>(sampleData), sampleCount, false, false);
0157 
0158     if(ps_get_in_speech(m_psDecoder)) {
0159         m_speechStarted = true;
0160     } else {
0161         if(m_utteranceStarted && m_speechStarted) {
0162             ps_end_utt(m_psDecoder);
0163             processUtterance();
0164             m_speechStarted = false;
0165             m_utteranceStarted = false;
0166         }
0167     }
0168 }
0169 
0170 /*virtual*/ void
0171 PocketSphinxPlugin::processComplete()
0172 {
0173     if(m_psDecoder) {
0174         if(m_utteranceStarted)
0175             ps_end_utt(m_psDecoder);
0176         processUtterance();
0177     }
0178 }
0179 
0180 QWidget *
0181 PocketSphinxPlugin::newConfigWidget(QWidget *parent)
0182 {
0183     return new PocketSphinxConfigWidget(parent);
0184 }
0185 
0186 KCoreConfigSkeleton *
0187 PocketSphinxPlugin::config() const
0188 {
0189     return PocketSphinxConfig::self();
0190 }