data/scripts/speech.py

0001 #!/usr/bin/env python3
0002
0003 # SPDX-FileCopyrightText: 2021 Jean-Baptiste Mardelle <jb@kdenlive.org>
0004 # SPDX-License-Identifier: GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
0005
0006 #pip3 install vosk
0007 #pip3 install srt
0008
0009 from vosk import Model, KaldiRecognizer, SetLogLevel
0010 import sys
0011 import os
0012 import wave
0013 import subprocess
0014 import srt
0015 import json
0016 import datetime
0017
0018 SetLogLevel(-1)
0019
0020 os.chdir(sys.argv[1])
0021
0022 if not os.path.exists(sys.argv[2]):
0023     print ("Please download the model from https://alphacephei.com/vosk/models and unpack as ", sys.argv[2]," in the current folder.")
0024     exit (1)
0025
0026 if sys.platform == 'darwin':
0027     from os.path import abspath, dirname, join
0028     path = abspath(join(dirname(__file__), '../../MacOS/ffmpeg'))
0029 else:
0030     path = 'ffmpeg'
0031
0032 sample_rate=16000
0033 model = Model(sys.argv[2])
0034 rec = KaldiRecognizer(model, sample_rate)
0035 rec.SetWords(True)
0036
0037 process = subprocess.Popen([path, '-loglevel', 'quiet', '-i',
0038                             sys.argv[3],
0039                             '-ar', str(sample_rate) , '-ac', '1', '-f', 's16le', '-'],
0040                             stdout=subprocess.PIPE)
0041 WORDS_PER_LINE = 7
0042
0043 def transcribe():
0044     results = []
0045     subs = []
0046     progress = 0
0047     while True:
0048        data = process.stdout.read(4000)
0049        print("progress:" + str(progress), file = sys.stdout, flush=True)
0050        progress += 1
0051        if len(data) == 0:
0052            break
0053        if rec.AcceptWaveform(data):
0054            results.append(rec.Result())
0055     results.append(rec.FinalResult())
0056
0057     for i, res in enumerate(results):
0058        jres = json.loads(res)
0059        if not 'result' in jres:
0060            continue
0061        words = jres['result']
0062        for j in range(0, len(words), WORDS_PER_LINE):
0063            line = words[j : j + WORDS_PER_LINE]
0064            s = srt.Subtitle(index=len(subs),
0065                    content=" ".join([l['word'] for l in line]),
0066                    start=datetime.timedelta(seconds=line[0]['start']),
0067                    end=datetime.timedelta(seconds=line[-1]['end']))
0068            subs.append(s)
0069     return subs
0070
0071 subtitle = srt.compose(transcribe())
0072 #print (subtitle)
0073 with open(sys.argv[4], 'w',encoding='utf8') as f:
0074     f.writelines(subtitle)
0075 f.close()