data/scripts/whispertotext.py

0001 #!/usr/bin/env python3
0002 # SPDX-FileCopyrightText: 2023 Jean-Baptiste Mardelle <jb@kdenlive.org>
0003 # SPDX-License-Identifier: GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
0004
0005 import codecs
0006 import datetime
0007 import os
0008 import re
0009 import subprocess
0010 import sys
0011 import wave
0012
0013 import torch
0014 import whisper
0015
0016 # Call this script with the following arguments
0017 # 1. source av file
0018 # 2. model name (tiny, base, small, medium, large)
0019 # 3. Device (cpu, cuda)
0020 # 4. translate or transcribe
0021 # 5. Language
0022 # 6. in point (optional)
0023 # 7. out point
0024 # 8. tmp file name to extract a clip's part
0025
0026 def avoid_fp16(device):
0027     """fp16 doesn't work on some GPUs, such as Nvidia GTX 16xx. See bug 467573."""
0028     if device == "cpu": # fp16 option doesn't matter for CPU
0029         return False
0030     device = torch.cuda.get_device_name(device)
0031     if re.search(r"GTX 16\d\d", device):
0032         sys.stderr.write("GTX 16xx series GPU detected, disabling fp16\n")
0033         return True
0034
0035 def ffmpeg_path():
0036     if sys.platform == 'darwin':
0037         from os.path import abspath, dirname, join
0038         return abspath(join(dirname(__file__), '../../MacOS/ffmpeg'))
0039     else:
0040         return 'ffmpeg'
0041
0042
0043 def extract_zone(source, outfile, in_point, out_point):
0044     sample_rate = 16000
0045     path = ffmpeg_path()
0046     process = subprocess.run([path, '-loglevel', 'quiet', '-y', '-i',
0047                             source, '-ss', in_point, '-t', out_point,
0048                             '-vn', '-ar', str(sample_rate) , '-ac', '1', '-f', 'wav', outfile],
0049                             stdout=subprocess.PIPE)
0050
0051
0052 def run_whisper(source, model, device="cpu", task="transcribe", extraparams=""):
0053     model = whisper.load_model(model, device)
0054
0055     transcribe_kwargs = {
0056         "task": task,
0057         "verbose": False,
0058         'patience': None,
0059         'length_penalty': None,
0060         'suppress_tokens': '-1',
0061         'condition_on_previous_text':True,
0062         'word_timestamps':True
0063     }
0064
0065     if len(extraparams) > 1:
0066         extraArgs = extraparams.split()
0067         for x in extraArgs:
0068             param = x.split('=')
0069             if (len(param) > 1):
0070                 transcribe_kwargs[param[0]] = param[1]
0071
0072     if avoid_fp16(device):
0073         transcribe_kwargs["fp16"] = False
0074
0075     return model.transcribe(source, **transcribe_kwargs)
0076
0077
0078 def main():
0079     source=sys.argv[1]
0080     if len(sys.argv) > 8 and (float(sys.argv[6])>0 or float(sys.argv[7])>0):
0081         tmp_file = sys.argv[8]
0082         extract_zone(source, tmp_file, sys.argv[6], sys.argv[7])
0083         source = tmp_file
0084
0085     model = sys.argv[2]
0086     device = sys.argv[3]
0087     task = sys.argv[4]
0088     language = sys.argv[5]
0089     result = run_whisper(source, model, device, task, language)
0090
0091     for i in result["segments"]:
0092         start_time = i["start"]
0093         end_time = i["end"]
0094         duration = end_time - start_time
0095         timestamp = f"{start_time:.3f} - {end_time:.3f}"
0096         text = i["text"]
0097         words = i["words"]
0098         res = '[' + str(start_time) + '>' + str(end_time) + ']' + '\n'
0099         for j in words:
0100             res += '[' + str(j["start"]) + '>' + str(j["end"]) + ']' + j["word"] + '\n'
0101         sys.stdout.buffer.write(res.encode('utf-8'))
0102     sys.stdout.flush()
0103     return 0
0104
0105
0106 if __name__ == "__main__":
0107     sys.exit(main())