File indexing completed on 2024-05-05 04:52:35

0001 #!/usr/bin/env python3
0002 # SPDX-FileCopyrightText: 2023 Jean-Baptiste Mardelle <jb@kdenlive.org>
0003 # SPDX-License-Identifier: GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL
0004 
0005 import codecs
0006 import datetime
0007 import os
0008 import re
0009 import subprocess
0010 import sys
0011 import wave
0012 
0013 import torch
0014 import whisper
0015 
0016 # Call this script with the following arguments
0017 # 1. source av file
0018 # 2. model name (tiny, base, small, medium, large)
0019 # 3. Device (cpu, cuda)
0020 # 4. translate or transcribe
0021 # 5. Language
0022 # 6. in point (optional)
0023 # 7. out point
0024 # 8. tmp file name to extract a clip's part
0025 
0026 def avoid_fp16(device):
0027     """fp16 doesn't work on some GPUs, such as Nvidia GTX 16xx. See bug 467573."""
0028     if device == "cpu": # fp16 option doesn't matter for CPU
0029         return False
0030     device = torch.cuda.get_device_name(device)
0031     if re.search(r"GTX 16\d\d", device):
0032         sys.stderr.write("GTX 16xx series GPU detected, disabling fp16\n")
0033         return True
0034 
0035 def ffmpeg_path():
0036     if sys.platform == 'darwin':
0037         from os.path import abspath, dirname, join
0038         return abspath(join(dirname(__file__), '../../MacOS/ffmpeg'))
0039     else:
0040         return 'ffmpeg'
0041 
0042 
0043 def extract_zone(source, outfile, in_point, out_point):
0044     sample_rate = 16000
0045     path = ffmpeg_path()
0046     process = subprocess.run([path, '-loglevel', 'quiet', '-y', '-i',
0047                             source, '-ss', in_point, '-t', out_point,
0048                             '-vn', '-ar', str(sample_rate) , '-ac', '1', '-f', 'wav', outfile],
0049                             stdout=subprocess.PIPE)
0050 
0051 
0052 def run_whisper(source, model, device="cpu", task="transcribe", extraparams=""):
0053     model = whisper.load_model(model, device)
0054 
0055     transcribe_kwargs = {
0056         "task": task,
0057         "verbose": False,
0058         'patience': None,
0059         'length_penalty': None,
0060         'suppress_tokens': '-1',
0061         'condition_on_previous_text':True,
0062         'word_timestamps':True
0063     }
0064 
0065     if len(extraparams) > 1:
0066         extraArgs = extraparams.split()
0067         for x in extraArgs:
0068             param = x.split('=')
0069             if (len(param) > 1):
0070                 transcribe_kwargs[param[0]] = param[1]
0071 
0072     if avoid_fp16(device):
0073         transcribe_kwargs["fp16"] = False
0074 
0075     return model.transcribe(source, **transcribe_kwargs)
0076 
0077 
0078 def main():
0079     source=sys.argv[1]
0080     if len(sys.argv) > 8 and (float(sys.argv[6])>0 or float(sys.argv[7])>0):
0081         tmp_file = sys.argv[8]
0082         extract_zone(source, tmp_file, sys.argv[6], sys.argv[7])
0083         source = tmp_file
0084 
0085     model = sys.argv[2]
0086     device = sys.argv[3]
0087     task = sys.argv[4]
0088     language = sys.argv[5]
0089     result = run_whisper(source, model, device, task, language)
0090 
0091     for i in result["segments"]:
0092         start_time = i["start"]
0093         end_time = i["end"]
0094         duration = end_time - start_time
0095         timestamp = f"{start_time:.3f} - {end_time:.3f}"
0096         text = i["text"]
0097         words = i["words"]
0098         res = '[' + str(start_time) + '>' + str(end_time) + ']' + '\n'
0099         for j in words:
0100             res += '[' + str(j["start"]) + '>' + str(j["end"]) + ']' + j["word"] + '\n'
0101         sys.stdout.buffer.write(res.encode('utf-8'))
0102     sys.stdout.flush()
0103     return 0
0104 
0105 
0106 if __name__ == "__main__":
0107     sys.exit(main())