File indexing completed on 2024-05-19 08:43:40
0001 #!/usr/bin/env python3 0002 # SPDX-FileCopyrightText: 2023 Jean-Baptiste Mardelle <jb@kdenlive.org> 0003 # SPDX-License-Identifier: GPL-3.0-only OR LicenseRef-KDE-Accepted-GPL 0004 0005 import codecs 0006 import datetime 0007 import os 0008 import re 0009 import subprocess 0010 import sys 0011 import wave 0012 0013 import torch 0014 import whisper 0015 0016 # Call this script with the following arguments 0017 # 1. source av file 0018 # 2. model name (tiny, base, small, medium, large) 0019 # 3. Device (cpu, cuda) 0020 # 4. translate or transcribe 0021 # 5. Language 0022 # 6. in point (optional) 0023 # 7. out point 0024 # 8. tmp file name to extract a clip's part 0025 0026 def avoid_fp16(device): 0027 """fp16 doesn't work on some GPUs, such as Nvidia GTX 16xx. See bug 467573.""" 0028 if device == "cpu": # fp16 option doesn't matter for CPU 0029 return False 0030 device = torch.cuda.get_device_name(device) 0031 if re.search(r"GTX 16\d\d", device): 0032 sys.stderr.write("GTX 16xx series GPU detected, disabling fp16\n") 0033 return True 0034 0035 def ffmpeg_path(): 0036 if sys.platform == 'darwin': 0037 from os.path import abspath, dirname, join 0038 return abspath(join(dirname(__file__), '../../MacOS/ffmpeg')) 0039 else: 0040 return 'ffmpeg' 0041 0042 0043 def extract_zone(source, outfile, in_point, out_point): 0044 sample_rate = 16000 0045 path = ffmpeg_path() 0046 process = subprocess.run([path, '-loglevel', 'quiet', '-y', '-i', 0047 source, '-ss', in_point, '-t', out_point, 0048 '-vn', '-ar', str(sample_rate) , '-ac', '1', '-f', 'wav', outfile], 0049 stdout=subprocess.PIPE) 0050 0051 0052 def run_whisper(source, model, device="cpu", task="transcribe", extraparams=""): 0053 model = whisper.load_model(model, device) 0054 0055 transcribe_kwargs = { 0056 "task": task, 0057 "verbose": False, 0058 'patience': None, 0059 'length_penalty': None, 0060 'suppress_tokens': '-1', 0061 'condition_on_previous_text':True, 0062 'word_timestamps':True 0063 } 0064 0065 if len(extraparams) > 1: 0066 extraArgs = extraparams.split() 0067 for x in extraArgs: 0068 param = x.split('=') 0069 if (len(param) > 1): 0070 transcribe_kwargs[param[0]] = param[1] 0071 0072 if avoid_fp16(device): 0073 transcribe_kwargs["fp16"] = False 0074 0075 return model.transcribe(source, **transcribe_kwargs) 0076 0077 0078 def main(): 0079 source=sys.argv[1] 0080 if len(sys.argv) > 8 and (float(sys.argv[6])>0 or float(sys.argv[7])>0): 0081 tmp_file = sys.argv[8] 0082 extract_zone(source, tmp_file, sys.argv[6], sys.argv[7]) 0083 source = tmp_file 0084 0085 model = sys.argv[2] 0086 device = sys.argv[3] 0087 task = sys.argv[4] 0088 language = sys.argv[5] 0089 result = run_whisper(source, model, device, task, language) 0090 0091 for i in result["segments"]: 0092 start_time = i["start"] 0093 end_time = i["end"] 0094 duration = end_time - start_time 0095 timestamp = f"{start_time:.3f} - {end_time:.3f}" 0096 text = i["text"] 0097 words = i["words"] 0098 res = '[' + str(start_time) + '>' + str(end_time) + ']' + '\n' 0099 for j in words: 0100 res += '[' + str(j["start"]) + '>' + str(j["end"]) + ']' + j["word"] + '\n' 0101 sys.stdout.buffer.write(res.encode('utf-8')) 0102 sys.stdout.flush() 0103 return 0 0104 0105 0106 if __name__ == "__main__": 0107 sys.exit(main())