#!python # cython: language_level=3 # distutils: language = c++ # distutils: sources= ./whisper.cpp/whisper.cpp ./whisper.cpp/ggml.c import ffmpeg import numpy as np cimport numpy as cnp cdef int SAMPLE_RATE = 16000 cdef char* TEST_FILE = b'test.wav' cdef char* DEFAULT_MODEL = b'ggml-tiny.bin' cdef char* LANGUAGE = b'fr' cdef audio_data load_audio(bytes file, int sr = SAMPLE_RATE): out = ( ffmpeg.input(file, threads=0) .output( "-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr ) .run( cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True ) )[0] cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] frames = ( np.frombuffer(out, np.int16) .flatten() .astype(np.float32) ) / pow(2, 15) cdef audio_data data; data.frames = &frames[0] data.n_frames = len(frames) return data cdef whisper_full_params default_params(): cdef whisper_full_params params = whisper_full_default_params( whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY ) params.print_realtime = True params.print_progress = True params.translate = False params.language = LANGUAGE return params cdef class Whisper: cdef whisper_context * ctx cdef whisper_full_params params def __init__(self, char* model=DEFAULT_MODEL): self.ctx = whisper_init(model) self.params = default_params() def __dealloc__(self): whisper_free(self.ctx) cpdef str transcribe(self): cdef audio_data data = load_audio(TEST_FILE) cdef int res = whisper_full(self.ctx, self.params, data.frames, data.n_frames) if res != 0: raise RuntimeError cdef int n_segments = whisper_full_n_segments(self.ctx) return b'\n'.join([ whisper_full_get_segment_text(self.ctx, i) for i in range(n_segments) ]).decode()