init

Changes made (not exhaustive): - changed defaults - whisper.cpp submodule set to 1.2.0 - removed `requests` dependency - models dir can be changed in constructor - added support for setting params - added back support for `large-v1` model - added support for english-only models
2023-02-18 22:59:42 +00:00 · 2023-02-18 22:59:42 +00:00 · af035ea355
commit af035ea355
parent e2581c8aad
8 changed files with 268 additions and 229 deletions
--- a/.github/workflows/build_wheels.yml
+++ b/.github/workflows/build_wheels.yml
@ -1,40 +0,0 @@
-name: build_wheels
-run-name: ${{ github.actor }} is building wheels
-on: [push]
-jobs:
-  build_wheels:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [ubuntu-latest, macos-latest, windows-latest]
-    steps:
-      - uses: actions/checkout@v2
-      - name: Checkout submodules
-        run: |
-          git submodule update --init --recursive
-  
-      - uses: actions/setup-python@v2
-        with:
-          python-version: '3.10'
-
-      - name: Setup pip
-        run: |
-           python -m pip install --upgrade pip
-           python -m pip install cibuildwheel==1.6.4
-         
-      - name:  Install
-        run:   |
-               if [ "$RUNNER_OS" == "Linux" ]; then
-                    sudo apt-get update
-                    sudo apt-get install gcc g++
-               fi
-        shell: bash
-
-      - name: Build wheel
-        run: python -m cibuildwheel --output-dir dist/
-        env:
-          CIBW_BUILD: cp36-* cp37-* cp38-*
-
-      - uses: actions/upload-artifact@v2
-        with:
-          path: ./dist/*.whl
--- a/.gitignore
+++ b/.gitignore
@ -127,3 +127,5 @@ dmypy.json

 # Pyre type checker
 .pyre/
+
+whispercpp.cpp
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +1,4 @@
 [submodule "whisper.cpp"]
 	path = whisper.cpp
 	url = https://github.com/ggerganov/whisper.cpp
+	branch = b2083c5d02db9a1e6dbb3d58254fd65ebfff4b5d
--- a/README.md
+++ b/README.md
@ -1,16 +1,36 @@
 Python bindings for whisper.cpp
 ===============================

-`pip install git+https://github.com/o4dev/whispercpp.py`
+```
+git clone --recurse-submodules https://git.ecker.tech/lightmare/whispercpp.py
+cd whispercpp.py
+pip install .
+```
+or
+```
+git clone https://git.ecker.tech/lightmare/whispercpp.py
+cd whispercpp.py
+git submodule update --init
+pip install .
+```

 ```python
 from whispercpp import Whisper

-w = Whisper('tiny')
+w = Whisper('tiny', models_dir='./models/', language=b'en')

 result = w.transcribe("myfile.mp3")
 text = w.extract_text(result)
 ```

-Note: default parameters might need to be tweaked.
+Note: default parameters might need to be tweaked.  
 See Whispercpp.pyx.
+
+Changes made (not exhaustive):
+- changed defaults
+- whisper.cpp submodule set to 1.2.0
+- removed `requests` dependency
+- models dir can be changed in constructor
+- added support for setting params
+- added back support for `large-v1` model
+- added support for english-only models
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,3 @@
+Cython
+numpy
+ffmpeg-python
--- a/setup.py
+++ b/setup.py
@ -34,7 +34,6 @@ setup(
    include_dirs = ['./whisper.cpp/', numpy.get_include()],
    install_requires=[
      'numpy',
-      'ffmpeg-python',
-      'requests'
+      'ffmpeg-python'
    ],
 )
--- a/whispercpp.pxd
+++ b/whispercpp.pxd
@ -3,111 +3,111 @@
 from libc.stdint cimport int64_t

 cdef nogil:
-    int WHISPER_SAMPLE_RATE = 16000
-    int WHISPER_N_FFT = 400
-    int WHISPER_N_MEL = 80
-    int WHISPER_HOP_LENGTH = 160
-    int WHISPER_CHUNK_SIZE = 30
-    int SAMPLE_RATE = 16000
-    char* TEST_FILE = b'test.wav'
-    char* DEFAULT_MODEL = b'ggml-tiny.bin'
-    char* LANGUAGE = b'fr'
-    ctypedef struct audio_data:
-        float* frames;
-        int n_frames;
+	int WHISPER_SAMPLE_RATE = 16000
+	int WHISPER_N_FFT = 400
+	int WHISPER_N_MEL = 80
+	int WHISPER_HOP_LENGTH = 160
+	int WHISPER_CHUNK_SIZE = 30
+	int SAMPLE_RATE = 16000
+	char* TEST_FILE = b'test.wav'
+	char* DEFAULT_MODEL = b'ggml-base.bin'
+	char* LANGUAGE = b'en'
+	ctypedef struct audio_data:
+		float* frames;
+		int n_frames;

 cdef extern from "whisper.h" nogil:
-    enum whisper_sampling_strategy:
-        WHISPER_SAMPLING_GREEDY = 0,
-        WHISPER_SAMPLING_BEAM_SEARCH,
-    ctypedef bint _Bool
-    ctypedef void (*whisper_new_segment_callback)(whisper_context*, int, void*)
-    ctypedef _Bool whisper_encoder_begin_callback(whisper_context*, void*)
-    ctypedef int whisper_token
-    ctypedef struct whisper_token_data:
-        whisper_token id
-        whisper_token tid
-        float p
-        float pt
-        float ptsum
-        int64_t t0
-        int64_t t1
-        float vlen
-    ctypedef struct whisper_context:
-        pass
-    ctypedef struct anon_2:
-        int n_past
-    ctypedef struct anon_3:
-        int n_past
-        int beam_width
-        int n_best
-    ctypedef struct whisper_full_params:
-        int strategy
-        int n_threads
-        int n_max_text_ctx
-        int offset_ms
-        int duration_ms
-        _Bool translate
-        _Bool no_context
-        _Bool single_segment
-        _Bool print_special
-        _Bool print_progress
-        _Bool print_realtime
-        _Bool print_timestamps
-        _Bool token_timestamps
-        float thold_pt
-        float thold_ptsum
-        int max_len
-        int max_tokens
-        _Bool speed_up
-        int audio_ctx
-        whisper_token* prompt_tokens
-        int prompt_n_tokens
-        char* language
-        anon_2 greedy
-        anon_3 beam_search
-        whisper_new_segment_callback new_segment_callback
-        void* new_segment_callback_user_data
-        whisper_encoder_begin_callback encoder_begin_callback
-        void* encoder_begin_callback_user_data
-    whisper_full_params whisper_full_default_params(whisper_sampling_strategy)
-    cdef whisper_context* whisper_init(char*)
-    cdef void whisper_free(whisper_context*)
-    cdef int whisper_pcm_to_mel(whisper_context*, float*, int, int)
-    cdef int whisper_set_mel(whisper_context*, float*, int, int)
-    cdef int whisper_encode(whisper_context*, int, int)
-    cdef int whisper_decode(whisper_context*, whisper_token*, int, int, int)
-    cdef whisper_token_data whisper_sample_best(whisper_context*)
-    cdef whisper_token whisper_sample_timestamp(whisper_context*)
-    cdef int whisper_lang_id(char*)
-    cdef int whisper_n_len(whisper_context*)
-    cdef int whisper_n_vocab(whisper_context*)
-    cdef int whisper_n_text_ctx(whisper_context*)
-    cdef int whisper_is_multilingual(whisper_context*)
-    cdef float* whisper_get_probs(whisper_context*)
-    # Unknown CtypesSpecial name='c_char_p'
-    cdef whisper_token whisper_token_eot(whisper_context*)
-    cdef whisper_token whisper_token_sot(whisper_context*)
-    cdef whisper_token whisper_token_prev(whisper_context*)
-    cdef whisper_token whisper_token_solm(whisper_context*)
-    cdef whisper_token whisper_token_not(whisper_context*)
-    cdef whisper_token whisper_token_beg(whisper_context*)
-    cdef whisper_token whisper_token_translate()
-    cdef whisper_token whisper_token_transcribe()
-    cdef void whisper_print_timings(whisper_context*)
-    cdef void whisper_reset_timings(whisper_context*)
-    # Unsupported base Klass='CtypesEnum'
-    cdef int whisper_full(whisper_context*, whisper_full_params, float*, int)
-    cdef int whisper_full_parallel(whisper_context*, whisper_full_params, float*, int, int)
-    cdef int whisper_full_n_segments(whisper_context*)
-    cdef int64_t whisper_full_get_segment_t0(whisper_context*, int)
-    cdef int64_t whisper_full_get_segment_t1(whisper_context*, int)
-    # Unknown CtypesSpecial name='c_char_p'
-    cdef int whisper_full_n_tokens(whisper_context*, int)
-    # Unknown CtypesSpecial name='c_char_p'
-    cdef whisper_token whisper_full_get_token_id(whisper_context*, int, int)
-    cdef whisper_token_data whisper_full_get_token_data(whisper_context*, int, int)
-    cdef float whisper_full_get_token_p(whisper_context*, int, int)
-    const char* whisper_print_system_info()
-    const char* whisper_full_get_segment_text(whisper_context*, int)
+	enum whisper_sampling_strategy:
+		WHISPER_SAMPLING_GREEDY = 0,
+		WHISPER_SAMPLING_BEAM_SEARCH,
+	ctypedef bint _Bool
+	ctypedef void (*whisper_new_segment_callback)(whisper_context*, int, void*)
+	ctypedef _Bool whisper_encoder_begin_callback(whisper_context*, void*)
+	ctypedef int whisper_token
+	ctypedef struct whisper_token_data:
+		whisper_token id
+		whisper_token tid
+		float p
+		float pt
+		float ptsum
+		int64_t t0
+		int64_t t1
+		float vlen
+	ctypedef struct whisper_context:
+		pass
+	ctypedef struct anon_2:
+		int n_past
+	ctypedef struct anon_3:
+		int n_past
+		int beam_width
+		int n_best
+	ctypedef struct whisper_full_params:
+		int strategy
+		int n_threads
+		int n_max_text_ctx
+		int offset_ms
+		int duration_ms
+		_Bool translate
+		_Bool no_context
+		_Bool single_segment
+		_Bool print_special
+		_Bool print_progress
+		_Bool print_realtime
+		_Bool print_timestamps
+		_Bool token_timestamps
+		float thold_pt
+		float thold_ptsum
+		int max_len
+		int max_tokens
+		_Bool speed_up
+		int audio_ctx
+		whisper_token* prompt_tokens
+		int prompt_n_tokens
+		char* language
+		anon_2 greedy
+		anon_3 beam_search
+		whisper_new_segment_callback new_segment_callback
+		void* new_segment_callback_user_data
+		whisper_encoder_begin_callback encoder_begin_callback
+		void* encoder_begin_callback_user_data
+	whisper_full_params whisper_full_default_params(whisper_sampling_strategy)
+	cdef whisper_context* whisper_init(char*)
+	cdef void whisper_free(whisper_context*)
+	cdef int whisper_pcm_to_mel(whisper_context*, float*, int, int)
+	cdef int whisper_set_mel(whisper_context*, float*, int, int)
+	cdef int whisper_encode(whisper_context*, int, int)
+	cdef int whisper_decode(whisper_context*, whisper_token*, int, int, int)
+	cdef whisper_token_data whisper_sample_best(whisper_context*)
+	cdef whisper_token whisper_sample_timestamp(whisper_context*)
+	cdef int whisper_lang_id(char*)
+	cdef int whisper_n_len(whisper_context*)
+	cdef int whisper_n_vocab(whisper_context*)
+	cdef int whisper_n_text_ctx(whisper_context*)
+	cdef int whisper_is_multilingual(whisper_context*)
+	cdef float* whisper_get_probs(whisper_context*)
+	# Unknown CtypesSpecial name='c_char_p'
+	cdef whisper_token whisper_token_eot(whisper_context*)
+	cdef whisper_token whisper_token_sot(whisper_context*)
+	cdef whisper_token whisper_token_prev(whisper_context*)
+	cdef whisper_token whisper_token_solm(whisper_context*)
+	cdef whisper_token whisper_token_not(whisper_context*)
+	cdef whisper_token whisper_token_beg(whisper_context*)
+	cdef whisper_token whisper_token_translate()
+	cdef whisper_token whisper_token_transcribe()
+	cdef void whisper_print_timings(whisper_context*)
+	cdef void whisper_reset_timings(whisper_context*)
+	# Unsupported base Klass='CtypesEnum'
+	cdef int whisper_full(whisper_context*, whisper_full_params, float*, int)
+	cdef int whisper_full_parallel(whisper_context*, whisper_full_params, float*, int, int)
+	cdef int whisper_full_n_segments(whisper_context*)
+	cdef int64_t whisper_full_get_segment_t0(whisper_context*, int)
+	cdef int64_t whisper_full_get_segment_t1(whisper_context*, int)
+	# Unknown CtypesSpecial name='c_char_p'
+	cdef int whisper_full_n_tokens(whisper_context*, int)
+	# Unknown CtypesSpecial name='c_char_p'
+	cdef whisper_token whisper_full_get_token_id(whisper_context*, int, int)
+	cdef whisper_token_data whisper_full_get_token_data(whisper_context*, int, int)
+	cdef float whisper_full_get_token_p(whisper_context*, int, int)
+	const char* whisper_print_system_info()
+	const char* whisper_full_get_segment_text(whisper_context*, int)

--- a/whispercpp.pyx
+++ b/whispercpp.pyx
@ -3,113 +3,167 @@

 import ffmpeg
 import numpy as np
-import requests
+import urllib.request
 import os
 from pathlib import Path

 MODELS_DIR = str(Path('~/.ggml-models').expanduser())
-print("Saving models to:", MODELS_DIR)


 cimport numpy as cnp

 cdef int SAMPLE_RATE = 16000
 cdef char* TEST_FILE = 'test.wav'
-cdef char* DEFAULT_MODEL = 'tiny'
-cdef char* LANGUAGE = b'fr'
+cdef char* DEFAULT_MODEL = 'base'
+cdef char* LANGUAGE = b'en'
 cdef int N_THREADS = os.cpu_count()
+cdef _Bool PRINT_REALTIME = False
+cdef _Bool PRINT_PROGRESS = False
+cdef _Bool TRANSLATE = False
+

 MODELS = {
-    'ggml-tiny.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin',
-    'ggml-base.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-base.bin',
-    'ggml-small.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-small.bin',
-    'ggml-medium.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin',
-    'ggml-large.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-large.bin',
+	'ggml-tiny.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin',
+	'ggml-tiny.en.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en.bin',
+	'ggml-base.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-base.bin',
+	'ggml-base.en.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin',
+	'ggml-small.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-small.bin',
+	'ggml-small.en.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-small.en.bin',
+	'ggml-medium.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin',
+	'ggml-medium.en.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-medium.en.bin',
+	'ggml-large-v1.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-large-v1.bin',
+	'ggml-large.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-large.bin',
 }

-def model_exists(model):
-    return os.path.exists(Path(MODELS_DIR).joinpath(model))
+def model_exists(model, models_dir=MODELS_DIR):
+	return os.path.exists(Path(models_dir).joinpath(model))

-def download_model(model):
-    if model_exists(model):
-        return
+def download_model(model, models_dir=MODELS_DIR):
+	"""Downloads ggml model with the given identifier

-    print(f'Downloading {model}...')
-    url = MODELS[model]
-    r = requests.get(url, allow_redirects=True)
-    os.makedirs(MODELS_DIR, exist_ok=True)
-    with open(Path(MODELS_DIR).joinpath(model), 'wb') as f:
-        f.write(r.content)
+	The filenames mirror the ones given in ggerganov's repos.
+	e.g. 'small' becomes 'ggml-small.bin'
+
+	Args:
+	    model: The model identifier
+	    models_dir: The path where the file is written to
+	"""
+	if model_exists(model, models_dir=models_dir):
+		return
+
+	print(f'Downloading {model} to {models_dir}...')
+	url = MODELS[model]
+	os.makedirs(models_dir, exist_ok=True)
+	with urllib.request.urlopen(url) as r:
+		with open(Path(models_dir).joinpath(model), 'wb') as f:
+			f.write(r.read())


 cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] load_audio(bytes file, int sr = SAMPLE_RATE):
-    try:
-        out = (
-            ffmpeg.input(file, threads=0)
-            .output(
-                "-", format="s16le",
-                acodec="pcm_s16le",
-                ac=1, ar=sr
-            )
-            .run(
-                cmd=["ffmpeg", "-nostdin"],
-                capture_stdout=True,
-                capture_stderr=True
-            )
-        )[0]
-    except:
-        raise RuntimeError(f"File '{file}' not found")
+	try:
+		out = (
+			ffmpeg.input(file, threads=0)
+			.output(
+				"-", format="s16le",
+				acodec="pcm_s16le",
+				ac=1, ar=sr
+			)
+			.run(
+				cmd=["ffmpeg", "-nostdin"],
+				capture_stdout=True,
+				capture_stderr=True
+			)
+		)[0]
+	except Exception:
+		raise RuntimeError(f"File '{file}' not found")

-    cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] frames = (
-        np.frombuffer(out, np.int16)
-        .flatten()
-        .astype(np.float32)
-    ) / pow(2, 15)
+	cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] frames = (
+		np.frombuffer(out, np.int16)
+		.flatten()
+		.astype(np.float32)
+	) / pow(2, 15)

-    return frames
-
-cdef whisper_full_params default_params() nogil:
-    cdef whisper_full_params params = whisper_full_default_params(
-        whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY
-    )
-    params.print_realtime = True
-    params.print_progress = True
-    params.translate = False
-    params.language = <const char *> LANGUAGE
-    n_threads = N_THREADS
-    return params
+	return frames

+cdef whisper_full_params set_params(_Bool print_realtime, _Bool print_progress, _Bool translate, char* language, int n_threads) nogil:
+	cdef whisper_full_params params = whisper_full_default_params(
+		whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY
+	)
+	params.print_realtime = print_realtime
+	params.print_progress = print_progress
+	params.translate = translate
+	params.language = <const char *> language
+	n_threads = n_threads
+	return params

 cdef class Whisper:
-    cdef whisper_context * ctx
-    cdef whisper_full_params params
+	cdef whisper_context * ctx
+	cdef whisper_full_params params

-    def __init__(self, model=DEFAULT_MODEL, pb=None):
-        model_fullname = f'ggml-{model}.bin'.encode('utf8')
-        download_model(model_fullname)
-        model_path = Path(MODELS_DIR).joinpath(model_fullname)
-        cdef bytes model_b = str(model_path).encode('utf8')
-        self.ctx = whisper_init(model_b)
-        self.params = default_params()
-        whisper_print_system_info()
+	def __init__(self, model = DEFAULT_MODEL, models_dir = MODELS_DIR, _Bool print_realtime = PRINT_REALTIME, _Bool print_progress = PRINT_PROGRESS, _Bool translate = TRANSLATE, char* language = LANGUAGE, int n_threads = N_THREADS, _Bool print_system_info = False): # not pretty, look for a way to use kwargs?
+		"""Constructor for Whisper class.

-    def __dealloc__(self):
-        whisper_free(self.ctx)
+		Automatically checks for model and downloads it if necessary.

-    def transcribe(self, filename=TEST_FILE):
-        print("Loading data..")
-        cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] frames = load_audio(<bytes>filename)
+		Args:
+		    model: Model identifier, e.g. 'base' (see MODELS)
+		    models_dir: The path where the models should be stored
+		    print_realtime: whisper.cpp's real time transcription output
+		    print_progress: whisper.cpp's progress indicator
+		    translate: whisper.cpp's translation option
+		    language: Which language to use. Must be a byte string.
+		    n_threads: Amount of threads to use
+		    print_system_info: whisper.cpp's system info output
+		"""
+		model_fullname = f'ggml-{model}.bin' #.encode('utf8')
+		download_model(model_fullname, models_dir=models_dir)
+		model_path = Path(models_dir).joinpath(model_fullname)
+		cdef bytes model_b = str(model_path).encode('utf8')
+		self.ctx = whisper_init(model_b)
+		self.params = set_params(print_realtime, print_progress, translate, language, n_threads)
+		if print_system_info:
+			whisper_print_system_info()

-        print("Transcribing..")
-        return whisper_full(self.ctx, self.params, &frames[0], len(frames))
-    
-    def extract_text(self, int res):
-        print("Extracting text...")
-        if res != 0:
-            raise RuntimeError
-        cdef int n_segments = whisper_full_n_segments(self.ctx)
-        return [
-            whisper_full_get_segment_text(self.ctx, i).decode() for i in range(n_segments)
-        ]
+	def __dealloc__(self):
+		whisper_free(self.ctx)
+
+	def transcribe(self, filename = TEST_FILE):
+		"""Transcribes from given file.
+
+		Args:
+		    filename: Path to file
+
+		Returns:
+		    A result id for extract_text(...)
+
+		Raises:
+		    RuntimeError: The given file could not be found
+		"""
+
+		#print(f"Loading data from '{filename}'...")
+		cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] frames = load_audio(<bytes>filename)
+
+		#print("Transcribing..")
+		return whisper_full(self.ctx, self.params, &frames[0], len(frames))
+	
+	def extract_text(self, int res):
+		"""Extracts the text from a transcription.
+
+		Args:
+		    res: A result id from transcribe(...)
+
+		Results:
+		    A list of transcribed strings.
+
+		Raises:
+		    RuntimeError: The given result id was invalid.
+		"""
+		#print("Extracting text...")
+		if res != 0:
+			raise RuntimeError
+		cdef int n_segments = whisper_full_n_segments(self.ctx)
+		return [
+			whisper_full_get_segment_text(self.ctx, i).decode() for i in range(n_segments)
+		]