init
Changes made (not exhaustive): - changed defaults - whisper.cpp submodule set to 1.2.0 - removed `requests` dependency - models dir can be changed in constructor - added support for setting params - added back support for `large-v1` model - added support for english-only models
This commit is contained in:
parent
e2581c8aad
commit
af035ea355
40
.github/workflows/build_wheels.yml
vendored
40
.github/workflows/build_wheels.yml
vendored
@ -1,40 +0,0 @@
|
|||||||
name: build_wheels
|
|
||||||
run-name: ${{ github.actor }} is building wheels
|
|
||||||
on: [push]
|
|
||||||
jobs:
|
|
||||||
build_wheels:
|
|
||||||
runs-on: ${{ matrix.os }}
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
os: [ubuntu-latest, macos-latest, windows-latest]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v2
|
|
||||||
- name: Checkout submodules
|
|
||||||
run: |
|
|
||||||
git submodule update --init --recursive
|
|
||||||
|
|
||||||
- uses: actions/setup-python@v2
|
|
||||||
with:
|
|
||||||
python-version: '3.10'
|
|
||||||
|
|
||||||
- name: Setup pip
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
python -m pip install cibuildwheel==1.6.4
|
|
||||||
|
|
||||||
- name: Install
|
|
||||||
run: |
|
|
||||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
|
||||||
sudo apt-get update
|
|
||||||
sudo apt-get install gcc g++
|
|
||||||
fi
|
|
||||||
shell: bash
|
|
||||||
|
|
||||||
- name: Build wheel
|
|
||||||
run: python -m cibuildwheel --output-dir dist/
|
|
||||||
env:
|
|
||||||
CIBW_BUILD: cp36-* cp37-* cp38-*
|
|
||||||
|
|
||||||
- uses: actions/upload-artifact@v2
|
|
||||||
with:
|
|
||||||
path: ./dist/*.whl
|
|
2
.gitignore
vendored
2
.gitignore
vendored
@ -127,3 +127,5 @@ dmypy.json
|
|||||||
|
|
||||||
# Pyre type checker
|
# Pyre type checker
|
||||||
.pyre/
|
.pyre/
|
||||||
|
|
||||||
|
whispercpp.cpp
|
||||||
|
1
.gitmodules
vendored
1
.gitmodules
vendored
@ -1,3 +1,4 @@
|
|||||||
[submodule "whisper.cpp"]
|
[submodule "whisper.cpp"]
|
||||||
path = whisper.cpp
|
path = whisper.cpp
|
||||||
url = https://github.com/ggerganov/whisper.cpp
|
url = https://github.com/ggerganov/whisper.cpp
|
||||||
|
branch = b2083c5d02db9a1e6dbb3d58254fd65ebfff4b5d
|
||||||
|
24
README.md
24
README.md
@ -1,12 +1,23 @@
|
|||||||
Python bindings for whisper.cpp
|
Python bindings for whisper.cpp
|
||||||
===============================
|
===============================
|
||||||
|
|
||||||
`pip install git+https://github.com/o4dev/whispercpp.py`
|
```
|
||||||
|
git clone --recurse-submodules https://git.ecker.tech/lightmare/whispercpp.py
|
||||||
|
cd whispercpp.py
|
||||||
|
pip install .
|
||||||
|
```
|
||||||
|
or
|
||||||
|
```
|
||||||
|
git clone https://git.ecker.tech/lightmare/whispercpp.py
|
||||||
|
cd whispercpp.py
|
||||||
|
git submodule update --init
|
||||||
|
pip install .
|
||||||
|
```
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from whispercpp import Whisper
|
from whispercpp import Whisper
|
||||||
|
|
||||||
w = Whisper('tiny')
|
w = Whisper('tiny', models_dir='./models/', language=b'en')
|
||||||
|
|
||||||
result = w.transcribe("myfile.mp3")
|
result = w.transcribe("myfile.mp3")
|
||||||
text = w.extract_text(result)
|
text = w.extract_text(result)
|
||||||
@ -14,3 +25,12 @@ text = w.extract_text(result)
|
|||||||
|
|
||||||
Note: default parameters might need to be tweaked.
|
Note: default parameters might need to be tweaked.
|
||||||
See Whispercpp.pyx.
|
See Whispercpp.pyx.
|
||||||
|
|
||||||
|
Changes made (not exhaustive):
|
||||||
|
- changed defaults
|
||||||
|
- whisper.cpp submodule set to 1.2.0
|
||||||
|
- removed `requests` dependency
|
||||||
|
- models dir can be changed in constructor
|
||||||
|
- added support for setting params
|
||||||
|
- added back support for `large-v1` model
|
||||||
|
- added support for english-only models
|
||||||
|
3
requirements.txt
Normal file
3
requirements.txt
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
Cython
|
||||||
|
numpy
|
||||||
|
ffmpeg-python
|
3
setup.py
3
setup.py
@ -34,7 +34,6 @@ setup(
|
|||||||
include_dirs = ['./whisper.cpp/', numpy.get_include()],
|
include_dirs = ['./whisper.cpp/', numpy.get_include()],
|
||||||
install_requires=[
|
install_requires=[
|
||||||
'numpy',
|
'numpy',
|
||||||
'ffmpeg-python',
|
'ffmpeg-python'
|
||||||
'requests'
|
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
210
whispercpp.pxd
210
whispercpp.pxd
@ -3,111 +3,111 @@
|
|||||||
from libc.stdint cimport int64_t
|
from libc.stdint cimport int64_t
|
||||||
|
|
||||||
cdef nogil:
|
cdef nogil:
|
||||||
int WHISPER_SAMPLE_RATE = 16000
|
int WHISPER_SAMPLE_RATE = 16000
|
||||||
int WHISPER_N_FFT = 400
|
int WHISPER_N_FFT = 400
|
||||||
int WHISPER_N_MEL = 80
|
int WHISPER_N_MEL = 80
|
||||||
int WHISPER_HOP_LENGTH = 160
|
int WHISPER_HOP_LENGTH = 160
|
||||||
int WHISPER_CHUNK_SIZE = 30
|
int WHISPER_CHUNK_SIZE = 30
|
||||||
int SAMPLE_RATE = 16000
|
int SAMPLE_RATE = 16000
|
||||||
char* TEST_FILE = b'test.wav'
|
char* TEST_FILE = b'test.wav'
|
||||||
char* DEFAULT_MODEL = b'ggml-tiny.bin'
|
char* DEFAULT_MODEL = b'ggml-base.bin'
|
||||||
char* LANGUAGE = b'fr'
|
char* LANGUAGE = b'en'
|
||||||
ctypedef struct audio_data:
|
ctypedef struct audio_data:
|
||||||
float* frames;
|
float* frames;
|
||||||
int n_frames;
|
int n_frames;
|
||||||
|
|
||||||
cdef extern from "whisper.h" nogil:
|
cdef extern from "whisper.h" nogil:
|
||||||
enum whisper_sampling_strategy:
|
enum whisper_sampling_strategy:
|
||||||
WHISPER_SAMPLING_GREEDY = 0,
|
WHISPER_SAMPLING_GREEDY = 0,
|
||||||
WHISPER_SAMPLING_BEAM_SEARCH,
|
WHISPER_SAMPLING_BEAM_SEARCH,
|
||||||
ctypedef bint _Bool
|
ctypedef bint _Bool
|
||||||
ctypedef void (*whisper_new_segment_callback)(whisper_context*, int, void*)
|
ctypedef void (*whisper_new_segment_callback)(whisper_context*, int, void*)
|
||||||
ctypedef _Bool whisper_encoder_begin_callback(whisper_context*, void*)
|
ctypedef _Bool whisper_encoder_begin_callback(whisper_context*, void*)
|
||||||
ctypedef int whisper_token
|
ctypedef int whisper_token
|
||||||
ctypedef struct whisper_token_data:
|
ctypedef struct whisper_token_data:
|
||||||
whisper_token id
|
whisper_token id
|
||||||
whisper_token tid
|
whisper_token tid
|
||||||
float p
|
float p
|
||||||
float pt
|
float pt
|
||||||
float ptsum
|
float ptsum
|
||||||
int64_t t0
|
int64_t t0
|
||||||
int64_t t1
|
int64_t t1
|
||||||
float vlen
|
float vlen
|
||||||
ctypedef struct whisper_context:
|
ctypedef struct whisper_context:
|
||||||
pass
|
pass
|
||||||
ctypedef struct anon_2:
|
ctypedef struct anon_2:
|
||||||
int n_past
|
int n_past
|
||||||
ctypedef struct anon_3:
|
ctypedef struct anon_3:
|
||||||
int n_past
|
int n_past
|
||||||
int beam_width
|
int beam_width
|
||||||
int n_best
|
int n_best
|
||||||
ctypedef struct whisper_full_params:
|
ctypedef struct whisper_full_params:
|
||||||
int strategy
|
int strategy
|
||||||
int n_threads
|
int n_threads
|
||||||
int n_max_text_ctx
|
int n_max_text_ctx
|
||||||
int offset_ms
|
int offset_ms
|
||||||
int duration_ms
|
int duration_ms
|
||||||
_Bool translate
|
_Bool translate
|
||||||
_Bool no_context
|
_Bool no_context
|
||||||
_Bool single_segment
|
_Bool single_segment
|
||||||
_Bool print_special
|
_Bool print_special
|
||||||
_Bool print_progress
|
_Bool print_progress
|
||||||
_Bool print_realtime
|
_Bool print_realtime
|
||||||
_Bool print_timestamps
|
_Bool print_timestamps
|
||||||
_Bool token_timestamps
|
_Bool token_timestamps
|
||||||
float thold_pt
|
float thold_pt
|
||||||
float thold_ptsum
|
float thold_ptsum
|
||||||
int max_len
|
int max_len
|
||||||
int max_tokens
|
int max_tokens
|
||||||
_Bool speed_up
|
_Bool speed_up
|
||||||
int audio_ctx
|
int audio_ctx
|
||||||
whisper_token* prompt_tokens
|
whisper_token* prompt_tokens
|
||||||
int prompt_n_tokens
|
int prompt_n_tokens
|
||||||
char* language
|
char* language
|
||||||
anon_2 greedy
|
anon_2 greedy
|
||||||
anon_3 beam_search
|
anon_3 beam_search
|
||||||
whisper_new_segment_callback new_segment_callback
|
whisper_new_segment_callback new_segment_callback
|
||||||
void* new_segment_callback_user_data
|
void* new_segment_callback_user_data
|
||||||
whisper_encoder_begin_callback encoder_begin_callback
|
whisper_encoder_begin_callback encoder_begin_callback
|
||||||
void* encoder_begin_callback_user_data
|
void* encoder_begin_callback_user_data
|
||||||
whisper_full_params whisper_full_default_params(whisper_sampling_strategy)
|
whisper_full_params whisper_full_default_params(whisper_sampling_strategy)
|
||||||
cdef whisper_context* whisper_init(char*)
|
cdef whisper_context* whisper_init(char*)
|
||||||
cdef void whisper_free(whisper_context*)
|
cdef void whisper_free(whisper_context*)
|
||||||
cdef int whisper_pcm_to_mel(whisper_context*, float*, int, int)
|
cdef int whisper_pcm_to_mel(whisper_context*, float*, int, int)
|
||||||
cdef int whisper_set_mel(whisper_context*, float*, int, int)
|
cdef int whisper_set_mel(whisper_context*, float*, int, int)
|
||||||
cdef int whisper_encode(whisper_context*, int, int)
|
cdef int whisper_encode(whisper_context*, int, int)
|
||||||
cdef int whisper_decode(whisper_context*, whisper_token*, int, int, int)
|
cdef int whisper_decode(whisper_context*, whisper_token*, int, int, int)
|
||||||
cdef whisper_token_data whisper_sample_best(whisper_context*)
|
cdef whisper_token_data whisper_sample_best(whisper_context*)
|
||||||
cdef whisper_token whisper_sample_timestamp(whisper_context*)
|
cdef whisper_token whisper_sample_timestamp(whisper_context*)
|
||||||
cdef int whisper_lang_id(char*)
|
cdef int whisper_lang_id(char*)
|
||||||
cdef int whisper_n_len(whisper_context*)
|
cdef int whisper_n_len(whisper_context*)
|
||||||
cdef int whisper_n_vocab(whisper_context*)
|
cdef int whisper_n_vocab(whisper_context*)
|
||||||
cdef int whisper_n_text_ctx(whisper_context*)
|
cdef int whisper_n_text_ctx(whisper_context*)
|
||||||
cdef int whisper_is_multilingual(whisper_context*)
|
cdef int whisper_is_multilingual(whisper_context*)
|
||||||
cdef float* whisper_get_probs(whisper_context*)
|
cdef float* whisper_get_probs(whisper_context*)
|
||||||
# Unknown CtypesSpecial name='c_char_p'
|
# Unknown CtypesSpecial name='c_char_p'
|
||||||
cdef whisper_token whisper_token_eot(whisper_context*)
|
cdef whisper_token whisper_token_eot(whisper_context*)
|
||||||
cdef whisper_token whisper_token_sot(whisper_context*)
|
cdef whisper_token whisper_token_sot(whisper_context*)
|
||||||
cdef whisper_token whisper_token_prev(whisper_context*)
|
cdef whisper_token whisper_token_prev(whisper_context*)
|
||||||
cdef whisper_token whisper_token_solm(whisper_context*)
|
cdef whisper_token whisper_token_solm(whisper_context*)
|
||||||
cdef whisper_token whisper_token_not(whisper_context*)
|
cdef whisper_token whisper_token_not(whisper_context*)
|
||||||
cdef whisper_token whisper_token_beg(whisper_context*)
|
cdef whisper_token whisper_token_beg(whisper_context*)
|
||||||
cdef whisper_token whisper_token_translate()
|
cdef whisper_token whisper_token_translate()
|
||||||
cdef whisper_token whisper_token_transcribe()
|
cdef whisper_token whisper_token_transcribe()
|
||||||
cdef void whisper_print_timings(whisper_context*)
|
cdef void whisper_print_timings(whisper_context*)
|
||||||
cdef void whisper_reset_timings(whisper_context*)
|
cdef void whisper_reset_timings(whisper_context*)
|
||||||
# Unsupported base Klass='CtypesEnum'
|
# Unsupported base Klass='CtypesEnum'
|
||||||
cdef int whisper_full(whisper_context*, whisper_full_params, float*, int)
|
cdef int whisper_full(whisper_context*, whisper_full_params, float*, int)
|
||||||
cdef int whisper_full_parallel(whisper_context*, whisper_full_params, float*, int, int)
|
cdef int whisper_full_parallel(whisper_context*, whisper_full_params, float*, int, int)
|
||||||
cdef int whisper_full_n_segments(whisper_context*)
|
cdef int whisper_full_n_segments(whisper_context*)
|
||||||
cdef int64_t whisper_full_get_segment_t0(whisper_context*, int)
|
cdef int64_t whisper_full_get_segment_t0(whisper_context*, int)
|
||||||
cdef int64_t whisper_full_get_segment_t1(whisper_context*, int)
|
cdef int64_t whisper_full_get_segment_t1(whisper_context*, int)
|
||||||
# Unknown CtypesSpecial name='c_char_p'
|
# Unknown CtypesSpecial name='c_char_p'
|
||||||
cdef int whisper_full_n_tokens(whisper_context*, int)
|
cdef int whisper_full_n_tokens(whisper_context*, int)
|
||||||
# Unknown CtypesSpecial name='c_char_p'
|
# Unknown CtypesSpecial name='c_char_p'
|
||||||
cdef whisper_token whisper_full_get_token_id(whisper_context*, int, int)
|
cdef whisper_token whisper_full_get_token_id(whisper_context*, int, int)
|
||||||
cdef whisper_token_data whisper_full_get_token_data(whisper_context*, int, int)
|
cdef whisper_token_data whisper_full_get_token_data(whisper_context*, int, int)
|
||||||
cdef float whisper_full_get_token_p(whisper_context*, int, int)
|
cdef float whisper_full_get_token_p(whisper_context*, int, int)
|
||||||
const char* whisper_print_system_info()
|
const char* whisper_print_system_info()
|
||||||
const char* whisper_full_get_segment_text(whisper_context*, int)
|
const char* whisper_full_get_segment_text(whisper_context*, int)
|
||||||
|
|
||||||
|
210
whispercpp.pyx
210
whispercpp.pyx
@ -3,113 +3,167 @@
|
|||||||
|
|
||||||
import ffmpeg
|
import ffmpeg
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import requests
|
import urllib.request
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
MODELS_DIR = str(Path('~/.ggml-models').expanduser())
|
MODELS_DIR = str(Path('~/.ggml-models').expanduser())
|
||||||
print("Saving models to:", MODELS_DIR)
|
|
||||||
|
|
||||||
|
|
||||||
cimport numpy as cnp
|
cimport numpy as cnp
|
||||||
|
|
||||||
cdef int SAMPLE_RATE = 16000
|
cdef int SAMPLE_RATE = 16000
|
||||||
cdef char* TEST_FILE = 'test.wav'
|
cdef char* TEST_FILE = 'test.wav'
|
||||||
cdef char* DEFAULT_MODEL = 'tiny'
|
cdef char* DEFAULT_MODEL = 'base'
|
||||||
cdef char* LANGUAGE = b'fr'
|
cdef char* LANGUAGE = b'en'
|
||||||
cdef int N_THREADS = os.cpu_count()
|
cdef int N_THREADS = os.cpu_count()
|
||||||
|
cdef _Bool PRINT_REALTIME = False
|
||||||
|
cdef _Bool PRINT_PROGRESS = False
|
||||||
|
cdef _Bool TRANSLATE = False
|
||||||
|
|
||||||
|
|
||||||
MODELS = {
|
MODELS = {
|
||||||
'ggml-tiny.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin',
|
'ggml-tiny.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin',
|
||||||
'ggml-base.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-base.bin',
|
'ggml-tiny.en.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en.bin',
|
||||||
'ggml-small.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-small.bin',
|
'ggml-base.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-base.bin',
|
||||||
'ggml-medium.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin',
|
'ggml-base.en.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin',
|
||||||
'ggml-large.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-large.bin',
|
'ggml-small.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-small.bin',
|
||||||
|
'ggml-small.en.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-small.en.bin',
|
||||||
|
'ggml-medium.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin',
|
||||||
|
'ggml-medium.en.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-medium.en.bin',
|
||||||
|
'ggml-large-v1.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-large-v1.bin',
|
||||||
|
'ggml-large.bin': 'https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-large.bin',
|
||||||
}
|
}
|
||||||
|
|
||||||
def model_exists(model):
|
def model_exists(model, models_dir=MODELS_DIR):
|
||||||
return os.path.exists(Path(MODELS_DIR).joinpath(model))
|
return os.path.exists(Path(models_dir).joinpath(model))
|
||||||
|
|
||||||
def download_model(model):
|
def download_model(model, models_dir=MODELS_DIR):
|
||||||
if model_exists(model):
|
"""Downloads ggml model with the given identifier
|
||||||
return
|
|
||||||
|
|
||||||
print(f'Downloading {model}...')
|
The filenames mirror the ones given in ggerganov's repos.
|
||||||
url = MODELS[model]
|
e.g. 'small' becomes 'ggml-small.bin'
|
||||||
r = requests.get(url, allow_redirects=True)
|
|
||||||
os.makedirs(MODELS_DIR, exist_ok=True)
|
Args:
|
||||||
with open(Path(MODELS_DIR).joinpath(model), 'wb') as f:
|
model: The model identifier
|
||||||
f.write(r.content)
|
models_dir: The path where the file is written to
|
||||||
|
"""
|
||||||
|
if model_exists(model, models_dir=models_dir):
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f'Downloading {model} to {models_dir}...')
|
||||||
|
url = MODELS[model]
|
||||||
|
os.makedirs(models_dir, exist_ok=True)
|
||||||
|
with urllib.request.urlopen(url) as r:
|
||||||
|
with open(Path(models_dir).joinpath(model), 'wb') as f:
|
||||||
|
f.write(r.read())
|
||||||
|
|
||||||
|
|
||||||
cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] load_audio(bytes file, int sr = SAMPLE_RATE):
|
cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] load_audio(bytes file, int sr = SAMPLE_RATE):
|
||||||
try:
|
try:
|
||||||
out = (
|
out = (
|
||||||
ffmpeg.input(file, threads=0)
|
ffmpeg.input(file, threads=0)
|
||||||
.output(
|
.output(
|
||||||
"-", format="s16le",
|
"-", format="s16le",
|
||||||
acodec="pcm_s16le",
|
acodec="pcm_s16le",
|
||||||
ac=1, ar=sr
|
ac=1, ar=sr
|
||||||
)
|
)
|
||||||
.run(
|
.run(
|
||||||
cmd=["ffmpeg", "-nostdin"],
|
cmd=["ffmpeg", "-nostdin"],
|
||||||
capture_stdout=True,
|
capture_stdout=True,
|
||||||
capture_stderr=True
|
capture_stderr=True
|
||||||
)
|
)
|
||||||
)[0]
|
)[0]
|
||||||
except:
|
except Exception:
|
||||||
raise RuntimeError(f"File '{file}' not found")
|
raise RuntimeError(f"File '{file}' not found")
|
||||||
|
|
||||||
cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] frames = (
|
cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] frames = (
|
||||||
np.frombuffer(out, np.int16)
|
np.frombuffer(out, np.int16)
|
||||||
.flatten()
|
.flatten()
|
||||||
.astype(np.float32)
|
.astype(np.float32)
|
||||||
) / pow(2, 15)
|
) / pow(2, 15)
|
||||||
|
|
||||||
return frames
|
return frames
|
||||||
|
|
||||||
cdef whisper_full_params default_params() nogil:
|
|
||||||
cdef whisper_full_params params = whisper_full_default_params(
|
|
||||||
whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY
|
|
||||||
)
|
|
||||||
params.print_realtime = True
|
|
||||||
params.print_progress = True
|
|
||||||
params.translate = False
|
|
||||||
params.language = <const char *> LANGUAGE
|
|
||||||
n_threads = N_THREADS
|
|
||||||
return params
|
|
||||||
|
|
||||||
|
cdef whisper_full_params set_params(_Bool print_realtime, _Bool print_progress, _Bool translate, char* language, int n_threads) nogil:
|
||||||
|
cdef whisper_full_params params = whisper_full_default_params(
|
||||||
|
whisper_sampling_strategy.WHISPER_SAMPLING_GREEDY
|
||||||
|
)
|
||||||
|
params.print_realtime = print_realtime
|
||||||
|
params.print_progress = print_progress
|
||||||
|
params.translate = translate
|
||||||
|
params.language = <const char *> language
|
||||||
|
n_threads = n_threads
|
||||||
|
return params
|
||||||
|
|
||||||
cdef class Whisper:
|
cdef class Whisper:
|
||||||
cdef whisper_context * ctx
|
cdef whisper_context * ctx
|
||||||
cdef whisper_full_params params
|
cdef whisper_full_params params
|
||||||
|
|
||||||
def __init__(self, model=DEFAULT_MODEL, pb=None):
|
def __init__(self, model = DEFAULT_MODEL, models_dir = MODELS_DIR, _Bool print_realtime = PRINT_REALTIME, _Bool print_progress = PRINT_PROGRESS, _Bool translate = TRANSLATE, char* language = LANGUAGE, int n_threads = N_THREADS, _Bool print_system_info = False): # not pretty, look for a way to use kwargs?
|
||||||
model_fullname = f'ggml-{model}.bin'.encode('utf8')
|
"""Constructor for Whisper class.
|
||||||
download_model(model_fullname)
|
|
||||||
model_path = Path(MODELS_DIR).joinpath(model_fullname)
|
|
||||||
cdef bytes model_b = str(model_path).encode('utf8')
|
|
||||||
self.ctx = whisper_init(model_b)
|
|
||||||
self.params = default_params()
|
|
||||||
whisper_print_system_info()
|
|
||||||
|
|
||||||
def __dealloc__(self):
|
Automatically checks for model and downloads it if necessary.
|
||||||
whisper_free(self.ctx)
|
|
||||||
|
|
||||||
def transcribe(self, filename=TEST_FILE):
|
Args:
|
||||||
print("Loading data..")
|
model: Model identifier, e.g. 'base' (see MODELS)
|
||||||
cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] frames = load_audio(<bytes>filename)
|
models_dir: The path where the models should be stored
|
||||||
|
print_realtime: whisper.cpp's real time transcription output
|
||||||
|
print_progress: whisper.cpp's progress indicator
|
||||||
|
translate: whisper.cpp's translation option
|
||||||
|
language: Which language to use. Must be a byte string.
|
||||||
|
n_threads: Amount of threads to use
|
||||||
|
print_system_info: whisper.cpp's system info output
|
||||||
|
"""
|
||||||
|
model_fullname = f'ggml-{model}.bin' #.encode('utf8')
|
||||||
|
download_model(model_fullname, models_dir=models_dir)
|
||||||
|
model_path = Path(models_dir).joinpath(model_fullname)
|
||||||
|
cdef bytes model_b = str(model_path).encode('utf8')
|
||||||
|
self.ctx = whisper_init(model_b)
|
||||||
|
self.params = set_params(print_realtime, print_progress, translate, language, n_threads)
|
||||||
|
if print_system_info:
|
||||||
|
whisper_print_system_info()
|
||||||
|
|
||||||
print("Transcribing..")
|
def __dealloc__(self):
|
||||||
return whisper_full(self.ctx, self.params, &frames[0], len(frames))
|
whisper_free(self.ctx)
|
||||||
|
|
||||||
def extract_text(self, int res):
|
def transcribe(self, filename = TEST_FILE):
|
||||||
print("Extracting text...")
|
"""Transcribes from given file.
|
||||||
if res != 0:
|
|
||||||
raise RuntimeError
|
Args:
|
||||||
cdef int n_segments = whisper_full_n_segments(self.ctx)
|
filename: Path to file
|
||||||
return [
|
|
||||||
whisper_full_get_segment_text(self.ctx, i).decode() for i in range(n_segments)
|
Returns:
|
||||||
]
|
A result id for extract_text(...)
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: The given file could not be found
|
||||||
|
"""
|
||||||
|
|
||||||
|
#print(f"Loading data from '{filename}'...")
|
||||||
|
cdef cnp.ndarray[cnp.float32_t, ndim=1, mode="c"] frames = load_audio(<bytes>filename)
|
||||||
|
|
||||||
|
#print("Transcribing..")
|
||||||
|
return whisper_full(self.ctx, self.params, &frames[0], len(frames))
|
||||||
|
|
||||||
|
def extract_text(self, int res):
|
||||||
|
"""Extracts the text from a transcription.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
res: A result id from transcribe(...)
|
||||||
|
|
||||||
|
Results:
|
||||||
|
A list of transcribed strings.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: The given result id was invalid.
|
||||||
|
"""
|
||||||
|
#print("Extracting text...")
|
||||||
|
if res != 0:
|
||||||
|
raise RuntimeError
|
||||||
|
cdef int n_segments = whisper_full_n_segments(self.ctx)
|
||||||
|
return [
|
||||||
|
whisper_full_get_segment_text(self.ctx, i).decode() for i in range(n_segments)
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user