more windows specific fixes, limit gradio to <5.0.0 on linux (it works on windows, but not on my linux machine tm)

This commit is contained in:
mrq 2024-11-04 18:00:33 -06:00
parent c83670c38c
commit 9e65e05e83
4 changed files with 11 additions and 4 deletions

View File

@ -12,8 +12,12 @@ Besides a working PyTorch environment, the only hard requirement is [`espeak-ng`
- Linux users can consult their package managers on installing `espeak`/`espeak-ng`.
- Windows users are required to install [`espeak-ng`](https://github.com/espeak-ng/espeak-ng/releases/tag/1.51#Assets).
+ additionally, you may be required to set the `PHONEMIZER_ESPEAK_LIBRARY` environment variable to specify the path to `libespeak-ng.dll`.
+ Simply running `set PHONEMIZER_ESPEAK_LIBRARY="C:\Program Files\eSpeak NG\libespeak-ng.dll"` beforehand should fix this.
- In the future, an internal homebrew to replace this would be fantastic.
Support on AMD systems with ROCm is *mostly* supported, but performance ***will*** vary.
- ROCm is simply too inconsistent with outputs.
## Install
Simply run `pip install git+https://git.ecker.tech/mrq/vall-e` or `pip install git+https://github.com/e-c-k-e-r/vall-e`.

View File

@ -72,8 +72,8 @@ setup(
"vocos",
"descript-audio-codec",
# gradio web UI
"gradio"
# gradio web UI (my linux install doesn't like 5.x, windows is fine)
f"gradio{"<5.0.0" if not sys.platform.startswith("win") else ""}"
],
extras_require = {

View File

@ -289,12 +289,12 @@ def main():
# generate demo output
for dir in tqdm(speakers, desc=f"Generating demo for {k}"):
text = open(dir / "prompt.txt").read()
text = open(dir / "prompt.txt", encoding="utf-8").read()
language = open(dir / "language.txt").read() if (dir / "language.txt").exists() else "en"
prompt = dir / "prompt.wav"
reference = dir / "reference.wav"
out_path = dir / "out" / "ours.wav"
out_path_comparison = dir / "out" / f"ours_{comparison_kwargs["suffix"]}.wav"
out_path_comparison = dir / "out" / f"ours_{comparison_kwargs['suffix']}.wav"
external_sources = [ dir / "out" / f"{source}.wav" for source in sources ]
audio_samples = [ prompt, out_path ]

View File

@ -539,10 +539,13 @@ class Base(nn.Module):
self.len_emb = Embedding(11, d_model) if "len" in self.capabilities else None
if attention_backend == "auto":
attention_backend = "sdpa"
"""
if AVAILABLE_ATTENTIONS:
attention_backend = AVAILABLE_ATTENTIONS[0]
else:
attention_backend = "default"
"""
hf_attention = attention_backend
HF_ATTENTIONS = ["eager", "sdpa", "flash_attention_2"]