diff --git a/README.md b/README.md index 1f46535..fb15d96 100755 --- a/README.md +++ b/README.md @@ -12,8 +12,12 @@ Besides a working PyTorch environment, the only hard requirement is [`espeak-ng` - Linux users can consult their package managers on installing `espeak`/`espeak-ng`. - Windows users are required to install [`espeak-ng`](https://github.com/espeak-ng/espeak-ng/releases/tag/1.51#Assets). + additionally, you may be required to set the `PHONEMIZER_ESPEAK_LIBRARY` environment variable to specify the path to `libespeak-ng.dll`. + + Simply running `set PHONEMIZER_ESPEAK_LIBRARY="C:\Program Files\eSpeak NG\libespeak-ng.dll"` beforehand should fix this. - In the future, an internal homebrew to replace this would be fantastic. +Support on AMD systems with ROCm is *mostly* supported, but performance ***will*** vary. +- ROCm is simply too inconsistent with outputs. + ## Install Simply run `pip install git+https://git.ecker.tech/mrq/vall-e` or `pip install git+https://github.com/e-c-k-e-r/vall-e`. diff --git a/setup.py b/setup.py index f4487fe..622d6ab 100755 --- a/setup.py +++ b/setup.py @@ -72,8 +72,8 @@ setup( "vocos", "descript-audio-codec", - # gradio web UI - "gradio" + # gradio web UI (my linux install doesn't like 5.x, windows is fine) + f"gradio{"<5.0.0" if not sys.platform.startswith("win") else ""}" ], extras_require = { diff --git a/vall_e/demo.py b/vall_e/demo.py index ccb5d8c..400cdd8 100644 --- a/vall_e/demo.py +++ b/vall_e/demo.py @@ -289,12 +289,12 @@ def main(): # generate demo output for dir in tqdm(speakers, desc=f"Generating demo for {k}"): - text = open(dir / "prompt.txt").read() + text = open(dir / "prompt.txt", encoding="utf-8").read() language = open(dir / "language.txt").read() if (dir / "language.txt").exists() else "en" prompt = dir / "prompt.wav" reference = dir / "reference.wav" out_path = dir / "out" / "ours.wav" - out_path_comparison = dir / "out" / f"ours_{comparison_kwargs["suffix"]}.wav" + out_path_comparison = dir / "out" / f"ours_{comparison_kwargs['suffix']}.wav" external_sources = [ dir / "out" / f"{source}.wav" for source in sources ] audio_samples = [ prompt, out_path ] diff --git a/vall_e/models/base.py b/vall_e/models/base.py index 8a2fe19..80078ec 100755 --- a/vall_e/models/base.py +++ b/vall_e/models/base.py @@ -539,10 +539,13 @@ class Base(nn.Module): self.len_emb = Embedding(11, d_model) if "len" in self.capabilities else None if attention_backend == "auto": + attention_backend = "sdpa" + """ if AVAILABLE_ATTENTIONS: attention_backend = AVAILABLE_ATTENTIONS[0] else: attention_backend = "default" + """ hf_attention = attention_backend HF_ATTENTIONS = ["eager", "sdpa", "flash_attention_2"]