diff --git a/README.md b/README.md index ca7942d..57174ac 100755 --- a/README.md +++ b/README.md @@ -64,6 +64,7 @@ If you're interested in creating an HDF5 copy of your dataset, simply invoke: `p 5. Train the model using the following scripts: `python -m vall_e.train yaml=./data/config.yaml` * If distributing your training (for example, multi-GPU), use `deepspeed --module vall_e.train yaml="./data/config.yaml"` + + if you're not using the `deepspeed` backend, set `trainer.ddp = True` in the config YAML, then launch with `torchrun --nnodes=1 --nproc-per-node=4 -m vall_e.train yaml="./data/config.yaml"` You may quit your training any time by just entering `quit` in your CLI. The latest checkpoint will be automatically saved. diff --git a/vall_e/utils/trainer.py b/vall_e/utils/trainer.py index 31b0b12..21e061b 100755 --- a/vall_e/utils/trainer.py +++ b/vall_e/utils/trainer.py @@ -173,7 +173,10 @@ def train( elapsed_time = stats.get("elapsed_time", 0) - metrics = json.dumps(stats) + try: + metrics = json.dumps(stats) + except Exception as e: + metrics = str(stats) if cfg.trainer.no_logger: tqdm.write(f"Training Metrics: {truncate_json(metrics)}.")