From 277dcec484f607947c730751752e1f78e8161809 Mon Sep 17 00:00:00 2001 From: mrq Date: Sat, 4 May 2024 12:33:43 -0500 Subject: [PATCH] apparently I got an error for trying to serialize an errant tensor that made its way into the json, this could be remedied easily with recursively traversing the dict and coercing any objects to primitives, but I'm tired and I just want to start training and nap --- README.md | 1 + vall_e/utils/trainer.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ca7942d..57174ac 100755 --- a/README.md +++ b/README.md @@ -64,6 +64,7 @@ If you're interested in creating an HDF5 copy of your dataset, simply invoke: `p 5. Train the model using the following scripts: `python -m vall_e.train yaml=./data/config.yaml` * If distributing your training (for example, multi-GPU), use `deepspeed --module vall_e.train yaml="./data/config.yaml"` + + if you're not using the `deepspeed` backend, set `trainer.ddp = True` in the config YAML, then launch with `torchrun --nnodes=1 --nproc-per-node=4 -m vall_e.train yaml="./data/config.yaml"` You may quit your training any time by just entering `quit` in your CLI. The latest checkpoint will be automatically saved. diff --git a/vall_e/utils/trainer.py b/vall_e/utils/trainer.py index 31b0b12..21e061b 100755 --- a/vall_e/utils/trainer.py +++ b/vall_e/utils/trainer.py @@ -173,7 +173,10 @@ def train( elapsed_time = stats.get("elapsed_time", 0) - metrics = json.dumps(stats) + try: + metrics = json.dumps(stats) + except Exception as e: + metrics = str(stats) if cfg.trainer.no_logger: tqdm.write(f"Training Metrics: {truncate_json(metrics)}.")