From 132a02c48b05174f3c82f1af6de860c2d95216f9 Mon Sep 17 00:00:00 2001 From: mrq Date: Sun, 9 Jun 2024 11:22:52 -0500 Subject: [PATCH] sanity cleanup, backup config yaml for each log file --- vall_e/config.py | 36 +++++++++++++++++++---------------- vall_e/data.py | 4 ++-- vall_e/engines/__init__.py | 2 +- vall_e/models/ar_nar.py | 6 +++--- vall_e/models/experimental.py | 6 +++--- vall_e/models/nar.py | 6 +++--- vall_e/plot.py | 4 ++-- vall_e/train.py | 5 +++++ vall_e/utils/trainer.py | 4 ++-- 9 files changed, 41 insertions(+), 32 deletions(-) diff --git a/vall_e/config.py b/vall_e/config.py index 33148bd..e91e5d8 100755 --- a/vall_e/config.py +++ b/vall_e/config.py @@ -23,31 +23,35 @@ from transformers import PreTrainedTokenizerFast @dataclass() class BaseConfig: - cfg_path: str | None = None + yaml_path: str | None = None @property - def relpath(self): + def cfg_path(self): + return Path(self.yaml_path.parent) if self.yaml_path is not None else None + + @property + def rel_path(self): return Path(self.cfg_path) @property def cache_dir(self): - return self.relpath / ".cache" + return self.rel_path / ".cache" @property def data_dir(self): - return self.relpath / "data" + return self.rel_path / "data" @property def metadata_dir(self): - return self.relpath / "metadata" + return self.rel_path / "metadata" @property def ckpt_dir(self): - return self.relpath / "ckpt" + return self.rel_path / "ckpt" @property def log_dir(self): - return self.relpath / "logs" / str(self.start_time) + return self.rel_path / "logs" / str(self.start_time) @cached_property def start_time(self): @@ -98,9 +102,9 @@ class BaseConfig: state = {} if args.yaml: - cfg_path = args.yaml - state = yaml.safe_load(open(cfg_path, "r", encoding="utf-8")) - state.setdefault("cfg_path", cfg_path.parent) + yaml_path = args.yaml + state = yaml.safe_load(open(yaml_path, "r", encoding="utf-8")) + state.setdefault("yaml_path", yaml_path) return cls(**state) @@ -376,10 +380,10 @@ class DeepSpeed: autotune_params['enabled'] = True if "results_dir" not in autotune_params: - autotune_params['results_dir'] = str( cfg.relpath / "autotune" / "results" ) + autotune_params['results_dir'] = str( cfg.rel_path / "autotune" / "results" ) if "exps_dir" not in autotune_params: - autotune_params['exps_dir'] = str( cfg.relpath / "autotune" / "exps_" ) + autotune_params['exps_dir'] = str( cfg.rel_path / "autotune" / "exps_" ) # DeepSpeed fp16 is incompatible with its AMP if cfg.trainer.weight_dtype.lower() == "float16": @@ -653,7 +657,7 @@ class Config(BaseConfig): @cached_property def diskcache(self): - if self.cfg_path is not None and self.dataset.cache: + if self.yaml_path is not None and self.dataset.cache: return diskcache.Cache(self.cache_dir).memoize return lambda: lambda x: x @@ -669,9 +673,9 @@ class Config(BaseConfig): if self.distributed: self.dataset.hdf5_flag = "r" try: - self.hdf5 = h5py.File(f'{self.relpath}/{self.dataset.hdf5_name}', 'a' if write else self.dataset.hdf5_flag) # to-do, have an easy to set flag that determines if training or creating the dataset + self.hdf5 = h5py.File(f'{self.rel_path}/{self.dataset.hdf5_name}', 'a' if write else self.dataset.hdf5_flag) # to-do, have an easy to set flag that determines if training or creating the dataset except Exception as e: - print("Error while opening HDF5 file:", f'{self.relpath}/{self.dataset.hdf5_name}', str(e)) + print("Error while opening HDF5 file:", f'{self.rel_path}/{self.dataset.hdf5_name}', str(e)) self.dataset.use_hdf5 = False def format( self ): @@ -790,7 +794,7 @@ except Exception as e: try: from transformers import PreTrainedTokenizerFast - cfg.tokenizer = (cfg.relpath if cfg.cfg_path is not None else Path("./data/")) / cfg.tokenizer + cfg.tokenizer = (cfg.rel_path if cfg.yaml_path is not None else Path("./data/")) / cfg.tokenizer cfg.tokenizer = PreTrainedTokenizerFast(tokenizer_file=str(cfg.tokenizer)) except Exception as e: cfg.tokenizer = NaiveTokenizer() diff --git a/vall_e/data.py b/vall_e/data.py index 0f3143b..0c369dc 100755 --- a/vall_e/data.py +++ b/vall_e/data.py @@ -965,7 +965,7 @@ def create_datasets(): train_dataset = Dataset( training=True ) val_dataset = Dataset( phone_symmap=train_dataset.phone_symmap, training=False ) - train_state_path = cfg.relpath / f"sampler.rank{global_rank()}.pt" + train_state_path = cfg.rel_path / f"sampler.rank{global_rank()}.pt" if train_state_path.exists(): train_dataset.load_state_dict( train_state_path ) @@ -1286,7 +1286,7 @@ if __name__ == "__main__": for i in range(len(v)): print(f'{k}[{i}]:', v[i]) - #train_dl.dataset.save_state_dict(cfg.relpath / "train_dataset.pt") + #train_dl.dataset.save_state_dict(cfg.rel_path / "train_dataset.pt") elif args.action == "tasks": index = 0 diff --git a/vall_e/engines/__init__.py b/vall_e/engines/__init__.py index 97bd9fc..6142b67 100755 --- a/vall_e/engines/__init__.py +++ b/vall_e/engines/__init__.py @@ -186,7 +186,7 @@ def load_engines(training=True): # copy embeddings if requested if cfg.model._embeddings is not None: - embeddings_path = cfg.relpath / cfg.model._embeddings + embeddings_path = cfg.rel_path / cfg.model._embeddings if embeddings_path.exists(): embeddings = torch.load(embeddings_path, map_location=torch.device(cfg.device)) diff --git a/vall_e/models/ar_nar.py b/vall_e/models/ar_nar.py index 96e0793..03258a2 100644 --- a/vall_e/models/ar_nar.py +++ b/vall_e/models/ar_nar.py @@ -432,9 +432,9 @@ def example_usage(): model = AR_NAR(**kwargs).to(device) steps = 200 - optimizer = cfg.hyperparameters.optimizer.lower() if cfg.cfg_path is not None else "prodigy" - scheduler = cfg.hyperparameters.scheduler.lower() if cfg.cfg_path is not None else "" - learning_rate = cfg.hyperparameters.learning_rate if cfg.cfg_path is not None else None + optimizer = cfg.hyperparameters.optimizer.lower() if cfg.yaml_path is not None else "prodigy" + scheduler = cfg.hyperparameters.scheduler.lower() if cfg.yaml_path is not None else "" + learning_rate = cfg.hyperparameters.learning_rate if cfg.yaml_path is not None else None if cfg.optimizations.dadaptation: # do not combine the two diff --git a/vall_e/models/experimental.py b/vall_e/models/experimental.py index d2a8dbd..0e8e68a 100644 --- a/vall_e/models/experimental.py +++ b/vall_e/models/experimental.py @@ -257,9 +257,9 @@ def example_usage(): elif cfg.model.interleave: steps = 250 - optimizer = cfg.hyperparameters.optimizer.lower() if cfg.cfg_path is not None else "prodigy" - scheduler = cfg.hyperparameters.scheduler.lower() if cfg.cfg_path is not None else "" - learning_rate = cfg.hyperparameters.learning_rate if cfg.cfg_path is not None else None + optimizer = cfg.hyperparameters.optimizer.lower() if cfg.yaml_path is not None else "prodigy" + scheduler = cfg.hyperparameters.scheduler.lower() if cfg.yaml_path is not None else "" + learning_rate = cfg.hyperparameters.learning_rate if cfg.yaml_path is not None else None if cfg.optimizations.dadaptation: # do not combine the two diff --git a/vall_e/models/nar.py b/vall_e/models/nar.py index e1ed1d6..0f20baf 100644 --- a/vall_e/models/nar.py +++ b/vall_e/models/nar.py @@ -357,9 +357,9 @@ def example_usage(): model = NAR(**kwargs).to(device) steps = 200 - optimizer = cfg.hyperparameters.optimizer.lower() if cfg.cfg_path is not None else "prodigy" - scheduler = cfg.hyperparameters.scheduler.lower() if cfg.cfg_path is not None else "" - learning_rate = cfg.hyperparameters.learning_rate if cfg.cfg_path is not None else None + optimizer = cfg.hyperparameters.optimizer.lower() if cfg.yaml_path is not None else "prodigy" + scheduler = cfg.hyperparameters.scheduler.lower() if cfg.yaml_path is not None else "" + learning_rate = cfg.hyperparameters.learning_rate if cfg.yaml_path is not None else None if cfg.optimizations.dadaptation: # do not combine the two diff --git a/vall_e/plot.py b/vall_e/plot.py index 0c1b61c..06f0ff1 100644 --- a/vall_e/plot.py +++ b/vall_e/plot.py @@ -106,7 +106,7 @@ if __name__ == "__main__": parser.add_argument("--group-level", default=1) args = parser.parse_args() - path = cfg.relpath / "logs" + path = cfg.rel_path / "logs" paths = path.rglob(f"./*/{args.filename}") args.models = [ model for model in cfg.model.get() if model.training and (args.model == "*" or model.name in args.model) ] @@ -116,5 +116,5 @@ if __name__ == "__main__": plot(paths, args) - out_path = cfg.relpath / "metrics.png" + out_path = cfg.rel_path / "metrics.png" plt.savefig(out_path, bbox_inches="tight") \ No newline at end of file diff --git a/vall_e/train.py b/vall_e/train.py index b6368d1..99e1a44 100755 --- a/vall_e/train.py +++ b/vall_e/train.py @@ -14,6 +14,7 @@ import random import torch import torch.nn.functional as F import traceback +import shutil from collections import defaultdict @@ -201,7 +202,11 @@ def train(): parser.add_argument("--eval", action="store_true", default=None) args, unknown = parser.parse_known_args() + # create log folder setup_logging(cfg.log_dir) + # copy config yaml to backup + if cfg.yaml_path is not None: + shutil.copy( cfg.yaml_path, cfg.log_dir / "config.yaml" ) train_dl, subtrain_dl, val_dl = create_train_val_dataloader() diff --git a/vall_e/utils/trainer.py b/vall_e/utils/trainer.py index 9d4e64b..909832a 100755 --- a/vall_e/utils/trainer.py +++ b/vall_e/utils/trainer.py @@ -218,7 +218,7 @@ def train( print("Failed to set LR rate to:", rate, str(e)) if "export" in command: - train_dl.dataset.save_state_dict(cfg.relpath / f"sampler.rank{global_rank()}.pt") + train_dl.dataset.save_state_dict(cfg.rel_path / f"sampler.rank{global_rank()}.pt") engines.save_checkpoint() last_save_step = engines.global_step @@ -241,7 +241,7 @@ def train( if engines.global_step != last_save_step: if engines.global_step % save_ckpt_every == 0 or command in saving_commands: - train_dl.dataset.save_state_dict(cfg.relpath / f"sampler.rank{global_rank()}.pt") + train_dl.dataset.save_state_dict(cfg.rel_path / f"sampler.rank{global_rank()}.pt") engines.save_checkpoint() last_save_step = engines.global_step