diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 index baea0e89..531dd7d9 --- a/.gitignore +++ b/.gitignore @@ -1,12 +1,13 @@ -experiments/* +dlas/experiments/* +dlas/codes/*.txt +dlas/codes/wandb/* +dlas/codes/pretrained_models/* +dlas/codes/scripts/audio/pretrained_models/* + results/* tb_logger/* datasets/* options/* -codes/*.txt -codes/wandb/* -codes/pretrained_models/* -codes/scripts/audio/pretrained_models/* data/* .vscode diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100755 index 00000000..1085ae5e --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +recursive-include codes/* diff --git a/README.md b/README.old.md old mode 100644 new mode 100755 similarity index 100% rename from README.md rename to README.old.md diff --git a/codes/__init__.py b/codes/__init__.py new file mode 100755 index 00000000..e69de29b diff --git a/experiments/EXAMPLE_gpt.yml b/experiments/EXAMPLE_gpt.yml new file mode 100755 index 00000000..123fd882 --- /dev/null +++ b/experiments/EXAMPLE_gpt.yml @@ -0,0 +1,144 @@ +name: CHANGEME_your_experiment_name +model: extensibletrainer +scale: 1 +gpu_ids: [0] # <-- unless you have multiple gpus, use this +start_step: -1 +checkpointing_enabled: true # <-- Gradient checkpointing. Enable for huge GPU memory savings. Disable for distributed training. +fp16: false # might want to check this out +wandb: false # <-- enable to log to wandb. tensorboard logging is always enabled. +use_tb_logger: true + +datasets: + train: + name: CHANGEME_training_dataset_name + n_workers: 8 # idk what this does + batch_size: 128 # This leads to ~16GB of vram usage on my 3090. + mode: paired_voice_audio + path: CHANGEME_path_to_training_dataset + fetcher_mode: ['lj'] # CHANGEME if your dataset isn't in LJSpeech format + phase: train + max_wav_length: 255995 + max_text_length: 200 + sample_rate: 22050 + load_conditioning: True + num_conditioning_candidates: 2 + conditioning_length: 44000 + use_bpe_tokenizer: True + load_aligned_codes: False + val: + name: CHANGEME_validation_dataset_name + n_workers: 1 + batch_size: 32 # this could be higher probably + mode: paired_voice_audio + path: CHANGEME_path_to_validation_dataset + fetcher_mode: ['lj'] + phase: val # might be broken idk + max_wav_length: 255995 + max_text_length: 200 + sample_rate: 22050 + load_conditioning: True + num_conditioning_candidates: 2 + conditioning_length: 44000 + use_bpe_tokenizer: True + load_aligned_codes: False + +steps: + gpt_train: + training: gpt + loss_log_buffer: 500 # no idea what this does + + # Generally follows the recipe from the DALLE paper. + optimizer: adamw # this should be adamw_zero if you're using distributed training + optimizer_params: + lr: !!float 1e-5 # CHANGEME: this was originally 1e-4; I reduced it to 1e-5 because it's fine-tuning, but **you should experiment with this value** + weight_decay: !!float 1e-2 + beta1: 0.9 + beta2: 0.96 + clip_grad_eps: 4 + + injectors: # TODO: replace this entire sequence with the GptVoiceLatentInjector + paired_to_mel: + type: torch_mel_spectrogram + mel_norm_file: ../experiments/clips_mel_norms.pth + in: wav + out: paired_mel + paired_cond_to_mel: + type: for_each + subtype: torch_mel_spectrogram + mel_norm_file: ../experiments/clips_mel_norms.pth + in: conditioning + out: paired_conditioning_mel + to_codes: + type: discrete_token + in: paired_mel + out: paired_mel_codes + dvae_config: "../experiments/train_diffusion_vocoder_22k_level.yml" # EXTREMELY IMPORTANT + paired_fwd_text: + type: generator + generator: gpt + in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths] + out: [loss_text_ce, loss_mel_ce, logits] + losses: + text_ce: + type: direct + weight: .01 + key: loss_text_ce + mel_ce: + type: direct + weight: 1 + key: loss_mel_ce + +networks: + gpt: + type: generator + which_model_G: unified_voice2 # none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter. + kwargs: + layers: 30 # WAS 8 + model_dim: 1024 # WAS 512 + heads: 16 # WAS 8 + max_text_tokens: 402 # WAS 120 + max_mel_tokens: 604 # WAS 250 + max_conditioning_inputs: 2 # WAS 1 + mel_length_compression: 1024 + number_text_tokens: 256 # supposed to be 255 for newer unified_voice files + number_mel_codes: 8194 + start_mel_token: 8192 + stop_mel_token: 8193 + start_text_token: 255 + train_solo_embeddings: False # missing in uv3/4 + use_mel_codes_as_input: True # ditto + checkpointing: True + #types: 1 # this is MISSING, but in my analysis 1 is equivalent to not having it. + #only_alignment_head: False # uv3/4 + +path: + pretrain_model_gpt: '../experiments/autoregressive.pth' # CHANGEME: copy this from tortoise cache + strict_load: true + #resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state # <-- Set this to resume from a previous training state. + +# afaik all units here are measured in **steps** (i.e. one batch of batch_size is 1 unit) +train: # CHANGEME: ALL OF THESE PARAMETERS SHOULD BE EXPERIMENTED WITH + niter: 50000 + warmup_iter: -1 + mega_batch_factor: 4 # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8]. + val_freq: 500 + + default_lr_scheme: MultiStepLR + gen_lr_steps: [500, 1000, 1400, 1800] #[50000, 100000, 140000, 180000] + lr_gamma: 0.5 + +eval: + output_state: gen + injectors: + gen_inj_eval: + type: generator + generator: generator + in: hq + out: [gen, codebook_commitment_loss] + +logger: + print_freq: 100 + save_checkpoint_freq: 500 # CHANGEME: especially you should increase this it's really slow + visuals: [gen, mel] + visual_debug_rate: 500 + is_mel_spectrogram: true diff --git a/experiments/bpe_lowercase_asr_256.json b/experiments/bpe_lowercase_asr_256.json new file mode 100755 index 00000000..a128f273 --- /dev/null +++ b/experiments/bpe_lowercase_asr_256.json @@ -0,0 +1 @@ +{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}} \ No newline at end of file diff --git a/experiments/train_diffusion_vocoder_22k_level.yml b/experiments/train_diffusion_vocoder_22k_level.yml new file mode 100755 index 00000000..2032f1ed --- /dev/null +++ b/experiments/train_diffusion_vocoder_22k_level.yml @@ -0,0 +1,18 @@ +path: + pretrain_model_dvae: '../experiments/dvae.pth' + strict_load: true + #resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state # <-- Set this to resume from a previous training state. +networks: + dvae: + type: generator + which_model_G: lucidrains_dvae + kwargs: + channels: 80 + codebook_dim: 512 + hidden_dim: 512 + kernel_size: 3 + num_layers: 2 + num_resnet_blocks: 3 + num_tokens: 8192 + positional_dims: 1 + use_transposed_convs: false diff --git a/requirements.txt b/requirements.txt new file mode 100755 index 00000000..877d9869 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,48 @@ +# Fundamentals +numpy +pyyaml +tb-nightly +future +scp +tqdm +matplotlib +scipy +munch +tqdm +scp +tensorboard +orjson +einops +lambda-networks +mup + +# For image generation stuff +opencv-python +kornia +pytorch_ssim +gsa-pytorch +pytorch_fid + +# For audio generation stuff +inflect +librosa +Unidecode +tgt +pyworld +audio2numpy +SoundFile + +# For text stuff +transformers +tokenizers +jiwer # calculating WER +omegaconf + +# lucidrains stuff +vector_quantize_pytorch +linear_attention_transformer +rotary-embedding-torch +axial_positional_embedding +g-mlp-pytorch +x-clip +x_transformers==1.0.4 \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100755 index 00000000..cba4ff04 --- /dev/null +++ b/setup.py @@ -0,0 +1,34 @@ +import setuptools + +with open("README.old.md", "r", encoding="utf-8") as fh: + long_description = fh.read() + +# kludge +packages = setuptools.find_packages() +for i in range(len(packages)): + packages[i] = packages[i].replace("codes", "dlas") + +setuptools.setup( + name="DL-Art-School", + packages=packages, + package_dir={ + "dlas": "./codes" + }, + version="0.0.1", + author="James Betker", + author_email="james@adamant.ai", + description="", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://git.ecker.tech/mrq/DL-Art-School", + project_urls={}, + scripts=[], + include_package_data=True, + install_requires=[], + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + ], + python_requires=">=3.6", +)