From a09cf98c7fed711e75e9d87d4fc63192eeaf2ee9 Mon Sep 17 00:00:00 2001 From: mrq Date: Fri, 17 Feb 2023 15:47:55 +0000 Subject: [PATCH] more cleanup, pip-ifying won't work, got an alternative --- codes/models/lucidrains/__init__.py | 0 experiments/.gitkeep | 0 experiments/EXAMPLE_gpt.yml | 144 ------------------ experiments/bpe_lowercase_asr_256.json | 1 - .../train_diffusion_vocoder_22k_level.yml | 18 --- setup.py | 51 ++++++- 6 files changed, 50 insertions(+), 164 deletions(-) create mode 100644 codes/models/lucidrains/__init__.py create mode 100644 experiments/.gitkeep delete mode 100644 experiments/EXAMPLE_gpt.yml delete mode 100644 experiments/bpe_lowercase_asr_256.json delete mode 100644 experiments/train_diffusion_vocoder_22k_level.yml diff --git a/codes/models/lucidrains/__init__.py b/codes/models/lucidrains/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/experiments/.gitkeep b/experiments/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/experiments/EXAMPLE_gpt.yml b/experiments/EXAMPLE_gpt.yml deleted file mode 100644 index 123fd882..00000000 --- a/experiments/EXAMPLE_gpt.yml +++ /dev/null @@ -1,144 +0,0 @@ -name: CHANGEME_your_experiment_name -model: extensibletrainer -scale: 1 -gpu_ids: [0] # <-- unless you have multiple gpus, use this -start_step: -1 -checkpointing_enabled: true # <-- Gradient checkpointing. Enable for huge GPU memory savings. Disable for distributed training. -fp16: false # might want to check this out -wandb: false # <-- enable to log to wandb. tensorboard logging is always enabled. -use_tb_logger: true - -datasets: - train: - name: CHANGEME_training_dataset_name - n_workers: 8 # idk what this does - batch_size: 128 # This leads to ~16GB of vram usage on my 3090. - mode: paired_voice_audio - path: CHANGEME_path_to_training_dataset - fetcher_mode: ['lj'] # CHANGEME if your dataset isn't in LJSpeech format - phase: train - max_wav_length: 255995 - max_text_length: 200 - sample_rate: 22050 - load_conditioning: True - num_conditioning_candidates: 2 - conditioning_length: 44000 - use_bpe_tokenizer: True - load_aligned_codes: False - val: - name: CHANGEME_validation_dataset_name - n_workers: 1 - batch_size: 32 # this could be higher probably - mode: paired_voice_audio - path: CHANGEME_path_to_validation_dataset - fetcher_mode: ['lj'] - phase: val # might be broken idk - max_wav_length: 255995 - max_text_length: 200 - sample_rate: 22050 - load_conditioning: True - num_conditioning_candidates: 2 - conditioning_length: 44000 - use_bpe_tokenizer: True - load_aligned_codes: False - -steps: - gpt_train: - training: gpt - loss_log_buffer: 500 # no idea what this does - - # Generally follows the recipe from the DALLE paper. - optimizer: adamw # this should be adamw_zero if you're using distributed training - optimizer_params: - lr: !!float 1e-5 # CHANGEME: this was originally 1e-4; I reduced it to 1e-5 because it's fine-tuning, but **you should experiment with this value** - weight_decay: !!float 1e-2 - beta1: 0.9 - beta2: 0.96 - clip_grad_eps: 4 - - injectors: # TODO: replace this entire sequence with the GptVoiceLatentInjector - paired_to_mel: - type: torch_mel_spectrogram - mel_norm_file: ../experiments/clips_mel_norms.pth - in: wav - out: paired_mel - paired_cond_to_mel: - type: for_each - subtype: torch_mel_spectrogram - mel_norm_file: ../experiments/clips_mel_norms.pth - in: conditioning - out: paired_conditioning_mel - to_codes: - type: discrete_token - in: paired_mel - out: paired_mel_codes - dvae_config: "../experiments/train_diffusion_vocoder_22k_level.yml" # EXTREMELY IMPORTANT - paired_fwd_text: - type: generator - generator: gpt - in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths] - out: [loss_text_ce, loss_mel_ce, logits] - losses: - text_ce: - type: direct - weight: .01 - key: loss_text_ce - mel_ce: - type: direct - weight: 1 - key: loss_mel_ce - -networks: - gpt: - type: generator - which_model_G: unified_voice2 # none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter. - kwargs: - layers: 30 # WAS 8 - model_dim: 1024 # WAS 512 - heads: 16 # WAS 8 - max_text_tokens: 402 # WAS 120 - max_mel_tokens: 604 # WAS 250 - max_conditioning_inputs: 2 # WAS 1 - mel_length_compression: 1024 - number_text_tokens: 256 # supposed to be 255 for newer unified_voice files - number_mel_codes: 8194 - start_mel_token: 8192 - stop_mel_token: 8193 - start_text_token: 255 - train_solo_embeddings: False # missing in uv3/4 - use_mel_codes_as_input: True # ditto - checkpointing: True - #types: 1 # this is MISSING, but in my analysis 1 is equivalent to not having it. - #only_alignment_head: False # uv3/4 - -path: - pretrain_model_gpt: '../experiments/autoregressive.pth' # CHANGEME: copy this from tortoise cache - strict_load: true - #resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state # <-- Set this to resume from a previous training state. - -# afaik all units here are measured in **steps** (i.e. one batch of batch_size is 1 unit) -train: # CHANGEME: ALL OF THESE PARAMETERS SHOULD BE EXPERIMENTED WITH - niter: 50000 - warmup_iter: -1 - mega_batch_factor: 4 # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8]. - val_freq: 500 - - default_lr_scheme: MultiStepLR - gen_lr_steps: [500, 1000, 1400, 1800] #[50000, 100000, 140000, 180000] - lr_gamma: 0.5 - -eval: - output_state: gen - injectors: - gen_inj_eval: - type: generator - generator: generator - in: hq - out: [gen, codebook_commitment_loss] - -logger: - print_freq: 100 - save_checkpoint_freq: 500 # CHANGEME: especially you should increase this it's really slow - visuals: [gen, mel] - visual_debug_rate: 500 - is_mel_spectrogram: true diff --git a/experiments/bpe_lowercase_asr_256.json b/experiments/bpe_lowercase_asr_256.json deleted file mode 100644 index a128f273..00000000 --- a/experiments/bpe_lowercase_asr_256.json +++ /dev/null @@ -1 +0,0 @@ -{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}} \ No newline at end of file diff --git a/experiments/train_diffusion_vocoder_22k_level.yml b/experiments/train_diffusion_vocoder_22k_level.yml deleted file mode 100644 index 2032f1ed..00000000 --- a/experiments/train_diffusion_vocoder_22k_level.yml +++ /dev/null @@ -1,18 +0,0 @@ -path: - pretrain_model_dvae: '../experiments/dvae.pth' - strict_load: true - #resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state # <-- Set this to resume from a previous training state. -networks: - dvae: - type: generator - which_model_G: lucidrains_dvae - kwargs: - channels: 80 - codebook_dim: 512 - hidden_dim: 512 - kernel_size: 3 - num_layers: 2 - num_resnet_blocks: 3 - num_tokens: 8192 - positional_dims: 1 - use_transposed_convs: false diff --git a/setup.py b/setup.py index 16c19702..1073fb00 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,56 @@ setuptools.setup( project_urls={}, scripts=[], include_package_data=True, - install_requires=parse_requirements('requirements.txt', session='hack'), + install_requires=[ + # Fundamentals + "numpy", + "pyyaml", + "tb-nightly", + "future", + "scp", + "tqdm", + "matplotlib", + "scipy", + "munch", + "tqdm", + "scp", + "tensorboard", + "orjson", + "einops", + "lambda-networks", + "mup", + + # For image generation stuff + "opencv-python", + "kornia", + "pytorch_ssim", + "gsa-pytorch", + "pytorch_fid", + + # For audio generation stuff + "inflect", + "librosa", + "Unidecode", + "tgt", + "pyworld", + "audio2numpy", + "SoundFile", + + # For text stuff + "transformers", + "tokenizers", + "jiwer", # calculating WER + "omegaconf", + + # lucidrains stuff + "vector_quantize_pytorch", + "linear_attention_transformer", + "rotary-embedding-torch", + "axial_positional_embedding", + "g-mlp-pytorch", + "x-clip", + "x_transformers==1.0.4", + ], classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: Apache Software License",