more cleanup, pip-ifying won't work, got an alternative

2023-02-17 15:47:55 +00:00 · 2023-02-17 15:47:55 +00:00 · a09cf98c7f
commit a09cf98c7f
parent 6afa2c299e
6 changed files with 50 additions and 164 deletions
--- a/codes/models/lucidrains/init.py
+++ b/codes/models/lucidrains/init.py
--- a/experiments/.gitkeep
+++ b/experiments/.gitkeep
--- a/experiments/EXAMPLE_gpt.yml
+++ b/experiments/EXAMPLE_gpt.yml
@ -1,144 +0,0 @@
-name: CHANGEME_your_experiment_name
-model: extensibletrainer
-scale: 1
-gpu_ids: [0] # <-- unless you have multiple gpus, use this
-start_step: -1
-checkpointing_enabled: true  # <-- Gradient checkpointing. Enable for huge GPU memory savings. Disable for distributed training.
-fp16: false # might want to check this out
-wandb: false  # <-- enable to log to wandb. tensorboard logging is always enabled.
-use_tb_logger: true
-
-datasets:
-  train:
-    name: CHANGEME_training_dataset_name
-    n_workers: 8 # idk what this does
-    batch_size: 128 # This leads to ~16GB of vram usage on my 3090.
-    mode: paired_voice_audio
-    path: CHANGEME_path_to_training_dataset
-    fetcher_mode: ['lj'] # CHANGEME if your dataset isn't in LJSpeech format
-    phase: train
-    max_wav_length: 255995
-    max_text_length: 200
-    sample_rate: 22050
-    load_conditioning: True
-    num_conditioning_candidates: 2
-    conditioning_length: 44000
-    use_bpe_tokenizer: True
-    load_aligned_codes: False
-  val:
-    name: CHANGEME_validation_dataset_name
-    n_workers: 1
-    batch_size: 32 # this could be higher probably
-    mode: paired_voice_audio
-    path: CHANGEME_path_to_validation_dataset
-    fetcher_mode: ['lj']
-    phase: val # might be broken idk
-    max_wav_length: 255995
-    max_text_length: 200
-    sample_rate: 22050
-    load_conditioning: True
-    num_conditioning_candidates: 2
-    conditioning_length: 44000
-    use_bpe_tokenizer: True
-    load_aligned_codes: False
-
-steps:        
-  gpt_train:
-    training: gpt
-    loss_log_buffer: 500 # no idea what this does
-
-    # Generally follows the recipe from the DALLE paper.
-    optimizer: adamw # this should be adamw_zero if you're using distributed training
-    optimizer_params:
-      lr: !!float 1e-5 # CHANGEME: this was originally 1e-4; I reduced it to 1e-5 because it's fine-tuning, but **you should experiment with this value**
-      weight_decay: !!float 1e-2
-      beta1: 0.9
-      beta2: 0.96
-    clip_grad_eps: 4
-
-    injectors:  # TODO: replace this entire sequence with the GptVoiceLatentInjector
-      paired_to_mel:
-        type: torch_mel_spectrogram
-        mel_norm_file: ../experiments/clips_mel_norms.pth
-        in: wav
-        out: paired_mel
-      paired_cond_to_mel:
-        type: for_each
-        subtype: torch_mel_spectrogram
-        mel_norm_file: ../experiments/clips_mel_norms.pth
-        in: conditioning
-        out: paired_conditioning_mel
-      to_codes:
-        type: discrete_token
-        in: paired_mel
-        out: paired_mel_codes
-        dvae_config: "../experiments/train_diffusion_vocoder_22k_level.yml" # EXTREMELY IMPORTANT
-      paired_fwd_text:
-        type: generator
-        generator: gpt
-        in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths]
-        out: [loss_text_ce, loss_mel_ce, logits]      
-    losses:
-      text_ce:
-        type: direct
-        weight: .01
-        key: loss_text_ce
-      mel_ce:
-        type: direct
-        weight: 1
-        key: loss_mel_ce
-
-networks:
-  gpt:
-    type: generator 
-    which_model_G: unified_voice2 # none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter.
-    kwargs:
-      layers: 30 # WAS 8
-      model_dim: 1024 # WAS 512
-      heads: 16 # WAS 8
-      max_text_tokens: 402 # WAS 120
-      max_mel_tokens: 604 # WAS 250
-      max_conditioning_inputs: 2 # WAS 1
-      mel_length_compression: 1024
-      number_text_tokens: 256 # supposed to be 255 for newer unified_voice files 
-      number_mel_codes: 8194
-      start_mel_token: 8192
-      stop_mel_token: 8193
-      start_text_token: 255
-      train_solo_embeddings: False # missing in uv3/4
-      use_mel_codes_as_input: True # ditto
-      checkpointing: True
-      #types: 1 # this is MISSING, but in my analysis 1 is equivalent to not having it.
-      #only_alignment_head: False  # uv3/4
-
-path:
-  pretrain_model_gpt: '../experiments/autoregressive.pth' # CHANGEME: copy this from tortoise cache
-  strict_load: true
-  #resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state   # <-- Set this to resume from a previous training state.
-
-# afaik all units here are measured in **steps** (i.e. one batch of batch_size is 1 unit)
-train: # CHANGEME: ALL OF THESE PARAMETERS SHOULD BE EXPERIMENTED WITH
-  niter: 50000
-  warmup_iter: -1
-  mega_batch_factor: 4    # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8].
-  val_freq: 500
-
-  default_lr_scheme: MultiStepLR
-  gen_lr_steps: [500, 1000, 1400, 1800] #[50000, 100000, 140000, 180000]
-  lr_gamma: 0.5
-
-eval:
-  output_state: gen
-  injectors:
-    gen_inj_eval:
-      type: generator
-      generator: generator
-      in: hq
-      out: [gen, codebook_commitment_loss]
-
-logger: 
-  print_freq: 100
-  save_checkpoint_freq: 500 # CHANGEME: especially you should increase this it's really slow
-  visuals: [gen, mel]
-  visual_debug_rate: 500
-  is_mel_spectrogram: true
--- a/experiments/bpe_lowercase_asr_256.json
+++ b/experiments/bpe_lowercase_asr_256.json
@ -1 +0,0 @@
-{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}}
--- a/experiments/train_diffusion_vocoder_22k_level.yml
+++ b/experiments/train_diffusion_vocoder_22k_level.yml
@ -1,18 +0,0 @@
-path:
-  pretrain_model_dvae: '../experiments/dvae.pth'
-  strict_load: true
-  #resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state   # <-- Set this to resume from a previous training state.
-networks:
-  dvae:
-    type: generator
-    which_model_G: lucidrains_dvae
-    kwargs:
-      channels: 80
-      codebook_dim: 512
-      hidden_dim: 512
-      kernel_size: 3
-      num_layers: 2
-      num_resnet_blocks: 3
-      num_tokens: 8192
-      positional_dims: 1
-      use_transposed_convs: false
--- a/setup.py
+++ b/setup.py
@ -18,7 +18,56 @@ setuptools.setup(
    project_urls={},
    scripts=[],
    include_package_data=True,
-    install_requires=parse_requirements('requirements.txt', session='hack'),
+    install_requires=[
+        # Fundamentals
+        "numpy",
+        "pyyaml",
+        "tb-nightly",
+        "future",
+        "scp",
+        "tqdm",
+        "matplotlib",
+        "scipy",
+        "munch",
+        "tqdm",
+        "scp",
+        "tensorboard",
+        "orjson",
+        "einops",
+        "lambda-networks",
+        "mup",
+
+        # For image generation stuff
+        "opencv-python",
+        "kornia",
+        "pytorch_ssim",
+        "gsa-pytorch",
+        "pytorch_fid",
+
+        # For audio generation stuff
+        "inflect",
+        "librosa",
+        "Unidecode",
+        "tgt",
+        "pyworld",
+        "audio2numpy",
+        "SoundFile",
+
+        # For text stuff
+        "transformers",
+        "tokenizers",
+        "jiwer",  # calculating WER
+        "omegaconf",
+
+        # lucidrains stuff
+        "vector_quantize_pytorch",
+        "linear_attention_transformer",
+        "rotary-embedding-torch",
+        "axial_positional_embedding",
+        "g-mlp-pytorch",
+        "x-clip",
+        "x_transformers==1.0.4",
+    ],
    classifiers=[
        "Programming Language :: Python :: 3",
        "License :: OSI Approved :: Apache Software License",
				`@ -1 +0,0 @@`
				{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}}