pip-ify

2023-02-17 00:33:50 +00:00 · 2023-02-17 00:33:50 +00:00 · 49e23b226b
commit 49e23b226b
parent f31a333c4f
9 changed files with 252 additions and 5 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,12 +1,13 @@
-experiments/*
+dlas/experiments/*
+dlas/codes/*.txt
+dlas/codes/wandb/*
+dlas/codes/pretrained_models/*
+dlas/codes/scripts/audio/pretrained_models/*
+
 results/*
 tb_logger/*
 datasets/*
 options/*
-codes/*.txt
-codes/wandb/*
-codes/pretrained_models/*
-codes/scripts/audio/pretrained_models/*
 data/*
 .vscode

--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1 @@
+recursive-include codes/*
--- a/README.old.md
+++ b/README.old.md
--- a/codes/init.py
+++ b/codes/init.py
--- a/experiments/EXAMPLE_gpt.yml
+++ b/experiments/EXAMPLE_gpt.yml
@ -0,0 +1,144 @@
+name: CHANGEME_your_experiment_name
+model: extensibletrainer
+scale: 1
+gpu_ids: [0] # <-- unless you have multiple gpus, use this
+start_step: -1
+checkpointing_enabled: true  # <-- Gradient checkpointing. Enable for huge GPU memory savings. Disable for distributed training.
+fp16: false # might want to check this out
+wandb: false  # <-- enable to log to wandb. tensorboard logging is always enabled.
+use_tb_logger: true
+
+datasets:
+  train:
+    name: CHANGEME_training_dataset_name
+    n_workers: 8 # idk what this does
+    batch_size: 128 # This leads to ~16GB of vram usage on my 3090.
+    mode: paired_voice_audio
+    path: CHANGEME_path_to_training_dataset
+    fetcher_mode: ['lj'] # CHANGEME if your dataset isn't in LJSpeech format
+    phase: train
+    max_wav_length: 255995
+    max_text_length: 200
+    sample_rate: 22050
+    load_conditioning: True
+    num_conditioning_candidates: 2
+    conditioning_length: 44000
+    use_bpe_tokenizer: True
+    load_aligned_codes: False
+  val:
+    name: CHANGEME_validation_dataset_name
+    n_workers: 1
+    batch_size: 32 # this could be higher probably
+    mode: paired_voice_audio
+    path: CHANGEME_path_to_validation_dataset
+    fetcher_mode: ['lj']
+    phase: val # might be broken idk
+    max_wav_length: 255995
+    max_text_length: 200
+    sample_rate: 22050
+    load_conditioning: True
+    num_conditioning_candidates: 2
+    conditioning_length: 44000
+    use_bpe_tokenizer: True
+    load_aligned_codes: False
+
+steps:        
+  gpt_train:
+    training: gpt
+    loss_log_buffer: 500 # no idea what this does
+
+    # Generally follows the recipe from the DALLE paper.
+    optimizer: adamw # this should be adamw_zero if you're using distributed training
+    optimizer_params:
+      lr: !!float 1e-5 # CHANGEME: this was originally 1e-4; I reduced it to 1e-5 because it's fine-tuning, but **you should experiment with this value**
+      weight_decay: !!float 1e-2
+      beta1: 0.9
+      beta2: 0.96
+    clip_grad_eps: 4
+
+    injectors:  # TODO: replace this entire sequence with the GptVoiceLatentInjector
+      paired_to_mel:
+        type: torch_mel_spectrogram
+        mel_norm_file: ../experiments/clips_mel_norms.pth
+        in: wav
+        out: paired_mel
+      paired_cond_to_mel:
+        type: for_each
+        subtype: torch_mel_spectrogram
+        mel_norm_file: ../experiments/clips_mel_norms.pth
+        in: conditioning
+        out: paired_conditioning_mel
+      to_codes:
+        type: discrete_token
+        in: paired_mel
+        out: paired_mel_codes
+        dvae_config: "../experiments/train_diffusion_vocoder_22k_level.yml" # EXTREMELY IMPORTANT
+      paired_fwd_text:
+        type: generator
+        generator: gpt
+        in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths]
+        out: [loss_text_ce, loss_mel_ce, logits]      
+    losses:
+      text_ce:
+        type: direct
+        weight: .01
+        key: loss_text_ce
+      mel_ce:
+        type: direct
+        weight: 1
+        key: loss_mel_ce
+
+networks:
+  gpt:
+    type: generator 
+    which_model_G: unified_voice2 # none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter.
+    kwargs:
+      layers: 30 # WAS 8
+      model_dim: 1024 # WAS 512
+      heads: 16 # WAS 8
+      max_text_tokens: 402 # WAS 120
+      max_mel_tokens: 604 # WAS 250
+      max_conditioning_inputs: 2 # WAS 1
+      mel_length_compression: 1024
+      number_text_tokens: 256 # supposed to be 255 for newer unified_voice files 
+      number_mel_codes: 8194
+      start_mel_token: 8192
+      stop_mel_token: 8193
+      start_text_token: 255
+      train_solo_embeddings: False # missing in uv3/4
+      use_mel_codes_as_input: True # ditto
+      checkpointing: True
+      #types: 1 # this is MISSING, but in my analysis 1 is equivalent to not having it.
+      #only_alignment_head: False  # uv3/4
+
+path:
+  pretrain_model_gpt: '../experiments/autoregressive.pth' # CHANGEME: copy this from tortoise cache
+  strict_load: true
+  #resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state   # <-- Set this to resume from a previous training state.
+
+# afaik all units here are measured in **steps** (i.e. one batch of batch_size is 1 unit)
+train: # CHANGEME: ALL OF THESE PARAMETERS SHOULD BE EXPERIMENTED WITH
+  niter: 50000
+  warmup_iter: -1
+  mega_batch_factor: 4    # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8].
+  val_freq: 500
+
+  default_lr_scheme: MultiStepLR
+  gen_lr_steps: [500, 1000, 1400, 1800] #[50000, 100000, 140000, 180000]
+  lr_gamma: 0.5
+
+eval:
+  output_state: gen
+  injectors:
+    gen_inj_eval:
+      type: generator
+      generator: generator
+      in: hq
+      out: [gen, codebook_commitment_loss]
+
+logger: 
+  print_freq: 100
+  save_checkpoint_freq: 500 # CHANGEME: especially you should increase this it's really slow
+  visuals: [gen, mel]
+  visual_debug_rate: 500
+  is_mel_spectrogram: true
--- a/experiments/bpe_lowercase_asr_256.json
+++ b/experiments/bpe_lowercase_asr_256.json
@ -0,0 +1 @@
+{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}}
--- a/experiments/train_diffusion_vocoder_22k_level.yml
+++ b/experiments/train_diffusion_vocoder_22k_level.yml
@ -0,0 +1,18 @@
+path:
+  pretrain_model_dvae: '../experiments/dvae.pth'
+  strict_load: true
+  #resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state   # <-- Set this to resume from a previous training state.
+networks:
+  dvae:
+    type: generator
+    which_model_G: lucidrains_dvae
+    kwargs:
+      channels: 80
+      codebook_dim: 512
+      hidden_dim: 512
+      kernel_size: 3
+      num_layers: 2
+      num_resnet_blocks: 3
+      num_tokens: 8192
+      positional_dims: 1
+      use_transposed_convs: false
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,48 @@
+# Fundamentals
+numpy
+pyyaml
+tb-nightly
+future
+scp
+tqdm
+matplotlib
+scipy
+munch
+tqdm
+scp
+tensorboard
+orjson
+einops
+lambda-networks
+mup
+
+# For image generation stuff
+opencv-python
+kornia
+pytorch_ssim
+gsa-pytorch
+pytorch_fid
+
+# For audio generation stuff
+inflect
+librosa
+Unidecode
+tgt
+pyworld
+audio2numpy
+SoundFile
+
+# For text stuff
+transformers
+tokenizers
+jiwer  # calculating WER
+omegaconf
+
+# lucidrains stuff
+vector_quantize_pytorch
+linear_attention_transformer
+rotary-embedding-torch
+axial_positional_embedding
+g-mlp-pytorch
+x-clip
+x_transformers==1.0.4
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,34 @@
+import setuptools
+
+with open("README.old.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+
+# kludge
+packages = setuptools.find_packages()
+for i in range(len(packages)):
+    packages[i] = packages[i].replace("codes", "dlas")
+
+setuptools.setup(
+    name="DL-Art-School",
+    packages=packages,
+    package_dir={
+        "dlas": "./codes"
+    },
+    version="0.0.1",
+    author="James Betker",
+    author_email="james@adamant.ai",
+    description="",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://git.ecker.tech/mrq/DL-Art-School",
+    project_urls={},
+    scripts=[],
+    include_package_data=True,
+    install_requires=[],
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires=">=3.6",
+)
				`@ -0,0 +1 @@`
				{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}}