forked from mrq/DL-Art-School
more cleanup, pip-ifying won't work, got an alternative
This commit is contained in:
parent
6afa2c299e
commit
a09cf98c7f
0
codes/models/lucidrains/__init__.py
Normal file
0
codes/models/lucidrains/__init__.py
Normal file
0
experiments/.gitkeep
Normal file
0
experiments/.gitkeep
Normal file
|
@ -1,144 +0,0 @@
|
||||||
name: CHANGEME_your_experiment_name
|
|
||||||
model: extensibletrainer
|
|
||||||
scale: 1
|
|
||||||
gpu_ids: [0] # <-- unless you have multiple gpus, use this
|
|
||||||
start_step: -1
|
|
||||||
checkpointing_enabled: true # <-- Gradient checkpointing. Enable for huge GPU memory savings. Disable for distributed training.
|
|
||||||
fp16: false # might want to check this out
|
|
||||||
wandb: false # <-- enable to log to wandb. tensorboard logging is always enabled.
|
|
||||||
use_tb_logger: true
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
train:
|
|
||||||
name: CHANGEME_training_dataset_name
|
|
||||||
n_workers: 8 # idk what this does
|
|
||||||
batch_size: 128 # This leads to ~16GB of vram usage on my 3090.
|
|
||||||
mode: paired_voice_audio
|
|
||||||
path: CHANGEME_path_to_training_dataset
|
|
||||||
fetcher_mode: ['lj'] # CHANGEME if your dataset isn't in LJSpeech format
|
|
||||||
phase: train
|
|
||||||
max_wav_length: 255995
|
|
||||||
max_text_length: 200
|
|
||||||
sample_rate: 22050
|
|
||||||
load_conditioning: True
|
|
||||||
num_conditioning_candidates: 2
|
|
||||||
conditioning_length: 44000
|
|
||||||
use_bpe_tokenizer: True
|
|
||||||
load_aligned_codes: False
|
|
||||||
val:
|
|
||||||
name: CHANGEME_validation_dataset_name
|
|
||||||
n_workers: 1
|
|
||||||
batch_size: 32 # this could be higher probably
|
|
||||||
mode: paired_voice_audio
|
|
||||||
path: CHANGEME_path_to_validation_dataset
|
|
||||||
fetcher_mode: ['lj']
|
|
||||||
phase: val # might be broken idk
|
|
||||||
max_wav_length: 255995
|
|
||||||
max_text_length: 200
|
|
||||||
sample_rate: 22050
|
|
||||||
load_conditioning: True
|
|
||||||
num_conditioning_candidates: 2
|
|
||||||
conditioning_length: 44000
|
|
||||||
use_bpe_tokenizer: True
|
|
||||||
load_aligned_codes: False
|
|
||||||
|
|
||||||
steps:
|
|
||||||
gpt_train:
|
|
||||||
training: gpt
|
|
||||||
loss_log_buffer: 500 # no idea what this does
|
|
||||||
|
|
||||||
# Generally follows the recipe from the DALLE paper.
|
|
||||||
optimizer: adamw # this should be adamw_zero if you're using distributed training
|
|
||||||
optimizer_params:
|
|
||||||
lr: !!float 1e-5 # CHANGEME: this was originally 1e-4; I reduced it to 1e-5 because it's fine-tuning, but **you should experiment with this value**
|
|
||||||
weight_decay: !!float 1e-2
|
|
||||||
beta1: 0.9
|
|
||||||
beta2: 0.96
|
|
||||||
clip_grad_eps: 4
|
|
||||||
|
|
||||||
injectors: # TODO: replace this entire sequence with the GptVoiceLatentInjector
|
|
||||||
paired_to_mel:
|
|
||||||
type: torch_mel_spectrogram
|
|
||||||
mel_norm_file: ../experiments/clips_mel_norms.pth
|
|
||||||
in: wav
|
|
||||||
out: paired_mel
|
|
||||||
paired_cond_to_mel:
|
|
||||||
type: for_each
|
|
||||||
subtype: torch_mel_spectrogram
|
|
||||||
mel_norm_file: ../experiments/clips_mel_norms.pth
|
|
||||||
in: conditioning
|
|
||||||
out: paired_conditioning_mel
|
|
||||||
to_codes:
|
|
||||||
type: discrete_token
|
|
||||||
in: paired_mel
|
|
||||||
out: paired_mel_codes
|
|
||||||
dvae_config: "../experiments/train_diffusion_vocoder_22k_level.yml" # EXTREMELY IMPORTANT
|
|
||||||
paired_fwd_text:
|
|
||||||
type: generator
|
|
||||||
generator: gpt
|
|
||||||
in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths]
|
|
||||||
out: [loss_text_ce, loss_mel_ce, logits]
|
|
||||||
losses:
|
|
||||||
text_ce:
|
|
||||||
type: direct
|
|
||||||
weight: .01
|
|
||||||
key: loss_text_ce
|
|
||||||
mel_ce:
|
|
||||||
type: direct
|
|
||||||
weight: 1
|
|
||||||
key: loss_mel_ce
|
|
||||||
|
|
||||||
networks:
|
|
||||||
gpt:
|
|
||||||
type: generator
|
|
||||||
which_model_G: unified_voice2 # none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter.
|
|
||||||
kwargs:
|
|
||||||
layers: 30 # WAS 8
|
|
||||||
model_dim: 1024 # WAS 512
|
|
||||||
heads: 16 # WAS 8
|
|
||||||
max_text_tokens: 402 # WAS 120
|
|
||||||
max_mel_tokens: 604 # WAS 250
|
|
||||||
max_conditioning_inputs: 2 # WAS 1
|
|
||||||
mel_length_compression: 1024
|
|
||||||
number_text_tokens: 256 # supposed to be 255 for newer unified_voice files
|
|
||||||
number_mel_codes: 8194
|
|
||||||
start_mel_token: 8192
|
|
||||||
stop_mel_token: 8193
|
|
||||||
start_text_token: 255
|
|
||||||
train_solo_embeddings: False # missing in uv3/4
|
|
||||||
use_mel_codes_as_input: True # ditto
|
|
||||||
checkpointing: True
|
|
||||||
#types: 1 # this is MISSING, but in my analysis 1 is equivalent to not having it.
|
|
||||||
#only_alignment_head: False # uv3/4
|
|
||||||
|
|
||||||
path:
|
|
||||||
pretrain_model_gpt: '../experiments/autoregressive.pth' # CHANGEME: copy this from tortoise cache
|
|
||||||
strict_load: true
|
|
||||||
#resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state # <-- Set this to resume from a previous training state.
|
|
||||||
|
|
||||||
# afaik all units here are measured in **steps** (i.e. one batch of batch_size is 1 unit)
|
|
||||||
train: # CHANGEME: ALL OF THESE PARAMETERS SHOULD BE EXPERIMENTED WITH
|
|
||||||
niter: 50000
|
|
||||||
warmup_iter: -1
|
|
||||||
mega_batch_factor: 4 # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8].
|
|
||||||
val_freq: 500
|
|
||||||
|
|
||||||
default_lr_scheme: MultiStepLR
|
|
||||||
gen_lr_steps: [500, 1000, 1400, 1800] #[50000, 100000, 140000, 180000]
|
|
||||||
lr_gamma: 0.5
|
|
||||||
|
|
||||||
eval:
|
|
||||||
output_state: gen
|
|
||||||
injectors:
|
|
||||||
gen_inj_eval:
|
|
||||||
type: generator
|
|
||||||
generator: generator
|
|
||||||
in: hq
|
|
||||||
out: [gen, codebook_commitment_loss]
|
|
||||||
|
|
||||||
logger:
|
|
||||||
print_freq: 100
|
|
||||||
save_checkpoint_freq: 500 # CHANGEME: especially you should increase this it's really slow
|
|
||||||
visuals: [gen, mel]
|
|
||||||
visual_debug_rate: 500
|
|
||||||
is_mel_spectrogram: true
|
|
|
@ -1 +0,0 @@
|
||||||
{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}}
|
|
|
@ -1,18 +0,0 @@
|
||||||
path:
|
|
||||||
pretrain_model_dvae: '../experiments/dvae.pth'
|
|
||||||
strict_load: true
|
|
||||||
#resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state # <-- Set this to resume from a previous training state.
|
|
||||||
networks:
|
|
||||||
dvae:
|
|
||||||
type: generator
|
|
||||||
which_model_G: lucidrains_dvae
|
|
||||||
kwargs:
|
|
||||||
channels: 80
|
|
||||||
codebook_dim: 512
|
|
||||||
hidden_dim: 512
|
|
||||||
kernel_size: 3
|
|
||||||
num_layers: 2
|
|
||||||
num_resnet_blocks: 3
|
|
||||||
num_tokens: 8192
|
|
||||||
positional_dims: 1
|
|
||||||
use_transposed_convs: false
|
|
51
setup.py
51
setup.py
|
@ -18,7 +18,56 @@ setuptools.setup(
|
||||||
project_urls={},
|
project_urls={},
|
||||||
scripts=[],
|
scripts=[],
|
||||||
include_package_data=True,
|
include_package_data=True,
|
||||||
install_requires=parse_requirements('requirements.txt', session='hack'),
|
install_requires=[
|
||||||
|
# Fundamentals
|
||||||
|
"numpy",
|
||||||
|
"pyyaml",
|
||||||
|
"tb-nightly",
|
||||||
|
"future",
|
||||||
|
"scp",
|
||||||
|
"tqdm",
|
||||||
|
"matplotlib",
|
||||||
|
"scipy",
|
||||||
|
"munch",
|
||||||
|
"tqdm",
|
||||||
|
"scp",
|
||||||
|
"tensorboard",
|
||||||
|
"orjson",
|
||||||
|
"einops",
|
||||||
|
"lambda-networks",
|
||||||
|
"mup",
|
||||||
|
|
||||||
|
# For image generation stuff
|
||||||
|
"opencv-python",
|
||||||
|
"kornia",
|
||||||
|
"pytorch_ssim",
|
||||||
|
"gsa-pytorch",
|
||||||
|
"pytorch_fid",
|
||||||
|
|
||||||
|
# For audio generation stuff
|
||||||
|
"inflect",
|
||||||
|
"librosa",
|
||||||
|
"Unidecode",
|
||||||
|
"tgt",
|
||||||
|
"pyworld",
|
||||||
|
"audio2numpy",
|
||||||
|
"SoundFile",
|
||||||
|
|
||||||
|
# For text stuff
|
||||||
|
"transformers",
|
||||||
|
"tokenizers",
|
||||||
|
"jiwer", # calculating WER
|
||||||
|
"omegaconf",
|
||||||
|
|
||||||
|
# lucidrains stuff
|
||||||
|
"vector_quantize_pytorch",
|
||||||
|
"linear_attention_transformer",
|
||||||
|
"rotary-embedding-torch",
|
||||||
|
"axial_positional_embedding",
|
||||||
|
"g-mlp-pytorch",
|
||||||
|
"x-clip",
|
||||||
|
"x_transformers==1.0.4",
|
||||||
|
],
|
||||||
classifiers=[
|
classifiers=[
|
||||||
"Programming Language :: Python :: 3",
|
"Programming Language :: Python :: 3",
|
||||||
"License :: OSI Approved :: Apache Software License",
|
"License :: OSI Approved :: Apache Software License",
|
||||||
|
|
Loading…
Reference in New Issue
Block a user