This commit is contained in:
mrq 2023-02-17 00:33:50 +00:00
parent f31a333c4f
commit 49e23b226b
9 changed files with 252 additions and 5 deletions

11
.gitignore vendored Normal file → Executable file
View File

@ -1,12 +1,13 @@
experiments/* dlas/experiments/*
dlas/codes/*.txt
dlas/codes/wandb/*
dlas/codes/pretrained_models/*
dlas/codes/scripts/audio/pretrained_models/*
results/* results/*
tb_logger/* tb_logger/*
datasets/* datasets/*
options/* options/*
codes/*.txt
codes/wandb/*
codes/pretrained_models/*
codes/scripts/audio/pretrained_models/*
data/* data/*
.vscode .vscode

1
MANIFEST.in Executable file
View File

@ -0,0 +1 @@
recursive-include codes/*

0
README.md → README.old.md Normal file → Executable file
View File

0
codes/__init__.py Executable file
View File

144
experiments/EXAMPLE_gpt.yml Executable file
View File

@ -0,0 +1,144 @@
name: CHANGEME_your_experiment_name
model: extensibletrainer
scale: 1
gpu_ids: [0] # <-- unless you have multiple gpus, use this
start_step: -1
checkpointing_enabled: true # <-- Gradient checkpointing. Enable for huge GPU memory savings. Disable for distributed training.
fp16: false # might want to check this out
wandb: false # <-- enable to log to wandb. tensorboard logging is always enabled.
use_tb_logger: true
datasets:
train:
name: CHANGEME_training_dataset_name
n_workers: 8 # idk what this does
batch_size: 128 # This leads to ~16GB of vram usage on my 3090.
mode: paired_voice_audio
path: CHANGEME_path_to_training_dataset
fetcher_mode: ['lj'] # CHANGEME if your dataset isn't in LJSpeech format
phase: train
max_wav_length: 255995
max_text_length: 200
sample_rate: 22050
load_conditioning: True
num_conditioning_candidates: 2
conditioning_length: 44000
use_bpe_tokenizer: True
load_aligned_codes: False
val:
name: CHANGEME_validation_dataset_name
n_workers: 1
batch_size: 32 # this could be higher probably
mode: paired_voice_audio
path: CHANGEME_path_to_validation_dataset
fetcher_mode: ['lj']
phase: val # might be broken idk
max_wav_length: 255995
max_text_length: 200
sample_rate: 22050
load_conditioning: True
num_conditioning_candidates: 2
conditioning_length: 44000
use_bpe_tokenizer: True
load_aligned_codes: False
steps:
gpt_train:
training: gpt
loss_log_buffer: 500 # no idea what this does
# Generally follows the recipe from the DALLE paper.
optimizer: adamw # this should be adamw_zero if you're using distributed training
optimizer_params:
lr: !!float 1e-5 # CHANGEME: this was originally 1e-4; I reduced it to 1e-5 because it's fine-tuning, but **you should experiment with this value**
weight_decay: !!float 1e-2
beta1: 0.9
beta2: 0.96
clip_grad_eps: 4
injectors: # TODO: replace this entire sequence with the GptVoiceLatentInjector
paired_to_mel:
type: torch_mel_spectrogram
mel_norm_file: ../experiments/clips_mel_norms.pth
in: wav
out: paired_mel
paired_cond_to_mel:
type: for_each
subtype: torch_mel_spectrogram
mel_norm_file: ../experiments/clips_mel_norms.pth
in: conditioning
out: paired_conditioning_mel
to_codes:
type: discrete_token
in: paired_mel
out: paired_mel_codes
dvae_config: "../experiments/train_diffusion_vocoder_22k_level.yml" # EXTREMELY IMPORTANT
paired_fwd_text:
type: generator
generator: gpt
in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths]
out: [loss_text_ce, loss_mel_ce, logits]
losses:
text_ce:
type: direct
weight: .01
key: loss_text_ce
mel_ce:
type: direct
weight: 1
key: loss_mel_ce
networks:
gpt:
type: generator
which_model_G: unified_voice2 # none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter.
kwargs:
layers: 30 # WAS 8
model_dim: 1024 # WAS 512
heads: 16 # WAS 8
max_text_tokens: 402 # WAS 120
max_mel_tokens: 604 # WAS 250
max_conditioning_inputs: 2 # WAS 1
mel_length_compression: 1024
number_text_tokens: 256 # supposed to be 255 for newer unified_voice files
number_mel_codes: 8194
start_mel_token: 8192
stop_mel_token: 8193
start_text_token: 255
train_solo_embeddings: False # missing in uv3/4
use_mel_codes_as_input: True # ditto
checkpointing: True
#types: 1 # this is MISSING, but in my analysis 1 is equivalent to not having it.
#only_alignment_head: False # uv3/4
path:
pretrain_model_gpt: '../experiments/autoregressive.pth' # CHANGEME: copy this from tortoise cache
strict_load: true
#resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state # <-- Set this to resume from a previous training state.
# afaik all units here are measured in **steps** (i.e. one batch of batch_size is 1 unit)
train: # CHANGEME: ALL OF THESE PARAMETERS SHOULD BE EXPERIMENTED WITH
niter: 50000
warmup_iter: -1
mega_batch_factor: 4 # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8].
val_freq: 500
default_lr_scheme: MultiStepLR
gen_lr_steps: [500, 1000, 1400, 1800] #[50000, 100000, 140000, 180000]
lr_gamma: 0.5
eval:
output_state: gen
injectors:
gen_inj_eval:
type: generator
generator: generator
in: hq
out: [gen, codebook_commitment_loss]
logger:
print_freq: 100
save_checkpoint_freq: 500 # CHANGEME: especially you should increase this it's really slow
visuals: [gen, mel]
visual_debug_rate: 500
is_mel_spectrogram: true

View File

@ -0,0 +1 @@
{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}}

View File

@ -0,0 +1,18 @@
path:
pretrain_model_dvae: '../experiments/dvae.pth'
strict_load: true
#resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state # <-- Set this to resume from a previous training state.
networks:
dvae:
type: generator
which_model_G: lucidrains_dvae
kwargs:
channels: 80
codebook_dim: 512
hidden_dim: 512
kernel_size: 3
num_layers: 2
num_resnet_blocks: 3
num_tokens: 8192
positional_dims: 1
use_transposed_convs: false

48
requirements.txt Executable file
View File

@ -0,0 +1,48 @@
# Fundamentals
numpy
pyyaml
tb-nightly
future
scp
tqdm
matplotlib
scipy
munch
tqdm
scp
tensorboard
orjson
einops
lambda-networks
mup
# For image generation stuff
opencv-python
kornia
pytorch_ssim
gsa-pytorch
pytorch_fid
# For audio generation stuff
inflect
librosa
Unidecode
tgt
pyworld
audio2numpy
SoundFile
# For text stuff
transformers
tokenizers
jiwer # calculating WER
omegaconf
# lucidrains stuff
vector_quantize_pytorch
linear_attention_transformer
rotary-embedding-torch
axial_positional_embedding
g-mlp-pytorch
x-clip
x_transformers==1.0.4

34
setup.py Executable file
View File

@ -0,0 +1,34 @@
import setuptools
with open("README.old.md", "r", encoding="utf-8") as fh:
long_description = fh.read()
# kludge
packages = setuptools.find_packages()
for i in range(len(packages)):
packages[i] = packages[i].replace("codes", "dlas")
setuptools.setup(
name="DL-Art-School",
packages=packages,
package_dir={
"dlas": "./codes"
},
version="0.0.1",
author="James Betker",
author_email="james@adamant.ai",
description="",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://git.ecker.tech/mrq/DL-Art-School",
project_urls={},
scripts=[],
include_package_data=True,
install_requires=[],
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
],
python_requires=">=3.6",
)