forked from mrq/DL-Art-School
pip-ify
This commit is contained in:
parent
f31a333c4f
commit
49e23b226b
11
.gitignore
vendored
Normal file → Executable file
11
.gitignore
vendored
Normal file → Executable file
|
@ -1,12 +1,13 @@
|
||||||
experiments/*
|
dlas/experiments/*
|
||||||
|
dlas/codes/*.txt
|
||||||
|
dlas/codes/wandb/*
|
||||||
|
dlas/codes/pretrained_models/*
|
||||||
|
dlas/codes/scripts/audio/pretrained_models/*
|
||||||
|
|
||||||
results/*
|
results/*
|
||||||
tb_logger/*
|
tb_logger/*
|
||||||
datasets/*
|
datasets/*
|
||||||
options/*
|
options/*
|
||||||
codes/*.txt
|
|
||||||
codes/wandb/*
|
|
||||||
codes/pretrained_models/*
|
|
||||||
codes/scripts/audio/pretrained_models/*
|
|
||||||
data/*
|
data/*
|
||||||
.vscode
|
.vscode
|
||||||
|
|
||||||
|
|
1
MANIFEST.in
Executable file
1
MANIFEST.in
Executable file
|
@ -0,0 +1 @@
|
||||||
|
recursive-include codes/*
|
0
README.md → README.old.md
Normal file → Executable file
0
README.md → README.old.md
Normal file → Executable file
0
codes/__init__.py
Executable file
0
codes/__init__.py
Executable file
144
experiments/EXAMPLE_gpt.yml
Executable file
144
experiments/EXAMPLE_gpt.yml
Executable file
|
@ -0,0 +1,144 @@
|
||||||
|
name: CHANGEME_your_experiment_name
|
||||||
|
model: extensibletrainer
|
||||||
|
scale: 1
|
||||||
|
gpu_ids: [0] # <-- unless you have multiple gpus, use this
|
||||||
|
start_step: -1
|
||||||
|
checkpointing_enabled: true # <-- Gradient checkpointing. Enable for huge GPU memory savings. Disable for distributed training.
|
||||||
|
fp16: false # might want to check this out
|
||||||
|
wandb: false # <-- enable to log to wandb. tensorboard logging is always enabled.
|
||||||
|
use_tb_logger: true
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
train:
|
||||||
|
name: CHANGEME_training_dataset_name
|
||||||
|
n_workers: 8 # idk what this does
|
||||||
|
batch_size: 128 # This leads to ~16GB of vram usage on my 3090.
|
||||||
|
mode: paired_voice_audio
|
||||||
|
path: CHANGEME_path_to_training_dataset
|
||||||
|
fetcher_mode: ['lj'] # CHANGEME if your dataset isn't in LJSpeech format
|
||||||
|
phase: train
|
||||||
|
max_wav_length: 255995
|
||||||
|
max_text_length: 200
|
||||||
|
sample_rate: 22050
|
||||||
|
load_conditioning: True
|
||||||
|
num_conditioning_candidates: 2
|
||||||
|
conditioning_length: 44000
|
||||||
|
use_bpe_tokenizer: True
|
||||||
|
load_aligned_codes: False
|
||||||
|
val:
|
||||||
|
name: CHANGEME_validation_dataset_name
|
||||||
|
n_workers: 1
|
||||||
|
batch_size: 32 # this could be higher probably
|
||||||
|
mode: paired_voice_audio
|
||||||
|
path: CHANGEME_path_to_validation_dataset
|
||||||
|
fetcher_mode: ['lj']
|
||||||
|
phase: val # might be broken idk
|
||||||
|
max_wav_length: 255995
|
||||||
|
max_text_length: 200
|
||||||
|
sample_rate: 22050
|
||||||
|
load_conditioning: True
|
||||||
|
num_conditioning_candidates: 2
|
||||||
|
conditioning_length: 44000
|
||||||
|
use_bpe_tokenizer: True
|
||||||
|
load_aligned_codes: False
|
||||||
|
|
||||||
|
steps:
|
||||||
|
gpt_train:
|
||||||
|
training: gpt
|
||||||
|
loss_log_buffer: 500 # no idea what this does
|
||||||
|
|
||||||
|
# Generally follows the recipe from the DALLE paper.
|
||||||
|
optimizer: adamw # this should be adamw_zero if you're using distributed training
|
||||||
|
optimizer_params:
|
||||||
|
lr: !!float 1e-5 # CHANGEME: this was originally 1e-4; I reduced it to 1e-5 because it's fine-tuning, but **you should experiment with this value**
|
||||||
|
weight_decay: !!float 1e-2
|
||||||
|
beta1: 0.9
|
||||||
|
beta2: 0.96
|
||||||
|
clip_grad_eps: 4
|
||||||
|
|
||||||
|
injectors: # TODO: replace this entire sequence with the GptVoiceLatentInjector
|
||||||
|
paired_to_mel:
|
||||||
|
type: torch_mel_spectrogram
|
||||||
|
mel_norm_file: ../experiments/clips_mel_norms.pth
|
||||||
|
in: wav
|
||||||
|
out: paired_mel
|
||||||
|
paired_cond_to_mel:
|
||||||
|
type: for_each
|
||||||
|
subtype: torch_mel_spectrogram
|
||||||
|
mel_norm_file: ../experiments/clips_mel_norms.pth
|
||||||
|
in: conditioning
|
||||||
|
out: paired_conditioning_mel
|
||||||
|
to_codes:
|
||||||
|
type: discrete_token
|
||||||
|
in: paired_mel
|
||||||
|
out: paired_mel_codes
|
||||||
|
dvae_config: "../experiments/train_diffusion_vocoder_22k_level.yml" # EXTREMELY IMPORTANT
|
||||||
|
paired_fwd_text:
|
||||||
|
type: generator
|
||||||
|
generator: gpt
|
||||||
|
in: [paired_conditioning_mel, padded_text, text_lengths, paired_mel_codes, wav_lengths]
|
||||||
|
out: [loss_text_ce, loss_mel_ce, logits]
|
||||||
|
losses:
|
||||||
|
text_ce:
|
||||||
|
type: direct
|
||||||
|
weight: .01
|
||||||
|
key: loss_text_ce
|
||||||
|
mel_ce:
|
||||||
|
type: direct
|
||||||
|
weight: 1
|
||||||
|
key: loss_mel_ce
|
||||||
|
|
||||||
|
networks:
|
||||||
|
gpt:
|
||||||
|
type: generator
|
||||||
|
which_model_G: unified_voice2 # none of the unified_voice*.py files actually match the tortoise inference code... 4 and 3 have "alignment_head" (wtf is that?), 2 lacks the types=1 parameter.
|
||||||
|
kwargs:
|
||||||
|
layers: 30 # WAS 8
|
||||||
|
model_dim: 1024 # WAS 512
|
||||||
|
heads: 16 # WAS 8
|
||||||
|
max_text_tokens: 402 # WAS 120
|
||||||
|
max_mel_tokens: 604 # WAS 250
|
||||||
|
max_conditioning_inputs: 2 # WAS 1
|
||||||
|
mel_length_compression: 1024
|
||||||
|
number_text_tokens: 256 # supposed to be 255 for newer unified_voice files
|
||||||
|
number_mel_codes: 8194
|
||||||
|
start_mel_token: 8192
|
||||||
|
stop_mel_token: 8193
|
||||||
|
start_text_token: 255
|
||||||
|
train_solo_embeddings: False # missing in uv3/4
|
||||||
|
use_mel_codes_as_input: True # ditto
|
||||||
|
checkpointing: True
|
||||||
|
#types: 1 # this is MISSING, but in my analysis 1 is equivalent to not having it.
|
||||||
|
#only_alignment_head: False # uv3/4
|
||||||
|
|
||||||
|
path:
|
||||||
|
pretrain_model_gpt: '../experiments/autoregressive.pth' # CHANGEME: copy this from tortoise cache
|
||||||
|
strict_load: true
|
||||||
|
#resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state # <-- Set this to resume from a previous training state.
|
||||||
|
|
||||||
|
# afaik all units here are measured in **steps** (i.e. one batch of batch_size is 1 unit)
|
||||||
|
train: # CHANGEME: ALL OF THESE PARAMETERS SHOULD BE EXPERIMENTED WITH
|
||||||
|
niter: 50000
|
||||||
|
warmup_iter: -1
|
||||||
|
mega_batch_factor: 4 # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8].
|
||||||
|
val_freq: 500
|
||||||
|
|
||||||
|
default_lr_scheme: MultiStepLR
|
||||||
|
gen_lr_steps: [500, 1000, 1400, 1800] #[50000, 100000, 140000, 180000]
|
||||||
|
lr_gamma: 0.5
|
||||||
|
|
||||||
|
eval:
|
||||||
|
output_state: gen
|
||||||
|
injectors:
|
||||||
|
gen_inj_eval:
|
||||||
|
type: generator
|
||||||
|
generator: generator
|
||||||
|
in: hq
|
||||||
|
out: [gen, codebook_commitment_loss]
|
||||||
|
|
||||||
|
logger:
|
||||||
|
print_freq: 100
|
||||||
|
save_checkpoint_freq: 500 # CHANGEME: especially you should increase this it's really slow
|
||||||
|
visuals: [gen, mel]
|
||||||
|
visual_debug_rate: 500
|
||||||
|
is_mel_spectrogram: true
|
1
experiments/bpe_lowercase_asr_256.json
Executable file
1
experiments/bpe_lowercase_asr_256.json
Executable file
|
@ -0,0 +1 @@
|
||||||
|
{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":0,"special":true,"content":"[STOP]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":1,"special":true,"content":"[UNK]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false},{"id":2,"special":true,"content":"[SPACE]","single_word":false,"lstrip":false,"rstrip":false,"normalized":false}],"normalizer":null,"pre_tokenizer":{"type":"Whitespace"},"post_processor":null,"decoder":null,"model":{"type":"BPE","dropout":null,"unk_token":"[UNK]","continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"vocab":{"[STOP]":0,"[UNK]":1,"[SPACE]":2,"!":3,"'":4,"(":5,")":6,",":7,"-":8,".":9,"/":10,":":11,";":12,"?":13,"a":14,"b":15,"c":16,"d":17,"e":18,"f":19,"g":20,"h":21,"i":22,"j":23,"k":24,"l":25,"m":26,"n":27,"o":28,"p":29,"q":30,"r":31,"s":32,"t":33,"u":34,"v":35,"w":36,"x":37,"y":38,"z":39,"th":40,"in":41,"the":42,"an":43,"er":44,"ou":45,"re":46,"on":47,"at":48,"ed":49,"en":50,"to":51,"ing":52,"and":53,"is":54,"as":55,"al":56,"or":57,"of":58,"ar":59,"it":60,"es":61,"he":62,"st":63,"le":64,"om":65,"se":66,"be":67,"ad":68,"ow":69,"ly":70,"ch":71,"wh":72,"that":73,"you":74,"li":75,"ve":76,"ac":77,"ti":78,"ld":79,"me":80,"was":81,"gh":82,"id":83,"ll":84,"wi":85,"ent":86,"for":87,"ay":88,"ro":89,"ver":90,"ic":91,"her":92,"ke":93,"his":94,"no":95,"ut":96,"un":97,"ir":98,"lo":99,"we":100,"ri":101,"ha":102,"with":103,"ght":104,"out":105,"im":106,"ion":107,"all":108,"ab":109,"one":110,"ne":111,"ge":112,"ould":113,"ter":114,"mo":115,"had":116,"ce":117,"she":118,"go":119,"sh":120,"ur":121,"am":122,"so":123,"pe":124,"my":125,"de":126,"are":127,"but":128,"ome":129,"fr":130,"ther":131,"fe":132,"su":133,"do":134,"con":135,"te":136,"ain":137,"ere":138,"po":139,"if":140,"they":141,"us":142,"ag":143,"tr":144,"now":145,"oun":146,"this":147,"have":148,"not":149,"sa":150,"il":151,"up":152,"thing":153,"from":154,"ap":155,"him":156,"ack":157,"ation":158,"ant":159,"our":160,"op":161,"like":162,"ust":163,"ess":164,"bo":165,"ok":166,"ul":167,"ind":168,"ex":169,"com":170,"some":171,"there":172,"ers":173,"co":174,"res":175,"man":176,"ard":177,"pl":178,"wor":179,"way":180,"tion":181,"fo":182,"ca":183,"were":184,"by":185,"ate":186,"pro":187,"ted":188,"ound":189,"own":190,"would":191,"ts":192,"what":193,"qu":194,"ally":195,"ight":196,"ck":197,"gr":198,"when":199,"ven":200,"can":201,"ough":202,"ine":203,"end":204,"per":205,"ous":206,"od":207,"ide":208,"know":209,"ty":210,"very":211,"si":212,"ak":213,"who":214,"about":215,"ill":216,"them":217,"est":218,"red":219,"ye":220,"could":221,"ong":222,"your":223,"their":224,"em":225,"just":226,"other":227,"into":228,"any":229,"whi":230,"um":231,"tw":232,"ast":233,"der":234,"did":235,"ie":236,"been":237,"ace":238,"ink":239,"ity":240,"back":241,"ting":242,"br":243,"more":244,"ake":245,"pp":246,"then":247,"sp":248,"el":249,"use":250,"bl":251,"said":252,"over":253,"get":254},"merges":["t h","i n","th e","a n","e r","o u","r e","o n","a t","e d","e n","t o","in g","an d","i s","a s","a l","o r","o f","a r","i t","e s","h e","s t","l e","o m","s e","b e","a d","o w","l y","c h","w h","th at","y ou","l i","v e","a c","t i","l d","m e","w as","g h","i d","l l","w i","en t","f or","a y","r o","v er","i c","h er","k e","h is","n o","u t","u n","i r","l o","w e","r i","h a","wi th","gh t","ou t","i m","i on","al l","a b","on e","n e","g e","ou ld","t er","m o","h ad","c e","s he","g o","s h","u r","a m","s o","p e","m y","d e","a re","b ut","om e","f r","the r","f e","s u","d o","c on","t e","a in","er e","p o","i f","the y","u s","a g","t r","n ow","ou n","th is","ha ve","no t","s a","i l","u p","th ing","fr om","a p","h im","ac k","at ion","an t","ou r","o p","li ke","u st","es s","b o","o k","u l","in d","e x","c om","s ome","the re","er s","c o","re s","m an","ar d","p l","w or","w ay","ti on","f o","c a","w ere","b y","at e","p ro","t ed","oun d","ow n","w ould","t s","wh at","q u","al ly","i ght","c k","g r","wh en","v en","c an","ou gh","in e","en d","p er","ou s","o d","id e","k now","t y","ver y","s i","a k","wh o","ab out","i ll","the m","es t","re d","y e","c ould","on g","you r","the ir","e m","j ust","o ther","in to","an y","wh i","u m","t w","as t","d er","d id","i e","be en","ac e","in k","it y","b ack","t ing","b r","mo re","a ke","p p","the n","s p","e l","u se","b l","sa id","o ver","ge t"]}}
|
18
experiments/train_diffusion_vocoder_22k_level.yml
Executable file
18
experiments/train_diffusion_vocoder_22k_level.yml
Executable file
|
@ -0,0 +1,18 @@
|
||||||
|
path:
|
||||||
|
pretrain_model_dvae: '../experiments/dvae.pth'
|
||||||
|
strict_load: true
|
||||||
|
#resume_state: ../experiments/train_imgnet_vqvae_stage1/training_state/0.state # <-- Set this to resume from a previous training state.
|
||||||
|
networks:
|
||||||
|
dvae:
|
||||||
|
type: generator
|
||||||
|
which_model_G: lucidrains_dvae
|
||||||
|
kwargs:
|
||||||
|
channels: 80
|
||||||
|
codebook_dim: 512
|
||||||
|
hidden_dim: 512
|
||||||
|
kernel_size: 3
|
||||||
|
num_layers: 2
|
||||||
|
num_resnet_blocks: 3
|
||||||
|
num_tokens: 8192
|
||||||
|
positional_dims: 1
|
||||||
|
use_transposed_convs: false
|
48
requirements.txt
Executable file
48
requirements.txt
Executable file
|
@ -0,0 +1,48 @@
|
||||||
|
# Fundamentals
|
||||||
|
numpy
|
||||||
|
pyyaml
|
||||||
|
tb-nightly
|
||||||
|
future
|
||||||
|
scp
|
||||||
|
tqdm
|
||||||
|
matplotlib
|
||||||
|
scipy
|
||||||
|
munch
|
||||||
|
tqdm
|
||||||
|
scp
|
||||||
|
tensorboard
|
||||||
|
orjson
|
||||||
|
einops
|
||||||
|
lambda-networks
|
||||||
|
mup
|
||||||
|
|
||||||
|
# For image generation stuff
|
||||||
|
opencv-python
|
||||||
|
kornia
|
||||||
|
pytorch_ssim
|
||||||
|
gsa-pytorch
|
||||||
|
pytorch_fid
|
||||||
|
|
||||||
|
# For audio generation stuff
|
||||||
|
inflect
|
||||||
|
librosa
|
||||||
|
Unidecode
|
||||||
|
tgt
|
||||||
|
pyworld
|
||||||
|
audio2numpy
|
||||||
|
SoundFile
|
||||||
|
|
||||||
|
# For text stuff
|
||||||
|
transformers
|
||||||
|
tokenizers
|
||||||
|
jiwer # calculating WER
|
||||||
|
omegaconf
|
||||||
|
|
||||||
|
# lucidrains stuff
|
||||||
|
vector_quantize_pytorch
|
||||||
|
linear_attention_transformer
|
||||||
|
rotary-embedding-torch
|
||||||
|
axial_positional_embedding
|
||||||
|
g-mlp-pytorch
|
||||||
|
x-clip
|
||||||
|
x_transformers==1.0.4
|
34
setup.py
Executable file
34
setup.py
Executable file
|
@ -0,0 +1,34 @@
|
||||||
|
import setuptools
|
||||||
|
|
||||||
|
with open("README.old.md", "r", encoding="utf-8") as fh:
|
||||||
|
long_description = fh.read()
|
||||||
|
|
||||||
|
# kludge
|
||||||
|
packages = setuptools.find_packages()
|
||||||
|
for i in range(len(packages)):
|
||||||
|
packages[i] = packages[i].replace("codes", "dlas")
|
||||||
|
|
||||||
|
setuptools.setup(
|
||||||
|
name="DL-Art-School",
|
||||||
|
packages=packages,
|
||||||
|
package_dir={
|
||||||
|
"dlas": "./codes"
|
||||||
|
},
|
||||||
|
version="0.0.1",
|
||||||
|
author="James Betker",
|
||||||
|
author_email="james@adamant.ai",
|
||||||
|
description="",
|
||||||
|
long_description=long_description,
|
||||||
|
long_description_content_type="text/markdown",
|
||||||
|
url="https://git.ecker.tech/mrq/DL-Art-School",
|
||||||
|
project_urls={},
|
||||||
|
scripts=[],
|
||||||
|
include_package_data=True,
|
||||||
|
install_requires=[],
|
||||||
|
classifiers=[
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"License :: OSI Approved :: Apache Software License",
|
||||||
|
"Operating System :: OS Independent",
|
||||||
|
],
|
||||||
|
python_requires=">=3.6",
|
||||||
|
)
|
Loading…
Reference in New Issue
Block a user