# attn_window_size = 128, # local attention receptive field at bottleneck
# attn_depth = 2 # 2 local attention transformer blocks - the soundstream folks were not experts with attention, so i took the liberty to add some. encodec went with lstms, but attention should be better
attn_window_size=128,# local attention receptive field at bottleneck
attn_depth=2# 2 local attention transformer blocks - the soundstream folks were not experts with attention, so i took the liberty to add some. encodec went with lstms, but attention should be better
)
soundstream_trainer=SoundStreamTrainer(
soundstream,
folder=dataset_folder,
lr=3e-4,
batch_size=4,
grad_accum_every=8,# effective batch size of batch_size * grad_accum_every = 32
data_max_length_seconds=2,# train on 2 second audio