@@ -63,6 +63,7 @@ from audiolm_pytorch import SoundStream, SoundStreamTrainer
soundstream=SoundStream(
codebook_size=1024,
rq_num_quantizers=8,
rq_groups=2,# this paper proposes using multi-headed residual vector quantization - https://arxiv.org/abs/2305.02765
attn_window_size=128,# local attention receptive field at bottleneck
attn_depth=2# 2 local attention transformer blocks - the soundstream folks were not experts with attention, so i took the liberty to add some. encodec went with lstms, but attention should be better