attn_window_size=128,# local attention receptive field at bottleneck
attn_depth=2# 2 local attention transformer blocks - the soundstream folks were not experts with attention, so i took the liberty to add some. encodec went with lstms, but attention should be better