Loading audiolm_pytorch/encodec.py +13 −2 Original line number Diff line number Diff line Loading @@ -41,7 +41,7 @@ class EncodecWrapper(nn.Module): def seq_len_multiple_of(self): return reduce(lambda x, y: x * y, self.strides) def forward(self, x, **kwargs): def forward(self, x, return_encoded = False, **kwargs): # kwargs for stuff like return_encoded=True, which SoundStream uses but Encodec doesn't assert not self.model.training, "Encodec is pretrained and should never be called outside eval mode." # Unlike in the Encodec sample code in its README, x has already been resampled so we don't need to call Loading @@ -60,7 +60,14 @@ class EncodecWrapper(nn.Module): # transformer code that uses codec expects codes to be [batch, timesteps, num_quantizers] codes = rearrange(codes, 'b q n -> b n q') # result: [batch, timesteps, num_quantizers] # in original soundstream, is x, indices, commit_loss. But we only use indices in eval mode, so just keep that. return None, codes, None # allow for returning of sum of quantized embeddings emb = None if return_encoded: emb = self.get_emb_from_indices(codes) return emb, codes, None def decode_from_codebook_indices(self, quantized_indices): # Input: batch x num tokens x num quantizers Loading @@ -81,6 +88,10 @@ class EncodecWrapper(nn.Module): # back to b n anyways, but we'll keep this as a temporary hack just to make things work for now return rearrange(result, 'b n -> b 1 n') def get_emb_from_indices(self, indices): codes = rearrange(indices, 'b t q -> q b t') return self.model.quantizer.decode(codes) def decode(self, emb): return self.model.decoder(emb) Loading audiolm_pytorch/version.py +1 −1 Original line number Diff line number Diff line __version__ = '0.27.1' __version__ = '0.27.2' Loading
audiolm_pytorch/encodec.py +13 −2 Original line number Diff line number Diff line Loading @@ -41,7 +41,7 @@ class EncodecWrapper(nn.Module): def seq_len_multiple_of(self): return reduce(lambda x, y: x * y, self.strides) def forward(self, x, **kwargs): def forward(self, x, return_encoded = False, **kwargs): # kwargs for stuff like return_encoded=True, which SoundStream uses but Encodec doesn't assert not self.model.training, "Encodec is pretrained and should never be called outside eval mode." # Unlike in the Encodec sample code in its README, x has already been resampled so we don't need to call Loading @@ -60,7 +60,14 @@ class EncodecWrapper(nn.Module): # transformer code that uses codec expects codes to be [batch, timesteps, num_quantizers] codes = rearrange(codes, 'b q n -> b n q') # result: [batch, timesteps, num_quantizers] # in original soundstream, is x, indices, commit_loss. But we only use indices in eval mode, so just keep that. return None, codes, None # allow for returning of sum of quantized embeddings emb = None if return_encoded: emb = self.get_emb_from_indices(codes) return emb, codes, None def decode_from_codebook_indices(self, quantized_indices): # Input: batch x num tokens x num quantizers Loading @@ -81,6 +88,10 @@ class EncodecWrapper(nn.Module): # back to b n anyways, but we'll keep this as a temporary hack just to make things work for now return rearrange(result, 'b n -> b 1 n') def get_emb_from_indices(self, indices): codes = rearrange(indices, 'b t q -> q b t') return self.model.quantizer.decode(codes) def decode(self, emb): return self.model.decoder(emb) Loading
audiolm_pytorch/version.py +1 −1 Original line number Diff line number Diff line __version__ = '0.27.1' __version__ = '0.27.2'