Loading README.md +5 −0 Original line number Diff line number Diff line Loading @@ -62,6 +62,11 @@ trainer = SoundStreamTrainer( ).cuda() trainer.train() # after a lot of training, you can test the autoencoding as so audio = torch.randn(10080).cuda() recons = soundstream(audio, return_recons_only = True) # (1, 10080) - 1 channel ``` Then three separate transformers (`SemanticTransformer`, `CoarseTransformer`, `FineTransformer`) need to be trained Loading audiolm_pytorch/soundstream.py +4 −2 Original line number Diff line number Diff line Loading @@ -13,7 +13,7 @@ from torch.linalg import vector_norm import torchaudio.transforms as T from einops import rearrange, reduce from einops import rearrange, reduce, pack, unpack from vector_quantize_pytorch import ResidualVQ Loading Loading @@ -518,7 +518,6 @@ class SoundStream(nn.Module): assert path.exists() obj = torch.load(str(path)) self.load_state_dict(obj['model']) exit() def non_discr_parameters(self): return [ Loading @@ -543,6 +542,8 @@ class SoundStream(nn.Module): input_sample_hz = None, apply_grad_penalty = False ): x, ps = pack([x], '* n') if exists(input_sample_hz): x = resample(x, input_sample_hz, self.target_sample_hz) Loading Loading @@ -573,6 +574,7 @@ class SoundStream(nn.Module): recon_x = self.decoder(x) if return_recons_only: recon_x, = unpack(recon_x, ps, '* c n') return recon_x # multi-scale discriminator loss Loading setup.py +1 −1 Original line number Diff line number Diff line Loading @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup( name = 'audiolm-pytorch', packages = find_packages(exclude=[]), version = '0.15.1', version = '0.15.2', license='MIT', description = 'AudioLM - Language Modeling Approach to Audio Generation from Google Research - Pytorch', author = 'Phil Wang', Loading Loading
README.md +5 −0 Original line number Diff line number Diff line Loading @@ -62,6 +62,11 @@ trainer = SoundStreamTrainer( ).cuda() trainer.train() # after a lot of training, you can test the autoencoding as so audio = torch.randn(10080).cuda() recons = soundstream(audio, return_recons_only = True) # (1, 10080) - 1 channel ``` Then three separate transformers (`SemanticTransformer`, `CoarseTransformer`, `FineTransformer`) need to be trained Loading
audiolm_pytorch/soundstream.py +4 −2 Original line number Diff line number Diff line Loading @@ -13,7 +13,7 @@ from torch.linalg import vector_norm import torchaudio.transforms as T from einops import rearrange, reduce from einops import rearrange, reduce, pack, unpack from vector_quantize_pytorch import ResidualVQ Loading Loading @@ -518,7 +518,6 @@ class SoundStream(nn.Module): assert path.exists() obj = torch.load(str(path)) self.load_state_dict(obj['model']) exit() def non_discr_parameters(self): return [ Loading @@ -543,6 +542,8 @@ class SoundStream(nn.Module): input_sample_hz = None, apply_grad_penalty = False ): x, ps = pack([x], '* n') if exists(input_sample_hz): x = resample(x, input_sample_hz, self.target_sample_hz) Loading Loading @@ -573,6 +574,7 @@ class SoundStream(nn.Module): recon_x = self.decoder(x) if return_recons_only: recon_x, = unpack(recon_x, ps, '* c n') return recon_x # multi-scale discriminator loss Loading
setup.py +1 −1 Original line number Diff line number Diff line Loading @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup( name = 'audiolm-pytorch', packages = find_packages(exclude=[]), version = '0.15.1', version = '0.15.2', license='MIT', description = 'AudioLM - Language Modeling Approach to Audio Generation from Google Research - Pytorch', author = 'Phil Wang', Loading