Loading README.md +1 −1 Original line number Diff line number Diff line Loading @@ -57,7 +57,7 @@ trainer = SoundStreamTrainer( folder = '/path/to/audio/files', batch_size = 4, grad_accum_every = 8, # effective batch size of 32 data_max_length = 320 * 32, data_max_length_seconds = 2, # train on 2 second audio num_train_steps = 10000 ).cuda() Loading audiolm_pytorch/trainer.py +24 −0 Original line number Diff line number Diff line Loading @@ -117,6 +117,7 @@ class SoundStreamTrainer(nn.Module): num_train_steps, batch_size, data_max_length = None, data_max_length_seconds = None, folder, lr = 2e-4, grad_accum_every = 4, Loading Loading @@ -173,6 +174,11 @@ class SoundStreamTrainer(nn.Module): # create dataset assert not (exists(data_max_length) and exists(data_max_length_seconds)) if exists(data_max_length_seconds): data_max_length = data_max_length_seconds * soundstream.target_sample_hz self.ds = SoundDataset( folder, max_length = data_max_length, Loading Loading @@ -482,6 +488,7 @@ class SemanticTransformerTrainer(nn.Module): audio_conditioner: Optional[AudioConditionerBase] = None, dataset: Optional[Dataset] = None, data_max_length = None, data_max_length_seconds = None, folder = None, lr = 3e-4, grad_accum_every = 1, Loading Loading @@ -528,6 +535,11 @@ class SemanticTransformerTrainer(nn.Module): if not exists(self.ds): assert exists(folder), 'folder must be passed in, if not passing in a custom dataset for text conditioned audio synthesis training' assert not (exists(data_max_length) and exists(data_max_length_seconds)) if exists(data_max_length_seconds): data_max_length = data_max_length_seconds * wav2vec.target_sample_hz self.ds = SoundDataset( folder, max_length = data_max_length, Loading Loading @@ -711,6 +723,7 @@ class CoarseTransformerTrainer(nn.Module): dataset: Optional[Dataset] = None, ds_fields: Tuple[str, ...] = ('raw_wave', 'raw_wave_for_soundstream', 'text'), data_max_length = None, data_max_length_seconds = None, folder = None, lr = 3e-4, grad_accum_every = 1, Loading Loading @@ -760,6 +773,11 @@ class CoarseTransformerTrainer(nn.Module): if not exists(self.ds): assert exists(folder), 'folder must be passed in, if not passing in a custom dataset for text conditioned audio synthesis training' assert not (exists(data_max_length) and exists(data_max_length_seconds)) if exists(data_max_length_seconds): data_max_length = tuple(data_max_length_seconds * hz for hz in (wav2vec.target_sample_hz, soundstream.target_sample_hz)) self.ds = SoundDataset( folder, max_length = data_max_length, Loading Loading @@ -947,6 +965,7 @@ class FineTransformerTrainer(nn.Module): audio_conditioner: Optional[AudioConditionerBase] = None, dataset: Optional[Dataset] = None, data_max_length = None, data_max_length_seconds = None, dataset_normalize = False, folder = None, lr = 3e-4, Loading Loading @@ -995,6 +1014,11 @@ class FineTransformerTrainer(nn.Module): if not exists(self.ds): assert exists(folder), 'folder must be passed in, if not passing in a custom dataset for text conditioned audio synthesis training' assert not (exists(data_max_length) and exists(data_max_length_seconds)) if exists(data_max_length_seconds): data_max_length = data_max_length_seconds * soundstream.target_sample_hz self.ds = SoundDataset( folder, max_length = data_max_length, Loading setup.py +1 −1 Original line number Diff line number Diff line Loading @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup( name = 'audiolm-pytorch', packages = find_packages(exclude=[]), version = '0.14.2', version = '0.14.3', license='MIT', description = 'AudioLM - Language Modeling Approach to Audio Generation from Google Research - Pytorch', author = 'Phil Wang', Loading Loading
README.md +1 −1 Original line number Diff line number Diff line Loading @@ -57,7 +57,7 @@ trainer = SoundStreamTrainer( folder = '/path/to/audio/files', batch_size = 4, grad_accum_every = 8, # effective batch size of 32 data_max_length = 320 * 32, data_max_length_seconds = 2, # train on 2 second audio num_train_steps = 10000 ).cuda() Loading
audiolm_pytorch/trainer.py +24 −0 Original line number Diff line number Diff line Loading @@ -117,6 +117,7 @@ class SoundStreamTrainer(nn.Module): num_train_steps, batch_size, data_max_length = None, data_max_length_seconds = None, folder, lr = 2e-4, grad_accum_every = 4, Loading Loading @@ -173,6 +174,11 @@ class SoundStreamTrainer(nn.Module): # create dataset assert not (exists(data_max_length) and exists(data_max_length_seconds)) if exists(data_max_length_seconds): data_max_length = data_max_length_seconds * soundstream.target_sample_hz self.ds = SoundDataset( folder, max_length = data_max_length, Loading Loading @@ -482,6 +488,7 @@ class SemanticTransformerTrainer(nn.Module): audio_conditioner: Optional[AudioConditionerBase] = None, dataset: Optional[Dataset] = None, data_max_length = None, data_max_length_seconds = None, folder = None, lr = 3e-4, grad_accum_every = 1, Loading Loading @@ -528,6 +535,11 @@ class SemanticTransformerTrainer(nn.Module): if not exists(self.ds): assert exists(folder), 'folder must be passed in, if not passing in a custom dataset for text conditioned audio synthesis training' assert not (exists(data_max_length) and exists(data_max_length_seconds)) if exists(data_max_length_seconds): data_max_length = data_max_length_seconds * wav2vec.target_sample_hz self.ds = SoundDataset( folder, max_length = data_max_length, Loading Loading @@ -711,6 +723,7 @@ class CoarseTransformerTrainer(nn.Module): dataset: Optional[Dataset] = None, ds_fields: Tuple[str, ...] = ('raw_wave', 'raw_wave_for_soundstream', 'text'), data_max_length = None, data_max_length_seconds = None, folder = None, lr = 3e-4, grad_accum_every = 1, Loading Loading @@ -760,6 +773,11 @@ class CoarseTransformerTrainer(nn.Module): if not exists(self.ds): assert exists(folder), 'folder must be passed in, if not passing in a custom dataset for text conditioned audio synthesis training' assert not (exists(data_max_length) and exists(data_max_length_seconds)) if exists(data_max_length_seconds): data_max_length = tuple(data_max_length_seconds * hz for hz in (wav2vec.target_sample_hz, soundstream.target_sample_hz)) self.ds = SoundDataset( folder, max_length = data_max_length, Loading Loading @@ -947,6 +965,7 @@ class FineTransformerTrainer(nn.Module): audio_conditioner: Optional[AudioConditionerBase] = None, dataset: Optional[Dataset] = None, data_max_length = None, data_max_length_seconds = None, dataset_normalize = False, folder = None, lr = 3e-4, Loading Loading @@ -995,6 +1014,11 @@ class FineTransformerTrainer(nn.Module): if not exists(self.ds): assert exists(folder), 'folder must be passed in, if not passing in a custom dataset for text conditioned audio synthesis training' assert not (exists(data_max_length) and exists(data_max_length_seconds)) if exists(data_max_length_seconds): data_max_length = data_max_length_seconds * soundstream.target_sample_hz self.ds = SoundDataset( folder, max_length = data_max_length, Loading
setup.py +1 −1 Original line number Diff line number Diff line Loading @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup( name = 'audiolm-pytorch', packages = find_packages(exclude=[]), version = '0.14.2', version = '0.14.3', license='MIT', description = 'AudioLM - Language Modeling Approach to Audio Generation from Google Research - Pytorch', author = 'Phil Wang', Loading