Loading README.md +1 −1 Original line number Diff line number Diff line Loading @@ -201,11 +201,11 @@ generated_wav_with_text_condition = audiolm(text = ['chirping of birds and the d - [x] refactor so semantic transformer has a wrapper to that handles unique consecutives as well as wav to hubert or vq-wav2vec - [x] simply not self attend to eos token on the prompting side (semantic for coarse transformer, coarse for fine transformer) - [x] add structured dropout from forgetful causal masking, far better than traditional dropouts - [x] figure out how to suppress logging in fairseq - [ ] figure out how to do the normalization across each dimension mentioned in the paper, but ignore it for v1 of the framework - [ ] offer option to weight tie coarse, fine, and semantic embeddings across the 3 hierarchical transformers - [ ] DRY a little at the end - [ ] figure out how to suppress logging in fairseq - [ ] test with speech synthesis for starters - [ ] abstract out conditioning + classifier free guidance into external module or potentially a package - [ ] add option to use flash attention Loading audiolm_pytorch/hubert_kmeans.py +4 −0 Original line number Diff line number Diff line Loading @@ -5,12 +5,16 @@ from torch import nn from einops import rearrange, pack, unpack import joblib import fairseq from torchaudio.functional import resample from audiolm_pytorch.utils import curtail_to_multiple import logging logging.root.setLevel(logging.ERROR) def exists(val): return val is not None Loading audiolm_pytorch/vq_wav2vec.py +3 −0 Original line number Diff line number Diff line Loading @@ -10,6 +10,9 @@ from torchaudio.functional import resample from audiolm_pytorch.utils import curtail_to_multiple import logging logging.root.setLevel(logging.ERROR) def exists(val): return val is not None Loading setup.py +1 −1 Original line number Diff line number Diff line Loading @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup( name = 'audiolm-pytorch', packages = find_packages(exclude=[]), version = '0.1.5', version = '0.1.6', license='MIT', description = 'AudioLM - Language Modeling Approach to Audio Generation from Google Research - Pytorch', author = 'Phil Wang', Loading Loading
README.md +1 −1 Original line number Diff line number Diff line Loading @@ -201,11 +201,11 @@ generated_wav_with_text_condition = audiolm(text = ['chirping of birds and the d - [x] refactor so semantic transformer has a wrapper to that handles unique consecutives as well as wav to hubert or vq-wav2vec - [x] simply not self attend to eos token on the prompting side (semantic for coarse transformer, coarse for fine transformer) - [x] add structured dropout from forgetful causal masking, far better than traditional dropouts - [x] figure out how to suppress logging in fairseq - [ ] figure out how to do the normalization across each dimension mentioned in the paper, but ignore it for v1 of the framework - [ ] offer option to weight tie coarse, fine, and semantic embeddings across the 3 hierarchical transformers - [ ] DRY a little at the end - [ ] figure out how to suppress logging in fairseq - [ ] test with speech synthesis for starters - [ ] abstract out conditioning + classifier free guidance into external module or potentially a package - [ ] add option to use flash attention Loading
audiolm_pytorch/hubert_kmeans.py +4 −0 Original line number Diff line number Diff line Loading @@ -5,12 +5,16 @@ from torch import nn from einops import rearrange, pack, unpack import joblib import fairseq from torchaudio.functional import resample from audiolm_pytorch.utils import curtail_to_multiple import logging logging.root.setLevel(logging.ERROR) def exists(val): return val is not None Loading
audiolm_pytorch/vq_wav2vec.py +3 −0 Original line number Diff line number Diff line Loading @@ -10,6 +10,9 @@ from torchaudio.functional import resample from audiolm_pytorch.utils import curtail_to_multiple import logging logging.root.setLevel(logging.ERROR) def exists(val): return val is not None Loading
setup.py +1 −1 Original line number Diff line number Diff line Loading @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup( name = 'audiolm-pytorch', packages = find_packages(exclude=[]), version = '0.1.5', version = '0.1.6', license='MIT', description = 'AudioLM - Language Modeling Approach to Audio Generation from Google Research - Pytorch', author = 'Phil Wang', Loading