Loading README.md +11 −0 Original line number Diff line number Diff line Loading @@ -79,6 +79,17 @@ wavs = torch.randn(2, 1024) conds = quantizer(wavs = wavs, namespace = 'semantic') # (2, 8, 1024) - 8 is number of quantizers ``` After much training, you will pass your finetuned or trained-from-scratch `AudioLM` and `MuLaN` wrapped in `MuLaNEmbedQuantizer` to the `MusicLM` ```python musiclm = MusicLM( audio_lm = audio_lm, mulan_embed_quantizer = mulan_embed_quantizer ) music = musiclm(['the crystalline sounds of the piano in a ballroom']) # torch.Tensor ``` ## Todo - [x] mulan seems to be using decoupled contrastive learning, offer that as an option Loading musiclm_pytorch/musiclm_pytorch.py +3 −3 Original line number Diff line number Diff line Loading @@ -549,7 +549,7 @@ class MusicLM(nn.Module): self.eval() texts = tokenizer.tokenize(raw_texts) cond_tokens = self.mulan_embed_quantizer(texts = texts) wavs = self.audio_lm.generate(cond_tokens = cond_tokens, **audio_lm_kwargs) return wavs text_embeds = self.mulan_embed_quantizer(texts = texts) return self.audio_lm(text_embeds = text_embeds, **audio_lm_kwargs) setup.py +2 −2 Original line number Diff line number Diff line Loading @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup( name = 'musiclm-pytorch', packages = find_packages(exclude=[]), version = '0.0.3', version = '0.0.4', license='MIT', description = 'MusicLM - AudioLM + Audio CLIP to text to music synthesis', author = 'Phil Wang', Loading @@ -19,7 +19,7 @@ setup( 'contrastive learning' ], install_requires=[ 'audiolm-pytorch', 'audiolm-pytorch>=0.8.1', 'beartype', 'einops>=0.4', 'vector-quantize-pytorch>=1.0.0', Loading Loading
README.md +11 −0 Original line number Diff line number Diff line Loading @@ -79,6 +79,17 @@ wavs = torch.randn(2, 1024) conds = quantizer(wavs = wavs, namespace = 'semantic') # (2, 8, 1024) - 8 is number of quantizers ``` After much training, you will pass your finetuned or trained-from-scratch `AudioLM` and `MuLaN` wrapped in `MuLaNEmbedQuantizer` to the `MusicLM` ```python musiclm = MusicLM( audio_lm = audio_lm, mulan_embed_quantizer = mulan_embed_quantizer ) music = musiclm(['the crystalline sounds of the piano in a ballroom']) # torch.Tensor ``` ## Todo - [x] mulan seems to be using decoupled contrastive learning, offer that as an option Loading
musiclm_pytorch/musiclm_pytorch.py +3 −3 Original line number Diff line number Diff line Loading @@ -549,7 +549,7 @@ class MusicLM(nn.Module): self.eval() texts = tokenizer.tokenize(raw_texts) cond_tokens = self.mulan_embed_quantizer(texts = texts) wavs = self.audio_lm.generate(cond_tokens = cond_tokens, **audio_lm_kwargs) return wavs text_embeds = self.mulan_embed_quantizer(texts = texts) return self.audio_lm(text_embeds = text_embeds, **audio_lm_kwargs)
setup.py +2 −2 Original line number Diff line number Diff line Loading @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup( name = 'musiclm-pytorch', packages = find_packages(exclude=[]), version = '0.0.3', version = '0.0.4', license='MIT', description = 'MusicLM - AudioLM + Audio CLIP to text to music synthesis', author = 'Phil Wang', Loading @@ -19,7 +19,7 @@ setup( 'contrastive learning' ], install_requires=[ 'audiolm-pytorch', 'audiolm-pytorch>=0.8.1', 'beartype', 'einops>=0.4', 'vector-quantize-pytorch>=1.0.0', Loading