Loading README.md +11 −0 Original line number Diff line number Diff line Loading @@ -133,3 +133,14 @@ loss.backward() url = {https://twitter.com/rivershavewings} } ``` ```bibtex @misc{ding2021cogview, title = {CogView: Mastering Text-to-Image Generation via Transformers}, author = {Ming Ding and Zhuoyi Yang and Wenyi Hong and Wendi Zheng and Chang Zhou and Da Yin and Junyang Lin and Xu Zou and Zhou Shao and Hongxia Yang and Jie Tang}, year = {2021}, eprint = {2105.13290}, archivePrefix = {arXiv}, primaryClass = {cs.CV} } ``` audiolm_pytorch/audiolm_pytorch.py +16 −3 Original line number Diff line number Diff line Loading @@ -64,6 +64,11 @@ def gradient_penalty(images, output, weight = 10): gradients = rearrange(gradients, 'b ... -> b (...)') return weight * ((gradients.norm(2, dim = 1) - 1) ** 2).mean() # attention related utils def grad_shrink(t, alpha = 0.1): return t * alpha + t.detach() * (1 - alpha) # classifier free guidance functions def uniform(shape, device): Loading Loading @@ -632,9 +637,12 @@ class Transformer(nn.Module): depth, dim_context = None, cross_attend = False, grad_shrink_alpha = 0.1, **kwargs ): super().__init__() self.grad_shrink = partial(grad_shrink, alpha = grad_shrink_alpha) self.layers = nn.ModuleList([]) self.rel_pos_bias = RelativePositionBias() Loading @@ -657,6 +665,8 @@ class Transformer(nn.Module): ): n, device = x.shape[1], x.device x = self.grad_shrink(x) # from cogview paper, adopted by GLM 130B LLM, decreases likelihood of attention net instability rel_pos_bias = self.rel_pos_bias(n, n, device = device) for attn, cross_attn, ff in self.layers: Loading Loading @@ -684,6 +694,7 @@ class SemanticTransformer(nn.Module): cond_drop_prob = 0.5, wav2vec: Optional[Union[FairseqVQWav2Vec, HubertWithKmeans]] = None, unique_consecutive = True, grad_shrink_alpha = 0.1, pad_id = -1, **kwargs ): Loading @@ -701,7 +712,7 @@ class SemanticTransformer(nn.Module): self.pad_id = pad_id self.wav2vec = wav2vec self.transformer = Transformer(dim = dim, dim_context = get_encoded_dim(t5_name), cross_attend = has_condition, **kwargs) self.transformer = Transformer(dim = dim, dim_context = get_encoded_dim(t5_name), cross_attend = has_condition, grad_shrink_alpha = grad_shrink_alpha, **kwargs) self.to_logits = nn.Linear(dim, num_semantic_tokens + 1) def forward( Loading Loading @@ -778,6 +789,7 @@ class CoarseTransformer(nn.Module): t5_name = DEFAULT_T5_NAME, has_condition = False, cond_drop_prob = 0.5, grad_shrink_alpha = 0.1, wav2vec: Optional[Union[FairseqVQWav2Vec, HubertWithKmeans]] = None, **kwargs ): Loading @@ -796,7 +808,7 @@ class CoarseTransformer(nn.Module): self.coarse_embedding = nn.Embedding(num_coarse_quantizers * codebook_size_with_eos, dim) self.wav2vec = wav2vec self.transformer = Transformer(dim = dim, dim_context = get_encoded_dim(t5_name), cross_attend = has_condition, **kwargs) self.transformer = Transformer(dim = dim, dim_context = get_encoded_dim(t5_name), cross_attend = has_condition, grad_shrink_alpha = grad_shrink_alpha, **kwargs) self.codebook_size = codebook_size self.num_coarse_quantizers = num_coarse_quantizers Loading Loading @@ -890,6 +902,7 @@ class FineTransformer(nn.Module): t5_name = DEFAULT_T5_NAME, has_condition = False, cond_drop_prob = 0.5, grad_shrink_alpha = 0.1, **kwargs ): super().__init__() Loading @@ -906,7 +919,7 @@ class FineTransformer(nn.Module): self.eos_id = codebook_size self.transformer = Transformer(dim = dim, dim_context = get_encoded_dim(t5_name), cross_attend = has_condition, **kwargs) self.transformer = Transformer(dim = dim, dim_context = get_encoded_dim(t5_name), cross_attend = has_condition, grad_shrink_alpha = grad_shrink_alpha, **kwargs) self.codebook_size = codebook_size self.num_coarse_quantizers = num_coarse_quantizers Loading setup.py +1 −1 Original line number Diff line number Diff line Loading @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup( name = 'audiolm-pytorch', packages = find_packages(exclude=[]), version = '0.0.22', version = '0.0.23', license='MIT', description = 'AudioLM - Language Modeling Approach to Audio Generation from Google Research - Pytorch', author = 'Phil Wang', Loading Loading
README.md +11 −0 Original line number Diff line number Diff line Loading @@ -133,3 +133,14 @@ loss.backward() url = {https://twitter.com/rivershavewings} } ``` ```bibtex @misc{ding2021cogview, title = {CogView: Mastering Text-to-Image Generation via Transformers}, author = {Ming Ding and Zhuoyi Yang and Wenyi Hong and Wendi Zheng and Chang Zhou and Da Yin and Junyang Lin and Xu Zou and Zhou Shao and Hongxia Yang and Jie Tang}, year = {2021}, eprint = {2105.13290}, archivePrefix = {arXiv}, primaryClass = {cs.CV} } ```
audiolm_pytorch/audiolm_pytorch.py +16 −3 Original line number Diff line number Diff line Loading @@ -64,6 +64,11 @@ def gradient_penalty(images, output, weight = 10): gradients = rearrange(gradients, 'b ... -> b (...)') return weight * ((gradients.norm(2, dim = 1) - 1) ** 2).mean() # attention related utils def grad_shrink(t, alpha = 0.1): return t * alpha + t.detach() * (1 - alpha) # classifier free guidance functions def uniform(shape, device): Loading Loading @@ -632,9 +637,12 @@ class Transformer(nn.Module): depth, dim_context = None, cross_attend = False, grad_shrink_alpha = 0.1, **kwargs ): super().__init__() self.grad_shrink = partial(grad_shrink, alpha = grad_shrink_alpha) self.layers = nn.ModuleList([]) self.rel_pos_bias = RelativePositionBias() Loading @@ -657,6 +665,8 @@ class Transformer(nn.Module): ): n, device = x.shape[1], x.device x = self.grad_shrink(x) # from cogview paper, adopted by GLM 130B LLM, decreases likelihood of attention net instability rel_pos_bias = self.rel_pos_bias(n, n, device = device) for attn, cross_attn, ff in self.layers: Loading Loading @@ -684,6 +694,7 @@ class SemanticTransformer(nn.Module): cond_drop_prob = 0.5, wav2vec: Optional[Union[FairseqVQWav2Vec, HubertWithKmeans]] = None, unique_consecutive = True, grad_shrink_alpha = 0.1, pad_id = -1, **kwargs ): Loading @@ -701,7 +712,7 @@ class SemanticTransformer(nn.Module): self.pad_id = pad_id self.wav2vec = wav2vec self.transformer = Transformer(dim = dim, dim_context = get_encoded_dim(t5_name), cross_attend = has_condition, **kwargs) self.transformer = Transformer(dim = dim, dim_context = get_encoded_dim(t5_name), cross_attend = has_condition, grad_shrink_alpha = grad_shrink_alpha, **kwargs) self.to_logits = nn.Linear(dim, num_semantic_tokens + 1) def forward( Loading Loading @@ -778,6 +789,7 @@ class CoarseTransformer(nn.Module): t5_name = DEFAULT_T5_NAME, has_condition = False, cond_drop_prob = 0.5, grad_shrink_alpha = 0.1, wav2vec: Optional[Union[FairseqVQWav2Vec, HubertWithKmeans]] = None, **kwargs ): Loading @@ -796,7 +808,7 @@ class CoarseTransformer(nn.Module): self.coarse_embedding = nn.Embedding(num_coarse_quantizers * codebook_size_with_eos, dim) self.wav2vec = wav2vec self.transformer = Transformer(dim = dim, dim_context = get_encoded_dim(t5_name), cross_attend = has_condition, **kwargs) self.transformer = Transformer(dim = dim, dim_context = get_encoded_dim(t5_name), cross_attend = has_condition, grad_shrink_alpha = grad_shrink_alpha, **kwargs) self.codebook_size = codebook_size self.num_coarse_quantizers = num_coarse_quantizers Loading Loading @@ -890,6 +902,7 @@ class FineTransformer(nn.Module): t5_name = DEFAULT_T5_NAME, has_condition = False, cond_drop_prob = 0.5, grad_shrink_alpha = 0.1, **kwargs ): super().__init__() Loading @@ -906,7 +919,7 @@ class FineTransformer(nn.Module): self.eos_id = codebook_size self.transformer = Transformer(dim = dim, dim_context = get_encoded_dim(t5_name), cross_attend = has_condition, **kwargs) self.transformer = Transformer(dim = dim, dim_context = get_encoded_dim(t5_name), cross_attend = has_condition, grad_shrink_alpha = grad_shrink_alpha, **kwargs) self.codebook_size = codebook_size self.num_coarse_quantizers = num_coarse_quantizers Loading
setup.py +1 −1 Original line number Diff line number Diff line Loading @@ -3,7 +3,7 @@ from setuptools import setup, find_packages setup( name = 'audiolm-pytorch', packages = find_packages(exclude=[]), version = '0.0.22', version = '0.0.23', license='MIT', description = 'AudioLM - Language Modeling Approach to Audio Generation from Google Research - Pytorch', author = 'Phil Wang', Loading