Commit d2932c75 authored by Phil Wang's avatar Phil Wang
Browse files

cleanup coarse transformer

parent 94cef623
Loading
Loading
Loading
Loading
+1 −9
Original line number Diff line number Diff line
@@ -406,21 +406,14 @@ class CoarseTransformer(nn.Module):
        codebook_size,
        num_coarse_quantizers,
        dim,
        num_semantic_tokens = None,
        num_semantic_tokens,
        t5_name = DEFAULT_T5_NAME,
        has_condition = False,
        cond_drop_prob = 0.5,
        grad_shrink_alpha = 0.1,
        wav2vec: Optional[Union[FairseqVQWav2Vec, HubertWithKmeans]] = None,
        **kwargs
    ):
        super().__init__()
        assert exists(wav2vec) or exists(num_semantic_tokens)

        if exists(wav2vec):
            num_semantic_tokens = default(num_semantic_tokens, wav2vec.codebook_size)
            assert num_semantic_tokens == wav2vec.codebook_size

        self.has_condition = has_condition
        self.embed_text = partial(t5_encode_text, name = t5_name)
        self.cond_drop_prob = cond_drop_prob
@@ -435,7 +428,6 @@ class CoarseTransformer(nn.Module):
        codebook_size_with_eos = codebook_size + 1
        self.coarse_embedding = nn.Embedding(num_coarse_quantizers * codebook_size_with_eos, dim)

        self.wav2vec = wav2vec
        self.transformer = Transformer(dim = dim, dim_context = get_encoded_dim(t5_name), cross_attend = has_condition, grad_shrink_alpha = grad_shrink_alpha, **kwargs)

        self.codebook_size = codebook_size
+1 −1
Original line number Diff line number Diff line
@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
setup(
  name = 'audiolm-pytorch',
  packages = find_packages(exclude=[]),
  version = '0.0.55',
  version = '0.0.56',
  license='MIT',
  description = 'AudioLM - Language Modeling Approach to Audio Generation from Google Research - Pytorch',
  author = 'Phil Wang',