go for single-headed key / values for all decoding attention networks, given... (fca12286) · Commits · school / Capstone Design / 01 / AudioLM

README.md

+10 −0

Original line number	Diff line number	Diff line
		@@ -97,3 +97,13 @@ loss.backward()
		url = {https://arxiv.org/abs/2002.05202}
		}
		```

		```bibtex
		@article{Shazeer2019FastTD,
		title = {Fast Transformer Decoding: One Write-Head is All You Need},
		author = {Noam M. Shazeer},
		journal = {ArXiv},
		year = {2019},
		volume = {abs/1911.02150}
		}
		```

+4 −4

Original line number	Diff line number	Diff line
		@@ -482,7 +482,7 @@ class Attention(nn.Module):
		self.norm = nn.LayerNorm(dim)

		self.to_q = nn.Linear(dim, inner_dim, bias = False)
		self.to_kv = nn.Linear(dim, inner_dim * 2, bias = False)
		self.to_kv = nn.Linear(dim, dim_head * 2, bias = False)
		self.to_out = nn.Linear(inner_dim, dim, bias = False)

		def forward(self, x, attn_bias = None):
		@@ -490,11 +490,11 @@ class Attention(nn.Module):

		q, k, v = self.to_q(x), *self.to_kv(x).chunk(2, dim = -1)

		q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), (q, k, v))
		q = rearrange(q, 'b n (h d) -> b h n d', h = self.heads)

		q = q * self.scale

		sim = einsum('b h i d, b h j d -> b h i j', q, k)
		sim = einsum('b h i d, b j d -> b h i j', q, k)

		if exists(attn_bias):
		sim = sim + attn_bias
		@@ -505,7 +505,7 @@ class Attention(nn.Module):

		attn = sim.softmax(dim = -1)

		out = einsum('b h i j, b h j d -> b h i d', attn, v)
		out = einsum('b h i j, b j d -> b h i d', attn, v)

		out = rearrange(out, 'b h n d -> b n (h d)')
		return self.to_out(out)

+1 −1

Original line number	Diff line number	Diff line
		@@ -3,7 +3,7 @@ from setuptools import setup, find_packages
		setup(
		name = 'audiolm-pytorch',
		packages = find_packages(exclude=[]),
		version = '0.0.6',
		version = '0.0.7',
		license='MIT',
		description = 'AudioLM - Language Modeling Approach to Audio Generation from Google Research - Pytorch',
		author = 'Phil Wang',