encodec test (d91ed4cb) · Commits · school / Capstone Design / 01 / audiolm-pytorch-training

encodec_test.ipynb

0 → 100644

+222 −0

Original line number	Diff line number	Diff line
		%% Cell type:code id: tags:

		```
		!pip install encodec torchaudio torch
		```

		%% Output

		Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
		Collecting encodec
		Downloading encodec-0.1.1.tar.gz (3.7 MB)
		[2K ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.7/3.7 MB 30.0 MB/s eta 0:00:00
		[?25h Preparing metadata (setup.py) ... [?25l[?25hdone
		Requirement already satisfied: torchaudio in /usr/local/lib/python3.9/dist-packages (0.13.1+cu116)
		Requirement already satisfied: torch in /usr/local/lib/python3.9/dist-packages (1.13.1+cu116)
		Requirement already satisfied: numpy in /usr/local/lib/python3.9/dist-packages (from encodec) (1.22.4)
		Collecting einops
		Downloading einops-0.6.0-py3-none-any.whl (41 kB)
		[2K ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 41.6/41.6 KB 1.5 MB/s eta 0:00:00
		[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.9/dist-packages (from torch) (4.5.0)
		Building wheels for collected packages: encodec
		Building wheel for encodec (setup.py) ... [?25l[?25hdone
		Created wheel for encodec: filename=encodec-0.1.1-py3-none-any.whl size=45775 sha256=a5ea932e765562c2c05d33eb974ddf223ed8fafddf99e45a8c8a4b3a5f81dcf8
		Stored in directory: /root/.cache/pip/wheels/1d/9d/20/489d6aafffb505e18fcfcfbe722562f91c26af0a8a6da7d00b
		Successfully built encodec
		Installing collected packages: einops, encodec
		Successfully installed einops-0.6.0 encodec-0.1.1

		%% Cell type:code id: tags:

		```
		from encodec import EncodecModel
		from encodec.utils import convert_audio

		import torchaudio
		import torch

		# Instantiate a pretrained EnCodec model
		model = EncodecModel.encodec_model_48khz()
		model.set_target_bandwidth(12.0)

		# Load and pre-process the audio waveform
		wav, sr = torchaudio.load("test.wav")
		print(f"channels {model.channels} and sampel rate {model.sample_rate} and wav_shape {wav.shape} and sr {sr}")
		# convert_audio up-samples if necessary, e.g. if wav has n samples at 16 kHz and model is 48 kHz, then resulting wav has 3n samples because you do n * 48/16
		wav = convert_audio(wav, sr, model.sample_rate, model.channels)
		# print(wav)
		# print(wav.shape)
		wav = wav.unsqueeze(0)
		print(f"wav.shape {wav.shape}")

		print(f"model.quantizer.bins {model.quantizer.bins}") # 1024 codebook size
		print(f"model.quantizer.dimension {model.quantizer.dimension}") # 128-dimension vecs in codebook
		print(f"model.quantizer.n_q {model.quantizer.n_q}") # 16 quantizers

		model.quantizer.get_num_quantizers_for_bandwidth(wav)
		# Extract discrete codes from EnCodec
		with torch.no_grad():
		encoded_frames = model.encode(wav)
		# Note that the 48 kHz model processes the audio by chunks of 1 seconds,
		# with an overlap of 1%, and renormalizes the audio to have unit scale.
		# For this model, the output of model.encode(wav) would a list
		# (for each frame of 1 second) of a tuple (codes, scale) with scale a scalar tensor.

		print(f"len encoded_frames {len(encoded_frames)}")
		print(f"encoded[0] shapes: {[encoded[0].shape for encoded in encoded_frames]}")
		codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1) # [B, n_q, T]
		len(model.quantizer.vq.layers) # should be n_q
		# TODO: continue from here, why codes n_q not actually 16 then? what am I missing

		# # print(codes[0][0])

		codes.shape
		```

		%% Output

		channels 2 and sampel rate 48000 and wav_shape torch.Size([1, 77040]) and sr 16000
		wav.shape torch.Size([1, 2, 231120])
		model.quantizer.bins 1024
		model.quantizer.dimension 128
		model.quantizer.n_q 16
		len encoded_frames 5
		encoded[0] shapes: [torch.Size([1, 4, 150]), torch.Size([1, 4, 150]), torch.Size([1, 4, 150]), torch.Size([1, 4, 150]), torch.Size([1, 4, 129])]

		torch.Size([1, 4, 729])

		%% Cell type:code id: tags:

		```
		num_samples = wav.shape[-1]
		num_seconds = num_samples / model.sample_rate
		num_frames = num_seconds * model.frame_rate
		num_frames / 0.99 # so i guess codes length is the number of segments
		# num_frames * model.segment_length
		```

		%% Output

		729.5454545454546

		%% Cell type:code id: tags:

		```
		??model.encode
		# 48000 == segment_length. for pretrained 48kHz model, model.segment is 1.0 seconds. so 48k samples
		model.segment_length
		# 47520 with overlap of 1%, i.e. segment_length * 0.99
		model.segment_stride

		# wav.shape has length 231120
		# 231120 / 47520 # 4.8636363...

		# the first batch x channel x 48000 samples
		x = wav[:, :, :model.segment_length]
		# length = x.shape[-1]
		# duration = length / model.sample_rate
		# duration
		encoded_frame = model._encode_frame(x)
		len(encoded_frame)
		encoded_frame[0].shape # batch x 4 x 150
		encoded_frame[1] # scale = 0.1256

		# inside _encode_frame
		model.normalize
		emb = model.encoder(x) # 1 x 128 x 150. downsampling ratio == 320, 150 * 320 == 48k
		# emb.shape
		model.frame_rate # frame_rate 150 == ceil(sampling rate / stride product). model.encoder.ratios are 2,4,5,8 -> 320
		model.bandwidth # 6.0 set by hand earlier
		# model.set_target_bandwidth(6.0) # bandwidth directly proportional to number of results that quantizer returns
		# bandwidth for 48kHz pretrained = 1.5 * the number of codes per timestep. no idea why

		codes = model.quantizer.encode(emb, model.frame_rate, model.bandwidth)
		codes.shape # 4 x 1 x 150
		# then the code just transposes to 1 x 4 x 150
		model.quantizer.get_num_quantizers_for_bandwidth(model.sample_rate, model.bandwidth)

		```

		%% Output

		1

		%% Cell type:code id: tags:

		```
		model.quantizer.bins
		```

		%% Output

		1024

		%% Cell type:code id: tags:

		```
		# 24 kHz
		# torch.Size([1, 115560])
		# torch.Size([1, 1, 115560])
		# torch.Size([1, 8, 362])

		# 48 kHz
		# torch.Size([2, 231120])
		# torch.Size([1, 2, 231120]) # doubled along both fronts? oh wait the 2 is the two channels
		# torch.Size([1, 4, 729])

		codes.shape
		```

		%% Output

		torch.Size([1, 4, 729])

		%% Cell type:code id: tags:

		```
		model24 = EncodecModel.encodec_model_24khz()
		model.frame_rate
		```

		%% Output

		150

		%% Cell type:code id: tags:

		```
		from encodec import EncodecModel
		from encodec.utils import convert_audio

		import torch

		# Instantiate a pretrained EnCodec model as described in README
		model = EncodecModel.encodec_model_24khz()
		model.set_target_bandwidth(6.0)
		x = torch.rand(1, 1, 24000) # batch x channel x samples. 1 second's worth
		with torch.no_grad():
		encoded_frames = model.encode(x)
		codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1) # [B, n_q, T]
		codes.shape # 1 x 8 x 75, 75 being the frame-rate
		print(f"model's n_q is {model.quantizer.n_q} but the result had {codes.shape[1]}") # 32, 8

		# Zoom in on code in model.py _encode_frame method
		# https://github.com/facebookresearch/encodec/blob/f6a9f768373ba351d0cd18b928769df40da1aeb5/encodec/model.py#L147
		y = torch.rand(1, 1, 24000)
		emb = model.encoder(y) # 1 x 128 x 75. 75 is frames per second
		# then it calls model.quantizer.encode, so we jump into there next
		# codes = model.quantizer.encode(emb, model.frame_rate, model.bandwidth)
		# note the discrepancy here-- code passes model.frame_rate and not model.sample_rate
		# https://github.com/facebookresearch/encodec/blob/f6a9f768373ba351d0cd18b928769df40da1aeb5/encodec/quantization/vq.py#L100
		sample_rate_thats_secretly_frame_rate = model.frame_rate
		n_q = model.quantizer.get_num_quantizers_for_bandwidth(model.frame_rate, model.bandwidth)
		# frame_rate = 75, bandwidth = 6.
		# So get_bandwidth_per_quantizer is 0.75 in the incorrect version,
		# and n_q ends up being bandwidth / bandwidth_per_quantizer = 6 / 0.75 = 8


		```

		%% Output

		torch.Size([1, 128, 75])