Commit d91ed4cb authored by Leon Wu's avatar Leon Wu
Browse files

encodec test

parent 321c4f4a
Loading
Loading
Loading
Loading

encodec_test.ipynb

0 → 100644
+222 −0
Original line number Diff line number Diff line
%% Cell type:code id: tags:

``` 
!pip install encodec torchaudio torch
```

%% Output

    Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
    Collecting encodec
      Downloading encodec-0.1.1.tar.gz (3.7 MB)
    [2K     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.7/3.7 MB 30.0 MB/s eta 0:00:00
    [?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
    Requirement already satisfied: torchaudio in /usr/local/lib/python3.9/dist-packages (0.13.1+cu116)
    Requirement already satisfied: torch in /usr/local/lib/python3.9/dist-packages (1.13.1+cu116)
    Requirement already satisfied: numpy in /usr/local/lib/python3.9/dist-packages (from encodec) (1.22.4)
    Collecting einops
      Downloading einops-0.6.0-py3-none-any.whl (41 kB)
    [2K     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 41.6/41.6 KB 1.5 MB/s eta 0:00:00
    [?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.9/dist-packages (from torch) (4.5.0)
    Building wheels for collected packages: encodec
      Building wheel for encodec (setup.py) ... [?25l[?25hdone
      Created wheel for encodec: filename=encodec-0.1.1-py3-none-any.whl size=45775 sha256=a5ea932e765562c2c05d33eb974ddf223ed8fafddf99e45a8c8a4b3a5f81dcf8
      Stored in directory: /root/.cache/pip/wheels/1d/9d/20/489d6aafffb505e18fcfcfbe722562f91c26af0a8a6da7d00b
    Successfully built encodec
    Installing collected packages: einops, encodec
    Successfully installed einops-0.6.0 encodec-0.1.1

%% Cell type:code id: tags:

``` 
from encodec import EncodecModel
from encodec.utils import convert_audio

import torchaudio
import torch

# Instantiate a pretrained EnCodec model
model = EncodecModel.encodec_model_48khz()
model.set_target_bandwidth(12.0)

# Load and pre-process the audio waveform
wav, sr = torchaudio.load("test.wav")
print(f"channels {model.channels} and sampel rate {model.sample_rate} and wav_shape {wav.shape} and sr {sr}")
# convert_audio up-samples if necessary, e.g. if wav has n samples at 16 kHz and model is 48 kHz, then resulting wav has 3n samples because you do n * 48/16
wav = convert_audio(wav, sr, model.sample_rate, model.channels)
# print(wav)
# print(wav.shape)
wav = wav.unsqueeze(0)
print(f"wav.shape {wav.shape}")

print(f"model.quantizer.bins {model.quantizer.bins}") # 1024 codebook size
print(f"model.quantizer.dimension {model.quantizer.dimension}") # 128-dimension vecs in codebook
print(f"model.quantizer.n_q {model.quantizer.n_q}") # 16 quantizers

model.quantizer.get_num_quantizers_for_bandwidth(wav)
# Extract discrete codes from EnCodec
with torch.no_grad():
    encoded_frames = model.encode(wav)
    # Note that the 48 kHz model processes the audio by chunks of 1 seconds,
    # with an overlap of 1%, and renormalizes the audio to have unit scale.
    # For this model, the output of model.encode(wav) would a list
    # (for each frame of 1 second) of a tuple (codes, scale) with scale a scalar tensor.

print(f"len encoded_frames {len(encoded_frames)}")
print(f"encoded[0] shapes: {[encoded[0].shape for encoded in encoded_frames]}")
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)  # [B, n_q, T]
len(model.quantizer.vq.layers) # should be n_q
# TODO: continue from here, why codes n_q not actually 16 then? what am I missing

# # print(codes[0][0])

codes.shape
```

%% Output

    channels 2 and sampel rate 48000 and wav_shape torch.Size([1, 77040]) and sr 16000
    wav.shape torch.Size([1, 2, 231120])
    model.quantizer.bins 1024
    model.quantizer.dimension 128
    model.quantizer.n_q 16
    len encoded_frames 5
    encoded[0] shapes: [torch.Size([1, 4, 150]), torch.Size([1, 4, 150]), torch.Size([1, 4, 150]), torch.Size([1, 4, 150]), torch.Size([1, 4, 129])]

    torch.Size([1, 4, 729])

%% Cell type:code id: tags:

``` 
num_samples = wav.shape[-1]
num_seconds = num_samples / model.sample_rate
num_frames = num_seconds * model.frame_rate
num_frames / 0.99 # so i guess codes length is the number of segments
# num_frames * model.segment_length
```

%% Output

    729.5454545454546

%% Cell type:code id: tags:

``` 
??model.encode
 # 48000 == segment_length. for pretrained 48kHz model, model.segment is 1.0 seconds. so 48k samples
model.segment_length
# 47520 with overlap of 1%, i.e. segment_length * 0.99
model.segment_stride

# wav.shape has length 231120
# 231120 / 47520 # 4.8636363...

# the first batch x channel x 48000 samples
x = wav[:, :, :model.segment_length]
# length = x.shape[-1]
# duration = length / model.sample_rate
# duration
encoded_frame = model._encode_frame(x)
len(encoded_frame)
encoded_frame[0].shape # batch x 4 x 150
encoded_frame[1] # scale = 0.1256

# inside _encode_frame
model.normalize
emb = model.encoder(x) # 1 x 128 x 150. downsampling ratio == 320, 150 * 320 == 48k
# emb.shape
model.frame_rate # frame_rate 150 == ceil(sampling rate / stride product). model.encoder.ratios are 2,4,5,8 -> 320
model.bandwidth # 6.0 set by hand earlier
# model.set_target_bandwidth(6.0) # bandwidth directly proportional to number of results that quantizer returns
# bandwidth for 48kHz pretrained = 1.5 * the number of codes per timestep. no idea why

codes = model.quantizer.encode(emb, model.frame_rate, model.bandwidth)
codes.shape # 4 x 1 x 150
# then the code just transposes to 1 x 4 x 150
model.quantizer.get_num_quantizers_for_bandwidth(model.sample_rate, model.bandwidth)

```

%% Output

    1

%% Cell type:code id: tags:

``` 
model.quantizer.bins
```

%% Output

    1024

%% Cell type:code id: tags:

``` 
# 24 kHz
# torch.Size([1, 115560])
# torch.Size([1, 1, 115560])
# torch.Size([1, 8, 362])

# 48 kHz
# torch.Size([2, 231120])
# torch.Size([1, 2, 231120]) # doubled along both fronts? oh wait the 2 is the two channels
# torch.Size([1, 4, 729])

codes.shape
```

%% Output

    torch.Size([1, 4, 729])

%% Cell type:code id: tags:

``` 
model24 = EncodecModel.encodec_model_24khz()
model.frame_rate
```

%% Output

    150

%% Cell type:code id: tags:

``` 
from encodec import EncodecModel
from encodec.utils import convert_audio

import torch

# Instantiate a pretrained EnCodec model as described in README
model = EncodecModel.encodec_model_24khz()
model.set_target_bandwidth(6.0)
x = torch.rand(1, 1, 24000) # batch x channel x samples. 1 second's worth
with torch.no_grad():
    encoded_frames = model.encode(x)
codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)  # [B, n_q, T]
codes.shape # 1 x 8 x 75, 75 being the frame-rate
print(f"model's n_q is {model.quantizer.n_q} but the result had {codes.shape[1]}") # 32, 8

# Zoom in on code in model.py _encode_frame method
# https://github.com/facebookresearch/encodec/blob/f6a9f768373ba351d0cd18b928769df40da1aeb5/encodec/model.py#L147
y = torch.rand(1, 1, 24000)
emb = model.encoder(y) # 1 x 128 x 75. 75 is frames per second
# then it calls model.quantizer.encode, so we jump into there next
# codes = model.quantizer.encode(emb, model.frame_rate, model.bandwidth)
# note the discrepancy here-- code passes model.frame_rate and not model.sample_rate
# https://github.com/facebookresearch/encodec/blob/f6a9f768373ba351d0cd18b928769df40da1aeb5/encodec/quantization/vq.py#L100
sample_rate_thats_secretly_frame_rate = model.frame_rate
n_q = model.quantizer.get_num_quantizers_for_bandwidth(model.frame_rate, model.bandwidth)
# frame_rate = 75, bandwidth = 6.
# So get_bandwidth_per_quantizer is 0.75 in the incorrect version,
# and n_q ends up being bandwidth / bandwidth_per_quantizer = 6 / 0.75 = 8


```

%% Output

    torch.Size([1, 128, 75])