nullptr-0 · nullptr-0 · Aug 18, 2019 · Aug 26, 2019 · Aug 26, 2019 · Sep 6, 2019
diff --git a/layers.py b/layers.py
@@ -1,9 +1,40 @@
 import torch
+import numpy as np
 from librosa.filters import mel as librosa_mel_fn
 from audio_processing import dynamic_range_compression
 from audio_processing import dynamic_range_decompression
 from stft import STFT
 
+def dct(x, norm=None):
+    """
+    Discrete Cosine Transform, Type II (a.k.a. the DCT)
+    For the meaning of the parameter `norm`, see:
+    https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html
+    :param x: the input signal
+    :param norm: the normalization, None or 'ortho'
+    :return: the DCT-II of the signal over the last dimension
+    """
+    x_shape = x.shape
+    N = x_shape[-1]
+    x = x.contiguous().view(-1, N)
+
+    v = torch.cat([x[:, ::2], x[:, 1::2].flip([1])], dim=1)
+
+    Vc = torch.rfft(v, 1, onesided=False)
+
+    k = - torch.arange(N, dtype=x.dtype, device=x.device)[None, :] * np.pi / (2 * N)
+    W_r = torch.cos(k)
+    W_i = torch.sin(k)
+
+    V = Vc[:, :, 0] * W_r - Vc[:, :, 1] * W_i
+
+    if norm == 'ortho':
+        V[:, 0] /= np.sqrt(N) * 2
+        V[:, 1:] /= np.sqrt(N / 2) * 2
+
+    V = 2 * V.view(*x_shape)
+
+    return V
 
 class LinearNorm(torch.nn.Module):
     def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
@@ -60,6 +91,11 @@ def spectral_de_normalize(self, magnitudes):
         output = dynamic_range_decompression(magnitudes)
         return output
 
+    def cepstrum_from_mel(self, mel):
+        #magnitudes = self.spectral_de_normalize(mel)
+        mcc = dct(mel,'ortho')
+        return mcc
+
     def mel_spectrogram(self, y):
         """Computes mel-spectrograms from a batch of waves
         PARAMS

diff --git a/logger.py b/logger.py
@@ -9,17 +9,25 @@ class Tacotron2Logger(SummaryWriter):
     def __init__(self, logdir):
         super(Tacotron2Logger, self).__init__(logdir)
 
-    def log_training(self, reduced_loss, grad_norm, learning_rate, duration,
+    def log_training(self, reduced_loss, grad_norm, learning_rate, duration, diagonality, avg_prob, avg_MCD, avg_f0,
                      iteration):
-            self.add_scalar("training.loss", reduced_loss, iteration)
-            self.add_scalar("grad.norm", grad_norm, iteration)
-            self.add_scalar("learning.rate", learning_rate, iteration)
-            self.add_scalar("duration", duration, iteration)
+        self.add_scalar("training.loss", reduced_loss, iteration)
+        self.add_scalar("grad.norm", grad_norm, iteration)
+        self.add_scalar("learning.rate", learning_rate, iteration)
+        self.add_scalar("duration", duration, iteration)
+        self.add_scalar("training.attention_alignment_diagonality", diagonality, iteration)
+        self.add_scalar("training.average_max_attention_weight", avg_prob, iteration)
+        self.add_scalar("training.log_MCD", avg_MCD, iteration)
+        self.add_scalar("training.f0(100hz)", avg_f0, iteration)
 
-    def log_validation(self, reduced_loss, model, y, y_pred, iteration):
+    def log_validation(self, reduced_loss, model, y, y_pred, diagonality, avg_prob, avg_MCD, avg_f0, iteration):
         self.add_scalar("validation.loss", reduced_loss, iteration)
         _, mel_outputs, gate_outputs, alignments = y_pred
         mel_targets, gate_targets = y
+        self.add_scalar("validation.attention_alignment_diagonality", diagonality, iteration)
+        self.add_scalar("validation.average_max_attention_weight", avg_prob, iteration)
+        self.add_scalar("validation.log_MCD", avg_MCD, iteration)
+        self.add_scalar("validation.f0(100hz)", avg_f0, iteration)
 
         # plot distribution of parameters
         for tag, value in model.named_parameters():

diff --git a/metric.py b/metric.py
@@ -0,0 +1,115 @@
+import torch
+from torch.autograd import Variable
+import numpy as np
+from utils import load_wav_to_torch
+from hparams import create_hparams
+from layers import TacotronSTFT
+
+def alignment_metric(alignments, input_lengths, output_lengths):
+    # alignments [batch size, x, y]
+    # input_lengths [batch size] for len_x
+    # output_lengths [batch size] for len_y
+
+    batch_size = alignments.size(0)
+    optimums = torch.sqrt(input_lengths.double()**2 + output_lengths.double()**2)
+
+    diagonalitys = torch.zeros(batch_size)
+    val_sum = torch.zeros(1)
+    for i in range(batch_size):
+        dist = torch.zeros(1)
+        for j in range(output_lengths[i]):
+            value, cur_idx = torch.max(alignments[i][:][j], 0)
+            val_sum += value
+            if j==0:
+                prev_idx = cur_idx
+                continue
+            else:
+                dist += (1 + (cur_idx - prev_idx).pow(2)).float().pow(0.5)
+                prev_idx = cur_idx
+        diagonalitys[i] = Variable(dist/optimums[i])
+    avg_prob = Variable(val_sum / torch.sum(output_lengths).float())
+    diagonality = torch.mean(diagonalitys)
+    return diagonality, avg_prob
+
+def evaluation_metrics(stft, source_mels, target_mels):
+    batch_size = source_mels.size(0)
+    MCDs = torch.zeros(batch_size)
+    f0s = None
+    for i in range(batch_size):
+        src_mel = source_mels[i].unsqueeze(0)
+        src_mel = torch.clamp(src_mel, min=-4.0, max=4.0)
+        dst_mel = target_mels[i].unsqueeze(0)
+        dst_mel = torch.clamp(dst_mel, min=-4.0, max=4.0)
+        MCDs[i] = MCD_from_mels(stft, src_mel, dst_mel)
+        f0 = sqDiffF0_from_mels(stft, src_mel, dst_mel)
+        f0s = f0 if f0s is None else torch.cat((f0s, f0), 0)
+
+    avg_MCD = torch.mean(MCDs)
+    avg_f0 = torch.mean(f0s)
+
+    return avg_MCD, avg_f0
+
+def melCepDist(srcMCC, dstMCC):
+    # https://dsp.stackexchange.com/questions/56391/mel-cepstral-distortion
+    diff = dstMCC - srcMCC
+    return torch.sum((torch.sqrt( 2 * (diff**2) ) ))* (10.0/np.log(10)) * 1/diff.size(1)
+
+def f0(MCC):
+    #print(MCC.shape, MCC.max(), MCC.min())
+    _, f0 = MCC.max(0)
+    return f0
+
+def MCD_from_mels(stft, srcMel, dstMel):
+    srcMCC = stft.cepstrum_from_mel(srcMel)[0,:25,:]
+    #print('srcMCC: ', srcMCC.max(), srcMCC.min())
+    dstMCC = stft.cepstrum_from_mel(dstMel)[0,:25,:]
+    #print('dstMCC: ', dstMCC.max(), dstMCC.min())
+    MCD = melCepDist(srcMCC,dstMCC)
+    log_MCD = torch.log10(torch.clamp(MCD,min=1e-5))
+    return log_MCD
+
+def sqDiffF0_from_mels(stft, srcMel, dstMel):
+    srcMCC = stft.cepstrum_from_mel(srcMel).squeeze(0)
+    dstMCC = stft.cepstrum_from_mel(dstMel).squeeze(0)
+    srcF0 = f0(srcMCC)
+    dstF0 = f0(dstMCC)
+    diff = (dstF0 - srcF0).double()
+    return torch.sqrt(diff**2)
+
+def test_MCD_and_f0():
+    hparams = create_hparams()
+    stft = TacotronSTFT(
+        hparams.filter_length, hparams.hop_length, hparams.win_length,
+        hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
+        hparams.mel_fmax)
+    audio_path = 'kakao/1/1_0001.wav'
+    mel_path = 'kakao/1/1_0001.mel.npy'
+    srcMel = torch.from_numpy(np.load(mel_path)).unsqueeze(0)
+    srcMel = torch.clamp(srcMel, -4.0, 4.0)
+    # print(srcMel.shape,  srcMel.max(), srcMel.min())
+    audio, sr = load_wav_to_torch(audio_path)
+    # print(audio.shape, audio.max(), audio.min())
+    audio_norm = audio / hparams.max_wav_value
+    audio_norm = audio_norm.unsqueeze(0)
+    audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
+
+    # print(audio_norm.shape, audio_norm.max(), audio_norm.min())
+    dstMel = stft.mel_spectrogram(audio_norm)
+    # print(dstMel.shape, dstMel.max(), dstMel.min())
+    # mcc = stft.cepstrum_from_audio(audio_norm)
+    # print('mcc', mcc.shape, mcc.max(), mcc.min())
+
+    log_MCD = MCD_from_mels(stft, srcMel, dstMel)
+    print(log_MCD.data, 'log')
+
+    sqrtDiffF0 = sqDiffF0_from_mels(stft, srcMel, dstMel)
+    print(sqrtDiffF0)
+    meanSqrtDiffF0 = torch.mean(sqrtDiffF0)
+    print(meanSqrtDiffF0.data, '100hz')
+
+#alignment_metric()
+if __name__ == "__main__":
+    test_MCD_and_f0()
+
+
+    #np.save('mel.npy' ,mel)
diff --git a/preprocess_dataset.py b/preprocess_dataset.py
@@ -0,0 +1,87 @@
+"""
+This code was developed with reference to https://github.com/Rayhane-mamah/Tacotron-2.
+"""
+from scipy.io.wavfile import write
+import librosa
+import numpy as np
+import argparse
+
+sr = 22050
+max_wav_value=32768.0
+trim_fft_size = 1024
+trim_hop_size = 256
+
+# These are control parameters for trimming and skipping
+trim_top_db = 23
+skip_len = 14848
+
+def preprocess_audio(file_list, silence_audio_size, pre_emphasis=False):
+    for F in file_list:
+        f = open(F, encoding='utf-8')
+        R = f.readlines()
+        f.close()
+        print('='*5+F+'='*5)
+
+        for i, r in enumerate(R):
+            wav_file = r.split('|')[0]
+            data, sampling_rate = librosa.core.load(wav_file, sr)
+            data = data / np.abs(data).max() *0.999
+            data_= librosa.effects.trim(data, top_db= trim_top_db, frame_length=trim_fft_size, hop_length=trim_hop_size)[0]
+            if (pre_emphasis):
+                data_ = np.append(data_[0], data_[1:] - 0.97 * data_[:-1])
+                data_ = data_ / np.abs(data_).max() * 0.999
+            data_ = data_ * max_wav_value
+            data_ = np.append(data_, [0.]*silence_audio_size)
+            data_ = data_.astype(dtype=np.int16)
+            write(wav_file, sr, data_)
+            #print(len(data),len(data_))
+            if(i%100 == 0):
+                print (i)
+
+def remove_short_audios(file_name):
+    f = open(file_name,'r',encoding='utf-8')
+    R = f.readlines()
+    f.close()
+
+    L = []
+    for i, r in enumerate(R):
+        wav_file = r.split('|')[0]
+        data, sampling_rate = librosa.core.load(wav_file, sr)
+        if(len(data) >= skip_len):
+            L.append(r)
+        if (i % 100 == 0):
+            print(i)
+    tmp = file_name.split('.')
+    tmp.insert(1,'_skipped.')
+    skipped_file_name = "".join(tmp)
+    f = open(skipped_file_name,'w',encoding='utf-8')
+    f.writelines(L)
+    f.close()
+
+if __name__ == "__main__":
+    """
+    usage
+    python preprocess_dataset.py -f=metadata.csv -s=5 -t -p -r
+    python preprocess_dataset.py -f=metadata.csv
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--file_list', type=str,
+                        help='Metadata file list to preprocess')
+    parser.add_argument('-s', '--silence_padding', type=int, default=0,
+                        help='Adding silence padding at the end of each audio, silence audio size is hop_length * silence padding')
+    parser.add_argument('-p', '--pre_emphasis', action='store_true',
+                        help="Doing pre_emphasis")
+    parser.add_argument('-t', '--trimming', action='store_true',
+                        help="Doing trimming audios")
+    parser.add_argument('-r', '--remove_short_audios',action='store_true',
+                        help="Removing short audios in metadata file")
+    args = parser.parse_args()
+    file_list = args.file_list.split(',')
+    silence_audio_size = trim_hop_size * args.silence_padding
+
+
+    preprocess_audio(file_list, silence_audio_size, args.pre_emphasis)
+
+    if(args.remove_short_audios):
+        for f in file_list:
+            remove_short_audios(f)
diff --git a/train.py b/train.py
@@ -15,7 +15,8 @@
 from loss_function import Tacotron2Loss
 from logger import Tacotron2Logger
 from hparams import create_hparams
-
+from metric import alignment_metric, evaluation_metrics
+import layers
 
 def reduce_tensor(tensor, n_gpus):
     rt = tensor.clone()
@@ -119,7 +120,7 @@ def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
 
 
 def validate(model, criterion, valset, iteration, batch_size, n_gpus,
-             collate_fn, logger, distributed_run, rank):
+             collate_fn, logger, distributed_run, rank, stft):
     """Handles all the validation scoring and printing"""
     model.eval()
     with torch.no_grad():
@@ -129,21 +130,38 @@ def validate(model, criterion, valset, iteration, batch_size, n_gpus,
                                 pin_memory=False, collate_fn=collate_fn)
 
         val_loss = 0.0
+        diagonality = torch.zeros(1)
+        avg_prob = torch.zeros(1)
+        avg_MCD = torch.zeros(1)
+        avg_f0 = torch.zeros(1)
         for i, batch in enumerate(val_loader):
             x, y = model.parse_batch(batch)
             y_pred = model(x)
+            _, input_lengths, mel_padded, _, output_lengths = x
+            _, mel_outputs_postnet, _, alignments = y_pred
             loss = criterion(y_pred, y)
             if distributed_run:
                 reduced_val_loss = reduce_tensor(loss.data, n_gpus).item()
             else:
                 reduced_val_loss = loss.item()
             val_loss += reduced_val_loss
+            rate, prob = alignment_metric(alignments, input_lengths, output_lengths)
+            MCD, f0 = evaluation_metrics(stft, mel_padded, mel_outputs_postnet)
+            diagonality += rate
+            avg_prob += prob
+            avg_MCD += MCD
+            avg_f0 += f0
+        diagonality=diagonality / (i + 1)
+        avg_prob = avg_prob / (i + 1)
         val_loss = val_loss / (i + 1)
+        avg_MCD = avg_MCD / (i + 1)
+        avg_f0 = avg_f0 / (i + 1)
 
     model.train()
     if rank == 0:
         print("Validation loss {}: {:9f}  ".format(iteration, val_loss))
-        logger.log_validation(val_loss, model, y, y_pred, iteration)
+        logger.log_validation(val_loss, model, y, y_pred, diagonality, avg_prob, avg_MCD, avg_f0, iteration)
+
 
 
 def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
@@ -159,6 +177,11 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
     rank (int): rank of current gpu
     hparams (object): comma separated list of "name=value" pairs.
     """
+    stft = layers.TacotronSTFT(
+        hparams.filter_length, hparams.hop_length, hparams.win_length,
+        hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
+        hparams.mel_fmax)
+
     if hparams.distributed_run:
         init_distributed(hparams, n_gpus, rank, group_name)
 
@@ -214,11 +237,15 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
             x, y = model.parse_batch(batch)
             y_pred = model(x)
 
+            _, input_lengths, mel_padded, _, output_lengths = x
+            _, mel_outputs_postnet, _, alignments = y_pred
+
             loss = criterion(y_pred, y)
             if hparams.distributed_run:
                 reduced_loss = reduce_tensor(loss.data, n_gpus).item()
             else:
                 reduced_loss = loss.item()
+
             if hparams.fp16_run:
                 with amp.scale_loss(loss, optimizer) as scaled_loss:
                     scaled_loss.backward()
@@ -239,13 +266,17 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
                 duration = time.perf_counter() - start
                 print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
                     iteration, reduced_loss, grad_norm, duration))
-                logger.log_training(
-                    reduced_loss, grad_norm, learning_rate, duration, iteration)
+                if (i % (hparams.iters_per_checkpoint / 10) == 0):
+                    with torch.no_grad():
+                        diagonality, avg_prob = alignment_metric(alignments, input_lengths, output_lengths)
+                        avg_MCD, avg_f0 = evaluation_metrics(stft, mel_padded, mel_outputs_postnet)
+                    logger.log_training(
+                        reduced_loss, grad_norm, learning_rate, duration, diagonality, avg_prob, avg_MCD, avg_f0, iteration)
 
             if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0):
                 validate(model, criterion, valset, iteration,
                          hparams.batch_size, n_gpus, collate_fn, logger,
-                         hparams.distributed_run, rank)
+                         hparams.distributed_run, rank, stft)
                 if rank == 0:
                     checkpoint_path = os.path.join(
                         output_directory, "checkpoint_{}".format(iteration))