From 6adaf9fb2118a976630f15fd3ed0acbb3810a78d Mon Sep 17 00:00:00 2001 From: YeongTae Date: Mon, 26 Aug 2019 14:47:56 +0900 Subject: [PATCH 1/7] Removing numpy==1.13.0 in requirements.txt to solve some error. --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 11eccea44..bcb6a0d56 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ matplotlib==2.1.0 tensorflow -numpy==1.13.3 inflect==0.2.5 librosa==0.6.0 scipy==1.0.0 From d2a3fe77bad385ea272f676e54a12e70268ba8c9 Mon Sep 17 00:00:00 2001 From: Hyemin Cho Date: Mon, 26 Aug 2019 16:56:26 +0900 Subject: [PATCH 2/7] =?UTF-8?q?Applying=20monitoring=20metrics=20to=20Tens?= =?UTF-8?q?orboard.=20=E3=84=B4=20attention=20alignment=20diagonality=20?= =?UTF-8?q?=E3=84=B4=20average=20max=20attention=20weight=20=E3=84=B4=20f0?= =?UTF-8?q?=20RMSE=20=E3=84=B4=20MCD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- logger.py | 17 ++++---- metric.py | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ train.py | 18 +++++++-- 3 files changed, 139 insertions(+), 10 deletions(-) create mode 100644 metric.py diff --git a/logger.py b/logger.py index 9b999adf6..fd14b6aa7 100644 --- a/logger.py +++ b/logger.py @@ -9,18 +9,21 @@ class Tacotron2Logger(SummaryWriter): def __init__(self, logdir): super(Tacotron2Logger, self).__init__(logdir) - def log_training(self, reduced_loss, grad_norm, learning_rate, duration, + def log_training(self, reduced_loss, grad_norm, learning_rate, duration, alignment_len_rate, avg_prob, iteration): - self.add_scalar("training.loss", reduced_loss, iteration) - self.add_scalar("grad.norm", grad_norm, iteration) - self.add_scalar("learning.rate", learning_rate, iteration) - self.add_scalar("duration", duration, iteration) + self.add_scalar("training.loss", reduced_loss, iteration) + self.add_scalar("grad.norm", grad_norm, iteration) + self.add_scalar("learning.rate", learning_rate, iteration) + self.add_scalar("duration", duration, iteration) + self.add_scalar("training attention alignment similarity", alignment_len_rate, iteration) + self.add_scalar("training attention alignment average probability", avg_prob, iteration) - def log_validation(self, reduced_loss, model, y, y_pred, iteration): + def log_validation(self, reduced_loss, model, y, y_pred, alignment_len_rate, avg_prob, iteration): self.add_scalar("validation.loss", reduced_loss, iteration) _, mel_outputs, gate_outputs, alignments = y_pred mel_targets, gate_targets = y - + self.add_scalar("validation attention alignment similarity", alignment_len_rate, iteration) + self.add_scalar("validation attention alignment average probability", avg_prob, iteration) # plot distribution of parameters for tag, value in model.named_parameters(): tag = tag.replace('.', '/') diff --git a/metric.py b/metric.py new file mode 100644 index 000000000..08619e5ba --- /dev/null +++ b/metric.py @@ -0,0 +1,114 @@ +import torch +from torch.autograd import Variable +import numpy as np + +import wave +from scipy.io.wavfile import read +from layers import cepstral +from parabolic import parabolic +from scipy.signal import blackmanharris + + + +def alignment_metric(alignments): + #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + x_len = torch.from_numpy(np.array(alignments[0].shape[1])).float() + y_len = torch.from_numpy(np.array(alignments[0].shape[0])).float() + + # Compute the squared distances + optimum = np.array((x_len.pow(2) + y_len.pow(2)).pow(0.5)) + dist = torch.zeros(1) + val_sum = torch.zeros(1) + for i in range(np.int(y_len)): + value, cur_idx = torch.max(alignments[0][i], 0) + val_sum += value + if i==0: + prev_idx = cur_idx + continue + else: + dist += (1 + (cur_idx - prev_idx).pow(2)).float().pow(0.5) + prev_idx = cur_idx + + avg_prob = Variable(val_sum /y_len).float() + optimum = torch.from_numpy(optimum) + rate = Variable(dist/optimum) + + return rate, avg_prob + + +def MCD(source_sound, syn_sound): + sourc_cep = source_sound.cepstral() + syn_cep = syn_sound.cepstral() + + mcd = 10 * ((2*torch.sum(sourc_cep-syn_cep).pow(2)).pow(0.5))/torch.log(10) + + return mcd + + +def freq_from_fft(sig, fs): + """ + Estimate frequency from peak of FFT + """ + # Compute Fourier transform of windowed signal + windowed = sig * blackmanharris(len(sig)) + f = np.fft.rfft(windowed) + + # Find the peak and interpolate to get a more accurate peak + i = np.argmax(abs(f)) # Just use this for less-accurate, naive version + true_i = parabolic(np.log(abs(f)), i)[0] + + # Convert to equivalent frequency + return torch.from_numpy(np.array(fs * true_i / len(windowed))).float() + + +def f0(wav): + nchannels, sampwidth, framerate, nframes, comptype, compname = wav.getparams() + + # Inititalize a fundamental frequency + freqs = torch.tensor([]) + up = framerate // 80 + down = framerate // 270 + d = framerate / 270.0 + + # Number of frames per window + window_size = 1024 + + # Create a window function + window = np.hamming(window_size) + + # Iterate over the wave file frames + for i in range(nframes // window_size): + # Reading n=window_size frames from the wave file + content = wav.readframes(window_size) + + # Converting array of bytes to array of integers according to sampwidth. If stereo only the first channel is picked + samples = np.fromstring(content, dtype=types[sampwidth])[0::nchannels] + + # Applying window function for our samples + samples = torch.from_numpy(window * samples) + + # Calculating spectrum of a signal frame as fft with n=window_size + #spectrum = np.fft.fft(samples, n=window_size) + + # Calculating cepstrum as ifft(log(abs(spectrum)))) + #cepstrum = np.fft.ifft(np.log(np.abs(spectrum))).real + + cepstrum = cepstral(samples) + + _, idx = torch.max(cepstrum[down:up]) + + # Calculating fundamental frequency by finding peak + fund_freq = torch.from_numpy(np.array(framerate)).float() * cepstrum.shape[0] / (idx + d) / cepstrum.shape[0] + freqs = torch.cat(freqs, fund_freq) + + return torch.from_numpy(np.array(freqs)) + + +def cal_fft(src_sound, syn_sound): + src_f0 = f0(src_sound) + syn_f0 = f0(syn_sound) + return Variable(torch.sum(((src_f0 - syn_f0).pow(2))/src_f0.shape[0]).pow(0.5)) + +#src_sound = wave.open("C:/Users/chme/Desktop/Voice_AI/wavenet-audio-mel_wiener_.wav", mode='r') +#syn_sound = wave.open("C:/Users/chme/Desktop/Voice_AI/wavenet-audio-mel_.wav", mode='r') +#print(cal_fft(src_sound, syn_sound)) #, MCD(source_sound, syn_sound)) diff --git a/train.py b/train.py index e93917bbc..91744f00b 100644 --- a/train.py +++ b/train.py @@ -15,7 +15,7 @@ from loss_function import Tacotron2Loss from logger import Tacotron2Logger from hparams import create_hparams - +from metric import alignment_metric def reduce_tensor(tensor, n_gpus): rt = tensor.clone() @@ -129,21 +129,30 @@ def validate(model, criterion, valset, iteration, batch_size, n_gpus, pin_memory=False, collate_fn=collate_fn) val_loss = 0.0 + alignment_len_rate = torch.zeros(1) + avg_prob = torch.zeros(1) for i, batch in enumerate(val_loader): x, y = model.parse_batch(batch) y_pred = model(x) + _, _, _, alignments = y_pred loss = criterion(y_pred, y) if distributed_run: reduced_val_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_val_loss = loss.item() val_loss += reduced_val_loss + + rate, prob = alignment_metric(alignments) + alignment_len_rate += rate + avg_prob += prob + alignment_len_rate = alignment_len_rate / (i + 1) + avg_prob = avg_prob / (i + 1) val_loss = val_loss / (i + 1) model.train() if rank == 0: print("Validation loss {}: {:9f} ".format(iteration, reduced_val_loss)) - logger.log_validation(reduced_val_loss, model, y, y_pred, iteration) + logger.log_validation(reduced_val_loss, model, y, y_pred, alignment_len_rate, avg_prob, iteration) def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, @@ -233,6 +242,9 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) + _, _, _, alignments = y_pred + alignment_len_rate, avg_prob = alignment_metric(alignments) + optimizer.step() if not is_overflow and rank == 0: @@ -240,7 +252,7 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) logger.log_training( - reduced_loss, grad_norm, learning_rate, duration, iteration) + reduced_loss, grad_norm, learning_rate, duration, alignment_len_rate, avg_prob, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): validate(model, criterion, valset, iteration, From ce4ef6c64dc9c014fcd52e28f5b34bf4279282ba Mon Sep 17 00:00:00 2001 From: YeongTae Date: Fri, 6 Sep 2019 14:25:16 +0900 Subject: [PATCH 3/7] =?UTF-8?q?Applying=20monitoring=20metrics=20to=20Tens?= =?UTF-8?q?orboard.=20=E3=84=B4=20re-implementing=20and=20solving=20errors?= =?UTF-8?q?.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- layers.py | 36 +++++++++ logger.py | 17 +++-- metric.py | 215 +++++++++++++++++++++++++++--------------------------- train.py | 50 +++++++++---- 4 files changed, 189 insertions(+), 129 deletions(-) diff --git a/layers.py b/layers.py index 615a64a43..546e3f4a2 100644 --- a/layers.py +++ b/layers.py @@ -1,9 +1,40 @@ import torch +import numpy as np from librosa.filters import mel as librosa_mel_fn from audio_processing import dynamic_range_compression from audio_processing import dynamic_range_decompression from stft import STFT +def dct(x, norm=None): + """ + Discrete Cosine Transform, Type II (a.k.a. the DCT) + For the meaning of the parameter `norm`, see: + https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html + :param x: the input signal + :param norm: the normalization, None or 'ortho' + :return: the DCT-II of the signal over the last dimension + """ + x_shape = x.shape + N = x_shape[-1] + x = x.contiguous().view(-1, N) + + v = torch.cat([x[:, ::2], x[:, 1::2].flip([1])], dim=1) + + Vc = torch.rfft(v, 1, onesided=False) + + k = - torch.arange(N, dtype=x.dtype, device=x.device)[None, :] * np.pi / (2 * N) + W_r = torch.cos(k) + W_i = torch.sin(k) + + V = Vc[:, :, 0] * W_r - Vc[:, :, 1] * W_i + + if norm == 'ortho': + V[:, 0] /= np.sqrt(N) * 2 + V[:, 1:] /= np.sqrt(N / 2) * 2 + + V = 2 * V.view(*x_shape) + + return V class LinearNorm(torch.nn.Module): def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'): @@ -60,6 +91,11 @@ def spectral_de_normalize(self, magnitudes): output = dynamic_range_decompression(magnitudes) return output + def cepstrum_from_mel(self, mel): + #magnitudes = self.spectral_de_normalize(mel) + mcc = dct(mel,'ortho') + return mcc + def mel_spectrogram(self, y): """Computes mel-spectrograms from a batch of waves PARAMS diff --git a/logger.py b/logger.py index fd14b6aa7..2355f19fb 100644 --- a/logger.py +++ b/logger.py @@ -9,21 +9,26 @@ class Tacotron2Logger(SummaryWriter): def __init__(self, logdir): super(Tacotron2Logger, self).__init__(logdir) - def log_training(self, reduced_loss, grad_norm, learning_rate, duration, alignment_len_rate, avg_prob, + def log_training(self, reduced_loss, grad_norm, learning_rate, duration, diagonality, avg_prob, avg_MCD, avg_f0, iteration): self.add_scalar("training.loss", reduced_loss, iteration) self.add_scalar("grad.norm", grad_norm, iteration) self.add_scalar("learning.rate", learning_rate, iteration) self.add_scalar("duration", duration, iteration) - self.add_scalar("training attention alignment similarity", alignment_len_rate, iteration) - self.add_scalar("training attention alignment average probability", avg_prob, iteration) + self.add_scalar("training.attention_alignment_diagonality", diagonality, iteration) + self.add_scalar("training.average_max_attention_weight", avg_prob, iteration) + self.add_scalar("training.MCD", avg_MCD, iteration) + self.add_scalar("training.f0(100hz)", avg_f0, iteration) - def log_validation(self, reduced_loss, model, y, y_pred, alignment_len_rate, avg_prob, iteration): + def log_validation(self, reduced_loss, model, y, y_pred, diagonality, avg_prob, avg_MCD, avg_f0, iteration): self.add_scalar("validation.loss", reduced_loss, iteration) _, mel_outputs, gate_outputs, alignments = y_pred mel_targets, gate_targets = y - self.add_scalar("validation attention alignment similarity", alignment_len_rate, iteration) - self.add_scalar("validation attention alignment average probability", avg_prob, iteration) + self.add_scalar("validation.attention_alignment_diagonality", diagonality, iteration) + self.add_scalar("validation.average_max_attention_weight", avg_prob, iteration) + self.add_scalar("validation.MCD", avg_MCD, iteration) + self.add_scalar("validation.f0(100hz)", avg_f0, iteration) + # plot distribution of parameters for tag, value in model.named_parameters(): tag = tag.replace('.', '/') diff --git a/metric.py b/metric.py index 08619e5ba..c8b9a15a4 100644 --- a/metric.py +++ b/metric.py @@ -1,114 +1,115 @@ import torch from torch.autograd import Variable import numpy as np +from utils import load_wav_to_torch +from hparams import create_hparams +from layers import TacotronSTFT -import wave -from scipy.io.wavfile import read -from layers import cepstral -from parabolic import parabolic -from scipy.signal import blackmanharris +def alignment_metric(alignments, input_lengths, output_lengths): + # alignments [batch size, x, y] + # input_lengths [batch size] for len_x + # output_lengths [batch size] for len_y + batch_size = alignments.size(0) + optimums = torch.sqrt(input_lengths.double()**2 + output_lengths.double()**2) - -def alignment_metric(alignments): - #device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") - x_len = torch.from_numpy(np.array(alignments[0].shape[1])).float() - y_len = torch.from_numpy(np.array(alignments[0].shape[0])).float() - - # Compute the squared distances - optimum = np.array((x_len.pow(2) + y_len.pow(2)).pow(0.5)) - dist = torch.zeros(1) + diagonalitys = torch.zeros(batch_size) val_sum = torch.zeros(1) - for i in range(np.int(y_len)): - value, cur_idx = torch.max(alignments[0][i], 0) - val_sum += value - if i==0: - prev_idx = cur_idx - continue - else: - dist += (1 + (cur_idx - prev_idx).pow(2)).float().pow(0.5) - prev_idx = cur_idx - - avg_prob = Variable(val_sum /y_len).float() - optimum = torch.from_numpy(optimum) - rate = Variable(dist/optimum) - - return rate, avg_prob - - -def MCD(source_sound, syn_sound): - sourc_cep = source_sound.cepstral() - syn_cep = syn_sound.cepstral() - - mcd = 10 * ((2*torch.sum(sourc_cep-syn_cep).pow(2)).pow(0.5))/torch.log(10) - - return mcd - - -def freq_from_fft(sig, fs): - """ - Estimate frequency from peak of FFT - """ - # Compute Fourier transform of windowed signal - windowed = sig * blackmanharris(len(sig)) - f = np.fft.rfft(windowed) - - # Find the peak and interpolate to get a more accurate peak - i = np.argmax(abs(f)) # Just use this for less-accurate, naive version - true_i = parabolic(np.log(abs(f)), i)[0] - - # Convert to equivalent frequency - return torch.from_numpy(np.array(fs * true_i / len(windowed))).float() - - -def f0(wav): - nchannels, sampwidth, framerate, nframes, comptype, compname = wav.getparams() - - # Inititalize a fundamental frequency - freqs = torch.tensor([]) - up = framerate // 80 - down = framerate // 270 - d = framerate / 270.0 - - # Number of frames per window - window_size = 1024 - - # Create a window function - window = np.hamming(window_size) - - # Iterate over the wave file frames - for i in range(nframes // window_size): - # Reading n=window_size frames from the wave file - content = wav.readframes(window_size) - - # Converting array of bytes to array of integers according to sampwidth. If stereo only the first channel is picked - samples = np.fromstring(content, dtype=types[sampwidth])[0::nchannels] - - # Applying window function for our samples - samples = torch.from_numpy(window * samples) - - # Calculating spectrum of a signal frame as fft with n=window_size - #spectrum = np.fft.fft(samples, n=window_size) - - # Calculating cepstrum as ifft(log(abs(spectrum)))) - #cepstrum = np.fft.ifft(np.log(np.abs(spectrum))).real - - cepstrum = cepstral(samples) - - _, idx = torch.max(cepstrum[down:up]) - - # Calculating fundamental frequency by finding peak - fund_freq = torch.from_numpy(np.array(framerate)).float() * cepstrum.shape[0] / (idx + d) / cepstrum.shape[0] - freqs = torch.cat(freqs, fund_freq) - - return torch.from_numpy(np.array(freqs)) - - -def cal_fft(src_sound, syn_sound): - src_f0 = f0(src_sound) - syn_f0 = f0(syn_sound) - return Variable(torch.sum(((src_f0 - syn_f0).pow(2))/src_f0.shape[0]).pow(0.5)) - -#src_sound = wave.open("C:/Users/chme/Desktop/Voice_AI/wavenet-audio-mel_wiener_.wav", mode='r') -#syn_sound = wave.open("C:/Users/chme/Desktop/Voice_AI/wavenet-audio-mel_.wav", mode='r') -#print(cal_fft(src_sound, syn_sound)) #, MCD(source_sound, syn_sound)) + for i in range(batch_size): + dist = torch.zeros(1) + for j in range(output_lengths[i]): + value, cur_idx = torch.max(alignments[i][:][j], 0) + val_sum += value + if j==0: + prev_idx = cur_idx + continue + else: + dist += (1 + (cur_idx - prev_idx).pow(2)).float().pow(0.5) + prev_idx = cur_idx + diagonalitys[i] = Variable(dist/optimums[i]) + avg_prob = Variable(val_sum / torch.sum(output_lengths).float()) + diagonality = torch.mean(diagonalitys) + return diagonality, avg_prob + +def evaluation_metrics(stft, source_mels, target_mels): + batch_size = source_mels.size(0) + MCDs = torch.zeros(batch_size) + f0s = None + for i in range(batch_size): + src_mel = source_mels[i].unsqueeze(0) + src_mel = torch.clamp(src_mel, min=-4.0, max=4.0) + dst_mel = target_mels[i].unsqueeze(0) + dst_mel = torch.clamp(dst_mel, min=-4.0, max=4.0) + MCDs[i] = MCD_from_mels(stft, src_mel, dst_mel) + f0 = sqDiffF0_from_mels(stft, src_mel, dst_mel) + f0s = f0 if f0s is None else torch.cat((f0s, f0), 0) + + avg_MCD = torch.mean(MCDs) + avg_f0 = torch.mean(f0s) + + return avg_MCD, avg_f0 + +def melCepDist(srcMCC, dstMCC): + # https://dsp.stackexchange.com/questions/56391/mel-cepstral-distortion + diff = dstMCC - srcMCC + return torch.sum((torch.sqrt( 2 * (diff**2) ) ))* (10.0/np.log(10)) * 1/diff.size(1) + +def f0(MCC): + #print(MCC.shape, MCC.max(), MCC.min()) + _, f0 = MCC.max(0) + return f0 + +def MCD_from_mels(stft, srcMel, dstMel): + srcMCC = stft.cepstrum_from_mel(srcMel)[0,:25,:] + #print('srcMCC: ', srcMCC.max(), srcMCC.min()) + dstMCC = stft.cepstrum_from_mel(dstMel)[0,:25,:] + #print('dstMCC: ', dstMCC.max(), dstMCC.min()) + MCD = melCepDist(srcMCC,dstMCC) + log_MCD = torch.log10(torch.clamp(MCD,min=1e-5)) + return log_MCD + +def sqDiffF0_from_mels(stft, srcMel, dstMel): + srcMCC = stft.cepstrum_from_mel(srcMel).squeeze(0) + dstMCC = stft.cepstrum_from_mel(dstMel).squeeze(0) + srcF0 = f0(srcMCC) + dstF0 = f0(dstMCC) + diff = (dstF0 - srcF0).double() + return torch.sqrt(diff**2) + +def test_MCD_and_f0(): + hparams = create_hparams() + stft = TacotronSTFT( + hparams.filter_length, hparams.hop_length, hparams.win_length, + hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, + hparams.mel_fmax) + audio_path = 'kakao/1/1_0001.wav' + mel_path = 'kakao/1/1_0001.mel.npy' + srcMel = torch.from_numpy(np.load(mel_path)).unsqueeze(0) + srcMel = torch.clamp(srcMel, -4.0, 4.0) + # print(srcMel.shape, srcMel.max(), srcMel.min()) + audio, sr = load_wav_to_torch(audio_path) + # print(audio.shape, audio.max(), audio.min()) + audio_norm = audio / hparams.max_wav_value + audio_norm = audio_norm.unsqueeze(0) + audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False) + + # print(audio_norm.shape, audio_norm.max(), audio_norm.min()) + dstMel = stft.mel_spectrogram(audio_norm) + # print(dstMel.shape, dstMel.max(), dstMel.min()) + # mcc = stft.cepstrum_from_audio(audio_norm) + # print('mcc', mcc.shape, mcc.max(), mcc.min()) + + log_MCD = MCD_from_mels(stft, srcMel, dstMel) + print(log_MCD.data, 'log') + + sqrtDiffF0 = sqDiffF0_from_mels(stft, srcMel, dstMel) + print(sqrtDiffF0) + meanSqrtDiffF0 = torch.mean(sqrtDiffF0) + print(meanSqrtDiffF0.data, '100hz') + +#alignment_metric() +if __name__ == "__main__": + test_MCD_and_f0() + + + #np.save('mel.npy' ,mel) \ No newline at end of file diff --git a/train.py b/train.py index 91744f00b..cf3eaa0b8 100644 --- a/train.py +++ b/train.py @@ -15,7 +15,8 @@ from loss_function import Tacotron2Loss from logger import Tacotron2Logger from hparams import create_hparams -from metric import alignment_metric +from metric import alignment_metric, evaluation_metrics +import layers def reduce_tensor(tensor, n_gpus): rt = tensor.clone() @@ -119,7 +120,7 @@ def save_checkpoint(model, optimizer, learning_rate, iteration, filepath): def validate(model, criterion, valset, iteration, batch_size, n_gpus, - collate_fn, logger, distributed_run, rank): + collate_fn, logger, distributed_run, rank, stft): """Handles all the validation scoring and printing""" model.eval() with torch.no_grad(): @@ -129,30 +130,37 @@ def validate(model, criterion, valset, iteration, batch_size, n_gpus, pin_memory=False, collate_fn=collate_fn) val_loss = 0.0 - alignment_len_rate = torch.zeros(1) + diagonality = torch.zeros(1) avg_prob = torch.zeros(1) + avg_MCD = torch.zeros(1) + avg_f0 = torch.zeros(1) for i, batch in enumerate(val_loader): x, y = model.parse_batch(batch) y_pred = model(x) - _, _, _, alignments = y_pred + _, input_lengths, mel_padded, _, output_lengths = x + _, mel_outputs_postnet, _, alignments = y_pred loss = criterion(y_pred, y) if distributed_run: reduced_val_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_val_loss = loss.item() val_loss += reduced_val_loss - - rate, prob = alignment_metric(alignments) - alignment_len_rate += rate + rate, prob = alignment_metric(alignments, input_lengths, output_lengths) + MCD, f0 = evaluation_metrics(stft, mel_padded, mel_outputs_postnet) + diagonality += rate avg_prob += prob - alignment_len_rate = alignment_len_rate / (i + 1) + avg_MCD += MCD + avg_f0 += f0 + diagonality=diagonality / (i + 1) avg_prob = avg_prob / (i + 1) val_loss = val_loss / (i + 1) + avg_MCD = avg_MCD / (i + 1) + avg_f0 = avg_f0 / (i + 1) model.train() if rank == 0: - print("Validation loss {}: {:9f} ".format(iteration, reduced_val_loss)) - logger.log_validation(reduced_val_loss, model, y, y_pred, alignment_len_rate, avg_prob, iteration) + print("Validation loss {}: {:9f} ".format(iteration, val_loss)) + logger.log_validation(val_loss, model, y, y_pred, diagonality, avg_prob, avg_MCD, avg_f0, iteration) def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, @@ -168,6 +176,11 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, rank (int): rank of current gpu hparams (object): comma separated list of "name=value" pairs. """ + stft = layers.TacotronSTFT( + hparams.filter_length, hparams.hop_length, hparams.win_length, + hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin, + hparams.mel_fmax) + if hparams.distributed_run: init_distributed(hparams, n_gpus, rank, group_name) @@ -223,11 +236,15 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, x, y = model.parse_batch(batch) y_pred = model(x) + _, input_lengths, mel_padded, _, output_lengths = x + _, mel_outputs_postnet, _, alignments = y_pred + loss = criterion(y_pred, y) if hparams.distributed_run: reduced_loss = reduce_tensor(loss.data, n_gpus).item() else: reduced_loss = loss.item() + if hparams.fp16_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() @@ -242,22 +259,23 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus, grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), hparams.grad_clip_thresh) - _, _, _, alignments = y_pred - alignment_len_rate, avg_prob = alignment_metric(alignments) - optimizer.step() if not is_overflow and rank == 0: duration = time.perf_counter() - start print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format( iteration, reduced_loss, grad_norm, duration)) - logger.log_training( - reduced_loss, grad_norm, learning_rate, duration, alignment_len_rate, avg_prob, iteration) + if (i % (hparams.iters_per_checkpoint / 10) == 0): + with torch.no_grad(): + diagonality, avg_prob = alignment_metric(alignments, input_lengths, output_lengths) + avg_MCD, avg_f0 = evaluation_metrics(stft, mel_padded, mel_outputs_postnet) + logger.log_training( + reduced_loss, grad_norm, learning_rate, duration, diagonality, avg_prob, avg_MCD, avg_f0, iteration) if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0): validate(model, criterion, valset, iteration, hparams.batch_size, n_gpus, collate_fn, logger, - hparams.distributed_run, rank) + hparams.distributed_run, rank, stft) if rank == 0: checkpoint_path = os.path.join( output_directory, "checkpoint_{}".format(iteration)) From 26954c8364bf8f212f7268069cf3ce0c3a36d309 Mon Sep 17 00:00:00 2001 From: YeongTae Date: Fri, 6 Sep 2019 14:59:25 +0900 Subject: [PATCH 4/7] =?UTF-8?q?Dataset=20preprocessing=20=E3=84=B4=20Audio?= =?UTF-8?q?=20processing:=20trimming=20silence(if=20it=20<=2023=20db),=20p?= =?UTF-8?q?reemphasis,=20amplitude=20normalization=20=E3=84=B4=20Remove=20?= =?UTF-8?q?short=20clip(if=20it=20<=2014847=20samples,=20It=20maybe=20perc?= =?UTF-8?q?entile=200.10=20my=20own=20dataset)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- preprocess_dataset.py | 81 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 preprocess_dataset.py diff --git a/preprocess_dataset.py b/preprocess_dataset.py new file mode 100644 index 000000000..e9e897ffd --- /dev/null +++ b/preprocess_dataset.py @@ -0,0 +1,81 @@ +""" +This code was developed with reference to https://github.com/Rayhane-mamah/Tacotron-2. +""" +from scipy.io.wavfile import write +import librosa +import numpy as np +import argparse + +sr = 22050 +max_wav_value=32768.0 +trim_fft_size = 1024 +trim_hop_size = 256 + +# These are control parameters for trimming and skipping +trim_top_db = 23 +skip_len = 14848 + +def preprocess_audio(file_list, silence_audio_size, pre_emphasis=True): + for F in file_list: + f = open(F, encoding='utf-8') + R = f.readlines() + f.close() + print('='*5+F+'='*5) + + for i, r in enumerate(R): + wav_file = r.split('|')[0] + data, sampling_rate = librosa.core.load(wav_file, sr) + data = data / np.abs(data).max() *0.999 + if(pre_emphasis): + data = np.append(data[0], data[1:] - 0.97 * data[:-1]) + data = data / np.abs(data).max() * 0.999 + data_= librosa.effects.trim(data, top_db= trim_top_db, frame_length=trim_fft_size, hop_length=trim_hop_size)[0] + data_ = data_*max_wav_value + data_ = np.append(data_, [0.]*silence_audio_size) + data_ = data_.astype(dtype=np.int16) + write(wav_file, sr, data_) + #print(len(data),len(data_)) + if(i%100 == 0): + print (i) + +def remove_short_audios(file_name): + f = open(file_name,'r',encoding='utf-8') + R = f.readlines() + f.close() + + L = [] + for i, r in enumerate(R): + wav_file = r.split('|')[0] + data, sampling_rate = librosa.core.load(wav_file, sr) + if(len(data) >= skip_len): + L.append(r) + + skiped_file_name = file_name.split('.')[0]+'_skiped.txt' + f = open(skiped_file_name,'w',encoding='utf-8') + f.writelines(L) + f.close() + +if __name__ == "__main__": + """ + usage + python preprocess_audio.py -f=filelists/ljs_audio_text_test_filelist.txt,filelists/ljs_audio_text_train_filelist.txt,filelists/ljs_audio_text_val_filelist.txt -s=5 -p -r + """ + parser = argparse.ArgumentParser() + parser.add_argument('-f', '--file_list', type=str, + help='file list to preprocess') + parser.add_argument('-s', '--silence_padding', type=int, default=0, + help='Adding silence padding at the end of each audio, silence audio size is hop_length * silence padding') + parser.add_argument('-p', '--pre_emphasis', action='store_true', + help="do or don't do pre_emphasis") + parser.add_argument('-r', '--remove_short_audios',action='store_true', + help="do or don't remove short audios") + args = parser.parse_args() + file_list = args.file_list.split(',') + silence_audio_size = trim_hop_size * args.silence_padding + remove_short_audios = args.remove_short_audios + + preprocess_audio(file_list, silence_audio_size) + + if(remove_short_audios): + for f in file_list: + remove_short_audios(remove_short_audios) \ No newline at end of file From b590337d4c1f88a1c514ebf36ed64e17207c20ca Mon Sep 17 00:00:00 2001 From: YeongTae Date: Mon, 9 Sep 2019 15:30:42 +0900 Subject: [PATCH 5/7] =?UTF-8?q?Being=20fixed=20errors=20of=20metric=20name?= =?UTF-8?q?=20on=20Tensorboard,=20=E3=84=B4=20Replacing=20MCD(metric=20nam?= =?UTF-8?q?e)=20to=20log=5FMCD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- logger.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/logger.py b/logger.py index 2355f19fb..c84841962 100644 --- a/logger.py +++ b/logger.py @@ -17,7 +17,7 @@ def log_training(self, reduced_loss, grad_norm, learning_rate, duration, diagona self.add_scalar("duration", duration, iteration) self.add_scalar("training.attention_alignment_diagonality", diagonality, iteration) self.add_scalar("training.average_max_attention_weight", avg_prob, iteration) - self.add_scalar("training.MCD", avg_MCD, iteration) + self.add_scalar("training.log_MCD", avg_MCD, iteration) self.add_scalar("training.f0(100hz)", avg_f0, iteration) def log_validation(self, reduced_loss, model, y, y_pred, diagonality, avg_prob, avg_MCD, avg_f0, iteration): @@ -26,7 +26,7 @@ def log_validation(self, reduced_loss, model, y, y_pred, diagonality, avg_prob, mel_targets, gate_targets = y self.add_scalar("validation.attention_alignment_diagonality", diagonality, iteration) self.add_scalar("validation.average_max_attention_weight", avg_prob, iteration) - self.add_scalar("validation.MCD", avg_MCD, iteration) + self.add_scalar("validation.log_MCD", avg_MCD, iteration) self.add_scalar("validation.f0(100hz)", avg_f0, iteration) # plot distribution of parameters From cdf64f8689508a8d66194d8a2eaf0cb86aba1ac4 Mon Sep 17 00:00:00 2001 From: YeongTae Date: Mon, 4 Nov 2019 14:22:09 +0900 Subject: [PATCH 6/7] Moving the order in which 'pre_emphasis' is applied after 'trimming'. --- preprocess_dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/preprocess_dataset.py b/preprocess_dataset.py index e9e897ffd..5a4a0db6f 100644 --- a/preprocess_dataset.py +++ b/preprocess_dataset.py @@ -26,11 +26,11 @@ def preprocess_audio(file_list, silence_audio_size, pre_emphasis=True): wav_file = r.split('|')[0] data, sampling_rate = librosa.core.load(wav_file, sr) data = data / np.abs(data).max() *0.999 - if(pre_emphasis): - data = np.append(data[0], data[1:] - 0.97 * data[:-1]) - data = data / np.abs(data).max() * 0.999 data_= librosa.effects.trim(data, top_db= trim_top_db, frame_length=trim_fft_size, hop_length=trim_hop_size)[0] data_ = data_*max_wav_value + if (pre_emphasis): + data_ = np.append(data_[0], data_[1:] - 0.97 * data_[:-1]) + data_ = data_ / np.abs(data_).max() * 0.999 data_ = np.append(data_, [0.]*silence_audio_size) data_ = data_.astype(dtype=np.int16) write(wav_file, sr, data_) From 8fad6457d298108f52939430927dfb0ac3a0e479 Mon Sep 17 00:00:00 2001 From: YeongTae Date: Thu, 16 Jan 2020 15:17:49 +0900 Subject: [PATCH 7/7] =?UTF-8?q?Dataset=20preprocessing=20=E3=84=B4=20debug?= =?UTF-8?q?ging?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- preprocess_dataset.py | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/preprocess_dataset.py b/preprocess_dataset.py index 5a4a0db6f..aa250dce3 100644 --- a/preprocess_dataset.py +++ b/preprocess_dataset.py @@ -15,7 +15,7 @@ trim_top_db = 23 skip_len = 14848 -def preprocess_audio(file_list, silence_audio_size, pre_emphasis=True): +def preprocess_audio(file_list, silence_audio_size, pre_emphasis=False): for F in file_list: f = open(F, encoding='utf-8') R = f.readlines() @@ -27,10 +27,10 @@ def preprocess_audio(file_list, silence_audio_size, pre_emphasis=True): data, sampling_rate = librosa.core.load(wav_file, sr) data = data / np.abs(data).max() *0.999 data_= librosa.effects.trim(data, top_db= trim_top_db, frame_length=trim_fft_size, hop_length=trim_hop_size)[0] - data_ = data_*max_wav_value if (pre_emphasis): data_ = np.append(data_[0], data_[1:] - 0.97 * data_[:-1]) data_ = data_ / np.abs(data_).max() * 0.999 + data_ = data_ * max_wav_value data_ = np.append(data_, [0.]*silence_audio_size) data_ = data_.astype(dtype=np.int16) write(wav_file, sr, data_) @@ -49,33 +49,39 @@ def remove_short_audios(file_name): data, sampling_rate = librosa.core.load(wav_file, sr) if(len(data) >= skip_len): L.append(r) - - skiped_file_name = file_name.split('.')[0]+'_skiped.txt' - f = open(skiped_file_name,'w',encoding='utf-8') + if (i % 100 == 0): + print(i) + tmp = file_name.split('.') + tmp.insert(1,'_skipped.') + skipped_file_name = "".join(tmp) + f = open(skipped_file_name,'w',encoding='utf-8') f.writelines(L) f.close() if __name__ == "__main__": """ usage - python preprocess_audio.py -f=filelists/ljs_audio_text_test_filelist.txt,filelists/ljs_audio_text_train_filelist.txt,filelists/ljs_audio_text_val_filelist.txt -s=5 -p -r + python preprocess_dataset.py -f=metadata.csv -s=5 -t -p -r + python preprocess_dataset.py -f=metadata.csv """ parser = argparse.ArgumentParser() parser.add_argument('-f', '--file_list', type=str, - help='file list to preprocess') + help='Metadata file list to preprocess') parser.add_argument('-s', '--silence_padding', type=int, default=0, help='Adding silence padding at the end of each audio, silence audio size is hop_length * silence padding') parser.add_argument('-p', '--pre_emphasis', action='store_true', - help="do or don't do pre_emphasis") + help="Doing pre_emphasis") + parser.add_argument('-t', '--trimming', action='store_true', + help="Doing trimming audios") parser.add_argument('-r', '--remove_short_audios',action='store_true', - help="do or don't remove short audios") + help="Removing short audios in metadata file") args = parser.parse_args() file_list = args.file_list.split(',') silence_audio_size = trim_hop_size * args.silence_padding - remove_short_audios = args.remove_short_audios - preprocess_audio(file_list, silence_audio_size) - if(remove_short_audios): + preprocess_audio(file_list, silence_audio_size, args.pre_emphasis) + + if(args.remove_short_audios): for f in file_list: - remove_short_audios(remove_short_audios) \ No newline at end of file + remove_short_audios(f) \ No newline at end of file