зеркало из https://github.com/mozilla/TTS.git
disabling multispeaker with num_speakers=0
This commit is contained in:
Родитель
04e452d8cb
Коммит
ba8cc8054b
|
@ -76,6 +76,6 @@
|
||||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||||
"text_cleaner": "phoneme_cleaners",
|
"text_cleaner": "phoneme_cleaners",
|
||||||
"num_speakers": 10 // should just be bigger than the actual number of speakers
|
"num_speakers": 10 // should just be bigger than the actual number of speakers, 0 disables speaker embeddings
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -77,6 +77,6 @@
|
||||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||||
"text_cleaner": "phoneme_cleaners",
|
"text_cleaner": "phoneme_cleaners",
|
||||||
"num_speakers": 10 // should just be bigger than the actual number of speakers
|
"num_speakers": 10 // should just be bigger than the actual number of speakers, 0 disables speaker embeddings
|
||||||
}
|
}
|
||||||
|
|
|
@ -79,6 +79,6 @@
|
||||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||||
"text_cleaner": "phoneme_cleaners",
|
"text_cleaner": "phoneme_cleaners",
|
||||||
"num_speakers": 10 // should just be bigger than the actual number of speakers
|
"num_speakers": 10 // should just be bigger than the actual number of speakers, 0 disables speaker embeddings
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -77,6 +77,6 @@
|
||||||
"use_phonemes": false, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
"use_phonemes": false, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||||
"phoneme_language": "de", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
"phoneme_language": "de", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||||
"text_cleaner": "phoneme_cleaners",
|
"text_cleaner": "phoneme_cleaners",
|
||||||
"num_speakers": 10 // should just be bigger than the actual number of speakers
|
"num_speakers": 10 // should just be bigger than the actual number of speakers, 0 disables speaker embeddings
|
||||||
}
|
}
|
||||||
|
|
|
@ -77,6 +77,6 @@
|
||||||
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
"use_phonemes": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.
|
||||||
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
"phoneme_language": "en-us", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages
|
||||||
"text_cleaner": "phoneme_cleaners",
|
"text_cleaner": "phoneme_cleaners",
|
||||||
"num_speakers": 10 // should just be bigger than the actual number of speakers
|
"num_speakers": 10 // should just be bigger than the actual number of speakers, 0 disables speaker embeddings
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,8 +29,8 @@ class Tacotron(nn.Module):
|
||||||
self.linear_dim = linear_dim
|
self.linear_dim = linear_dim
|
||||||
self.embedding = nn.Embedding(num_chars, 256)
|
self.embedding = nn.Embedding(num_chars, 256)
|
||||||
self.embedding.weight.data.normal_(0, 0.3)
|
self.embedding.weight.data.normal_(0, 0.3)
|
||||||
self.speaker_embedding = nn.Embedding(num_speakers,
|
if num_speakers > 0:
|
||||||
256)
|
self.speaker_embedding = nn.Embedding(num_speakers, 256)
|
||||||
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
||||||
self.encoder = Encoder(256)
|
self.encoder = Encoder(256)
|
||||||
self.decoder = Decoder(256, mel_dim, r, memory_size, attn_win,
|
self.decoder = Decoder(256, mel_dim, r, memory_size, attn_win,
|
||||||
|
@ -42,18 +42,13 @@ class Tacotron(nn.Module):
|
||||||
nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim),
|
nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim),
|
||||||
nn.Sigmoid())
|
nn.Sigmoid())
|
||||||
|
|
||||||
def forward(self, characters, speaker_ids, text_lengths, mel_specs):
|
def forward(self, characters, text_lengths, mel_specs, speaker_ids=None):
|
||||||
B = characters.size(0)
|
B = characters.size(0)
|
||||||
mask = sequence_mask(text_lengths).to(characters.device)
|
mask = sequence_mask(text_lengths).to(characters.device)
|
||||||
inputs = self.embedding(characters)
|
inputs = self.embedding(characters)
|
||||||
encoder_outputs = self.encoder(inputs)
|
encoder_outputs = self.encoder(inputs)
|
||||||
speaker_embeddings = self.speaker_embedding(speaker_ids)
|
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
|
||||||
|
speaker_ids)
|
||||||
speaker_embeddings.unsqueeze_(1)
|
|
||||||
speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0),
|
|
||||||
encoder_outputs.size(1),
|
|
||||||
-1)
|
|
||||||
encoder_outputs += speaker_embeddings
|
|
||||||
mel_outputs, alignments, stop_tokens = self.decoder(
|
mel_outputs, alignments, stop_tokens = self.decoder(
|
||||||
encoder_outputs, mel_specs, mask)
|
encoder_outputs, mel_specs, mask)
|
||||||
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
|
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
|
||||||
|
@ -61,10 +56,21 @@ class Tacotron(nn.Module):
|
||||||
linear_outputs = self.last_linear(linear_outputs)
|
linear_outputs = self.last_linear(linear_outputs)
|
||||||
return mel_outputs, linear_outputs, alignments, stop_tokens
|
return mel_outputs, linear_outputs, alignments, stop_tokens
|
||||||
|
|
||||||
def inference(self, characters, speaker_ids):
|
def inference(self, characters, speaker_ids=None):
|
||||||
B = characters.size(0)
|
B = characters.size(0)
|
||||||
inputs = self.embedding(characters)
|
inputs = self.embedding(characters)
|
||||||
encoder_outputs = self.encoder(inputs)
|
encoder_outputs = self.encoder(inputs)
|
||||||
|
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
|
||||||
|
speaker_ids)
|
||||||
|
mel_outputs, alignments, stop_tokens = self.decoder.inference(
|
||||||
|
encoder_outputs)
|
||||||
|
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
|
||||||
|
linear_outputs = self.postnet(mel_outputs)
|
||||||
|
linear_outputs = self.last_linear(linear_outputs)
|
||||||
|
return mel_outputs, linear_outputs, alignments, stop_tokens
|
||||||
|
|
||||||
|
def _add_speaker_embedding(self, encoder_outputs, speaker_ids):
|
||||||
|
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
|
||||||
speaker_embeddings = self.speaker_embedding(speaker_ids)
|
speaker_embeddings = self.speaker_embedding(speaker_ids)
|
||||||
|
|
||||||
speaker_embeddings.unsqueeze_(1)
|
speaker_embeddings.unsqueeze_(1)
|
||||||
|
@ -72,9 +78,4 @@ class Tacotron(nn.Module):
|
||||||
encoder_outputs.size(1),
|
encoder_outputs.size(1),
|
||||||
-1)
|
-1)
|
||||||
encoder_outputs += speaker_embeddings
|
encoder_outputs += speaker_embeddings
|
||||||
mel_outputs, alignments, stop_tokens = self.decoder.inference(
|
return encoder_outputs
|
||||||
encoder_outputs)
|
|
||||||
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
|
|
||||||
linear_outputs = self.postnet(mel_outputs)
|
|
||||||
linear_outputs = self.last_linear(linear_outputs)
|
|
||||||
return mel_outputs, linear_outputs, alignments, stop_tokens
|
|
||||||
|
|
|
@ -29,6 +29,7 @@ class Tacotron2(nn.Module):
|
||||||
std = sqrt(2.0 / (num_chars + 512))
|
std = sqrt(2.0 / (num_chars + 512))
|
||||||
val = sqrt(3.0) * std # uniform bounds for std
|
val = sqrt(3.0) * std # uniform bounds for std
|
||||||
self.embedding.weight.data.uniform_(-val, val)
|
self.embedding.weight.data.uniform_(-val, val)
|
||||||
|
if num_speakers > 0:
|
||||||
self.speaker_embedding = nn.Embedding(num_speakers, 512)
|
self.speaker_embedding = nn.Embedding(num_speakers, 512)
|
||||||
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
||||||
self.encoder = Encoder(512)
|
self.encoder = Encoder(512)
|
||||||
|
@ -43,19 +44,13 @@ class Tacotron2(nn.Module):
|
||||||
mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2)
|
mel_outputs_postnet = mel_outputs_postnet.transpose(1, 2)
|
||||||
return mel_outputs, mel_outputs_postnet, alignments
|
return mel_outputs, mel_outputs_postnet, alignments
|
||||||
|
|
||||||
def forward(self, text, speaker_ids, text_lengths, mel_specs=None):
|
def forward(self, text, text_lengths, mel_specs=None, speaker_ids=None):
|
||||||
# compute mask for padding
|
# compute mask for padding
|
||||||
mask = sequence_mask(text_lengths).to(text.device)
|
mask = sequence_mask(text_lengths).to(text.device)
|
||||||
embedded_inputs = self.embedding(text).transpose(1, 2)
|
embedded_inputs = self.embedding(text).transpose(1, 2)
|
||||||
encoder_outputs = self.encoder(embedded_inputs, text_lengths)
|
encoder_outputs = self.encoder(embedded_inputs, text_lengths)
|
||||||
speaker_embeddings = self.speaker_embedding(speaker_ids)
|
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
|
||||||
|
speaker_ids)
|
||||||
speaker_embeddings.unsqueeze_(1)
|
|
||||||
speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0),
|
|
||||||
encoder_outputs.size(1),
|
|
||||||
-1)
|
|
||||||
|
|
||||||
encoder_outputs += speaker_embeddings
|
|
||||||
mel_outputs, stop_tokens, alignments = self.decoder(
|
mel_outputs, stop_tokens, alignments = self.decoder(
|
||||||
encoder_outputs, mel_specs, mask)
|
encoder_outputs, mel_specs, mask)
|
||||||
mel_outputs_postnet = self.postnet(mel_outputs)
|
mel_outputs_postnet = self.postnet(mel_outputs)
|
||||||
|
@ -64,16 +59,11 @@ class Tacotron2(nn.Module):
|
||||||
mel_outputs, mel_outputs_postnet, alignments)
|
mel_outputs, mel_outputs_postnet, alignments)
|
||||||
return mel_outputs, mel_outputs_postnet, alignments, stop_tokens
|
return mel_outputs, mel_outputs_postnet, alignments, stop_tokens
|
||||||
|
|
||||||
def inference(self, text, speaker_ids):
|
def inference(self, text, speaker_ids=None):
|
||||||
embedded_inputs = self.embedding(text).transpose(1, 2)
|
embedded_inputs = self.embedding(text).transpose(1, 2)
|
||||||
encoder_outputs = self.encoder.inference(embedded_inputs)
|
encoder_outputs = self.encoder.inference(embedded_inputs)
|
||||||
speaker_embeddings = self.speaker_embedding(speaker_ids)
|
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
|
||||||
|
speaker_ids)
|
||||||
speaker_embeddings.unsqueeze_(1)
|
|
||||||
speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0),
|
|
||||||
encoder_outputs.size(1),
|
|
||||||
-1)
|
|
||||||
encoder_outputs += speaker_embeddings
|
|
||||||
mel_outputs, stop_tokens, alignments = self.decoder.inference(
|
mel_outputs, stop_tokens, alignments = self.decoder.inference(
|
||||||
encoder_outputs)
|
encoder_outputs)
|
||||||
mel_outputs_postnet = self.postnet(mel_outputs)
|
mel_outputs_postnet = self.postnet(mel_outputs)
|
||||||
|
@ -82,19 +72,14 @@ class Tacotron2(nn.Module):
|
||||||
mel_outputs, mel_outputs_postnet, alignments)
|
mel_outputs, mel_outputs_postnet, alignments)
|
||||||
return mel_outputs, mel_outputs_postnet, alignments, stop_tokens
|
return mel_outputs, mel_outputs_postnet, alignments, stop_tokens
|
||||||
|
|
||||||
def inference_truncated(self, text, speaker_ids):
|
def inference_truncated(self, text, speaker_ids=None):
|
||||||
"""
|
"""
|
||||||
Preserve model states for continuous inference
|
Preserve model states for continuous inference
|
||||||
"""
|
"""
|
||||||
embedded_inputs = self.embedding(text).transpose(1, 2)
|
embedded_inputs = self.embedding(text).transpose(1, 2)
|
||||||
encoder_outputs = self.encoder.inference_truncated(embedded_inputs)
|
encoder_outputs = self.encoder.inference_truncated(embedded_inputs)
|
||||||
speaker_embeddings = self.speaker_embedding(speaker_ids)
|
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
|
||||||
|
speaker_ids)
|
||||||
speaker_embeddings.unsqueeze_(1)
|
|
||||||
speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0),
|
|
||||||
encoder_outputs.size(1),
|
|
||||||
-1)
|
|
||||||
encoder_outputs += speaker_embeddings
|
|
||||||
mel_outputs, stop_tokens, alignments = self.decoder.inference_truncated(
|
mel_outputs, stop_tokens, alignments = self.decoder.inference_truncated(
|
||||||
encoder_outputs)
|
encoder_outputs)
|
||||||
mel_outputs_postnet = self.postnet(mel_outputs)
|
mel_outputs_postnet = self.postnet(mel_outputs)
|
||||||
|
@ -102,3 +87,14 @@ class Tacotron2(nn.Module):
|
||||||
mel_outputs, mel_outputs_postnet, alignments = self.shape_outputs(
|
mel_outputs, mel_outputs_postnet, alignments = self.shape_outputs(
|
||||||
mel_outputs, mel_outputs_postnet, alignments)
|
mel_outputs, mel_outputs_postnet, alignments)
|
||||||
return mel_outputs, mel_outputs_postnet, alignments, stop_tokens
|
return mel_outputs, mel_outputs_postnet, alignments, stop_tokens
|
||||||
|
|
||||||
|
def _add_speaker_embedding(self, encoder_outputs, speaker_ids):
|
||||||
|
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
|
||||||
|
speaker_embeddings = self.speaker_embedding(speaker_ids)
|
||||||
|
|
||||||
|
speaker_embeddings.unsqueeze_(1)
|
||||||
|
speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0),
|
||||||
|
encoder_outputs.size(1),
|
||||||
|
-1)
|
||||||
|
encoder_outputs += speaker_embeddings
|
||||||
|
return encoder_outputs
|
||||||
|
|
|
@ -30,6 +30,7 @@ class TacotronGST(nn.Module):
|
||||||
self.linear_dim = linear_dim
|
self.linear_dim = linear_dim
|
||||||
self.embedding = nn.Embedding(num_chars, 256)
|
self.embedding = nn.Embedding(num_chars, 256)
|
||||||
self.embedding.weight.data.normal_(0, 0.3)
|
self.embedding.weight.data.normal_(0, 0.3)
|
||||||
|
if num_speakers > 0:
|
||||||
self.speaker_embedding = nn.Embedding(num_speakers, 256)
|
self.speaker_embedding = nn.Embedding(num_speakers, 256)
|
||||||
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
self.speaker_embedding.weight.data.normal_(0, 0.3)
|
||||||
self.encoder = Encoder(256)
|
self.encoder = Encoder(256)
|
||||||
|
@ -43,22 +44,16 @@ class TacotronGST(nn.Module):
|
||||||
nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim),
|
nn.Linear(self.postnet.cbhg.gru_features * 2, linear_dim),
|
||||||
nn.Sigmoid())
|
nn.Sigmoid())
|
||||||
|
|
||||||
def forward(self, characters, speaker_ids, text_lengths, mel_specs):
|
def forward(self, characters, text_lengths, mel_specs, speaker_ids=None):
|
||||||
B = characters.size(0)
|
B = characters.size(0)
|
||||||
mask = sequence_mask(text_lengths).to(characters.device)
|
mask = sequence_mask(text_lengths).to(characters.device)
|
||||||
inputs = self.embedding(characters)
|
inputs = self.embedding(characters)
|
||||||
encoder_outputs = self.encoder(inputs)
|
encoder_outputs = self.encoder(inputs)
|
||||||
|
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
|
||||||
speaker_embeddings = self.speaker_embedding(speaker_ids)
|
speaker_ids)
|
||||||
|
|
||||||
speaker_embeddings.unsqueeze_(1)
|
|
||||||
speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0),
|
|
||||||
encoder_outputs.size(1),
|
|
||||||
-1)
|
|
||||||
|
|
||||||
gst_outputs = self.gst(mel_specs)
|
gst_outputs = self.gst(mel_specs)
|
||||||
gst_outputs = gst_outputs.expand(-1, encoder_outputs.size(1), -1)
|
gst_outputs = gst_outputs.expand(-1, encoder_outputs.size(1), -1)
|
||||||
encoder_outputs = encoder_outputs + gst_outputs + speaker_embeddings
|
encoder_outputs = encoder_outputs + gst_outputs
|
||||||
mel_outputs, alignments, stop_tokens = self.decoder(
|
mel_outputs, alignments, stop_tokens = self.decoder(
|
||||||
encoder_outputs, mel_specs, mask)
|
encoder_outputs, mel_specs, mask)
|
||||||
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
|
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
|
||||||
|
@ -66,24 +61,30 @@ class TacotronGST(nn.Module):
|
||||||
linear_outputs = self.last_linear(linear_outputs)
|
linear_outputs = self.last_linear(linear_outputs)
|
||||||
return mel_outputs, linear_outputs, alignments, stop_tokens
|
return mel_outputs, linear_outputs, alignments, stop_tokens
|
||||||
|
|
||||||
def inference(self, characters, speaker_ids, style_mel=None):
|
def inference(self, characters, speaker_ids=None, style_mel=None):
|
||||||
B = characters.size(0)
|
B = characters.size(0)
|
||||||
inputs = self.embedding(characters)
|
inputs = self.embedding(characters)
|
||||||
encoder_outputs = self.encoder(inputs)
|
encoder_outputs = self.encoder(inputs)
|
||||||
speaker_embeddings = self.speaker_embedding(speaker_ids)
|
encoder_outputs = self._add_speaker_embedding(encoder_outputs,
|
||||||
|
speaker_ids)
|
||||||
speaker_embeddings.unsqueeze_(1)
|
|
||||||
speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0),
|
|
||||||
encoder_outputs.size(1),
|
|
||||||
-1)
|
|
||||||
if style_mel is not None:
|
if style_mel is not None:
|
||||||
gst_outputs = self.gst(style_mel)
|
gst_outputs = self.gst(style_mel)
|
||||||
gst_outputs = gst_outputs.expand(-1, encoder_outputs.size(1), -1)
|
gst_outputs = gst_outputs.expand(-1, encoder_outputs.size(1), -1)
|
||||||
encoder_outputs = encoder_outputs + gst_outputs
|
encoder_outputs = encoder_outputs + gst_outputs
|
||||||
encoder_outputs += speaker_embeddings
|
|
||||||
mel_outputs, alignments, stop_tokens = self.decoder.inference(
|
mel_outputs, alignments, stop_tokens = self.decoder.inference(
|
||||||
encoder_outputs)
|
encoder_outputs)
|
||||||
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
|
mel_outputs = mel_outputs.view(B, -1, self.mel_dim)
|
||||||
linear_outputs = self.postnet(mel_outputs)
|
linear_outputs = self.postnet(mel_outputs)
|
||||||
linear_outputs = self.last_linear(linear_outputs)
|
linear_outputs = self.last_linear(linear_outputs)
|
||||||
return mel_outputs, linear_outputs, alignments, stop_tokens
|
return mel_outputs, linear_outputs, alignments, stop_tokens
|
||||||
|
|
||||||
|
def _add_speaker_embedding(self, encoder_outputs, speaker_ids):
|
||||||
|
if hasattr(self, "speaker_embedding") and speaker_ids is not None:
|
||||||
|
speaker_embeddings = self.speaker_embedding(speaker_ids)
|
||||||
|
|
||||||
|
speaker_embeddings.unsqueeze_(1)
|
||||||
|
speaker_embeddings = speaker_embeddings.expand(encoder_outputs.size(0),
|
||||||
|
encoder_outputs.size(1),
|
||||||
|
-1)
|
||||||
|
encoder_outputs += speaker_embeddings
|
||||||
|
return encoder_outputs
|
||||||
|
|
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
28
train.py
28
train.py
|
@ -78,6 +78,7 @@ def setup_loader(is_val=False, verbose=False):
|
||||||
def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
||||||
ap, epoch):
|
ap, epoch):
|
||||||
data_loader = setup_loader(is_val=False, verbose=(epoch==0))
|
data_loader = setup_loader(is_val=False, verbose=(epoch==0))
|
||||||
|
if c.num_speakers > 0:
|
||||||
speaker_mapping = load_speaker_mapping(OUT_PATH)
|
speaker_mapping = load_speaker_mapping(OUT_PATH)
|
||||||
model.train()
|
model.train()
|
||||||
epoch_time = 0
|
epoch_time = 0
|
||||||
|
@ -101,6 +102,7 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
||||||
avg_text_length = torch.mean(text_lengths.float())
|
avg_text_length = torch.mean(text_lengths.float())
|
||||||
avg_spec_length = torch.mean(mel_lengths.float())
|
avg_spec_length = torch.mean(mel_lengths.float())
|
||||||
|
|
||||||
|
if c.num_speakers > 0:
|
||||||
speaker_ids = []
|
speaker_ids = []
|
||||||
for speaker_name in speaker_names:
|
for speaker_name in speaker_names:
|
||||||
if speaker_name not in speaker_mapping:
|
if speaker_name not in speaker_mapping:
|
||||||
|
@ -110,10 +112,13 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
||||||
|
|
||||||
if len(speaker_mapping) > c.num_speakers:
|
if len(speaker_mapping) > c.num_speakers:
|
||||||
raise ValueError("It seems there are at least {} speakers in "
|
raise ValueError("It seems there are at least {} speakers in "
|
||||||
"your dataset, while 'num_speakers' is set to {}. "
|
"your dataset, while 'num_speakers' is set to "
|
||||||
"Found the following speakers: {}".format(len(speaker_mapping),
|
"{}. Found the following speakers: {}".format(
|
||||||
|
len(speaker_mapping),
|
||||||
c.num_speakers,
|
c.num_speakers,
|
||||||
list(speaker_mapping)))
|
list(speaker_mapping)))
|
||||||
|
else:
|
||||||
|
speaker_ids = None
|
||||||
|
|
||||||
# set stop targets view, we predict a single stop token per r frames prediction
|
# set stop targets view, we predict a single stop token per r frames prediction
|
||||||
stop_targets = stop_targets.view(text_input.shape[0],
|
stop_targets = stop_targets.view(text_input.shape[0],
|
||||||
|
@ -137,11 +142,12 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
||||||
mel_lengths = mel_lengths.cuda(non_blocking=True)
|
mel_lengths = mel_lengths.cuda(non_blocking=True)
|
||||||
linear_input = linear_input.cuda(non_blocking=True) if c.model in ["Tacotron", "TacotronGST"] else None
|
linear_input = linear_input.cuda(non_blocking=True) if c.model in ["Tacotron", "TacotronGST"] else None
|
||||||
stop_targets = stop_targets.cuda(non_blocking=True)
|
stop_targets = stop_targets.cuda(non_blocking=True)
|
||||||
|
if speaker_ids is not None:
|
||||||
speaker_ids = speaker_ids.cuda(non_blocking=True)
|
speaker_ids = speaker_ids.cuda(non_blocking=True)
|
||||||
|
|
||||||
# forward pass model
|
# forward pass model
|
||||||
decoder_output, postnet_output, alignments, stop_tokens = model(
|
decoder_output, postnet_output, alignments, stop_tokens = model(
|
||||||
text_input, speaker_ids, text_lengths, mel_input)
|
text_input, text_lengths, mel_input, speaker_ids=speaker_ids)
|
||||||
|
|
||||||
# loss computation
|
# loss computation
|
||||||
stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1)
|
stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1)
|
||||||
|
@ -266,12 +272,14 @@ def train(model, criterion, criterion_st, optimizer, optimizer_st, scheduler,
|
||||||
tb_logger.tb_model_weights(model, current_step)
|
tb_logger.tb_model_weights(model, current_step)
|
||||||
|
|
||||||
# save speaker mapping
|
# save speaker mapping
|
||||||
|
if c.num_speakers > 0:
|
||||||
save_speaker_mapping(OUT_PATH, speaker_mapping)
|
save_speaker_mapping(OUT_PATH, speaker_mapping)
|
||||||
return avg_postnet_loss, current_step
|
return avg_postnet_loss, current_step
|
||||||
|
|
||||||
|
|
||||||
def evaluate(model, criterion, criterion_st, ap, current_step, epoch):
|
def evaluate(model, criterion, criterion_st, ap, current_step, epoch):
|
||||||
data_loader = setup_loader(is_val=True)
|
data_loader = setup_loader(is_val=True)
|
||||||
|
if c.num_speakers > 0:
|
||||||
speaker_mapping = load_speaker_mapping(OUT_PATH)
|
speaker_mapping = load_speaker_mapping(OUT_PATH)
|
||||||
model.eval()
|
model.eval()
|
||||||
epoch_time = 0
|
epoch_time = 0
|
||||||
|
@ -303,9 +311,12 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch):
|
||||||
mel_lengths = data[5]
|
mel_lengths = data[5]
|
||||||
stop_targets = data[6]
|
stop_targets = data[6]
|
||||||
|
|
||||||
|
if c.num_speakers > 0:
|
||||||
speaker_ids = [speaker_mapping[speaker_name]
|
speaker_ids = [speaker_mapping[speaker_name]
|
||||||
for speaker_name in speaker_names]
|
for speaker_name in speaker_names]
|
||||||
speaker_ids = torch.LongTensor(speaker_ids)
|
speaker_ids = torch.LongTensor(speaker_ids)
|
||||||
|
else:
|
||||||
|
speaker_ids = None
|
||||||
|
|
||||||
# set stop targets view, we predict a single stop token per r frames prediction
|
# set stop targets view, we predict a single stop token per r frames prediction
|
||||||
stop_targets = stop_targets.view(text_input.shape[0],
|
stop_targets = stop_targets.view(text_input.shape[0],
|
||||||
|
@ -320,12 +331,13 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch):
|
||||||
mel_lengths = mel_lengths.cuda()
|
mel_lengths = mel_lengths.cuda()
|
||||||
linear_input = linear_input.cuda() if c.model in ["Tacotron", "TacotronGST"] else None
|
linear_input = linear_input.cuda() if c.model in ["Tacotron", "TacotronGST"] else None
|
||||||
stop_targets = stop_targets.cuda()
|
stop_targets = stop_targets.cuda()
|
||||||
|
if speaker_ids is not None:
|
||||||
speaker_ids = speaker_ids.cuda()
|
speaker_ids = speaker_ids.cuda()
|
||||||
|
|
||||||
# forward pass
|
# forward pass
|
||||||
decoder_output, postnet_output, alignments, stop_tokens =\
|
decoder_output, postnet_output, alignments, stop_tokens =\
|
||||||
model.forward(text_input, speaker_ids,
|
model.forward(text_input, text_lengths, mel_input,
|
||||||
text_lengths, mel_input)
|
speaker_ids=speaker_ids)
|
||||||
|
|
||||||
# loss computation
|
# loss computation
|
||||||
stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1)
|
stop_loss = criterion_st(stop_tokens, stop_targets) if c.stopnet else torch.zeros(1)
|
||||||
|
@ -403,11 +415,12 @@ def evaluate(model, criterion, criterion_st, ap, current_step, epoch):
|
||||||
test_audios = {}
|
test_audios = {}
|
||||||
test_figures = {}
|
test_figures = {}
|
||||||
print(" | > Synthesizing test sentences")
|
print(" | > Synthesizing test sentences")
|
||||||
speaker_id = 0
|
speaker_id = 0 if c.num_speakers > 0 else None
|
||||||
for idx, test_sentence in enumerate(test_sentences):
|
for idx, test_sentence in enumerate(test_sentences):
|
||||||
try:
|
try:
|
||||||
wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis(
|
wav, alignment, decoder_output, postnet_output, stop_tokens = synthesis(
|
||||||
model, test_sentence, speaker_id, c, use_cuda, ap)
|
model, test_sentence, c, use_cuda, ap,
|
||||||
|
speaker_id=speaker_id)
|
||||||
file_path = os.path.join(AUDIO_PATH, str(current_step))
|
file_path = os.path.join(AUDIO_PATH, str(current_step))
|
||||||
os.makedirs(file_path, exist_ok=True)
|
os.makedirs(file_path, exist_ok=True)
|
||||||
file_path = os.path.join(file_path,
|
file_path = os.path.join(file_path,
|
||||||
|
@ -471,6 +484,7 @@ def main(args):
|
||||||
args.restore_step = checkpoint['step']
|
args.restore_step = checkpoint['step']
|
||||||
# copying speakers.json
|
# copying speakers.json
|
||||||
prev_out_path = os.path.dirname(args.restore_path)
|
prev_out_path = os.path.dirname(args.restore_path)
|
||||||
|
if c.num_speakers > 0:
|
||||||
copy_speaker_mapping(prev_out_path, OUT_PATH)
|
copy_speaker_mapping(prev_out_path, OUT_PATH)
|
||||||
else:
|
else:
|
||||||
args.restore_step = 0
|
args.restore_step = 0
|
||||||
|
|
|
@ -35,17 +35,17 @@ def compute_style_mel(style_wav, ap, use_cuda):
|
||||||
return style_mel
|
return style_mel
|
||||||
|
|
||||||
|
|
||||||
def run_model(model, inputs, speaker_id, CONFIG, truncated, style_mel=None):
|
def run_model(model, inputs, CONFIG, truncated, speaker_id=None, style_mel=None):
|
||||||
if CONFIG.model == "TacotronGST" and style_mel is not None:
|
if CONFIG.model == "TacotronGST" and style_mel is not None:
|
||||||
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
|
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
|
||||||
inputs, style_mel, speaker_id)
|
inputs, style_mel=style_mel, speaker_ids=speaker_id)
|
||||||
else:
|
else:
|
||||||
if truncated:
|
if truncated:
|
||||||
decoder_output, postnet_output, alignments, stop_tokens = model.inference_truncated(
|
decoder_output, postnet_output, alignments, stop_tokens = model.inference_truncated(
|
||||||
inputs, speaker_id)
|
inputs, speaker_ids=speaker_id)
|
||||||
else:
|
else:
|
||||||
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
|
decoder_output, postnet_output, alignments, stop_tokens = model.inference(
|
||||||
inputs, speaker_id)
|
inputs, speaker_ids=speaker_id)
|
||||||
return decoder_output, postnet_output, alignments, stop_tokens
|
return decoder_output, postnet_output, alignments, stop_tokens
|
||||||
|
|
||||||
|
|
||||||
|
@ -70,10 +70,10 @@ def inv_spectrogram(postnet_output, ap, CONFIG):
|
||||||
|
|
||||||
def synthesis(model,
|
def synthesis(model,
|
||||||
text,
|
text,
|
||||||
speaker_id,
|
|
||||||
CONFIG,
|
CONFIG,
|
||||||
use_cuda,
|
use_cuda,
|
||||||
ap,
|
ap,
|
||||||
|
speaker_id=None,
|
||||||
style_wav=None,
|
style_wav=None,
|
||||||
truncated=False,
|
truncated=False,
|
||||||
enable_eos_bos_chars=False,
|
enable_eos_bos_chars=False,
|
||||||
|
@ -83,11 +83,11 @@ def synthesis(model,
|
||||||
Args:
|
Args:
|
||||||
model (TTS.models): model to synthesize.
|
model (TTS.models): model to synthesize.
|
||||||
text (str): target text
|
text (str): target text
|
||||||
speaker_id (int): id of speaker
|
|
||||||
CONFIG (dict): config dictionary to be loaded from config.json.
|
CONFIG (dict): config dictionary to be loaded from config.json.
|
||||||
use_cuda (bool): enable cuda.
|
use_cuda (bool): enable cuda.
|
||||||
ap (TTS.utils.audio.AudioProcessor): audio processor to process
|
ap (TTS.utils.audio.AudioProcessor): audio processor to process
|
||||||
model outputs.
|
model outputs.
|
||||||
|
speaker_id (int): id of speaker
|
||||||
style_wav (str): Uses for style embedding of GST.
|
style_wav (str): Uses for style embedding of GST.
|
||||||
truncated (bool): keep model states after inference. It can be used
|
truncated (bool): keep model states after inference. It can be used
|
||||||
for continuous inference at long texts.
|
for continuous inference at long texts.
|
||||||
|
@ -100,13 +100,14 @@ def synthesis(model,
|
||||||
style_mel = compute_style_mel(style_wav, ap, use_cuda)
|
style_mel = compute_style_mel(style_wav, ap, use_cuda)
|
||||||
# preprocess the given text
|
# preprocess the given text
|
||||||
inputs = text_to_seqvec(text, CONFIG, use_cuda)
|
inputs = text_to_seqvec(text, CONFIG, use_cuda)
|
||||||
|
if speaker_id is not None:
|
||||||
speaker_id = np.asarray(speaker_id)
|
speaker_id = np.asarray(speaker_id)
|
||||||
speaker_id = torch.from_numpy(speaker_id).unsqueeze(0)
|
speaker_id = torch.from_numpy(speaker_id).unsqueeze(0)
|
||||||
if use_cuda:
|
if use_cuda:
|
||||||
speaker_id.cuda()
|
speaker_id.cuda()
|
||||||
# synthesize voice
|
# synthesize voice
|
||||||
decoder_output, postnet_output, alignments, stop_tokens = run_model(
|
decoder_output, postnet_output, alignments, stop_tokens = run_model(
|
||||||
model, inputs, speaker_id, CONFIG, truncated, style_mel)
|
model, inputs, CONFIG, truncated, speaker_id, style_mel)
|
||||||
# convert outputs to numpy
|
# convert outputs to numpy
|
||||||
postnet_output, decoder_output, alignment = parse_outputs(
|
postnet_output, decoder_output, alignment = parse_outputs(
|
||||||
postnet_output, decoder_output, alignments)
|
postnet_output, decoder_output, alignments)
|
||||||
|
|
Загрузка…
Ссылка в новой задаче