Merge pull request #650 from lifeiteng/master

add read/write cache opt to train_rnn.py and fix cache read bug
This commit is contained in:
Daniel Povey 2016-04-04 23:02:23 -04:00
Родитель 60a5e712e6 76f26e733f
Коммит 2d3b910faf
2 изменённых файлов: 23 добавлений и 7 удалений

Просмотреть файл

@ -338,7 +338,7 @@ def TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives,
left_context, right_context, min_deriv_time,
momentum, max_param_change,
shuffle_buffer_size, num_chunk_per_minibatch,
run_opts):
cache_read_opt, run_opts):
# We cannot easily use a single parallel SGE job to do the main training,
# because the computation of which archive and which --frame option
# to use for each job is a little complex, so we spawn each one separately.
@ -353,9 +353,15 @@ def TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives,
# the other indexes from.
archive_index = (k % num_archives) + 1 # work out the 1-based archive index.
cache_write_opt = ""
if job == 1:
# an option for writing cache (storing pairs of nnet-computations and
# computation-requests) during training.
cache_write_opt="--write-cache={dir}/cache.{iter}".format(dir=dir, iter=iter+1)
process_handle = RunKaldiCommand("""
{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
nnet3-train {parallel_train_opts} \
nnet3-train {parallel_train_opts} {cache_read_opt} {cache_write_opt} \
--print-interval=10 --momentum={momentum} \
--max-param-change={max_param_change} \
--optimization.min-deriv-time={min_deriv_time} "{raw_model}" \
@ -365,6 +371,7 @@ def TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives,
train_queue_opt = run_opts.train_queue_opt,
dir = dir, iter = iter, next_iter = iter + 1, job = job,
parallel_train_opts = run_opts.parallel_train_opts,
cache_read_opt = cache_read_opt, cache_write_opt = cache_write_opt,
momentum = momentum, max_param_change = max_param_change,
min_deriv_time = min_deriv_time,
raw_model = raw_model_string, context_opts = context_opts,
@ -387,7 +394,6 @@ def TrainNewModels(dir, iter, num_jobs, num_archives_processed, num_archives,
open('{0}/.error'.format(dir), 'w').close()
raise Exception("There was error during training iteration {0}".format(iter))
def TrainOneIteration(dir, iter, egs_dir,
num_jobs, num_archives_processed, num_archives,
learning_rate, shrinkage_value, num_chunk_per_minibatch,
@ -404,17 +410,21 @@ def TrainOneIteration(dir, iter, egs_dir,
if iter > 0:
ComputeProgress(dir, iter, egs_dir, run_opts)
# an option for writing cache (storing pairs of nnet-computations
# and computation-requests) during training.
cache_read_opt = ""
if iter > 0 and (iter <= (num_hidden_layers-1) * add_layers_period) and (iter % add_layers_period == 0):
do_average = False # if we've just mixed up, don't do averaging but take the
# best.
cur_num_hidden_layers = 1 + iter / add_layers_period
config_file = "{0}/configs/layer{1}.config".format(dir, cur_num_hidden_layers)
raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={iter} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, config=config_file )
raw_model_string = "nnet3-am-copy --raw=true --learning-rate={lr} {dir}/{iter}.mdl - | nnet3-init --srand={iter} - {config} - |".format(lr=learning_rate, dir=dir, iter=iter, config=config_file)
else:
do_average = True
if iter == 0:
do_average = False # on iteration 0, pick the best, don't average.
else:
cache_read_opt = "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter)
raw_model_string = "nnet3-am-copy --raw=true --learning-rate={0} {1}/{2}.mdl - |".format(learning_rate, dir, iter)
if do_average:
@ -437,7 +447,7 @@ def TrainOneIteration(dir, iter, egs_dir,
left_context, right_context, min_deriv_time,
momentum, max_param_change,
shuffle_buffer_size, cur_num_chunk_per_minibatch,
run_opts)
cache_read_opt, run_opts)
[models_to_average, best_model] = GetSuccessfulModels(num_jobs, '{0}/log/train.{1}.%.log'.format(dir,iter))
nnets_list = []
for n in models_to_average:
@ -477,6 +487,12 @@ nnet3-am-copy --scale={shrink} --set-raw-nnet=- {dir}/{iter}.mdl {dir}/{new_iter
raise Exception("Could not find {0}, at the end of iteration {1}".format(new_model, iter))
elif os.stat(new_model).st_size == 0:
raise Exception("{0} has size 0. Something went wrong in iteration {1}".format(new_model, iter))
try:
if cache_read_opt:
os.remove("{dir}/cache.{iter}".format(dir=dir, iter=iter))
except OSError:
raise Exception("Error while trying to delete the cache file")
# args is a Namespace with the required parameters
def Train(args, run_opts):

Просмотреть файл

@ -515,7 +515,7 @@ void CachingOptimizingCompiler::ReadCache(std::istream &is, bool binary) {
bool read_cache = (opt_config_ == opt_config_cached);
if (read_cache) {
size_t computation_cache_size;
int32 computation_cache_size;
ExpectToken(is, binary, "<ComputationCacheSize>");
ReadBasicType(is, binary, &computation_cache_size);
KALDI_ASSERT(computation_cache_size >= 0);