Add encoding cache and lazy-train mechanism (#50)

* Add new config about knowledge distillation for query binary classifier * remove inferenced result in knowledge distillation for query binary classifier * Add AUC.py in tools folder * Add test_data_path into conf_kdqbc_bilstmattn_cnn.json * Modify AUC.py * Rename AUC.py into calculate_AUC.py * Modify test&calculate AUC commands for Knowledge Distillation for Query Binary Classifier * Add cpu_thread_num parameter in conf.training_params * Rename cpu_thread_num into cpu_num_workers * update comments in ModelConf.py * Add cup_num_workers in model_zoo/advanced/conf.json * Add the description of cpu_num_workers in Tutorial.md * Update inference speed of compressed model * Add ProcessorsScheduler Class * Add license in ProcessorScheduler.py * use lazy loading instead of one-off loading * Remove Debug Info in problem.py * use open instead of codecs.open * update the inference of build dictionary for classification * add md5 function in common_utils.py * add merge_encode_* function * update typo * update typo * reorg the logical flow in train.py * remove dummy comments in problem.py * add encoding cache mechanism * add lazy-load mechanism for training phase * enumerate problem types in problem.py * remove data_encoding.py * add lazy load train logic * Modify comment and remove debug code * Judge if test_path exists * fix parameter missing when use char embedding * merge master * add file_column_num in problem.py * merge add_encoding_cache branch * add SST-2 in .gitignore * merge master * use steps_per_validation instead of valid_times_per_epoch * Fix Learning Rate decay logic bug * add log of calculating md5 of training data * fix multi-gpu char_emb OOM problem & add char leval fix_lengths * Modify batch_num_to_show_results in multi-gpu * Modify batch_num_to_show_results * delete deepcopy in get_batches * add new parameters chunk_size and max_building_lines in conf and update tutorials
2019-08-02 20:59:00 +08:00 · 2019-08-02 20:59:00 +08:00 · 58ad563a23
--- a/.gitignore
+++ b/.gitignore
@ -5,4 +5,5 @@
 *.vs*
 dataset/GloVe/
 dataset/20_newsgroups/
-models/
+dataset/SST-2/
+models/
--- a/LearningMachine.py
+++ b/LearningMachine.py
@ -13,7 +13,7 @@ import codecs
 import pickle as pkl

 from utils.common_utils import dump_to_pkl, load_from_pkl, get_param_num, get_trainable_param_num, \
-    transfer_to_gpu, transform_params2tensors, get_layer_class
+    transfer_to_gpu, transform_params2tensors, get_layer_class, load_from_json, dump_to_json
 from utils.philly_utils import HDFSDirectTransferer, open_and_move, convert_to_tmppath, \
    convert_to_hdfspath, move_from_local_to_hdfs
 from Model import Model
@ -22,7 +22,7 @@ from metrics.Evaluator import Evaluator
 from utils.corpus_utils import get_batches
 from core.StreamingRecorder import StreamingRecorder
 from core.LRScheduler import LRScheduler
-from settings import ProblemTypes
+from settings import ProblemTypes, Setting as st
 from block_zoo import Linear
 from block_zoo import CRF
 from losses.CRFLoss import CRFLoss
@ -93,33 +93,17 @@ class LearningMachine(object):
    def train(self, optimizer, loss_fn):
        self.model.train()
        logging.info("="*100 + '\n' + "*"*15 + 'Prepare data for training' + "*"*15)
-        if not self.conf.train_data_path.endswith('.pkl'):
-            train_data, train_length, train_target = self.problem.encode(self.conf.train_data_path, self.conf.file_columns,
-                self.conf.input_types, self.conf.file_with_col_header, self.conf.object_inputs, self.conf.answer_column_name, max_lengths=self.conf.max_lengths,
-                min_sentence_len = self.conf.min_sentence_len, extra_feature=self.conf.extra_feature,fixed_lengths=self.conf.fixed_lengths, file_format='tsv',
-                show_progress=True if self.conf.mode == 'normal' else False, cpu_num_workers=self.conf.cpu_num_workers)
-        else:
-            train_pkl_data = load_from_pkl(self.conf.train_data_path)
-            train_data, train_length, train_target = train_pkl_data['data'], train_pkl_data['length'], train_pkl_data['target']

-        if not self.conf.valid_data_path.endswith('.pkl'):
-            valid_data, valid_length, valid_target = self.problem.encode(self.conf.valid_data_path, self.conf.file_columns,
-                self.conf.input_types, self.conf.file_with_col_header, self.conf.object_inputs, self.conf.answer_column_name, max_lengths=self.conf.max_lengths,
-                min_sentence_len = self.conf.min_sentence_len, extra_feature = self.conf.extra_feature,fixed_lengths=self.conf.fixed_lengths, file_format='tsv',
-                show_progress=True if self.conf.mode == 'normal' else False, cpu_num_workers=self.conf.cpu_num_workers)
-        else:
-            valid_pkl_data = load_from_pkl(self.conf.valid_data_path)
-            valid_data, valid_length, valid_target = valid_pkl_data['data'], valid_pkl_data['length'], valid_pkl_data['target']
+        valid_data, valid_length, valid_target = self.problem.encode(self.conf.valid_data_path, self.conf.file_columns,
+            self.conf.input_types, self.conf.file_with_col_header, self.conf.object_inputs, self.conf.answer_column_name, max_lengths=self.conf.max_lengths,
+            min_sentence_len = self.conf.min_sentence_len, extra_feature = self.conf.extra_feature,fixed_lengths=self.conf.fixed_lengths, file_format='tsv',
+            show_progress=True if self.conf.mode == 'normal' else False, cpu_num_workers=self.conf.cpu_num_workers, chunk_size=self.conf.chunk_size)

        if self.conf.test_data_path is not None:
-            if not self.conf.test_data_path.endswith('.pkl'):
-                test_data, test_length, test_target = self.problem.encode(self.conf.test_data_path, self.conf.file_columns, self.conf.input_types,
-                    self.conf.file_with_col_header, self.conf.object_inputs, self.conf.answer_column_name, max_lengths=self.conf.max_lengths,
-                    min_sentence_len = self.conf.min_sentence_len, extra_feature = self.conf.extra_feature,fixed_lengths=self.conf.fixed_lengths,
-                    file_format='tsv', show_progress=True if self.conf.mode == 'normal' else False, cpu_num_workers=self.conf.cpu_num_workers)
-            else:
-                test_pkl_data = load_from_pkl(self.conf.test_data_path)
-                test_data, test_length, test_target = test_pkl_data['data'], test_pkl_data['length'], test_pkl_data['target']
+            test_data, test_length, test_target = self.problem.encode(self.conf.test_data_path, self.conf.file_columns, 
+            self.conf.input_types, self.conf.file_with_col_header, self.conf.object_inputs, self.conf.answer_column_name, max_lengths=self.conf.max_lengths,
+            min_sentence_len = self.conf.min_sentence_len, extra_feature = self.conf.extra_feature,fixed_lengths=self.conf.fixed_lengths, file_format='tsv', 
+            show_progress=True if self.conf.mode == 'normal' else False, cpu_num_workers=self.conf.cpu_num_workers, chunk_size=self.conf.chunk_size)

        stop_training = False
        epoch = 1
@ -139,212 +123,216 @@ class LearningMachine(object):
        logging.info("=" * 100 + '\n' + "*" * 15 + 'Start training' + "*" * 15)
        while not stop_training and epoch <= self.conf.max_epoch:
            logging.info('Training: Epoch ' + str(epoch))
+            train_data_generator = self._get_training_data_generator()
+            part_index = 1
+            for train_data, train_length, train_target in train_data_generator:
+                logging.debug('Training: Epoch %s Part %s'%(epoch, part_index))
+                part_index += 1
+                data_batches, length_batches, target_batches = \
+                    get_batches(self.problem, train_data, train_length, train_target, self.conf.batch_size_total,
+                        self.conf.input_types, None, permutate=True, transform_tensor=True)

-            data_batches, length_batches, target_batches = \
-                get_batches(self.problem, train_data, train_length, train_target, self.conf.batch_size_total,
-                    self.conf.input_types, None, permutate=True, transform_tensor=True)
-
-            whole_batch_num = len(target_batches)
-            valid_batch_num = max(len(target_batches) // self.conf.valid_times_per_epoch, 1)
-            if torch.cuda.device_count() > 1:
-                small_batch_num = whole_batch_num * torch.cuda.device_count()       # total batch num over all the gpus
-                valid_batch_num_show = valid_batch_num * torch.cuda.device_count()      # total batch num over all the gpus to do validation
-            else:
+                whole_batch_num = len(target_batches)
+                valid_batch_num = min(self.conf.steps_per_validation, whole_batch_num)
                small_batch_num = whole_batch_num
                valid_batch_num_show = valid_batch_num
+                batch_num_to_show_results = self.conf.batch_num_to_show_results
+                if torch.cuda.device_count() > 1:
+                    batch_num_to_show_results *= torch.cuda.device_count() # total batch num overall all the gpus to log 
+                    small_batch_num *= torch.cuda.device_count()       # total batch num over all the gpus
+                    valid_batch_num_show *= torch.cuda.device_count()      # total batch num over all the gpus to do validation
+                
+                streaming_recoder.clear_records()
+                all_costs = []

-            streaming_recoder.clear_records()
-            all_costs = []
+                logging.info('There are %d batches during current period; validation are conducted every %d batch' % (small_batch_num, valid_batch_num_show))

-            logging.info('There are %d batches during an epoch; validation are conducted every %d batch' % (small_batch_num, valid_batch_num_show))
+                if self.conf.mode == 'normal':
+                    progress = tqdm(range(len(target_batches)))
+                elif self.conf.mode == 'philly':
+                    progress = range(len(target_batches))
+                for i in progress:
+                    # the result shape: for classification: [batch_size, # of classes]; for sequence tagging: [batch_size, seq_len, # of tags]
+                    param_list, inputs_desc, length_desc = transform_params2tensors(data_batches[i], length_batches[i])
+                    logits = self.model(inputs_desc, length_desc, *param_list)

-            if self.conf.mode == 'normal':
-                progress = tqdm(range(len(target_batches)))
-            elif self.conf.mode == 'philly':
-                progress = range(len(target_batches))
-            for i in progress:
-                # the result shape: for classification: [batch_size, # of classes]; for sequence tagging: [batch_size, seq_len, # of tags]
-                param_list, inputs_desc, length_desc = transform_params2tensors(data_batches[i], length_batches[i])
-                logits = self.model(inputs_desc, length_desc, *param_list)
-
-                logits_softmax = {}
-                if isinstance(self.model, nn.DataParallel):
-                    for tmp_output_layer_id in self.model.module.output_layer_id:
-                        if isinstance(self.model.module.layers[tmp_output_layer_id], Linear) and \
-                                (not self.model.module.layers[tmp_output_layer_id].layer_conf.last_hidden_softmax):
-                            logits_softmax[tmp_output_layer_id] = nn.functional.softmax(
-                                logits[tmp_output_layer_id], dim=-1)
-                        elif isinstance(get_layer_class(self.model, tmp_output_layer_id), CRF):
-                            pass
-                        else:
-                            logits_softmax[tmp_output_layer_id] = logits[tmp_output_layer_id]
-                else:
-                    for tmp_output_layer_id in self.model.output_layer_id:
-                        if isinstance(self.model.layers[tmp_output_layer_id], Linear) and \
-                                (not self.model.layers[tmp_output_layer_id].layer_conf.last_hidden_softmax):
-                            logits_softmax[tmp_output_layer_id] = nn.functional.softmax(
-                                logits[tmp_output_layer_id], dim=-1)
-                        elif isinstance(get_layer_class(self.model, tmp_output_layer_id), CRF):
-                            pass
-                        else:
-                            logits_softmax[tmp_output_layer_id] = logits[tmp_output_layer_id]
-
-                # check the output
-                if ProblemTypes[self.problem.problem_type] == ProblemTypes.classification:
-                    logits = list(logits.values())[0]
-                    logits_softmax = list(logits_softmax.values())[0]
-                    assert len(logits_softmax.shape) == 2, 'The dimension of your output is %s, but we need [batch_size*GPUs, class num]' % (str(list(logits_softmax.shape)))
-                    assert logits_softmax.shape[1] == self.problem.output_target_num(), 'The dimension of your output layer %d is inconsistent with your type number %d!' % (logits_softmax.shape[1], self.problem.output_target_num())
-                    # for auc metric
-                    prediction_scores = logits_softmax[:, self.conf.pos_label].cpu().data.numpy()
-                    if self.evaluator.has_auc_type_specific:
-                        prediction_scores_all = logits_softmax.cpu().data.numpy()
-                    else:
-                        prediction_scores_all = None
-                elif ProblemTypes[self.problem.problem_type] == ProblemTypes.sequence_tagging:
-                    logits = list(logits.values())[0]
-                    if not isinstance(get_layer_class(self.model, tmp_output_layer_id), CRF):
-                        logits_softmax = list(logits_softmax.values())[0]
-                        assert len(logits_softmax.shape) == 3, 'The dimension of your output is %s, but we need [batch_size*GPUs, sequence length, representation dim]' % (str(list(logits_softmax.shape)), )
-                    prediction_scores = None
-                    prediction_scores_all = None
-                elif ProblemTypes[self.problem.problem_type] == ProblemTypes.regression:
-                    logits = list(logits.values())[0]
-                    logits_softmax = list(logits_softmax.values())[0]
-                    assert len(logits_softmax.shape) == 2 and logits_softmax.shape[1] == 1, 'The dimension of your output is %s, but we need [batch_size*GPUs, 1]' % (str(list(logits_softmax.shape)))
-                    prediction_scores = None
-                    prediction_scores_all = None
-                elif ProblemTypes[self.problem.problem_type] == ProblemTypes.mrc:
-                    for single_value in logits_softmax.values():
-                        assert len(single_value.shape) == 3, 'The dimension of your output is %s, but we need [batch_size*GPUs, sequence_len, 1]' % (str(list(single_value.shape)))
-                    prediction_scores = None
-                    prediction_scores_all = None
-
-                logits_flat = dict()
-                if ProblemTypes[self.problem.problem_type] == ProblemTypes.sequence_tagging:
-                    # Transform output shapes for metric evaluation
-                    # for seq_tag_f1 metric
-                    if isinstance(get_layer_class(self.model, tmp_output_layer_id), CRF):
-                        forward_score, scores, masks, tag_seq, transitions, layer_conf = logits
-                        prediction_indices = tag_seq.cpu().numpy()
-                        streaming_recoder.record_one_row([self.problem.decode(prediction_indices, length_batches[i]['target'][self.conf.answer_column_name[0]].numpy()),
-                                                          prediction_scores, self.problem.decode(
-                                target_batches[i][self.conf.answer_column_name[0]],
-                                length_batches[i]['target'][self.conf.answer_column_name[0]].numpy())], keep_dim=False)
-
-                    else:
-                        prediction_indices = logits_softmax.data.max(2)[1].cpu().numpy()    # [batch_size, seq_len]
-                        # pytorch's CrossEntropyLoss only support this
-                        logits_flat[self.conf.output_layer_id[0]] = logits.view(-1, logits.size(2))  # [batch_size * seq_len, # of tags]
-                        streaming_recoder.record_one_row([self.problem.decode(prediction_indices, length_batches[i]['target'][self.conf.answer_column_name[0]].numpy()),
-                                                          prediction_scores, self.problem.decode(
-                                target_batches[i][self.conf.answer_column_name[0]],
-                                length_batches[i]['target'][self.conf.answer_column_name[0]].numpy())], keep_dim=False)
-
-                        target_batches[i][self.conf.answer_column_name[0]] = target_batches[i][
-                            self.conf.answer_column_name[0]].reshape(-1)
-
-                elif ProblemTypes[self.problem.problem_type] == ProblemTypes.classification:
-                    prediction_indices = logits_softmax.detach().max(1)[1].cpu().numpy()
-                    # Should not decode!
-                    streaming_recoder.record_one_row([prediction_indices, prediction_scores, prediction_scores_all, target_batches[i][self.conf.answer_column_name[0]].numpy()])
-                    logits_flat[self.conf.output_layer_id[0]] = logits
-                elif ProblemTypes[self.problem.problem_type] == ProblemTypes.regression:
-                    temp_logits_flat = logits.squeeze(1)
-                    prediction_scores = temp_logits_flat.detach().cpu().numpy()
-                    streaming_recoder.record_one_row([prediction_scores, target_batches[i][self.conf.answer_column_name[0]].numpy()])
-                    logits_flat[self.conf.output_layer_id[0]] = temp_logits_flat
-                elif ProblemTypes[self.problem.problem_type] == ProblemTypes.mrc:
-                    for key, value in logits.items():
-                        logits[key] = value.squeeze()
-                    for key, value in logits_softmax.items():
-                        logits_softmax[key] = value.squeeze()
-                    passage_identify = None
-                    for type_key in data_batches[i].keys():
-                        if 'p' in type_key.lower():
-                            passage_identify = type_key
-                            break
-                    if not passage_identify:
-                        raise Exception('MRC task need passage information.')
-                    prediction = self.problem.decode(logits_softmax, lengths=length_batches[i][passage_identify],
-                                                     batch_data=data_batches[i][passage_identify])
-                    logits_flat = logits
-                    mrc_answer_target = None
-                    for single_target in target_batches[i]:
-                        if isinstance(target_batches[i][single_target][0], str):
-                            mrc_answer_target = target_batches[i][single_target]
-                    streaming_recoder.record_one_row([prediction, mrc_answer_target])
-
-                if self.use_gpu:
-                    for single_target in self.conf.answer_column_name:
-                        if isinstance(target_batches[i][single_target], torch.Tensor):
-                            target_batches[i][single_target] = transfer_to_gpu(target_batches[i][single_target])
-                if isinstance(loss_fn.loss_fn[0], CRFLoss):
-                    loss = loss_fn.loss_fn[0](forward_score, scores, masks, list(target_batches[i].values())[0], transitions, layer_conf)
-                else:
-                    loss = loss_fn(logits_flat, target_batches[i])
-
-                all_costs.append(loss.item())
-                optimizer.zero_grad()
-                loss.backward()
-                if self.conf.clip_grad_norm_max_norm != -1:
-                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.conf.clip_grad_norm_max_norm)
+                    logits_softmax = {}
                    if isinstance(self.model, nn.DataParallel):
-                        torch.nn.utils.clip_grad_norm_(self.model.module.layers['embedding'].get_parameters(), self.conf.clip_grad_norm_max_norm)
+                        for tmp_output_layer_id in self.model.module.output_layer_id:
+                            if isinstance(self.model.module.layers[tmp_output_layer_id], Linear) and \
+                                    (not self.model.module.layers[tmp_output_layer_id].layer_conf.last_hidden_softmax):
+                                logits_softmax[tmp_output_layer_id] = nn.functional.softmax(
+                                    logits[tmp_output_layer_id], dim=-1)
+                            elif isinstance(get_layer_class(self.model, tmp_output_layer_id), CRF):
+                                pass
+                            else:
+                                logits_softmax[tmp_output_layer_id] = logits[tmp_output_layer_id]
                    else:
-                        torch.nn.utils.clip_grad_norm_(self.model.layers['embedding'].get_parameters(), self.conf.clip_grad_norm_max_norm)
-                optimizer.step()
+                        for tmp_output_layer_id in self.model.output_layer_id:
+                            if isinstance(self.model.layers[tmp_output_layer_id], Linear) and \
+                                    (not self.model.layers[tmp_output_layer_id].layer_conf.last_hidden_softmax):
+                                logits_softmax[tmp_output_layer_id] = nn.functional.softmax(
+                                    logits[tmp_output_layer_id], dim=-1)
+                            elif isinstance(get_layer_class(self.model, tmp_output_layer_id), CRF):
+                                pass
+                            else:
+                                logits_softmax[tmp_output_layer_id] = logits[tmp_output_layer_id]

-                del loss, logits, logits_softmax, logits_flat
-                del prediction_scores
-                if ProblemTypes[self.problem.problem_type] == ProblemTypes.sequence_tagging \
-                        or ProblemTypes[self.problem.problem_type] == ProblemTypes.classification:
-                    del prediction_indices
-
-                if show_result_cnt == self.conf.batch_num_to_show_results:
+                    # check the output
                    if ProblemTypes[self.problem.problem_type] == ProblemTypes.classification:
-                        result = self.evaluator.evaluate(streaming_recoder.get('target'),
-                            streaming_recoder.get('prediction'), y_pred_pos_score=streaming_recoder.get('pred_scores'),
-                            y_pred_scores_all=streaming_recoder.get('pred_scores_all'), formatting=True)
+                        logits = list(logits.values())[0]
+                        logits_softmax = list(logits_softmax.values())[0]
+                        assert len(logits_softmax.shape) == 2, 'The dimension of your output is %s, but we need [batch_size*GPUs, class num]' % (str(list(logits_softmax.shape)))
+                        assert logits_softmax.shape[1] == self.problem.output_target_num(), 'The dimension of your output layer %d is inconsistent with your type number %d!' % (logits_softmax.shape[1], self.problem.output_target_num())
+                        # for auc metric
+                        prediction_scores = logits_softmax[:, self.conf.pos_label].cpu().data.numpy()
+                        if self.evaluator.has_auc_type_specific:
+                            prediction_scores_all = logits_softmax.cpu().data.numpy()
+                        else:
+                            prediction_scores_all = None
                    elif ProblemTypes[self.problem.problem_type] == ProblemTypes.sequence_tagging:
-                        result = self.evaluator.evaluate(streaming_recoder.get('target'),
-                            streaming_recoder.get('prediction'), y_pred_pos_score=streaming_recoder.get('pred_scores'),
-                            formatting=True)
+                        logits = list(logits.values())[0]
+                        if not isinstance(get_layer_class(self.model, tmp_output_layer_id), CRF):
+                            logits_softmax = list(logits_softmax.values())[0]
+                            assert len(logits_softmax.shape) == 3, 'The dimension of your output is %s, but we need [batch_size*GPUs, sequence length, representation dim]' % (str(list(logits_softmax.shape)), )
+                        prediction_scores = None
+                        prediction_scores_all = None
                    elif ProblemTypes[self.problem.problem_type] == ProblemTypes.regression:
-                        result = self.evaluator.evaluate(streaming_recoder.get('target'),
-                            streaming_recoder.get('prediction'), y_pred_pos_score=None, y_pred_scores_all=None, formatting=True)
+                        logits = list(logits.values())[0]
+                        logits_softmax = list(logits_softmax.values())[0]
+                        assert len(logits_softmax.shape) == 2 and logits_softmax.shape[1] == 1, 'The dimension of your output is %s, but we need [batch_size*GPUs, 1]' % (str(list(logits_softmax.shape)))
+                        prediction_scores = None
+                        prediction_scores_all = None
                    elif ProblemTypes[self.problem.problem_type] == ProblemTypes.mrc:
-                        result = self.evaluator.evaluate(streaming_recoder.get('answer_text'), streaming_recoder.get('prediction'),
-                                                         y_pred_pos_score=None, y_pred_scores_all=None, formatting=True)
+                        for single_value in logits_softmax.values():
+                            assert len(single_value.shape) == 3, 'The dimension of your output is %s, but we need [batch_size*GPUs, sequence_len, 1]' % (str(list(single_value.shape)))
+                        prediction_scores = None
+                        prediction_scores_all = None

-                    if torch.cuda.device_count() > 1:
-                        logging.info("Epoch %d batch idx: %d; lr: %f; since last log, loss=%f; %s" % \
-                            (epoch, i * torch.cuda.device_count(), lr_scheduler.get_lr(), np.sum(all_costs), result))
+                    logits_flat = dict()
+                    if ProblemTypes[self.problem.problem_type] == ProblemTypes.sequence_tagging:
+                        # Transform output shapes for metric evaluation
+                        # for seq_tag_f1 metric
+                        if isinstance(get_layer_class(self.model, tmp_output_layer_id), CRF):
+                            forward_score, scores, masks, tag_seq, transitions, layer_conf = logits
+                            prediction_indices = tag_seq.cpu().numpy()
+                            streaming_recoder.record_one_row([self.problem.decode(prediction_indices, length_batches[i]['target'][self.conf.answer_column_name[0]].numpy()),
+                                                            prediction_scores, self.problem.decode(
+                                    target_batches[i][self.conf.answer_column_name[0]],
+                                    length_batches[i]['target'][self.conf.answer_column_name[0]].numpy())], keep_dim=False)
+
+                        else:
+                            prediction_indices = logits_softmax.data.max(2)[1].cpu().numpy()    # [batch_size, seq_len]
+                            # pytorch's CrossEntropyLoss only support this
+                            logits_flat[self.conf.output_layer_id[0]] = logits.view(-1, logits.size(2))  # [batch_size * seq_len, # of tags]
+                            streaming_recoder.record_one_row([self.problem.decode(prediction_indices, length_batches[i]['target'][self.conf.answer_column_name[0]].numpy()),
+                                                            prediction_scores, self.problem.decode(
+                                    target_batches[i][self.conf.answer_column_name[0]],
+                                    length_batches[i]['target'][self.conf.answer_column_name[0]].numpy())], keep_dim=False)
+
+                            target_batches[i][self.conf.answer_column_name[0]] = target_batches[i][
+                                self.conf.answer_column_name[0]].reshape(-1)
+
+                    elif ProblemTypes[self.problem.problem_type] == ProblemTypes.classification:
+                        prediction_indices = logits_softmax.detach().max(1)[1].cpu().numpy()
+                        # Should not decode!
+                        streaming_recoder.record_one_row([prediction_indices, prediction_scores, prediction_scores_all, target_batches[i][self.conf.answer_column_name[0]].numpy()])
+                        logits_flat[self.conf.output_layer_id[0]] = logits
+                    elif ProblemTypes[self.problem.problem_type] == ProblemTypes.regression:
+                        temp_logits_flat = logits.squeeze(1)
+                        prediction_scores = temp_logits_flat.detach().cpu().numpy()
+                        streaming_recoder.record_one_row([prediction_scores, target_batches[i][self.conf.answer_column_name[0]].numpy()])
+                        logits_flat[self.conf.output_layer_id[0]] = temp_logits_flat
+                    elif ProblemTypes[self.problem.problem_type] == ProblemTypes.mrc:
+                        for key, value in logits.items():
+                            logits[key] = value.squeeze()
+                        for key, value in logits_softmax.items():
+                            logits_softmax[key] = value.squeeze()
+                        passage_identify = None
+                        for type_key in data_batches[i].keys():
+                            if 'p' in type_key.lower():
+                                passage_identify = type_key
+                                break
+                        if not passage_identify:
+                            raise Exception('MRC task need passage information.')
+                        prediction = self.problem.decode(logits_softmax, lengths=length_batches[i][passage_identify],
+                                                        batch_data=data_batches[i][passage_identify])
+                        logits_flat = logits
+                        mrc_answer_target = None
+                        for single_target in target_batches[i]:
+                            if isinstance(target_batches[i][single_target][0], str):
+                                mrc_answer_target = target_batches[i][single_target]
+                        streaming_recoder.record_one_row([prediction, mrc_answer_target])
+
+                    if self.use_gpu:
+                        for single_target in self.conf.answer_column_name:
+                            if isinstance(target_batches[i][single_target], torch.Tensor):
+                                target_batches[i][single_target] = transfer_to_gpu(target_batches[i][single_target])
+                    if isinstance(loss_fn.loss_fn[0], CRFLoss):
+                        loss = loss_fn.loss_fn[0](forward_score, scores, masks, list(target_batches[i].values())[0], transitions, layer_conf)
                    else:
-                        logging.info("Epoch %d batch idx: %d; lr: %f; since last log, loss=%f; %s" % \
-                            (epoch, i, lr_scheduler.get_lr(), np.mean(all_costs), result))
+                        loss = loss_fn(logits_flat, target_batches[i])

-                    show_result_cnt = 0
-                    # The loss and other metrics printed during a training epoch are just the result of part of the training data.
-                    all_costs = []
-                    streaming_recoder.clear_records()
+                    all_costs.append(loss.item())
+                    optimizer.zero_grad()
+                    loss.backward()
+                    if self.conf.clip_grad_norm_max_norm != -1:
+                        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.conf.clip_grad_norm_max_norm)
+                        if isinstance(self.model, nn.DataParallel):
+                            torch.nn.utils.clip_grad_norm_(self.model.module.layers['embedding'].get_parameters(), self.conf.clip_grad_norm_max_norm)
+                        else:
+                            torch.nn.utils.clip_grad_norm_(self.model.layers['embedding'].get_parameters(), self.conf.clip_grad_norm_max_norm)
+                    optimizer.step()

-                if (i != 0 and i % valid_batch_num == 0) or i == len(target_batches) - 1:
-                    torch.cuda.empty_cache()    # actually useless
-                    logging.info('Valid & Test : Epoch ' + str(epoch))
-                    new_result = self.evaluate(valid_data, valid_length, valid_target,
-                        self.conf.input_types, self.evaluator, loss_fn, pad_ids=None, cur_best_result=best_result,
-                        model_save_path=self.conf.model_save_path, phase="valid", epoch=epoch)
-                    renew_flag = best_result != new_result
-                    best_result = new_result
+                    del loss, logits, logits_softmax, logits_flat
+                    del prediction_scores
+                    if ProblemTypes[self.problem.problem_type] == ProblemTypes.sequence_tagging \
+                            or ProblemTypes[self.problem.problem_type] == ProblemTypes.classification:
+                        del prediction_indices

-                    if renew_flag and self.conf.test_data_path is not None:
-                        self.evaluate(test_data, test_length, test_target,
-                            self.conf.input_types, self.evaluator, loss_fn, pad_ids=None, phase="test", epoch=epoch)
-                    self.model.train()
-                show_result_cnt += 1
+                    if show_result_cnt == batch_num_to_show_results:
+                        if ProblemTypes[self.problem.problem_type] == ProblemTypes.classification:
+                            result = self.evaluator.evaluate(streaming_recoder.get('target'),
+                                streaming_recoder.get('prediction'), y_pred_pos_score=streaming_recoder.get('pred_scores'),
+                                y_pred_scores_all=streaming_recoder.get('pred_scores_all'), formatting=True)
+                        elif ProblemTypes[self.problem.problem_type] == ProblemTypes.sequence_tagging:
+                            result = self.evaluator.evaluate(streaming_recoder.get('target'),
+                                streaming_recoder.get('prediction'), y_pred_pos_score=streaming_recoder.get('pred_scores'),
+                                formatting=True)
+                        elif ProblemTypes[self.problem.problem_type] == ProblemTypes.regression:
+                            result = self.evaluator.evaluate(streaming_recoder.get('target'),
+                                streaming_recoder.get('prediction'), y_pred_pos_score=None, y_pred_scores_all=None, formatting=True)
+                        elif ProblemTypes[self.problem.problem_type] == ProblemTypes.mrc:
+                            result = self.evaluator.evaluate(streaming_recoder.get('answer_text'), streaming_recoder.get('prediction'),
+                                                                y_pred_pos_score=None, y_pred_scores_all=None, formatting=True)

-            del data_batches, length_batches, target_batches
+                        if torch.cuda.device_count() > 1:
+                            logging.info("Epoch %d batch idx: %d; lr: %f; since last log, loss=%f; %s" % \
+                                (epoch, i * torch.cuda.device_count(), lr_scheduler.get_lr(), np.mean(all_costs), result))
+                        else:
+                            logging.info("Epoch %d batch idx: %d; lr: %f; since last log, loss=%f; %s" % \
+                                (epoch, i, lr_scheduler.get_lr(), np.mean(all_costs), result))
+                        show_result_cnt = 0
+                        # The loss and other metrics printed during a training epoch are just the result of part of the training data.
+                        all_costs = []
+                        streaming_recoder.clear_records()
+
+                    if (i != 0 and i % valid_batch_num == 0) or i == len(target_batches) - 1:
+                        torch.cuda.empty_cache()    # actually useless
+                        logging.info('Valid & Test : Epoch ' + str(epoch))
+                        new_result = self.evaluate(valid_data, valid_length, valid_target,
+                            self.conf.input_types, self.evaluator, loss_fn, pad_ids=None, cur_best_result=best_result,
+                            model_save_path=self.conf.model_save_path, phase="valid", epoch=epoch)
+                        renew_flag = best_result != new_result
+                        best_result = new_result
+
+                        if renew_flag and self.conf.test_data_path is not None:
+                            self.evaluate(test_data, test_length, test_target,
+                                self.conf.input_types, self.evaluator, loss_fn, pad_ids=None, phase="test", epoch=epoch)
+                        self.model.train()
+                    show_result_cnt += 1
+
+                del data_batches, length_batches, target_batches
            lr_scheduler.step()
            epoch += 1

@ -357,7 +345,7 @@ class LearningMachine(object):
            test_data, test_length, test_target = self.problem.encode(test_data_path, self.conf.file_columns, self.conf.input_types,
                self.conf.file_with_col_header, self.conf.object_inputs, self.conf.answer_column_name, max_lengths=self.conf.max_lengths,
                min_sentence_len = self.conf.min_sentence_len, extra_feature = self.conf.extra_feature,fixed_lengths=self.conf.fixed_lengths, file_format='tsv',
-                show_progress=True if self.conf.mode == 'normal' else False, cpu_num_workers=self.conf.cpu_num_workers)
+                show_progress=True if self.conf.mode == 'normal' else False, cpu_num_workers=self.conf.cpu_num_workers, chunk_size=self.conf.chunk_size)
        else:
            test_pkl_data = load_from_pkl(test_data_path)
            test_data, test_length, test_target = test_pkl_data['data'], test_pkl_data['length'], test_pkl_data['target']
@ -681,7 +669,7 @@ class LearningMachine(object):
            self.conf.file_with_col_header,self.conf.object_inputs, None, min_sentence_len=self.conf.min_sentence_len,
            extra_feature=self.conf.extra_feature,max_lengths=self.conf.max_lengths, fixed_lengths=self.conf.fixed_lengths,
            file_format='tsv', show_progress=True if self.conf.mode == 'normal' else False, 
-            cpu_num_workers=self.conf.cpu_num_workers)
+            cpu_num_workers=self.conf.cpu_num_workers, chunk_size=self.conf.chunk_size)

        logging.info("Starting predict ...")
        self.model.eval()
@ -910,5 +898,19 @@ class LearningMachine(object):
        logging.info("Model %s loaded!" % model_path)
        logging.info("Total trainable parameters: %d" % (get_trainable_param_num(self.model)))

+    def _get_training_data_generator(self):
+        if not self.conf.use_cache:
+            return self.problem.get_encode_generator(self.conf, build_cache=False)
+        if not self.conf.encoding_file_index:
+            return self._get_save_encode_generator()
+        assert self.conf.load_encoding_cache_generator, 'function conf.load_encoding_cache_generator is not defined'
+        return self.conf.load_encoding_cache_generator(self.conf.encoding_cache_dir, self.conf.encoding_file_index)
+
+    def _get_save_encode_generator(self):
+        load_save_encode_generator = self.problem.get_encode_generator(self.conf, build_cache=True)
+        for data, lengths, target in load_save_encode_generator:
+            yield data, lengths, target
+        cache_index = load_from_json(self.conf.encoding_cache_index_file_path)
+        self.conf.encoding_file_index = cache_index[st.cencoding_key_index]


--- a/ModelConf.py
+++ b/ModelConf.py
@ -14,8 +14,8 @@ import shutil

 from losses.BaseLossConf import BaseLossConf
 #import traceback
-from settings import LanguageTypes, ProblemTypes, TaggingSchemes, SupportedMetrics, PredictionTypes, DefaultPredictionFields
-from utils.common_utils import log_set, prepare_dir
+from settings import LanguageTypes, ProblemTypes, TaggingSchemes, SupportedMetrics, PredictionTypes, DefaultPredictionFields, ConstantStatic
+from utils.common_utils import log_set, prepare_dir, md5
 from utils.exceptions import ConfigurationError
 import numpy as np

@ -219,6 +219,10 @@ class ModelConf(object):
        # vocabulary setting
        self.max_vocabulary = self.get_item(['training_params', 'vocabulary', 'max_vocabulary'], default=800000, use_default=True)
        self.min_word_frequency = self.get_item(['training_params', 'vocabulary', 'min_word_frequency'], default=3, use_default=True)
+        self.max_building_lines = self.get_item(['training_params', 'vocabulary', 'max_building_lines'], default=1000 * 1000, use_default=True)
+
+        # chunk_size
+        self.chunk_size = self.get_item(['training_params', 'chunk_size'], default=1000 * 1000, use_default=True)

        # file column header setting
        self.file_with_col_header = self.get_item(['inputs', 'file_with_col_header'], default=False, use_default=True)
@ -280,6 +284,9 @@ class ModelConf(object):
            tmp_problem_path = os.path.join(self.save_base_dir, '.necessary_cache', 'problem.pkl')
            self.problem_path = tmp_problem_path if os.path.isfile(tmp_problem_path) else os.path.join(self.save_base_dir, 'necessary_cache', 'problem.pkl')

+        # cache configuration
+        self._load_cache_config_from_conf()
+
        # training params
        self.training_params = self.get_item(['training_params'])

@ -303,7 +310,9 @@ class ModelConf(object):
            self.max_epoch = self.params.max_epoch
        else:
            self.max_epoch = self.get_item(['training_params', 'max_epoch'], default=float('inf'))
-        self.valid_times_per_epoch = self.get_item(['training_params', 'valid_times_per_epoch'], default=1)
+        if 'valid_times_per_epoch' in self.conf['training_params']:
+            logging.info("configuration[training_params][valid_times_per_epoch] is deprecated, please use configuration[training_params][steps_per_validation] instead")
+        self.steps_per_validation = self.get_item(['training_params', 'steps_per_validation'], default=10)
        self.batch_num_to_show_results = self.get_item(['training_params', 'batch_num_to_show_results'], default=10)
        self.max_lengths = self.get_item(['training_params', 'max_lengths'], default=None, use_default=True)
        self.fixed_lengths = self.get_item(['training_params', 'fixed_lengths'], default=None, use_default=True)
@ -529,3 +538,23 @@ class ModelConf(object):
        shutil.copy(params.conf_path, self.save_base_dir)
        logging.info('Configuration file is backed up to %s' % (self.save_base_dir))
        
+    def _load_cache_config_from_conf(self):
+        # training data
+        self.train_data_md5 = None
+        if self.phase == 'train' and self.train_data_path:
+            logging.info("Calculating the md5 of traing data ...")
+            self.train_data_md5 = md5([self.train_data_path])
+            logging.info("the md5 of traing data is %s"%(self.train_data_md5))
+
+        # problem
+        self.problem_md5 = None
+        
+        # encoding 
+        self.encoding_cache_dir = None
+        self.encoding_cache_index_file_path = None
+        self.encoding_cache_index_file_md5_path = None
+        self.encoding_file_index = None
+        self.encoding_cache_legal_line_cnt = 0
+        self.encoding_cache_illegal_line_cnt = 0
+        self.load_encoding_cache_generator = None
+        
--- a/Tutorial.md
+++ b/Tutorial.md
@ -147,10 +147,12 @@ The architecture of the configuration file is:
        CUDA_VISIBLE_DEVICES= python train.py
        ```
    - ***cpu_num_workers***. [default: -1] Define the number of processes to preprocess the dataset. The number of processes is equal to that of logical cores CPU supports if value is negtive or 0, otherwise it is equal to *cpu_num_workers*.
+    - ***chunk_size***. [default: 1000000] Define the chunk size of files that NB reads every time for avoiding out of memory and the mechanism of lazy-loading.
    - ***batch_size***. Define the batch size here. If there are multiple GPUs, *batch_size* is the batch size of each GPU.
    - ***batch_num_to_show_results***. [necessary for training] During the training process, show the results every batch_num_to_show_results batches.
    - ***max_epoch***. [necessary for training] The maximum number of epochs to train.
-    - ***valid_times_per_epoch***. [optional for training, default: 1] Define how many times to conduct validation per epoch. Usually, we conduct validation after each epoch, but for a very large corpus, we'd better validate multiple times in case to miss the best state of our model. The default value is 1.
+    - ~~***valid_times_per_epoch***~~. [**deprecated**] Please use steps_per_validation instead.
+    - ***steps_per_validation***. [default: 10] Define how many steps does each validation take place. 
    - ***tokenizer***. [optional] Define tokenizer here. Currently, we support 'nltk' and 'jieba'. By default, 'nltk' for English and 'jieba' for Chinese.
 - **architecture**. Define the model architecture. The node is a list of layers (blocks) in block_zoo to represent a model. The supported layers of this toolkit are given in [block_zoo overview](https://microsoft.github.io/NeuronBlocks). 
    
@ -729,5 +731,7 @@ To solve the above problems, NeuronBlocks supports *fixing embedding weight* (em
    
    ***training_params/vocabulary/max_vocabulary***. [int, optional for training, default: 800,000] The max size of corpus vocabulary. If corpus vocabulary size is larger than *max_vocabulary*, it will be cut according to word frequency.

+    ***training_params/vocabulary/max_building_lines***. [int, optional for training, default: 1,000,000] The max lines NB will read from every file to build vocabulary
+
 ## <span id="faq">Frequently Asked Questions</span>

--- a/Tutorial_zh_CN.md
+++ b/Tutorial_zh_CN.md
@ -137,10 +137,12 @@ python predict.py --conf_path=model_zoo/demo/conf.json
        CUDA_VISIBLE_DEVICES= python train.py
        ```
    - ***cpu_num_workers***. [default: -1] Define the number of processes to preprocess the dataset. The number of processes is equal to that of logical cores CPU supports if value is negtive or 0, otherwise it is equal to *cpu_num_workers*.
+    - ***chunk_size***. [default: 1000000] Define the chunk size of files that NB reads every time for avoiding out of memory and the mechanism of lazy-loading.
    - ***batch_size***. Define the batch size here. If there are multiple GPUs, *batch_size* is the batch size of each GPU.
    - ***batch_num_to_show_results***. [necessary for training] During the training process, show the results every batch_num_to_show_results batches.
    - ***max_epoch***. [necessary for training] The maximum number of epochs to train.
-    - ***valid_times_per_epoch***. [optional for training, default: 1] Define how many times to conduct validation per epoch. Usually, we conduct validation after each epoch, but for a very large corpus, we'd better validate multiple times in case to miss the best state of our model. The default value is 1.
+    - ~~***valid_times_per_epoch***~~. [**deprecated**] Please use steps_per_validation instead.
+    - ***steps_per_validation***. [default: 10] Define how many steps does each validation take place. 
    - ***tokenizer***. [optional] Define tokenizer here. Currently, we support 'nltk' and 'jieba'. By default, 'nltk' for English and 'jieba' for Chinese.
 - **architecture**. Define the model architecture. The node is a list of layers (blocks) in block_zoo to represent a model. The supported layers of this toolkit are given in [block_zoo overview](https://microsoft.github.io/NeuronBlocks). 
    
@ -719,4 +721,6 @@ To solve the above problems, NeuronBlocks supports *fixing embedding weight* (em
    
    ***training_params/vocabulary/max_vocabulary***. [int, optional for training, default: 800,000] The max size of corpus vocabulary. If corpus vocabulary size is larger than *max_vocabulary*, it will be cut according to word frequency.

+    ***training_params/vocabulary/max_building_lines***. [int, optional for training, default: 1,000,000] The max lines NB will read from every file to build vocabulary
+
 ## <span id="faq">常见问题与答案</span>
--- a/block_zoo/Embedding.py
+++ b/block_zoo/Embedding.py
@ -66,7 +66,10 @@ class EmbeddingConf(BaseConf):
        for emb_type in self.conf:
            if emb_type == 'position':
                continue
-            self.output_dim[2] += self.conf[emb_type]['dim']
+            if isinstance(self.conf[emb_type]['dim'], list):
+                self.output_dim[2] += sum(self.conf[emb_type]['dim'])
+            else:
+                self.output_dim[2] += self.conf[emb_type]['dim']

        super(EmbeddingConf, self).inference()

@ -113,6 +116,7 @@ class Embedding(BaseLayer):
        self.layer_conf = layer_conf

        self.embeddings = nn.ModuleDict() if layer_conf.weight_on_gpu else dict()
+        self.char_embeddings = nn.ModuleDict()
        for input_cluster in layer_conf.conf:
            if 'type' in layer_conf.conf[input_cluster]:
                # char embedding
@ -122,7 +126,7 @@ class Embedding(BaseLayer):
                char_emb_conf = eval(layer_conf.conf[input_cluster]['type'] + "Conf")(** char_emb_conf_dict)
                char_emb_conf.inference()
                char_emb_conf.verify()
-                self.embeddings[input_cluster] = eval(layer_conf.conf[input_cluster]['type'])(char_emb_conf)
+                self.char_embeddings[input_cluster] = eval(layer_conf.conf[input_cluster]['type'])(char_emb_conf)
            else:
                # word embedding, postag embedding, and so on
                self.embeddings[input_cluster] = nn.Embedding(layer_conf.conf[input_cluster]['vocab_size'], layer_conf.conf[input_cluster]['dim'], padding_idx=0)
@ -155,14 +159,13 @@ class Embedding(BaseLayer):
            if 'extra' in input_cluster:
                continue
            input = inputs[input_cluster]
-            # if 'type' in self.layer_conf.conf[input_cluster]:
-            #     emb = self.embeddings[input_cluster](input, lengths[input]).float()
-            # else:
-            #     emb = self.embeddings[input_cluster](input).float()
-            if list(self.embeddings[input_cluster].parameters())[0].device.type == 'cpu':
-                emb = self.embeddings[input_cluster](input.cpu()).float()
+            if input_cluster == 'char':
+                emb = self.char_embeddings[input_cluster](input).float()
            else:
-                emb = self.embeddings[input_cluster](input).float()
+                if list(self.embeddings[input_cluster].parameters())[0].device.type == 'cpu':
+                    emb = self.embeddings[input_cluster](input.cpu()).float()
+                else:
+                    emb = self.embeddings[input_cluster](input).float()
            if use_gpu is True:
                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                emb = emb.to(device)
--- a/block_zoo/Pooling2D.py
+++ b/block_zoo/Pooling2D.py
@ -29,7 +29,7 @@ class Pooling2DConf(BaseConf):
        self.pool_type = 'max'  # Supported: ['max', mean']
        self.stride = 1
        self.padding = 0
-        self.window_size = 3
+        # self.window_size = [self.input_dims[0][1], self.input_dims[0][2]]
        
    @DocInherit
    def declare(self):
@ -38,7 +38,7 @@ class Pooling2DConf(BaseConf):
    
    def check_size(self, value, attr):
        res = value
-        if isinstance(value,int):
+        if isinstance(value, int):
            res = [value, value]
        elif (isinstance(self.window_size, tuple) or isinstance(self.window_size, list)) and len(value)==2:
            res = list(value)
@ -48,6 +48,9 @@ class Pooling2DConf(BaseConf):
            
    @DocInherit
    def inference(self):
+
+        if not hasattr(self, "window_size"):
+            self.window_size = [self.input_dims[0][1], self.input_dims[0][2]]
        
        self.window_size = self.check_size(self.window_size, "window_size")
        self.stride = self.check_size(self.stride, "stride")
--- a/block_zoo/embedding/CNNCharEmbedding.py
+++ b/block_zoo/embedding/CNNCharEmbedding.py
@ -28,11 +28,11 @@ class CNNCharEmbeddingConf(BaseConf):

    @DocInherit
    def default(self):
-        self.dim = 30       # cnn's output channel dim
+        self.dim = [30]       # cnn's output channel dim
        self.embedding_matrix_dim = 30      #
-        self.stride = 1
+        self.stride = [1]
        self.padding = 0
-        self.window_size = 3
+        self.window_size = [3]
        self.activation = 'ReLU'

    @DocInherit
@ -41,8 +41,14 @@ class CNNCharEmbeddingConf(BaseConf):
        self.num_of_inputs = 1
        self.input_ranks = [3]

+    def change_to_list(self, attribute):
+        for single in attribute:
+            if not isinstance(getattr(self, single), list):
+                setattr(self, single, [getattr(self, single)])
+
    @DocInherit
    def inference(self):
+        self.change_to_list(['dim', 'stride', 'window_size'])
        self.output_channel_num = self.dim
        self.output_rank = 3

@ -65,20 +71,24 @@ class CNNCharEmbedding(BaseLayer):
        super(CNNCharEmbedding, self).__init__(layer_conf)
        self.layer_conf = layer_conf

+        assert len(layer_conf.dim) == len(layer_conf.window_size) == len(layer_conf.stride), "The attribute dim/window_size/stride must have the same length."
+
        self.char_embeddings = nn.Embedding(layer_conf.vocab_size, layer_conf.embedding_matrix_dim, padding_idx=self.layer_conf.padding)
        nn.init.uniform_(self.char_embeddings.weight, -0.001, 0.001)

-        self.char_cnn = nn.Conv2d(1, layer_conf.output_channel_num, (layer_conf.window_size, layer_conf.embedding_matrix_dim),
-                                   stride=self.layer_conf.stride, padding=self.layer_conf.padding)
+        self.char_cnn = nn.ModuleList()
+        for i in range(len(layer_conf.output_channel_num)):
+            self.char_cnn.append(nn.Conv2d(1, layer_conf.output_channel_num[i], (layer_conf.window_size[i], layer_conf.embedding_matrix_dim),
+                                   stride=self.layer_conf.stride[i], padding=self.layer_conf.padding))
        if layer_conf.activation:
            self.activation = eval("nn." + self.layer_conf.activation)()
        else:
            self.activation = None
-        if self.is_cuda():
-            self.char_embeddings = self.char_embeddings.cuda()
-            self.char_cnn = self.char_cnn.cuda()
-            if self.activation and hasattr(self.activation, 'weight'):
-                self.activation.weight = torch.nn.Parameter(self.activation.weight.cuda())
+        # if self.is_cuda():
+        #     self.char_embeddings = self.char_embeddings.cuda()
+        #     self.char_cnn = self.char_cnn.cuda()
+        #     if self.activation and hasattr(self.activation, 'weight'):
+        #         self.activation.weight = torch.nn.Parameter(self.activation.weight.cuda())

    def forward(self, string):
        """
@ -102,14 +112,24 @@ class CNNCharEmbedding(BaseLayer):
        char_embs_lookup = char_embs_lookup.view(-1, string.size()[2], self.layer_conf.embedding_matrix_dim)    #[batch_size * seq_len, char num in words, embedding_dim]

        string_input = torch.unsqueeze(char_embs_lookup, 1)   # [batch_size * seq_len, input_channel_num=1, char num in words, embedding_dim]
-        string_conv = self.char_cnn(string_input).squeeze()
-        if self.activation:
-            string_conv = self.activation(string_conv)

-        string_maxpooling = F.max_pool1d(string_conv, string_conv.size(2)).squeeze()
-        string_out = string_maxpooling.view(string.size()[0], -1, self.layer_conf.output_channel_num)
+        outputs = []
+        for index, single_cnn in enumerate(self.char_cnn):
+            string_conv = single_cnn(string_input).squeeze(3)
+            if self.activation:
+                string_conv = self.activation(string_conv)

-        return string_out
+            string_maxpooling = F.max_pool1d(string_conv, string_conv.size(2)).squeeze()
+            string_out = string_maxpooling.view(string.size()[0], -1, self.layer_conf.output_channel_num[index])
+
+            outputs.append(string_out)
+
+        if len(outputs) > 1:
+            string_output = torch.cat(outputs, 2)
+        else:
+            string_output = outputs[0]
+
+        return string_output


 if __name__ == '__main__':
--- a/model_zoo/advanced/conf.json
+++ b/model_zoo/advanced/conf.json
@ -49,7 +49,8 @@
  "training_params": {
    "vocabulary": {
      "min_word_frequency": 1,
-      "max_vocabulary": 100000
+      "max_vocabulary": 100000,
+      "max_building_lines": 1000000
    },
    "optimizer": {
      "name": "Adam",
@ -57,6 +58,7 @@
        "lr": 0.001
      }
    },
+    "chunk_size": 1000000,
    "lr_decay": 0.95,
    "minimum_lr": 0.0001,
    "epoch_start_lr_decay": 1,
@ -65,7 +67,7 @@
    "batch_size": 30,
    "batch_num_to_show_results": 10,
    "max_epoch": 3,
-    "valid_times_per_epoch": 1,
+    "steps_per_validation": 10,
    "text_preprocessing": ["DBC2SBC"],
    "max_lengths":{
      "question": 30,
@ -90,10 +92,10 @@
            "cols": ["question_char", "answer_char"],
            "type": "CNNCharEmbedding",
            "dropout": 0.2,
-            "dim": 30,
-            "embedding_matrix_dim": 8,
-            "stride":1,
-            "window_size": 5,
+            "dim": [30, 20, 100],
+            "embedding_matrix_dim": 50,
+            "stride":[1, 2, 3],
+            "window_size": [3,3,5],
            "activation": "ReLU"
          }
        }
--- a/model_zoo/nlp_tasks/knowledge_distillation/query_binary_classifier_compression/conf_kdqbc_bilstmattn_cnn.json
+++ b/model_zoo/nlp_tasks/knowledge_distillation/query_binary_classifier_compression/conf_kdqbc_bilstmattn_cnn.json
@ -53,7 +53,7 @@
    "batch_size": 256,
    "batch_num_to_show_results": 10,
    "max_epoch": 30,
-    "valid_times_per_epoch": 10,
+    "steps_per_validation": 10,
    "fixed_lengths":{
      "query": 30
    }
--- a/problem.py
+++ b/problem.py
@ -12,9 +12,9 @@ nltk.download('stopwords', quiet=True)
 from utils.BPEEncoder import BPEEncoder
 import os
 import pickle as pkl
-from utils.common_utils import load_from_pkl, dump_to_pkl
+from utils.common_utils import load_from_pkl, dump_to_pkl, load_from_json, dump_to_json, prepare_dir, md5

-from settings import ProblemTypes
+from settings import ProblemTypes, Setting as st
 import math
 from utils.ProcessorsScheduler import ProcessorsScheduler

@ -112,24 +112,21 @@ class Problem():
        else:
            return None

-    def get_data_generator_from_file(self, data_path_list, file_with_col_header, chunk_size=1000000):
-        # NOTE: file_path is a list type
-        for single_path in data_path_list:
-            data_list = list()
-            if single_path is not None:
-                with open(single_path, "r", encoding='utf-8') as f:
-                    if file_with_col_header:
-                        f.readline()
-                    for index, line in enumerate(f):
-                        line = line.rstrip()
-                        if not line:
-                            break
-                        data_list.append(line)
-                        if (index + 1) % chunk_size == 0:
-                            yield data_list
-                            data_list = list()
-                    if len(data_list) > 0:
-                        yield data_list
+    def get_data_generator_from_file(self, data_path, file_with_col_header, chunk_size=1000000):
+        data_list = list()
+        with open(data_path, "r", encoding='utf-8') as f:
+            if file_with_col_header:
+                f.readline()
+            for index, line in enumerate(f):
+                line = line.rstrip()
+                if not line:
+                    break
+                data_list.append(line)
+                if (index + 1) % chunk_size == 0:
+                    yield data_list
+                    data_list = list()
+            if len(data_list) > 0:
+                yield data_list

    def build_training_data_list(self, training_data_list, file_columns, input_types, answer_column_name, bpe_encoder=None):
        docs = dict()           # docs of each type of input
@ -226,11 +223,11 @@ class Problem():

    def build(self, data_path_list, file_columns, input_types, file_with_col_header, answer_column_name, word2vec_path=None, word_emb_dim=None,
              format=None, file_type=None, involve_all_words=None, file_format="tsv", show_progress=True,
-              cpu_num_workers=-1, max_vocabulary=800000, word_frequency=3):
+              cpu_num_workers=-1, max_vocabulary=800000, word_frequency=3, max_building_lines=1000*1000):
        """

        Args:
-            training_data_path:
+            data_path_list:
            file_columns: {
                  "word1": 0,
                  "word2": 1,
@ -268,39 +265,29 @@ class Problem():

        """
        # parameter check
-        if not word2vec_path:
-            word_emb_dim, format, file_type, involve_all_words = None, None, None, None
-
-        if 'bpe' in input_types:
-            try:
-                bpe_encoder = BPEEncoder(input_types['bpe']['bpe_path'])
-            except KeyError:
-                raise Exception('Please define a bpe path at the embedding layer.')
-        else:
-            bpe_encoder = None
-
+        bpe_encoder = self._check_bpe_encoder(input_types)  
        self.file_column_num = len(file_columns)
-        progress = self.get_data_generator_from_file(data_path_list, file_with_col_header)
-        preprocessed_data_generator = self.build_training_multi_processor(progress, cpu_num_workers, file_columns, input_types, answer_column_name, bpe_encoder=bpe_encoder)
-        
-        # update symbol universe
-        total_cnt_legal, total_cnt_illegal = 0, 0
-        for docs, target_docs, cnt_legal, cnt_illegal in tqdm(preprocessed_data_generator):
-            total_cnt_legal += cnt_legal
-            total_cnt_illegal += cnt_illegal

-            # input_type
-            for input_type in input_types:
-                self.input_dicts[input_type].update(docs[input_type])
-            
-            # problem_type
-            if ProblemTypes[self.problem_type] == ProblemTypes.classification or \
-                ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging:
-                self.output_dict.update(list(target_docs.values())[0])
-            elif ProblemTypes[self.problem_type] == ProblemTypes.regression or \
-                    ProblemTypes[self.problem_type] == ProblemTypes.mrc:
-                pass
-        logging.info("Corpus imported: %d legal lines, %d illegal lines." % (total_cnt_legal, total_cnt_illegal))
+        for data_path in data_path_list:
+            if data_path:
+                progress = self.get_data_generator_from_file(data_path, file_with_col_header, chunk_size=max_building_lines)
+                preprocessed_data_generator= self.build_training_multi_processor(progress, cpu_num_workers, file_columns, input_types, answer_column_name, bpe_encoder=bpe_encoder)
+        
+                # update symbol universe
+                docs, target_docs, cnt_legal, cnt_illegal = next(preprocessed_data_generator)
+
+                # input_type
+                for input_type in input_types:
+                    self.input_dicts[input_type].update(docs[input_type])
+        
+                # problem_type
+                if ProblemTypes[self.problem_type] == ProblemTypes.classification or \
+                    ProblemTypes[self.problem_type] == ProblemTypes.sequence_tagging:
+                    self.output_dict.update(list(target_docs.values())[0])
+                elif ProblemTypes[self.problem_type] == ProblemTypes.regression or \
+                        ProblemTypes[self.problem_type] == ProblemTypes.mrc:
+                    pass
+                logging.info("[Building Dictionary] in %s at most %d lines imported: %d legal lines, %d illegal lines." % (data_path, max_building_lines, cnt_legal, cnt_illegal))
     
        # build dictionary
        for input_type in input_types:
@ -404,8 +391,6 @@ class Problem():

    def encode_data_multi_processor(self, data_generator, cpu_num_workers, file_columns, input_types, object_inputs,
                answer_column_name, min_sentence_len, extra_feature, max_lengths=None, fixed_lengths=None, file_format="tsv", bpe_encoder=None):
-        
-        
        for data in data_generator:
            scheduler = ProcessorsScheduler(cpu_num_workers)
            func_args = (data, file_columns, input_types, object_inputs,
@ -445,6 +430,9 @@ class Problem():

        type_branches = dict()            # branch of input type, e.g. type_branches['query_index'] = 'query'

+        # for char: don't split these word
+        word_no_split = ['<start>', '<pad>', '<eos>', '<unk>']
+        
        for branch in object_inputs:
            data[branch] = dict()
            lengths[branch] = dict()
@ -594,8 +582,18 @@ class Problem():
                            temp_word_char = []
                            temp_word_length = []
                            for single_token in tokens:
-                                temp_word_char.append(self.input_dicts[type2cluster[single_input_type]].lookup(single_token))
-                                temp_word_length.append(len(single_token))
+                                if single_token in word_no_split:
+                                    # temp_word_length.append(1)
+                                    temp_id = [self.input_dicts[type2cluster[single_input_type]].id(single_token)]
+                                else:
+                                    temp_id = self.input_dicts[type2cluster[single_input_type]].lookup(single_token)
+                                if fixed_lengths and 'word' in fixed_lengths:
+                                    if len(temp_id) >= fixed_lengths['word']:
+                                        temp_id = temp_id[:fixed_lengths['word']]
+                                    else:
+                                        temp_id = temp_id + [self.input_dicts[type2cluster[single_input_type]].id('<pad>')] * (fixed_lengths['word'] - len(temp_id))
+                                temp_word_char.append(temp_id)
+                                temp_word_length.append(len(temp_id))
                            data[branch][single_input_type].append(temp_word_char)
                            lengths[branch]['word_length'].append(temp_word_length)
                        else:
@ -675,7 +673,7 @@ class Problem():

    def encode(self, data_path, file_columns, input_types, file_with_col_header, object_inputs, answer_column_name,
               min_sentence_len, extra_feature, max_lengths=None, fixed_lengths=None, file_format="tsv", show_progress=True,
-               cpu_num_workers = -1):
+               cpu_num_workers=-1, chunk_size=1000*1000):
        """

        Args:
@ -751,22 +749,16 @@ class Problem():
            target: [...]

        """
-        if 'bpe' in input_types:
-            try:
-                bpe_encoder = BPEEncoder(input_types['bpe']['bpe_path'])
-            except KeyError:
-                raise Exception('Please define a bpe path at the embedding layer.')
-        else:
-            bpe_encoder = None
+        bpe_encoder = self._check_bpe_encoder(input_types)  

-        progress = self.get_data_generator_from_file([data_path], file_with_col_header)
-        encoder_generator = self.encode_data_multi_processor(progress, cpu_num_workers,
+        progress = self.get_data_generator_from_file(data_path, file_with_col_header, chunk_size=chunk_size)
+        encode_generator = self.encode_data_multi_processor(progress, cpu_num_workers,
                    file_columns, input_types, object_inputs, answer_column_name, min_sentence_len, extra_feature, max_lengths,
                    fixed_lengths, file_format, bpe_encoder=bpe_encoder)
        
        data, lengths, target = dict(), dict(), dict()
        cnt_legal, cnt_illegal = 0, 0
-        for temp_data, temp_lengths, temp_target, temp_cnt_legal, temp_cnt_illegal in tqdm(encoder_generator):
+        for temp_data, temp_lengths, temp_target, temp_cnt_legal, temp_cnt_illegal in tqdm(encode_generator):
            data = self._merge_encode_data(data, temp_data)
            lengths = self._merge_encode_lengths(lengths, temp_lengths)
            target = self._merge_target(target, temp_target)   
@ -776,6 +768,59 @@ class Problem():
        logging.info("%s: %d legal samples, %d illegal samples" % (data_path, cnt_legal, cnt_illegal))
        return data, lengths, target

+    def build_encode_cache(self, conf, file_format="tsv"):
+        logging.info("[Cache] building encoding cache") 
+        build_encode_cache_generator = self.get_encode_generator(conf, build_cache=True, file_format=file_format)
+        for _ in build_encode_cache_generator:
+            continue
+        logging.info("[Cache] encoding is saved to %s" % conf.encoding_cache_dir)    
+        
+    def get_encode_generator(self, conf, build_cache=True, file_format="tsv"):
+        # parameter check
+        if build_cache:
+            assert conf.encoding_cache_dir, 'There is no property encoding_cache_dir in object conf'
+            assert conf.encoding_cache_index_file_path, 'There is no property encoding_cache_index_file_path in object conf'
+            assert conf.encoding_cache_index_file_md5_path, 'There is no property encoding_cache_index_file_md5_path in object conf'
+
+        bpe_encoder = self._check_bpe_encoder(conf.input_types)   
+        data_generator = self.get_data_generator_from_file(conf.train_data_path, conf.file_with_col_header, chunk_size=conf.chunk_size)
+        encode_generator = self.encode_data_multi_processor(data_generator, conf.cpu_num_workers,
+                    conf.file_columns, conf.input_types, conf.object_inputs, conf.answer_column_name, 
+                    conf.min_sentence_len, conf.extra_feature, conf.max_lengths,
+                    conf.fixed_lengths, file_format, bpe_encoder=bpe_encoder)
+      
+        file_index = []
+        total_cnt_legal, total_cnt_illegal = 0, 0
+        for part_number, encode_data in enumerate(encode_generator):
+            data, lengths, target, cnt_legal, cnt_illegal = encode_data
+            if build_cache:
+                total_cnt_legal = total_cnt_legal + cnt_legal
+                total_cnt_illegal = total_cnt_illegal + cnt_illegal
+                file_name = st.cencoding_file_name_pattern % (part_number)
+                file_path = os.path.join(conf.encoding_cache_dir, file_name)
+                dump_to_pkl((data, lengths, target), file_path)
+                file_index.append([file_name, md5([file_path])])
+                logging.info("Up to now, in %s: %d legal samples, %d illegal samples" % (conf.train_data_path, total_cnt_legal, total_cnt_illegal))
+            yield data, lengths, target
+        
+        if build_cache:
+            cache_index = dict()
+            cache_index[st.cencoding_key_index] = file_index
+            cache_index[st.cencoding_key_legal_cnt] = total_cnt_legal
+            cache_index[st.cencoding_key_illegal_cnt] = total_cnt_illegal
+            dump_to_json(cache_index, conf.encoding_cache_index_file_path)
+            dump_to_json(md5([conf.encoding_cache_index_file_path]), conf.encoding_cache_index_file_md5_path)            
+            
+    @staticmethod
+    def _check_bpe_encoder(input_types):
+        bpe_encoder = None
+        if 'bpe' in input_types:
+            try:
+                bpe_encoder = BPEEncoder(input_types['bpe']['bpe_path'])
+            except KeyError:
+                raise Exception('Please define a bpe path at the embedding layer.')
+        return bpe_encoder
+
    def decode(self, model_output, lengths=None, batch_data=None):
        """ decode the model output, either a batch of output or a single output

--- a/settings.py
+++ b/settings.py
@ -53,3 +53,27 @@ DefaultPredictionFields = {
 # nltk's models
 nltk.data.path.append(os.path.join(os.getcwd(), 'dataset', 'nltk_data'))

+
+class Constant(type):
+    def __setattr__(self, name, value):
+        raise AttributeError("Class %s can not be modified"%(self.__name__))
+
+class ConstantStatic(metaclass=Constant):
+    def __init__(self, *args,**kwargs):
+        raise Exception("Class %s can not be instantiated"%(self.__class__.__name__))
+
+
+class Setting(ConstantStatic):
+    # cache
+
+    ## cencoding (cache_encoding)
+    cencodig_index_file_name = 'index.json'
+    cencoding_index_md5_file_name = 'index_md5.json'
+    cencoding_file_name_pattern = 'encoding_cache_%s.pkl'
+    cencoding_key_finish = 'finish'
+    cencoding_key_index = 'index'
+    cencoding_key_legal_cnt = 'legal_line_cnt'
+    cencoding_key_illegal_cnt = 'illegal_line_cnt'
+
+
+
--- a/train.py
+++ b/train.py
@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT license.

-from settings import ProblemTypes, version
+from settings import ProblemTypes, version, Setting as st

 import os
 import argparse
@ -15,7 +15,7 @@ import torch
 import torch.nn as nn
 from ModelConf import ModelConf
 from problem import Problem
-from utils.common_utils import dump_to_pkl, load_from_pkl, prepare_dir
+from utils.common_utils import dump_to_pkl, load_from_pkl, load_from_json, dump_to_json, prepare_dir, md5
 from utils.philly_utils import HDFSDirectTransferer
 from losses import *
 from optimizers import *
@ -33,33 +33,76 @@ class Cache:
        self.dictionary_invalid = True
        self.embedding_invalid = True

-        # cache_conf
-        cache_conf = None
-        cache_conf_path = os.path.join(conf.cache_dir, 'conf_cache.json')
-        if os.path.isfile(cache_conf_path):
-            params_cache = copy.deepcopy(params)
-            try:
-                cache_conf = ModelConf('cache', cache_conf_path, version, params_cache)
-            except Exception as e:
-                cache_conf = None
-        if cache_conf is None or not self._verify_conf(cache_conf, conf):
-            return False
-        
-        # problem
-        if not os.path.isfile(conf.problem_path):
-            return False
-
-        # embedding
-        if conf.emb_pkl_path:
-            if not os.path.isfile(conf.emb_pkl_path):
+        if not conf.pretrained_model_path:
+            # cache_conf
+            cache_conf = None
+            cache_conf_path = os.path.join(conf.cache_dir, 'conf_cache.json')
+            if os.path.isfile(cache_conf_path):
+                params_cache = copy.deepcopy(params)
+                try:
+                    cache_conf = ModelConf('cache', cache_conf_path, version, params_cache)
+                except Exception as e:
+                    cache_conf = None
+            if cache_conf is None or not self._verify_conf(cache_conf, conf):
                return False
-            self.embedding_invalid = False
+            
+            # problem
+            if not os.path.isfile(conf.problem_path):
+                return False
+
+            # embedding
+            if conf.emb_pkl_path:
+                if not os.path.isfile(conf.emb_pkl_path):
+                    return False
+                self.embedding_invalid = False
        
-        self.dictionary_invalid = False
+            self.dictionary_invalid = False
+            logging.info('[Cache] dictionary found')
        return True
        
    def _check_encoding(self, conf):
+        self.encoding_invalid = True
+        if not conf.pretrained_model_path and self.dictionary_invalid:
+            return False
+        
+        # Calculate the MD5 of problem
+        problem_path = conf.problem_path if not conf.pretrained_model_path else conf.saved_problem_path
+        try:
+            conf.problem_md5 = md5([problem_path])
+        except Exception as e:
+            conf.problem_md5 = None
+            logging.info('Can not calculate md5 of problem.pkl from %s'%(problem_path))
+            return False
+        
+        # check the valid of encoding cache
+        ## encoding cache dir
+        conf.encoding_cache_dir = os.path.join(conf.cache_dir, conf.train_data_md5 + conf.problem_md5)
+        logging.debug('[Cache] conf.encoding_cache_dir %s' % (conf.encoding_cache_dir))
+        if not os.path.exists(conf.encoding_cache_dir):
+            return False
+        
+        ## encoding cache index 
+        conf.encoding_cache_index_file_path = os.path.join(conf.encoding_cache_dir, st.cencodig_index_file_name)
+        conf.encoding_cache_index_file_md5_path = os.path.join(conf.encoding_cache_dir, st.cencoding_index_md5_file_name)
+        if not os.path.exists(conf.encoding_cache_index_file_path) or not os.path.exists(conf.encoding_cache_index_file_md5_path):
+            return False
+        if md5([conf.encoding_cache_index_file_path]) != load_from_json(conf.encoding_cache_index_file_md5_path):
+            return False
+        cache_index = load_from_json(conf.encoding_cache_index_file_path)
+
+        ## encoding cache content
+        for index in cache_index[st.cencoding_key_index]:
+            file_name, file_md5 = index[0], index[1]
+            if file_md5 != md5([os.path.join(conf.encoding_cache_dir, file_name)]):
+                return False
+        
+        if (st.cencoding_key_legal_cnt in cache_index) and (st.cencoding_key_illegal_cnt in cache_index):
+            conf.encoding_cache_legal_line_cnt = cache_index[st.cencoding_key_legal_cnt]
+            conf.encoding_cache_illegal_line_cnt = cache_index[st.cencoding_key_illegal_cnt]
+        
        self.encoding_invalid = False
+        logging.info('[Cache] encoding found')
+        logging.info('%s: %d legal samples, %d illegal samples' % (conf.train_data_path, conf.encoding_cache_legal_line_cnt, conf.encoding_cache_illegal_line_cnt))
        return True

    def check(self, conf, params):
@ -69,7 +112,7 @@ class Cache:
            return
        # encoding
        if not self._check_encoding(conf):
-            self._renew_cache(params, conf.cache_dir)
+            self._renew_cache(params, conf.encoding_cache_dir)

    def load(self, conf, problem, emb_matrix):
        # load dictionary when (not finetune) and (cache valid)
@ -80,13 +123,17 @@ class Cache:
            logging.info('[Cache] loading dictionary successfully')
        
        if not self.encoding_invalid:
-            pass  
+            self._prepare_encoding_cache(conf, problem, build=False)
+            logging.info('[Cache] preparing encoding successfully')
        return problem, emb_matrix

    def save(self, conf, params, problem, emb_matrix):
+        # make cache dir
        if not os.path.exists(conf.cache_dir):
            os.makedirs(conf.cache_dir)
        shutil.copy(params.conf_path, os.path.join(conf.cache_dir, 'conf_cache.json'))
+
+        # dictionary
        if self.dictionary_invalid:
            if conf.mode == 'philly' and conf.emb_pkl_path.startswith('/hdfs/'):
                with HDFSDirectTransferer(conf.problem_path, with_hdfs_command=True) as transferer:
@ -100,10 +147,11 @@ class Cache:
                        transferer.pkl_dump(emb_matrix)
                else:
                    dump_to_pkl(emb_matrix, conf.emb_pkl_path)
-            logging.info("Embedding matrix saved to %s" % conf.emb_pkl_path)
+            logging.info("[Cache] Embedding matrix saved to %s" % conf.emb_pkl_path)
        
+        # encoding
        if self.encoding_invalid:
-            pass
+            self._prepare_encoding_cache(conf, problem, build=params.make_cache_only) 

    def back_up(self, conf, problem):
        cache_bakup_path = os.path.join(conf.save_base_dir, 'necessary_cache/')
@ -149,6 +197,34 @@ class Cache:
                flag = False
        return flag

+    def _prepare_encoding_cache(self, conf, problem, build=False):
+        # encoding cache dir
+        problem_path = conf.problem_path if not conf.pretrained_model_path else conf.saved_problem_path
+        conf.problem_md5 = md5([problem_path])
+        conf.encoding_cache_dir = os.path.join(conf.cache_dir, conf.train_data_md5 + conf.problem_md5)
+        if not os.path.exists(conf.encoding_cache_dir):
+            os.makedirs(conf.encoding_cache_dir)
+        
+        # encoding cache files
+        conf.encoding_cache_index_file_path = os.path.join(conf.encoding_cache_dir, st.cencodig_index_file_name)
+        conf.encoding_cache_index_file_md5_path = os.path.join(conf.encoding_cache_dir, st.cencoding_index_md5_file_name) 
+        conf.load_encoding_cache_generator = self._load_encoding_cache_generator
+        
+        if build:
+            prepare_dir(conf.encoding_cache_dir, True, allow_overwrite=True, clear_dir_if_exist=True)
+            problem.build_encode_cache(conf)
+            self.encoding_invalid = False
+
+        if not self.encoding_invalid:
+            cache_index = load_from_json(conf.encoding_cache_index_file_path)
+            conf.encoding_file_index = cache_index[st.cencoding_key_index]
+
+    @staticmethod
+    def _load_encoding_cache_generator(cache_dir, file_index):
+        for index in file_index:
+            file_path = os.path.join(cache_dir, index[0])
+            yield load_from_pkl(file_path)
+    
 def main(params):
    # init
    conf = ModelConf("train", params.conf_path, version, params, mode=params.mode)
@ -181,11 +257,7 @@ def main(params):
                                    word_emb_dim=conf.pretrained_emb_dim, format=conf.pretrained_emb_type,
                                    file_type=conf.pretrained_emb_binary_or_text, involve_all_words=conf.involve_all_words_in_pretrained_emb,
                                    show_progress=True if params.mode == 'normal' else False, cpu_num_workers = conf.cpu_num_workers,
-                                    max_vocabulary=conf.max_vocabulary, word_frequency=conf.min_word_frequency)
-
-    ## encode rawdata when do not use cache
-    if conf.use_cache == False:
-        pass
+                                    max_vocabulary=conf.max_vocabulary, word_frequency=conf.min_word_frequency, max_building_lines=conf.max_building_lines)

    # environment preparing
    ## cache save
--- a/utils/common_utils.py
+++ b/utils/common_utils.py
@ -3,6 +3,7 @@

 import logging
 import pickle as pkl
+import json
 import torch
 import torch.nn as nn
 import os
@ -49,6 +50,17 @@ def dump_to_pkl(obj, pkl_path):
        pkl.dump(obj, fout, protocol=pkl.HIGHEST_PROTOCOL)
    logging.debug("Obj dumped to %s!" % pkl_path)

+def load_from_json(json_path):
+    data = None
+    with open(json_path, 'r', encoding='utf-8') as f:
+        data = json.loads(f.read())
+    logging.debug("%s loaded!" % json_path)
+    return data
+
+def dump_to_json(obj, json_path):
+    with open(json_path, 'w', encoding='utf-8') as f:
+        f.write(json.dumps(obj))
+    logging.debug("Obj dumped to %s!" % json_path)

 def get_trainable_param_num(model):
    """ get the number of trainable parameters
@ -60,9 +72,15 @@ def get_trainable_param_num(model):

    """
    if isinstance(model, nn.DataParallel):
-        model_param = list(model.parameters()) + list(model.module.layers['embedding'].get_parameters())
+        if isinstance(model.module.layers['embedding'].embeddings, dict):
+            model_param = list(model.parameters()) + list(model.module.layers['embedding'].get_parameters())
+        else:
+            model_param = list(model.parameters())
    else:
-        model_param = list(model.parameters()) + list(model.layers['embedding'].get_parameters())
+        if isinstance(model.layers['embedding'].embeddings, dict):
+            model_param = list(model.parameters()) + list(model.layers['embedding'].get_parameters())
+        else:
+            model_param = list(model.parameters())

    return sum(p.numel() for p in model_param if p.requires_grad)

@ -228,7 +246,7 @@ def md5(file_paths, chunk_size=1024*1024*1024):
    """ Calculate a md5 of lists of files. 

    Args:
-        file_paths:  an iterable object contains files. Files will be concatenated orderly if there are more than one file
+        file_paths:  an iterable object contains file paths. Files will be concatenated orderly if there are more than one file
        chunk_size:  unit is byte, default value is 1GB
    Returns:
        md5
--- a/utils/corpus_utils.py
+++ b/utils/corpus_utils.py
@ -16,6 +16,7 @@ import codecs
 import copy
 from settings import ProblemTypes
 import torch
+import time


 if sys.version_info < (3,):
@ -236,10 +237,10 @@ def get_batches(problem, data, length, target, batch_size, input_types, pad_ids=
        logging.info("Start making batches")
    if permutate is True:
        #CAUTION! data and length would be revised
-        data = copy.deepcopy(data)
-        length = copy.deepcopy(length)
-        if target is not None:
-            target = copy.deepcopy(target)
+        # data = copy.deepcopy(data)
+        # length = copy.deepcopy(length)
+        # if target is not None:
+        #     target = copy.deepcopy(target)

        # shuffle the data
        permutation = np.random.permutation(len(list(target.values())[0]))