Saved models to keep directory
This commit is contained in:
Родитель
6e146c933c
Коммит
6329684b58
|
@ -56,15 +56,15 @@ fi
|
|||
|
||||
|
||||
# create the model folder
|
||||
mkdir -p model
|
||||
export MODEL=`pwd`/../../../keep
|
||||
|
||||
# train model
|
||||
$MARIAN/build/marian \
|
||||
--devices $GPUS \
|
||||
--type s2s \
|
||||
--model model/model.npz \
|
||||
--model $MODEL/model.npz \
|
||||
--train-sets data/corpus.ro data/corpus.en \
|
||||
--vocabs model/vocab.roen.spm model/vocab.roen.spm \
|
||||
--vocabs $MODEL/vocab.roen.spm $MODEL/vocab.roen.spm \
|
||||
--sentencepiece-options '--normalization_rule_tsv=data/norm_romanian.tsv' \
|
||||
--dim-vocabs 32000 32000 \
|
||||
--mini-batch-fit -w 20000 \
|
||||
|
@ -74,19 +74,19 @@ $MARIAN/build/marian \
|
|||
--valid-freq 10000 --save-freq 10000 --disp-freq 1000 \
|
||||
--cost-type ce-mean-words --valid-metrics ce-mean-words bleu-detok \
|
||||
--valid-sets data/newsdev2016.ro data/newsdev2016.en \
|
||||
--log model/train.log --valid-log model/valid.log --tempdir model \
|
||||
--log $MODEL/train.log --valid-log $MODEL/valid.log --tempdir $MODEL \
|
||||
--overwrite --keep-best \
|
||||
--seed 1111 --exponential-smoothing \
|
||||
--normalize=0.6 --beam-size=6 --quiet-translation
|
||||
|
||||
# translate dev set
|
||||
cat data/newsdev2016.ro \
|
||||
| $MARIAN/build/marian-decoder -c model/model.npz.best-bleu-detok.npz.decoder.yml -d $GPUS -b 6 -n0.6 \
|
||||
| $MARIAN/build/marian-decoder -c $MODEL/model.npz.best-bleu-detok.npz.decoder.yml -d $GPUS -b 6 -n0.6 \
|
||||
--mini-batch 64 --maxi-batch 100 --maxi-batch-sort src > data/newsdev2016.ro.output
|
||||
|
||||
# translate test set
|
||||
cat data/newstest2016.ro \
|
||||
| $MARIAN/build/marian-decoder -c model/model.npz.best-bleu-detok.npz.decoder.yml -d $GPUS -b 6 -n0.6 \
|
||||
| $MARIAN/build/marian-decoder -c $MODEL/model.npz.best-bleu-detok.npz.decoder.yml -d $GPUS -b 6 -n0.6 \
|
||||
--mini-batch 64 --maxi-batch 100 --maxi-batch-sort src > data/newstest2016.ro.output
|
||||
|
||||
# calculate bleu scores on dev and test set
|
||||
|
|
|
@ -38,7 +38,7 @@ then
|
|||
./scripts/download-files.sh
|
||||
fi
|
||||
|
||||
mkdir -p model
|
||||
export MODEL=`pwd`/../../../keep
|
||||
|
||||
# preprocess data
|
||||
if [ ! -e "data/corpus.bpe.en" ]
|
||||
|
@ -47,14 +47,14 @@ then
|
|||
fi
|
||||
|
||||
# train model
|
||||
if [ ! -e "model/model.npz.best-translation.npz" ]
|
||||
if [ ! -e "$MODEL/model.npz.best-translation.npz" ]
|
||||
then
|
||||
$MARIAN_TRAIN \
|
||||
--devices $GPUS \
|
||||
--type amun \
|
||||
--model model/model.npz \
|
||||
--model $MODEL/model.npz \
|
||||
--train-sets data/corpus.bpe.ro data/corpus.bpe.en \
|
||||
--vocabs model/vocab.ro.yml model/vocab.en.yml \
|
||||
--vocabs $MODEL/vocab.ro.yml $MODEL/vocab.en.yml \
|
||||
--dim-vocabs 66000 50000 \
|
||||
--mini-batch-fit -w 19000 \
|
||||
--layer-normalization --dropout-rnn 0.2 --dropout-src 0.1 --dropout-trg 0.1 \
|
||||
|
@ -63,7 +63,7 @@ then
|
|||
--valid-metrics cross-entropy translation \
|
||||
--valid-sets data/newsdev2016.bpe.ro data/newsdev2016.bpe.en \
|
||||
--valid-script-path "bash ./scripts/validate.sh" \
|
||||
--log model/train.log --valid-log model/valid.log \
|
||||
--log $MODEL/train.log --valid-log $MODEL/valid.log \
|
||||
--overwrite --keep-best \
|
||||
--seed 1111 --exponential-smoothing \
|
||||
--normalize=1 --beam-size=12 --quiet-translation
|
||||
|
@ -71,7 +71,7 @@ fi
|
|||
|
||||
# translate dev set
|
||||
cat data/newsdev2016.bpe.ro \
|
||||
| $MARIAN_DECODER -c model/model.npz.best-translation.npz.decoder.yml -d $GPUS -b 12 -n1 \
|
||||
| $MARIAN_DECODER -c $MODEL/model.npz.best-translation.npz.decoder.yml -d $GPUS -b 12 -n1 \
|
||||
--mini-batch 64 --maxi-batch 10 --maxi-batch-sort src \
|
||||
| sed 's/\@\@ //g' \
|
||||
| ../tools/moses-scripts/scripts/recaser/detruecase.perl \
|
||||
|
@ -80,7 +80,7 @@ cat data/newsdev2016.bpe.ro \
|
|||
|
||||
# translate test set
|
||||
cat data/newstest2016.bpe.ro \
|
||||
| $MARIAN_DECODER -c model/model.npz.best-translation.npz.decoder.yml -d $GPUS -b 12 -n1 \
|
||||
| $MARIAN_DECODER -c $MODEL/model.npz.best-translation.npz.decoder.yml -d $GPUS -b 12 -n1 \
|
||||
--mini-batch 64 --maxi-batch 10 --maxi-batch-sort src \
|
||||
| sed 's/\@\@ //g' \
|
||||
| ../tools/moses-scripts/scripts/recaser/detruecase.perl \
|
||||
|
|
|
@ -47,29 +47,29 @@ done
|
|||
$mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok $SRC $TRG data/corpus.tok.clean 1 80
|
||||
|
||||
# train truecaser
|
||||
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.clean.$SRC -model model/tc.$SRC
|
||||
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.clean.$TRG -model model/tc.$TRG
|
||||
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.clean.$SRC -model $MODEL/tc.$SRC
|
||||
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.clean.$TRG -model $MODEL/tc.$TRG
|
||||
|
||||
# apply truecaser (cleaned training corpus)
|
||||
for prefix in corpus
|
||||
do
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.clean.$SRC > data/$prefix.tc.$SRC
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.clean.$TRG > data/$prefix.tc.$TRG
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$SRC < data/$prefix.tok.clean.$SRC > data/$prefix.tc.$SRC
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$TRG < data/$prefix.tok.clean.$TRG > data/$prefix.tc.$TRG
|
||||
done
|
||||
|
||||
# apply truecaser (dev/test files)
|
||||
for prefix in newsdev2016 newstest2016
|
||||
do
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
|
||||
done
|
||||
|
||||
# train BPE
|
||||
cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
|
||||
cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > $MODEL/$SRC$TRG.bpe
|
||||
|
||||
# apply BPE
|
||||
for prefix in corpus newsdev2016 newstest2016
|
||||
do
|
||||
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
|
||||
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
|
||||
$subword_nmt/apply_bpe.py -c $MODEL/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
|
||||
$subword_nmt/apply_bpe.py -c $MODEL/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
|
||||
done
|
||||
|
|
|
@ -38,7 +38,7 @@ then
|
|||
./scripts/download-files.sh
|
||||
fi
|
||||
|
||||
mkdir -p model
|
||||
export MODEL=`pwd`/../../../keep
|
||||
|
||||
# preprocess data
|
||||
if [ ! -e "data/corpus.bpe.en" ]
|
||||
|
@ -54,19 +54,19 @@ then
|
|||
fi
|
||||
|
||||
# create common vocabulary
|
||||
if [ ! -e "model/vocab.ende.yml" ]
|
||||
if [ ! -e "$MODEL/vocab.ende.yml" ]
|
||||
then
|
||||
cat data/corpus.bpe.en data/corpus.bpe.de | $MARIAN_VOCAB --max-size 36000 > model/vocab.ende.yml
|
||||
cat data/corpus.bpe.en data/corpus.bpe.de | $MARIAN_VOCAB --max-size 36000 > $MODEL/vocab.ende.yml
|
||||
fi
|
||||
|
||||
# train model
|
||||
if [ ! -e "model/model.npz" ]
|
||||
if [ ! -e "$MODEL/model.npz" ]
|
||||
then
|
||||
$MARIAN_TRAIN \
|
||||
--model model/model.npz --type transformer \
|
||||
--model $MODEL/model.npz --type transformer \
|
||||
--train-sets data/corpus.bpe.en data/corpus.bpe.de \
|
||||
--max-length 100 \
|
||||
--vocabs model/vocab.ende.yml model/vocab.ende.yml \
|
||||
--vocabs $MODEL/vocab.ende.yml $MODEL/vocab.ende.yml \
|
||||
--mini-batch-fit -w 22000 --maxi-batch 1000 \
|
||||
--early-stopping 10 --cost-type=ce-mean-words \
|
||||
--valid-freq 5000 --save-freq 5000 --disp-freq 500 \
|
||||
|
@ -76,7 +76,7 @@ then
|
|||
--valid-translation-output data/valid.bpe.en.output --quiet-translation \
|
||||
--valid-mini-batch 64 \
|
||||
--beam-size 6 --normalize 0.6 \
|
||||
--log model/train.log --valid-log model/valid.log \
|
||||
--log $MODEL/train.log --valid-log $MODEL/valid.log \
|
||||
--enc-depth 6 --dec-depth 6 \
|
||||
--transformer-heads 8 \
|
||||
--transformer-postprocess-emb d \
|
||||
|
@ -90,13 +90,13 @@ then
|
|||
fi
|
||||
|
||||
# find best model on dev set
|
||||
ITER=`cat model/valid.log | grep translation | sort -rg -k12,12 -t' ' | cut -f8 -d' ' | head -n1`
|
||||
ITER=`cat $MODEL/valid.log | grep translation | sort -rg -k12,12 -t' ' | cut -f8 -d' ' | head -n1`
|
||||
|
||||
# translate test sets
|
||||
for prefix in test2014 test2015 test2016
|
||||
do
|
||||
cat data/$prefix.bpe.en \
|
||||
| $MARIAN_DECODER -c model/model.npz.decoder.yml -m model/model.iter$ITER.npz -d $GPUS -b 12 -n -w 6000 \
|
||||
| $MARIAN_DECODER -c $MODEL/model.npz.decoder.yml -m $MODEL/model.iter$ITER.npz -d $GPUS -b 12 -n -w 6000 \
|
||||
| sed 's/\@\@ //g' \
|
||||
| ../tools/moses-scripts/scripts/recaser/detruecase.perl \
|
||||
| ../tools/moses-scripts/scripts/tokenizer/detokenizer.perl -l de \
|
||||
|
|
|
@ -35,24 +35,24 @@ mv data/corpus.tok.$TRG data/corpus.tok.uncleaned.$TRG
|
|||
$mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok.uncleaned $SRC $TRG data/corpus.tok 1 100
|
||||
|
||||
# train truecaser
|
||||
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC
|
||||
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG
|
||||
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model $MODEL/tc.$SRC
|
||||
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model $MODEL/tc.$TRG
|
||||
|
||||
# apply truecaser (cleaned training corpus)
|
||||
for prefix in corpus valid test2014 test2015 test2016
|
||||
do
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
|
||||
test -f data/$prefix.tok.$TRG || continue
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
|
||||
done
|
||||
|
||||
# train BPE
|
||||
cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
|
||||
cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > $MODEL/$SRC$TRG.bpe
|
||||
|
||||
# apply BPE
|
||||
for prefix in corpus valid test2014 test2015 test2016
|
||||
do
|
||||
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
|
||||
$subword_nmt/apply_bpe.py -c $MODEL/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
|
||||
test -f data/$prefix.tc.$TRG || continue
|
||||
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
|
||||
$subword_nmt/apply_bpe.py -c $MODEL/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
|
||||
done
|
||||
|
|
|
@ -44,7 +44,7 @@ then
|
|||
./scripts/download-files.sh
|
||||
fi
|
||||
|
||||
mkdir -p model
|
||||
export MODEL=`pwd`/../../../keep
|
||||
|
||||
# preprocess data
|
||||
if [ ! -e "data/corpus.bpe.en" ]
|
||||
|
@ -71,20 +71,20 @@ then
|
|||
fi
|
||||
|
||||
# create common vocabulary
|
||||
if [ ! -e "model/vocab.ende.yml" ]
|
||||
if [ ! -e "$MODEL/vocab.ende.yml" ]
|
||||
then
|
||||
cat data/corpus.bpe.en data/corpus.bpe.de | $MARIAN_VOCAB --max-size 36000 > model/vocab.ende.yml
|
||||
cat data/corpus.bpe.en data/corpus.bpe.de | $MARIAN_VOCAB --max-size 36000 > $MODEL/vocab.ende.yml
|
||||
fi
|
||||
|
||||
# train model
|
||||
mkdir -p model.back
|
||||
if [ ! -e "model.back/model.npz.best-translation.npz" ]
|
||||
mkdir -p $MODEL/back
|
||||
if [ ! -e "$MODEL/back/model.npz.best-translation.npz" ]
|
||||
then
|
||||
$MARIAN_TRAIN \
|
||||
--model model.back/model.npz --type s2s \
|
||||
--model $MODEL/back/model.npz --type s2s \
|
||||
--train-sets data/corpus.bpe.de data/corpus.bpe.en \
|
||||
--max-length 100 \
|
||||
--vocabs model/vocab.ende.yml model/vocab.ende.yml \
|
||||
--vocabs $MODEL/vocab.ende.yml $MODEL/vocab.ende.yml \
|
||||
--mini-batch-fit -w 3500 --maxi-batch 1000 \
|
||||
--valid-freq 10000 --save-freq 10000 --disp-freq 1000 \
|
||||
--valid-metrics ce-mean-words perplexity translation \
|
||||
|
@ -94,7 +94,7 @@ then
|
|||
--valid-mini-batch 64 --beam-size 12 --normalize=1 \
|
||||
--overwrite --keep-best \
|
||||
--early-stopping 5 --after-epochs 10 --cost-type=ce-mean-words \
|
||||
--log model.back/train.log --valid-log model.back/valid.log \
|
||||
--log $MODEL/back/train.log --valid-log $MODEL/back/valid.log \
|
||||
--tied-embeddings-all --layer-normalization \
|
||||
--devices $GPUS --seed 1111 \
|
||||
--exponential-smoothing
|
||||
|
@ -103,7 +103,7 @@ fi
|
|||
if [ ! -e "data/news.2016.bpe.en" ]
|
||||
then
|
||||
$MARIAN_DECODER \
|
||||
-c model.back/model.npz.best-translation.npz.decoder.yml \
|
||||
-c $MODEL/back/model.npz.best-translation.npz.decoder.yml \
|
||||
-i data/news.2016.bpe.de \
|
||||
-b 6 --normalize=1 -w 2500 -d $GPUS \
|
||||
--mini-batch 64 --maxi-batch 100 --maxi-batch-sort src \
|
||||
|
@ -119,13 +119,13 @@ fi
|
|||
|
||||
for i in $(seq 1 $N)
|
||||
do
|
||||
mkdir -p model/ens$i
|
||||
mkdir -p $MODEL/ens$i
|
||||
# train model
|
||||
$MARIAN_TRAIN \
|
||||
--model model/ens$i/model.npz --type transformer \
|
||||
--model $MODEL/ens$i/model.npz --type transformer \
|
||||
--train-sets data/all.bpe.en data/all.bpe.de \
|
||||
--max-length 100 \
|
||||
--vocabs model/vocab.ende.yml model/vocab.ende.yml \
|
||||
--vocabs $MODEL/vocab.ende.yml $MODEL/vocab.ende.yml \
|
||||
--mini-batch-fit -w $WORKSPACE --mini-batch 1000 --maxi-batch 1000 \
|
||||
--valid-freq 5000 --save-freq 5000 --disp-freq 500 \
|
||||
--valid-metrics ce-mean-words perplexity translation \
|
||||
|
@ -136,7 +136,7 @@ do
|
|||
--valid-mini-batch 64 \
|
||||
--overwrite --keep-best \
|
||||
--early-stopping 5 --after-epochs $EPOCHS --cost-type=ce-mean-words \
|
||||
--log model/ens$i/train.log --valid-log model/ens$i/valid.log \
|
||||
--log $MODEL/ens$i/train.log --valid-log $MODEL/ens$i/valid.log \
|
||||
--enc-depth 6 --dec-depth 6 \
|
||||
--tied-embeddings-all \
|
||||
--transformer-dropout 0.1 --label-smoothing 0.1 \
|
||||
|
@ -148,13 +148,13 @@ done
|
|||
|
||||
for i in $(seq 1 $N)
|
||||
do
|
||||
mkdir -p model/ens-rtl$i
|
||||
mkdir -p $MODEL/ens-rtl$i
|
||||
# train model
|
||||
$MARIAN_TRAIN \
|
||||
--model model/ens-rtl$i/model.npz --type transformer \
|
||||
--model $MODEL/ens-rtl$i/model.npz --type transformer \
|
||||
--train-sets data/all.bpe.en data/all.bpe.de \
|
||||
--max-length 100 \
|
||||
--vocabs model/vocab.ende.yml model/vocab.ende.yml \
|
||||
--vocabs $MODEL/vocab.ende.yml $MODEL/vocab.ende.yml \
|
||||
--mini-batch-fit -w $WORKSPACE --mini-batch 1000 --maxi-batch 1000 \
|
||||
--valid-freq 5000 --save-freq 5000 --disp-freq 500 \
|
||||
--valid-metrics ce-mean-words perplexity translation \
|
||||
|
@ -165,7 +165,7 @@ do
|
|||
--valid-mini-batch 64 \
|
||||
--overwrite --keep-best \
|
||||
--early-stopping 5 --after-epochs $EPOCHS --cost-type=ce-mean-words \
|
||||
--log model/ens-rtl$i/train.log --valid-log model/ens-rtl$i/valid.log \
|
||||
--log $MODEL/ens-rtl$i/train.log --valid-log $MODEL/ens-rtl$i/valid.log \
|
||||
--enc-depth 6 --dec-depth 6 \
|
||||
--tied-embeddings-all \
|
||||
--transformer-dropout 0.1 --label-smoothing 0.1 \
|
||||
|
@ -179,15 +179,15 @@ done
|
|||
for prefix in valid test2014 test2015 test2017
|
||||
do
|
||||
cat data/$prefix.bpe.en \
|
||||
| $MARIAN_DECODER -c model/ens1/model.npz.best-translation.npz.decoder.yml \
|
||||
-m model/ens?/model.npz.best-translation.npz -d $GPUS \
|
||||
| $MARIAN_DECODER -c $MODEL/ens1/model.npz.best-translation.npz.decoder.yml \
|
||||
-m $MODEL/ens?/model.npz.best-translation.npz -d $GPUS \
|
||||
--mini-batch 16 --maxi-batch 100 --maxi-batch-sort src -w 5000 --n-best --beam-size $B \
|
||||
> data/$prefix.bpe.en.output.nbest.0
|
||||
|
||||
for i in $(seq 1 $N)
|
||||
do
|
||||
$MARIAN_SCORER -m model/ens-rtl$i/model.npz.best-perplexity.npz \
|
||||
-v model/vocab.ende.yml model/vocab.ende.yml -d $GPUS \
|
||||
$MARIAN_SCORER -m $MODEL/ens-rtl$i/model.npz.best-perplexity.npz \
|
||||
-v $MODEL/vocab.ende.yml $MODEL/vocab.ende.yml -d $GPUS \
|
||||
--mini-batch 16 --maxi-batch 100 --maxi-batch-sort trg --n-best --n-best-feature R2L$(expr $i - 1) \
|
||||
-t data/$prefix.bpe.en data/$prefix.bpe.en.output.nbest.$(expr $i - 1) > data/$prefix.bpe.en.output.nbest.$i
|
||||
done
|
||||
|
|
|
@ -18,6 +18,6 @@ cat data/$prefix.$TRG \
|
|||
| $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG \
|
||||
| $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG
|
||||
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
|
||||
|
||||
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
|
||||
$subword_nmt/apply_bpe.py -c $MODEL/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
|
||||
|
|
|
@ -35,24 +35,24 @@ mv data/corpus.tok.$TRG data/corpus.tok.uncleaned.$TRG
|
|||
$mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok.uncleaned $SRC $TRG data/corpus.tok 1 100
|
||||
|
||||
# train truecaser
|
||||
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC
|
||||
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG
|
||||
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model $MODEL/tc.$SRC
|
||||
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model $MODEL/tc.$TRG
|
||||
|
||||
# apply truecaser (cleaned training corpus)
|
||||
for prefix in corpus valid test2014 test2015 test2016 test2017
|
||||
do
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
|
||||
test -f data/$prefix.tok.$TRG || continue
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
|
||||
done
|
||||
|
||||
# train BPE
|
||||
cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
|
||||
cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > $MODEL/$SRC$TRG.bpe
|
||||
|
||||
# apply BPE
|
||||
for prefix in corpus valid test2014 test2015 test2016 test2017
|
||||
do
|
||||
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
|
||||
$subword_nmt/apply_bpe.py -c $MODEL/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
|
||||
test -f data/$prefix.tc.$TRG || continue
|
||||
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
|
||||
$subword_nmt/apply_bpe.py -c $MODEL/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
|
||||
done
|
||||
|
|
|
@ -43,7 +43,7 @@ then
|
|||
./scripts/download-files.sh
|
||||
fi
|
||||
|
||||
mkdir -p model
|
||||
export MODEL=`pwd`/../../../keep
|
||||
|
||||
# preprocess data
|
||||
if [ ! -e "data/corpus.bpe.en" ]
|
||||
|
@ -70,20 +70,20 @@ then
|
|||
fi
|
||||
|
||||
# create common vocabulary
|
||||
if [ ! -e "model/vocab.ende.yml" ]
|
||||
if [ ! -e "$MODEL/vocab.ende.yml" ]
|
||||
then
|
||||
cat data/corpus.bpe.en data/corpus.bpe.de | $MARIAN_VOCAB --max-size 36000 > model/vocab.ende.yml
|
||||
cat data/corpus.bpe.en data/corpus.bpe.de | $MARIAN_VOCAB --max-size 36000 > $MODEL/vocab.ende.yml
|
||||
fi
|
||||
|
||||
# train model
|
||||
mkdir -p model.back
|
||||
if [ ! -e "model.back/model.npz.best-translation.npz" ]
|
||||
mkdir -p $MODEL/back
|
||||
if [ ! -e "$MODEL/back/model.npz.best-translation.npz" ]
|
||||
then
|
||||
$MARIAN_TRAIN \
|
||||
--model model.back/model.npz --type s2s \
|
||||
--model $MODEL/back/model.npz --type s2s \
|
||||
--train-sets data/corpus.bpe.de data/corpus.bpe.en \
|
||||
--max-length 100 \
|
||||
--vocabs model/vocab.ende.yml model/vocab.ende.yml \
|
||||
--vocabs $MODEL/vocab.ende.yml $MODEL/vocab.ende.yml \
|
||||
--mini-batch-fit -w 3500 --maxi-batch 1000 \
|
||||
--valid-freq 10000 --save-freq 10000 --disp-freq 1000 \
|
||||
--valid-metrics ce-mean-words perplexity translation \
|
||||
|
@ -93,7 +93,7 @@ then
|
|||
--valid-mini-batch 64 --beam-size 12 --normalize=1 \
|
||||
--overwrite --keep-best \
|
||||
--early-stopping 5 --after-epochs 10 --cost-type=ce-mean-words \
|
||||
--log model.back/train.log --valid-log model.back/valid.log \
|
||||
--log $MODEL/back/train.log --valid-log $MODEL/back/valid.log \
|
||||
--tied-embeddings-all --layer-normalization \
|
||||
--devices $GPUS --seed 1111 \
|
||||
--exponential-smoothing
|
||||
|
@ -102,7 +102,7 @@ fi
|
|||
if [ ! -e "data/news.2016.bpe.en" ]
|
||||
then
|
||||
$MARIAN_DECODER \
|
||||
-c model.back/model.npz.best-translation.npz.decoder.yml \
|
||||
-c $MODEL/back/model.npz.best-translation.npz.decoder.yml \
|
||||
-i data/news.2016.bpe.de \
|
||||
-b 6 --normalize=1 -w 2500 -d $GPUS \
|
||||
--mini-batch 64 --maxi-batch 100 --maxi-batch-sort src \
|
||||
|
@ -118,13 +118,13 @@ fi
|
|||
|
||||
for i in $(seq 1 $N)
|
||||
do
|
||||
mkdir -p model/ens$i
|
||||
mkdir -p $MODEL/ens$i
|
||||
# train model
|
||||
$MARIAN_TRAIN \
|
||||
--model model/ens$i/model.npz --type s2s \
|
||||
--model $MODEL/ens$i/model.npz --type s2s \
|
||||
--train-sets data/all.bpe.en data/all.bpe.de \
|
||||
--max-length 100 \
|
||||
--vocabs model/vocab.ende.yml model/vocab.ende.yml \
|
||||
--vocabs $MODEL/vocab.ende.yml $MODEL/vocab.ende.yml \
|
||||
--mini-batch-fit -w $WORKSPACE --mini-batch 1000 --maxi-batch 1000 \
|
||||
--valid-freq 5000 --save-freq 5000 --disp-freq 500 \
|
||||
--valid-metrics ce-mean-words perplexity translation \
|
||||
|
@ -135,7 +135,7 @@ do
|
|||
--valid-mini-batch 64 \
|
||||
--overwrite --keep-best \
|
||||
--early-stopping 5 --after-epochs $EPOCHS --cost-type=ce-mean-words \
|
||||
--log model/ens$i/train.log --valid-log model/ens$i/valid.log \
|
||||
--log $MODEL/ens$i/train.log --valid-log $MODEL/ens$i/valid.log \
|
||||
--enc-type bidirectional --enc-depth 1 --enc-cell-depth 4 \
|
||||
--dec-depth 1 --dec-cell-base-depth 8 --dec-cell-high-depth 1 \
|
||||
--tied-embeddings-all --layer-normalization \
|
||||
|
@ -148,13 +148,13 @@ done
|
|||
|
||||
for i in $(seq 1 $N)
|
||||
do
|
||||
mkdir -p model/ens-rtl$i
|
||||
mkdir -p $MODEL/ens-rtl$i
|
||||
# train model
|
||||
$MARIAN_TRAIN \
|
||||
--model model/ens-rtl$i/model.npz --type s2s \
|
||||
--model $MODEL/ens-rtl$i/model.npz --type s2s \
|
||||
--train-sets data/all.bpe.en data/all.bpe.de \
|
||||
--max-length 100 \
|
||||
--vocabs model/vocab.ende.yml model/vocab.ende.yml \
|
||||
--vocabs $MODEL/vocab.ende.yml $MODEL/vocab.ende.yml \
|
||||
--mini-batch-fit -w $WORKSPACE --mini-batch 1000 --maxi-batch 1000 \
|
||||
--valid-freq 5000 --save-freq 5000 --disp-freq 500 \
|
||||
--valid-metrics ce-mean-words perplexity translation \
|
||||
|
@ -165,7 +165,7 @@ do
|
|||
--valid-mini-batch 64 \
|
||||
--overwrite --keep-best \
|
||||
--early-stopping 5 --after-epochs $EPOCHS --cost-type=ce-mean-words \
|
||||
--log model/ens-rtl$i/train.log --valid-log model/ens-rtl$i/valid.log \
|
||||
--log $MODEL/ens-rtl$i/train.log --valid-log $MODEL/ens-rtl$i/valid.log \
|
||||
--enc-type bidirectional --enc-depth 1 --enc-cell-depth 4 \
|
||||
--dec-depth 1 --dec-cell-base-depth 8 --dec-cell-high-depth 1 \
|
||||
--tied-embeddings-all --layer-normalization \
|
||||
|
@ -181,15 +181,15 @@ done
|
|||
for prefix in valid test2014 test2015 test2017
|
||||
do
|
||||
cat data/$prefix.bpe.en \
|
||||
| $MARIAN_DECODER -c model/ens1/model.npz.best-translation.npz.decoder.yml \
|
||||
-m model/ens?/model.npz.best-translation.npz -d $GPUS \
|
||||
| $MARIAN_DECODER -c $MODEL/ens1/model.npz.best-translation.npz.decoder.yml \
|
||||
-m $MODEL/ens?/model.npz.best-translation.npz -d $GPUS \
|
||||
--mini-batch 16 --maxi-batch 100 --maxi-batch-sort src -w 5000 --n-best --beam-size $B \
|
||||
> data/$prefix.bpe.en.output.nbest.0
|
||||
|
||||
for i in $(seq 1 $N)
|
||||
do
|
||||
$MARIAN_SCORER -m model/ens-rtl$i/model.npz.best-perplexity.npz \
|
||||
-v model/vocab.ende.yml model/vocab.ende.yml -d $GPUS \
|
||||
$MARIAN_SCORER -m $MODEL/ens-rtl$i/model.npz.best-perplexity.npz \
|
||||
-v $MODEL/vocab.ende.yml $MODEL/vocab.ende.yml -d $GPUS \
|
||||
--mini-batch 16 --maxi-batch 100 --maxi-batch-sort trg --n-best --n-best-feature R2L$(expr $i - 1) \
|
||||
-t data/$prefix.bpe.en data/$prefix.bpe.en.output.nbest.$(expr $i - 1) > data/$prefix.bpe.en.output.nbest.$i
|
||||
done
|
||||
|
|
|
@ -18,6 +18,6 @@ cat data/$prefix.$TRG \
|
|||
| $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG \
|
||||
| $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG
|
||||
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
|
||||
|
||||
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
|
||||
$subword_nmt/apply_bpe.py -c $MODEL/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
|
||||
|
|
|
@ -35,24 +35,24 @@ mv data/corpus.tok.$TRG data/corpus.tok.uncleaned.$TRG
|
|||
$mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok.uncleaned $SRC $TRG data/corpus.tok 1 100
|
||||
|
||||
# train truecaser
|
||||
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC
|
||||
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG
|
||||
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model $MODEL/tc.$SRC
|
||||
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model $MODEL/tc.$TRG
|
||||
|
||||
# apply truecaser (cleaned training corpus)
|
||||
for prefix in corpus valid test2014 test2015 test2016 test2017
|
||||
do
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
|
||||
test -f data/$prefix.tok.$TRG || continue
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
|
||||
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
|
||||
done
|
||||
|
||||
# train BPE
|
||||
cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
|
||||
cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > $MODEL/$SRC$TRG.bpe
|
||||
|
||||
# apply BPE
|
||||
for prefix in corpus valid test2014 test2015 test2016 test2017
|
||||
do
|
||||
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
|
||||
$subword_nmt/apply_bpe.py -c $MODEL/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
|
||||
test -f data/$prefix.tc.$TRG || continue
|
||||
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
|
||||
$subword_nmt/apply_bpe.py -c $MODEL/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
|
||||
done
|
||||
|
|
Загрузка…
Ссылка в новой задаче