Saved models to keep directory

This commit is contained in:
kdavis-mozilla 2020-04-28 19:59:02 +02:00
Родитель 6e146c933c
Коммит 6329684b58
11 изменённых файлов: 98 добавлений и 98 удалений

Просмотреть файл

@ -56,15 +56,15 @@ fi
# create the model folder
mkdir -p model
export MODEL=`pwd`/../../../keep
# train model
$MARIAN/build/marian \
--devices $GPUS \
--type s2s \
--model model/model.npz \
--model $MODEL/model.npz \
--train-sets data/corpus.ro data/corpus.en \
--vocabs model/vocab.roen.spm model/vocab.roen.spm \
--vocabs $MODEL/vocab.roen.spm $MODEL/vocab.roen.spm \
--sentencepiece-options '--normalization_rule_tsv=data/norm_romanian.tsv' \
--dim-vocabs 32000 32000 \
--mini-batch-fit -w 20000 \
@ -74,19 +74,19 @@ $MARIAN/build/marian \
--valid-freq 10000 --save-freq 10000 --disp-freq 1000 \
--cost-type ce-mean-words --valid-metrics ce-mean-words bleu-detok \
--valid-sets data/newsdev2016.ro data/newsdev2016.en \
--log model/train.log --valid-log model/valid.log --tempdir model \
--log $MODEL/train.log --valid-log $MODEL/valid.log --tempdir $MODEL \
--overwrite --keep-best \
--seed 1111 --exponential-smoothing \
--normalize=0.6 --beam-size=6 --quiet-translation
# translate dev set
cat data/newsdev2016.ro \
| $MARIAN/build/marian-decoder -c model/model.npz.best-bleu-detok.npz.decoder.yml -d $GPUS -b 6 -n0.6 \
| $MARIAN/build/marian-decoder -c $MODEL/model.npz.best-bleu-detok.npz.decoder.yml -d $GPUS -b 6 -n0.6 \
--mini-batch 64 --maxi-batch 100 --maxi-batch-sort src > data/newsdev2016.ro.output
# translate test set
cat data/newstest2016.ro \
| $MARIAN/build/marian-decoder -c model/model.npz.best-bleu-detok.npz.decoder.yml -d $GPUS -b 6 -n0.6 \
| $MARIAN/build/marian-decoder -c $MODEL/model.npz.best-bleu-detok.npz.decoder.yml -d $GPUS -b 6 -n0.6 \
--mini-batch 64 --maxi-batch 100 --maxi-batch-sort src > data/newstest2016.ro.output
# calculate bleu scores on dev and test set

Просмотреть файл

@ -38,7 +38,7 @@ then
./scripts/download-files.sh
fi
mkdir -p model
export MODEL=`pwd`/../../../keep
# preprocess data
if [ ! -e "data/corpus.bpe.en" ]
@ -47,14 +47,14 @@ then
fi
# train model
if [ ! -e "model/model.npz.best-translation.npz" ]
if [ ! -e "$MODEL/model.npz.best-translation.npz" ]
then
$MARIAN_TRAIN \
--devices $GPUS \
--type amun \
--model model/model.npz \
--model $MODEL/model.npz \
--train-sets data/corpus.bpe.ro data/corpus.bpe.en \
--vocabs model/vocab.ro.yml model/vocab.en.yml \
--vocabs $MODEL/vocab.ro.yml $MODEL/vocab.en.yml \
--dim-vocabs 66000 50000 \
--mini-batch-fit -w 19000 \
--layer-normalization --dropout-rnn 0.2 --dropout-src 0.1 --dropout-trg 0.1 \
@ -63,7 +63,7 @@ then
--valid-metrics cross-entropy translation \
--valid-sets data/newsdev2016.bpe.ro data/newsdev2016.bpe.en \
--valid-script-path "bash ./scripts/validate.sh" \
--log model/train.log --valid-log model/valid.log \
--log $MODEL/train.log --valid-log $MODEL/valid.log \
--overwrite --keep-best \
--seed 1111 --exponential-smoothing \
--normalize=1 --beam-size=12 --quiet-translation
@ -71,7 +71,7 @@ fi
# translate dev set
cat data/newsdev2016.bpe.ro \
| $MARIAN_DECODER -c model/model.npz.best-translation.npz.decoder.yml -d $GPUS -b 12 -n1 \
| $MARIAN_DECODER -c $MODEL/model.npz.best-translation.npz.decoder.yml -d $GPUS -b 12 -n1 \
--mini-batch 64 --maxi-batch 10 --maxi-batch-sort src \
| sed 's/\@\@ //g' \
| ../tools/moses-scripts/scripts/recaser/detruecase.perl \
@ -80,7 +80,7 @@ cat data/newsdev2016.bpe.ro \
# translate test set
cat data/newstest2016.bpe.ro \
| $MARIAN_DECODER -c model/model.npz.best-translation.npz.decoder.yml -d $GPUS -b 12 -n1 \
| $MARIAN_DECODER -c $MODEL/model.npz.best-translation.npz.decoder.yml -d $GPUS -b 12 -n1 \
--mini-batch 64 --maxi-batch 10 --maxi-batch-sort src \
| sed 's/\@\@ //g' \
| ../tools/moses-scripts/scripts/recaser/detruecase.perl \

Просмотреть файл

@ -47,29 +47,29 @@ done
$mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok $SRC $TRG data/corpus.tok.clean 1 80
# train truecaser
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.clean.$SRC -model model/tc.$SRC
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.clean.$TRG -model model/tc.$TRG
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.clean.$SRC -model $MODEL/tc.$SRC
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.clean.$TRG -model $MODEL/tc.$TRG
# apply truecaser (cleaned training corpus)
for prefix in corpus
do
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.clean.$SRC > data/$prefix.tc.$SRC
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.clean.$TRG > data/$prefix.tc.$TRG
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$SRC < data/$prefix.tok.clean.$SRC > data/$prefix.tc.$SRC
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$TRG < data/$prefix.tok.clean.$TRG > data/$prefix.tc.$TRG
done
# apply truecaser (dev/test files)
for prefix in newsdev2016 newstest2016
do
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
done
# train BPE
cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > $MODEL/$SRC$TRG.bpe
# apply BPE
for prefix in corpus newsdev2016 newstest2016
do
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
$subword_nmt/apply_bpe.py -c $MODEL/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
$subword_nmt/apply_bpe.py -c $MODEL/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
done

Просмотреть файл

@ -38,7 +38,7 @@ then
./scripts/download-files.sh
fi
mkdir -p model
export MODEL=`pwd`/../../../keep
# preprocess data
if [ ! -e "data/corpus.bpe.en" ]
@ -54,19 +54,19 @@ then
fi
# create common vocabulary
if [ ! -e "model/vocab.ende.yml" ]
if [ ! -e "$MODEL/vocab.ende.yml" ]
then
cat data/corpus.bpe.en data/corpus.bpe.de | $MARIAN_VOCAB --max-size 36000 > model/vocab.ende.yml
cat data/corpus.bpe.en data/corpus.bpe.de | $MARIAN_VOCAB --max-size 36000 > $MODEL/vocab.ende.yml
fi
# train model
if [ ! -e "model/model.npz" ]
if [ ! -e "$MODEL/model.npz" ]
then
$MARIAN_TRAIN \
--model model/model.npz --type transformer \
--model $MODEL/model.npz --type transformer \
--train-sets data/corpus.bpe.en data/corpus.bpe.de \
--max-length 100 \
--vocabs model/vocab.ende.yml model/vocab.ende.yml \
--vocabs $MODEL/vocab.ende.yml $MODEL/vocab.ende.yml \
--mini-batch-fit -w 22000 --maxi-batch 1000 \
--early-stopping 10 --cost-type=ce-mean-words \
--valid-freq 5000 --save-freq 5000 --disp-freq 500 \
@ -76,7 +76,7 @@ then
--valid-translation-output data/valid.bpe.en.output --quiet-translation \
--valid-mini-batch 64 \
--beam-size 6 --normalize 0.6 \
--log model/train.log --valid-log model/valid.log \
--log $MODEL/train.log --valid-log $MODEL/valid.log \
--enc-depth 6 --dec-depth 6 \
--transformer-heads 8 \
--transformer-postprocess-emb d \
@ -90,13 +90,13 @@ then
fi
# find best model on dev set
ITER=`cat model/valid.log | grep translation | sort -rg -k12,12 -t' ' | cut -f8 -d' ' | head -n1`
ITER=`cat $MODEL/valid.log | grep translation | sort -rg -k12,12 -t' ' | cut -f8 -d' ' | head -n1`
# translate test sets
for prefix in test2014 test2015 test2016
do
cat data/$prefix.bpe.en \
| $MARIAN_DECODER -c model/model.npz.decoder.yml -m model/model.iter$ITER.npz -d $GPUS -b 12 -n -w 6000 \
| $MARIAN_DECODER -c $MODEL/model.npz.decoder.yml -m $MODEL/model.iter$ITER.npz -d $GPUS -b 12 -n -w 6000 \
| sed 's/\@\@ //g' \
| ../tools/moses-scripts/scripts/recaser/detruecase.perl \
| ../tools/moses-scripts/scripts/tokenizer/detokenizer.perl -l de \

Просмотреть файл

@ -35,24 +35,24 @@ mv data/corpus.tok.$TRG data/corpus.tok.uncleaned.$TRG
$mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok.uncleaned $SRC $TRG data/corpus.tok 1 100
# train truecaser
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model $MODEL/tc.$SRC
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model $MODEL/tc.$TRG
# apply truecaser (cleaned training corpus)
for prefix in corpus valid test2014 test2015 test2016
do
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
test -f data/$prefix.tok.$TRG || continue
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
done
# train BPE
cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > $MODEL/$SRC$TRG.bpe
# apply BPE
for prefix in corpus valid test2014 test2015 test2016
do
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
$subword_nmt/apply_bpe.py -c $MODEL/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
test -f data/$prefix.tc.$TRG || continue
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
$subword_nmt/apply_bpe.py -c $MODEL/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
done

Просмотреть файл

@ -44,7 +44,7 @@ then
./scripts/download-files.sh
fi
mkdir -p model
export MODEL=`pwd`/../../../keep
# preprocess data
if [ ! -e "data/corpus.bpe.en" ]
@ -71,20 +71,20 @@ then
fi
# create common vocabulary
if [ ! -e "model/vocab.ende.yml" ]
if [ ! -e "$MODEL/vocab.ende.yml" ]
then
cat data/corpus.bpe.en data/corpus.bpe.de | $MARIAN_VOCAB --max-size 36000 > model/vocab.ende.yml
cat data/corpus.bpe.en data/corpus.bpe.de | $MARIAN_VOCAB --max-size 36000 > $MODEL/vocab.ende.yml
fi
# train model
mkdir -p model.back
if [ ! -e "model.back/model.npz.best-translation.npz" ]
mkdir -p $MODEL/back
if [ ! -e "$MODEL/back/model.npz.best-translation.npz" ]
then
$MARIAN_TRAIN \
--model model.back/model.npz --type s2s \
--model $MODEL/back/model.npz --type s2s \
--train-sets data/corpus.bpe.de data/corpus.bpe.en \
--max-length 100 \
--vocabs model/vocab.ende.yml model/vocab.ende.yml \
--vocabs $MODEL/vocab.ende.yml $MODEL/vocab.ende.yml \
--mini-batch-fit -w 3500 --maxi-batch 1000 \
--valid-freq 10000 --save-freq 10000 --disp-freq 1000 \
--valid-metrics ce-mean-words perplexity translation \
@ -94,7 +94,7 @@ then
--valid-mini-batch 64 --beam-size 12 --normalize=1 \
--overwrite --keep-best \
--early-stopping 5 --after-epochs 10 --cost-type=ce-mean-words \
--log model.back/train.log --valid-log model.back/valid.log \
--log $MODEL/back/train.log --valid-log $MODEL/back/valid.log \
--tied-embeddings-all --layer-normalization \
--devices $GPUS --seed 1111 \
--exponential-smoothing
@ -103,7 +103,7 @@ fi
if [ ! -e "data/news.2016.bpe.en" ]
then
$MARIAN_DECODER \
-c model.back/model.npz.best-translation.npz.decoder.yml \
-c $MODEL/back/model.npz.best-translation.npz.decoder.yml \
-i data/news.2016.bpe.de \
-b 6 --normalize=1 -w 2500 -d $GPUS \
--mini-batch 64 --maxi-batch 100 --maxi-batch-sort src \
@ -119,13 +119,13 @@ fi
for i in $(seq 1 $N)
do
mkdir -p model/ens$i
mkdir -p $MODEL/ens$i
# train model
$MARIAN_TRAIN \
--model model/ens$i/model.npz --type transformer \
--model $MODEL/ens$i/model.npz --type transformer \
--train-sets data/all.bpe.en data/all.bpe.de \
--max-length 100 \
--vocabs model/vocab.ende.yml model/vocab.ende.yml \
--vocabs $MODEL/vocab.ende.yml $MODEL/vocab.ende.yml \
--mini-batch-fit -w $WORKSPACE --mini-batch 1000 --maxi-batch 1000 \
--valid-freq 5000 --save-freq 5000 --disp-freq 500 \
--valid-metrics ce-mean-words perplexity translation \
@ -136,7 +136,7 @@ do
--valid-mini-batch 64 \
--overwrite --keep-best \
--early-stopping 5 --after-epochs $EPOCHS --cost-type=ce-mean-words \
--log model/ens$i/train.log --valid-log model/ens$i/valid.log \
--log $MODEL/ens$i/train.log --valid-log $MODEL/ens$i/valid.log \
--enc-depth 6 --dec-depth 6 \
--tied-embeddings-all \
--transformer-dropout 0.1 --label-smoothing 0.1 \
@ -148,13 +148,13 @@ done
for i in $(seq 1 $N)
do
mkdir -p model/ens-rtl$i
mkdir -p $MODEL/ens-rtl$i
# train model
$MARIAN_TRAIN \
--model model/ens-rtl$i/model.npz --type transformer \
--model $MODEL/ens-rtl$i/model.npz --type transformer \
--train-sets data/all.bpe.en data/all.bpe.de \
--max-length 100 \
--vocabs model/vocab.ende.yml model/vocab.ende.yml \
--vocabs $MODEL/vocab.ende.yml $MODEL/vocab.ende.yml \
--mini-batch-fit -w $WORKSPACE --mini-batch 1000 --maxi-batch 1000 \
--valid-freq 5000 --save-freq 5000 --disp-freq 500 \
--valid-metrics ce-mean-words perplexity translation \
@ -165,7 +165,7 @@ do
--valid-mini-batch 64 \
--overwrite --keep-best \
--early-stopping 5 --after-epochs $EPOCHS --cost-type=ce-mean-words \
--log model/ens-rtl$i/train.log --valid-log model/ens-rtl$i/valid.log \
--log $MODEL/ens-rtl$i/train.log --valid-log $MODEL/ens-rtl$i/valid.log \
--enc-depth 6 --dec-depth 6 \
--tied-embeddings-all \
--transformer-dropout 0.1 --label-smoothing 0.1 \
@ -179,15 +179,15 @@ done
for prefix in valid test2014 test2015 test2017
do
cat data/$prefix.bpe.en \
| $MARIAN_DECODER -c model/ens1/model.npz.best-translation.npz.decoder.yml \
-m model/ens?/model.npz.best-translation.npz -d $GPUS \
| $MARIAN_DECODER -c $MODEL/ens1/model.npz.best-translation.npz.decoder.yml \
-m $MODEL/ens?/model.npz.best-translation.npz -d $GPUS \
--mini-batch 16 --maxi-batch 100 --maxi-batch-sort src -w 5000 --n-best --beam-size $B \
> data/$prefix.bpe.en.output.nbest.0
for i in $(seq 1 $N)
do
$MARIAN_SCORER -m model/ens-rtl$i/model.npz.best-perplexity.npz \
-v model/vocab.ende.yml model/vocab.ende.yml -d $GPUS \
$MARIAN_SCORER -m $MODEL/ens-rtl$i/model.npz.best-perplexity.npz \
-v $MODEL/vocab.ende.yml $MODEL/vocab.ende.yml -d $GPUS \
--mini-batch 16 --maxi-batch 100 --maxi-batch-sort trg --n-best --n-best-feature R2L$(expr $i - 1) \
-t data/$prefix.bpe.en data/$prefix.bpe.en.output.nbest.$(expr $i - 1) > data/$prefix.bpe.en.output.nbest.$i
done

Просмотреть файл

@ -18,6 +18,6 @@ cat data/$prefix.$TRG \
| $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG \
| $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
$subword_nmt/apply_bpe.py -c $MODEL/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG

Просмотреть файл

@ -35,24 +35,24 @@ mv data/corpus.tok.$TRG data/corpus.tok.uncleaned.$TRG
$mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok.uncleaned $SRC $TRG data/corpus.tok 1 100
# train truecaser
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model $MODEL/tc.$SRC
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model $MODEL/tc.$TRG
# apply truecaser (cleaned training corpus)
for prefix in corpus valid test2014 test2015 test2016 test2017
do
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
test -f data/$prefix.tok.$TRG || continue
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
done
# train BPE
cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > $MODEL/$SRC$TRG.bpe
# apply BPE
for prefix in corpus valid test2014 test2015 test2016 test2017
do
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
$subword_nmt/apply_bpe.py -c $MODEL/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
test -f data/$prefix.tc.$TRG || continue
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
$subword_nmt/apply_bpe.py -c $MODEL/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
done

Просмотреть файл

@ -43,7 +43,7 @@ then
./scripts/download-files.sh
fi
mkdir -p model
export MODEL=`pwd`/../../../keep
# preprocess data
if [ ! -e "data/corpus.bpe.en" ]
@ -70,20 +70,20 @@ then
fi
# create common vocabulary
if [ ! -e "model/vocab.ende.yml" ]
if [ ! -e "$MODEL/vocab.ende.yml" ]
then
cat data/corpus.bpe.en data/corpus.bpe.de | $MARIAN_VOCAB --max-size 36000 > model/vocab.ende.yml
cat data/corpus.bpe.en data/corpus.bpe.de | $MARIAN_VOCAB --max-size 36000 > $MODEL/vocab.ende.yml
fi
# train model
mkdir -p model.back
if [ ! -e "model.back/model.npz.best-translation.npz" ]
mkdir -p $MODEL/back
if [ ! -e "$MODEL/back/model.npz.best-translation.npz" ]
then
$MARIAN_TRAIN \
--model model.back/model.npz --type s2s \
--model $MODEL/back/model.npz --type s2s \
--train-sets data/corpus.bpe.de data/corpus.bpe.en \
--max-length 100 \
--vocabs model/vocab.ende.yml model/vocab.ende.yml \
--vocabs $MODEL/vocab.ende.yml $MODEL/vocab.ende.yml \
--mini-batch-fit -w 3500 --maxi-batch 1000 \
--valid-freq 10000 --save-freq 10000 --disp-freq 1000 \
--valid-metrics ce-mean-words perplexity translation \
@ -93,7 +93,7 @@ then
--valid-mini-batch 64 --beam-size 12 --normalize=1 \
--overwrite --keep-best \
--early-stopping 5 --after-epochs 10 --cost-type=ce-mean-words \
--log model.back/train.log --valid-log model.back/valid.log \
--log $MODEL/back/train.log --valid-log $MODEL/back/valid.log \
--tied-embeddings-all --layer-normalization \
--devices $GPUS --seed 1111 \
--exponential-smoothing
@ -102,7 +102,7 @@ fi
if [ ! -e "data/news.2016.bpe.en" ]
then
$MARIAN_DECODER \
-c model.back/model.npz.best-translation.npz.decoder.yml \
-c $MODEL/back/model.npz.best-translation.npz.decoder.yml \
-i data/news.2016.bpe.de \
-b 6 --normalize=1 -w 2500 -d $GPUS \
--mini-batch 64 --maxi-batch 100 --maxi-batch-sort src \
@ -118,13 +118,13 @@ fi
for i in $(seq 1 $N)
do
mkdir -p model/ens$i
mkdir -p $MODEL/ens$i
# train model
$MARIAN_TRAIN \
--model model/ens$i/model.npz --type s2s \
--model $MODEL/ens$i/model.npz --type s2s \
--train-sets data/all.bpe.en data/all.bpe.de \
--max-length 100 \
--vocabs model/vocab.ende.yml model/vocab.ende.yml \
--vocabs $MODEL/vocab.ende.yml $MODEL/vocab.ende.yml \
--mini-batch-fit -w $WORKSPACE --mini-batch 1000 --maxi-batch 1000 \
--valid-freq 5000 --save-freq 5000 --disp-freq 500 \
--valid-metrics ce-mean-words perplexity translation \
@ -135,7 +135,7 @@ do
--valid-mini-batch 64 \
--overwrite --keep-best \
--early-stopping 5 --after-epochs $EPOCHS --cost-type=ce-mean-words \
--log model/ens$i/train.log --valid-log model/ens$i/valid.log \
--log $MODEL/ens$i/train.log --valid-log $MODEL/ens$i/valid.log \
--enc-type bidirectional --enc-depth 1 --enc-cell-depth 4 \
--dec-depth 1 --dec-cell-base-depth 8 --dec-cell-high-depth 1 \
--tied-embeddings-all --layer-normalization \
@ -148,13 +148,13 @@ done
for i in $(seq 1 $N)
do
mkdir -p model/ens-rtl$i
mkdir -p $MODEL/ens-rtl$i
# train model
$MARIAN_TRAIN \
--model model/ens-rtl$i/model.npz --type s2s \
--model $MODEL/ens-rtl$i/model.npz --type s2s \
--train-sets data/all.bpe.en data/all.bpe.de \
--max-length 100 \
--vocabs model/vocab.ende.yml model/vocab.ende.yml \
--vocabs $MODEL/vocab.ende.yml $MODEL/vocab.ende.yml \
--mini-batch-fit -w $WORKSPACE --mini-batch 1000 --maxi-batch 1000 \
--valid-freq 5000 --save-freq 5000 --disp-freq 500 \
--valid-metrics ce-mean-words perplexity translation \
@ -165,7 +165,7 @@ do
--valid-mini-batch 64 \
--overwrite --keep-best \
--early-stopping 5 --after-epochs $EPOCHS --cost-type=ce-mean-words \
--log model/ens-rtl$i/train.log --valid-log model/ens-rtl$i/valid.log \
--log $MODEL/ens-rtl$i/train.log --valid-log $MODEL/ens-rtl$i/valid.log \
--enc-type bidirectional --enc-depth 1 --enc-cell-depth 4 \
--dec-depth 1 --dec-cell-base-depth 8 --dec-cell-high-depth 1 \
--tied-embeddings-all --layer-normalization \
@ -181,15 +181,15 @@ done
for prefix in valid test2014 test2015 test2017
do
cat data/$prefix.bpe.en \
| $MARIAN_DECODER -c model/ens1/model.npz.best-translation.npz.decoder.yml \
-m model/ens?/model.npz.best-translation.npz -d $GPUS \
| $MARIAN_DECODER -c $MODEL/ens1/model.npz.best-translation.npz.decoder.yml \
-m $MODEL/ens?/model.npz.best-translation.npz -d $GPUS \
--mini-batch 16 --maxi-batch 100 --maxi-batch-sort src -w 5000 --n-best --beam-size $B \
> data/$prefix.bpe.en.output.nbest.0
for i in $(seq 1 $N)
do
$MARIAN_SCORER -m model/ens-rtl$i/model.npz.best-perplexity.npz \
-v model/vocab.ende.yml model/vocab.ende.yml -d $GPUS \
$MARIAN_SCORER -m $MODEL/ens-rtl$i/model.npz.best-perplexity.npz \
-v $MODEL/vocab.ende.yml $MODEL/vocab.ende.yml -d $GPUS \
--mini-batch 16 --maxi-batch 100 --maxi-batch-sort trg --n-best --n-best-feature R2L$(expr $i - 1) \
-t data/$prefix.bpe.en data/$prefix.bpe.en.output.nbest.$(expr $i - 1) > data/$prefix.bpe.en.output.nbest.$i
done

Просмотреть файл

@ -18,6 +18,6 @@ cat data/$prefix.$TRG \
| $mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l $TRG \
| $mosesdecoder/scripts/tokenizer/tokenizer.perl -a -l $TRG > data/$prefix.tok.$TRG
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
$subword_nmt/apply_bpe.py -c $MODEL/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG

Просмотреть файл

@ -35,24 +35,24 @@ mv data/corpus.tok.$TRG data/corpus.tok.uncleaned.$TRG
$mosesdecoder/scripts/training/clean-corpus-n.perl data/corpus.tok.uncleaned $SRC $TRG data/corpus.tok 1 100
# train truecaser
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model model/tc.$SRC
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model model/tc.$TRG
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$SRC -model $MODEL/tc.$SRC
$mosesdecoder/scripts/recaser/train-truecaser.perl -corpus data/corpus.tok.$TRG -model $MODEL/tc.$TRG
# apply truecaser (cleaned training corpus)
for prefix in corpus valid test2014 test2015 test2016 test2017
do
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$SRC < data/$prefix.tok.$SRC > data/$prefix.tc.$SRC
test -f data/$prefix.tok.$TRG || continue
$mosesdecoder/scripts/recaser/truecase.perl -model model/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
$mosesdecoder/scripts/recaser/truecase.perl -model $MODEL/tc.$TRG < data/$prefix.tok.$TRG > data/$prefix.tc.$TRG
done
# train BPE
cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > model/$SRC$TRG.bpe
cat data/corpus.tc.$SRC data/corpus.tc.$TRG | $subword_nmt/learn_bpe.py -s $bpe_operations > $MODEL/$SRC$TRG.bpe
# apply BPE
for prefix in corpus valid test2014 test2015 test2016 test2017
do
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
$subword_nmt/apply_bpe.py -c $MODEL/$SRC$TRG.bpe < data/$prefix.tc.$SRC > data/$prefix.bpe.$SRC
test -f data/$prefix.tc.$TRG || continue
$subword_nmt/apply_bpe.py -c model/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
$subword_nmt/apply_bpe.py -c $MODEL/$SRC$TRG.bpe < data/$prefix.tc.$TRG > data/$prefix.bpe.$TRG
done