зеркало из https://github.com/microsoft/inmt.git
source side bpe
This commit is contained in:
Родитель
1a9a9df0f3
Коммит
0586ffef2e
|
@ -4,13 +4,15 @@
|
|||
"src": "ta",
|
||||
"tgt": "en",
|
||||
"model": "taen_final_step_100000.pt",
|
||||
"src_bpe": "bpe/bpe_codes_taen",
|
||||
"provide_help": false,
|
||||
"active": false
|
||||
"active": true
|
||||
},
|
||||
"en-ta": {
|
||||
"src": "en",
|
||||
"tgt": "ta",
|
||||
"model": "enta_final_step_100000.pt",
|
||||
"src_bpe": null,
|
||||
"provide_help": true,
|
||||
"active": false
|
||||
},
|
||||
|
@ -18,6 +20,7 @@
|
|||
"src": "hi",
|
||||
"tgt": "en",
|
||||
"model": "onmt-hien.pt",
|
||||
"src_bpe": null,
|
||||
"provide_help": false,
|
||||
"active": true
|
||||
},
|
||||
|
@ -25,6 +28,7 @@
|
|||
"src": "en",
|
||||
"tgt": "hi",
|
||||
"model": "full_iitb_enhi_50v.pt",
|
||||
"src_bpe": null,
|
||||
"provide_help": true,
|
||||
"active": true
|
||||
}
|
||||
|
|
|
@ -8,6 +8,7 @@ import pickle
|
|||
import json
|
||||
|
||||
from indic_transliteration import sanscript
|
||||
from subword_nmt import apply_bpe
|
||||
|
||||
|
||||
from onmt.translate.infertranslator import build_translator
|
||||
|
@ -39,6 +40,31 @@ for key, value in langspecs.items():
|
|||
engines[key]["translatorbigram"] = build_translator(opt, report_score=True)
|
||||
#translatorbiagram builds best translations of length two
|
||||
|
||||
if value['src_bpe']:
|
||||
print("BPE in SRC side")
|
||||
bpe_src_code = os.path.join(dir_path, 'model', value['src_bpe'])
|
||||
merge_file = open(bpe_src_code, "r")
|
||||
bpe = apply_bpe.BPE(codes=merge_file)
|
||||
engines[key]["src_segmenter"] = lambda x: bpe.process_line(x.strip())
|
||||
else:
|
||||
engines[key]["src_segmenter"] = None
|
||||
|
||||
def preprocess_src(s, preprocess):
|
||||
s = s.lower()
|
||||
s = re.sub(r"([\“\”])", r'"', s)
|
||||
s = re.sub(r"([\‘\’])", r"'", s)
|
||||
s = re.sub(r"([\ः])", r":", s)
|
||||
s = re.sub(r"([-!$%^&*()_+|~=`{}\[\]:\";<>?,.\/#@।]+)", r" \1 ", s)
|
||||
# s = re.sub(r'"', r'"', s)
|
||||
# s = re.sub(r"'", r"'", s)
|
||||
s = re.sub(r"(\s+)", r" ", s)
|
||||
|
||||
for p in preprocess:
|
||||
if p:
|
||||
s = p(s)
|
||||
|
||||
return s
|
||||
|
||||
def quotaposto(s, lang="en"):
|
||||
s = re.sub(r""", r'"', s)
|
||||
s = re.sub(r"'", r"'", s)
|
||||
|
@ -71,7 +97,9 @@ def translate_new(request):
|
|||
translatorbest = engines[langspec]["translatorbest"]
|
||||
translatorbigram = engines[langspec]["translatorbigram"]
|
||||
|
||||
L1 = toquotapos(sentence.strip())
|
||||
src_segmenter = engines[request.session["langspec"]]["src_segmenter"]
|
||||
|
||||
L1 = preprocess_src(sentence.strip(), [src_segmenter])
|
||||
L2 = partial_trans
|
||||
L2split = L2.split()
|
||||
|
||||
|
|
|
@ -34,6 +34,23 @@ langspec = None
|
|||
|
||||
global translatorbest, translatorbigram
|
||||
|
||||
def preprocess_src(s, preprocess):
|
||||
s = s.lower()
|
||||
s = re.sub(r"([\“\”])", r'"', s)
|
||||
s = re.sub(r"([\‘\’])", r"'", s)
|
||||
s = re.sub(r"([\ः])", r":", s)
|
||||
s = re.sub(r"([-!$%^&*()_+|~=`{}\[\]:\";<>?,.\/#@।]+)", r" \1 ", s)
|
||||
# s = re.sub(r'"', r'"', s)
|
||||
# s = re.sub(r"'", r"'", s)
|
||||
s = re.sub(r"(\s+)", r" ", s)
|
||||
|
||||
for p in preprocess:
|
||||
if p:
|
||||
s = p(s)
|
||||
|
||||
return s
|
||||
|
||||
|
||||
def quotaposto(s, lang="en"):
|
||||
s = re.sub(r""", r'"', s)
|
||||
s = re.sub(r"'", r"'", s)
|
||||
|
@ -77,6 +94,15 @@ def toquotapos(s, lang="en"):
|
|||
# engines[key]["translatorbigram"] = build_translator(opt, report_score=True)
|
||||
# #translatorbiagram builds 5 best translations of length two
|
||||
|
||||
if value['src_bpe']:
|
||||
print("BPE in SRC side")
|
||||
bpe_src_code = os.path.join(dir_path, 'model', value['src_bpe'])
|
||||
merge_file = open(bpe_src_code, "r")
|
||||
bpe = apply_bpe.BPE(codes=merge_file)
|
||||
engines[key]["src_segmenter"] = lambda x: bpe.process_line(x.strip())
|
||||
else:
|
||||
engines[key]["src_segmenter"] = None
|
||||
|
||||
global corpusops
|
||||
corpusops = []
|
||||
|
||||
|
|
|
@ -8,4 +8,5 @@ djangorestframework
|
|||
torch
|
||||
torchtext
|
||||
configargparse
|
||||
nltk
|
||||
nltk
|
||||
subword-nmt
|
||||
|
|
|
@ -42,6 +42,9 @@
|
|||
if (lang == "en-hi") {
|
||||
text = "The weather is pleasant today. Let us go out to in the evening."
|
||||
}
|
||||
if (lang == "en-ti") {
|
||||
text = "The weather is pleasant today. Let us go out to in the evening."
|
||||
}
|
||||
if (lang == "hi-en") {
|
||||
text = "आज मौसम सुहावना है। हमें शाम को बाहर जाना चाहिए।"
|
||||
}
|
||||
|
@ -90,6 +93,7 @@
|
|||
<div class="row py-2">
|
||||
<label for="targetlang">Target Language</label>
|
||||
<select class="form-control" id="tgt">
|
||||
<option value="ti">Tigrinya</option value="ti">
|
||||
<option value="en">English</option value="en">
|
||||
<option value="hi">Hindi</option value="en">
|
||||
<option value="ta">Tamil</option>
|
||||
|
|
Загрузка…
Ссылка в новой задаче