This commit is contained in:
Alp 2020-07-10 11:30:13 +02:00 коммит произвёл Sebastin Santy
Родитель 1a9a9df0f3
Коммит 0586ffef2e
5 изменённых файлов: 66 добавлений и 3 удалений

Просмотреть файл

@ -4,13 +4,15 @@
"src": "ta",
"tgt": "en",
"model": "taen_final_step_100000.pt",
"src_bpe": "bpe/bpe_codes_taen",
"provide_help": false,
"active": false
"active": true
},
"en-ta": {
"src": "en",
"tgt": "ta",
"model": "enta_final_step_100000.pt",
"src_bpe": null,
"provide_help": true,
"active": false
},
@ -18,6 +20,7 @@
"src": "hi",
"tgt": "en",
"model": "onmt-hien.pt",
"src_bpe": null,
"provide_help": false,
"active": true
},
@ -25,6 +28,7 @@
"src": "en",
"tgt": "hi",
"model": "full_iitb_enhi_50v.pt",
"src_bpe": null,
"provide_help": true,
"active": true
}

Просмотреть файл

@ -8,6 +8,7 @@ import pickle
import json
from indic_transliteration import sanscript
from subword_nmt import apply_bpe
from onmt.translate.infertranslator import build_translator
@ -39,6 +40,31 @@ for key, value in langspecs.items():
engines[key]["translatorbigram"] = build_translator(opt, report_score=True)
#translatorbiagram builds best translations of length two
if value['src_bpe']:
print("BPE in SRC side")
bpe_src_code = os.path.join(dir_path, 'model', value['src_bpe'])
merge_file = open(bpe_src_code, "r")
bpe = apply_bpe.BPE(codes=merge_file)
engines[key]["src_segmenter"] = lambda x: bpe.process_line(x.strip())
else:
engines[key]["src_segmenter"] = None
def preprocess_src(s, preprocess):
s = s.lower()
s = re.sub(r"([\\”])", r'"', s)
s = re.sub(r"([\\])", r"'", s)
s = re.sub(r"([\])", r":", s)
s = re.sub(r"([-!$%^&*()_+|~=`{}\[\]:\";<>?,.\/#@।]+)", r" \1 ", s)
# s = re.sub(r'"', r'&quot;', s)
# s = re.sub(r"'", r"&apos;", s)
s = re.sub(r"(\s+)", r" ", s)
for p in preprocess:
if p:
s = p(s)
return s
def quotaposto(s, lang="en"):
s = re.sub(r"&quot;", r'"', s)
s = re.sub(r"&apos;", r"'", s)
@ -71,7 +97,9 @@ def translate_new(request):
translatorbest = engines[langspec]["translatorbest"]
translatorbigram = engines[langspec]["translatorbigram"]
L1 = toquotapos(sentence.strip())
src_segmenter = engines[request.session["langspec"]]["src_segmenter"]
L1 = preprocess_src(sentence.strip(), [src_segmenter])
L2 = partial_trans
L2split = L2.split()

Просмотреть файл

@ -34,6 +34,23 @@ langspec = None
global translatorbest, translatorbigram
def preprocess_src(s, preprocess):
s = s.lower()
s = re.sub(r"([\\”])", r'"', s)
s = re.sub(r"([\\])", r"'", s)
s = re.sub(r"([\])", r":", s)
s = re.sub(r"([-!$%^&*()_+|~=`{}\[\]:\";<>?,.\/#@।]+)", r" \1 ", s)
# s = re.sub(r'"', r'&quot;', s)
# s = re.sub(r"'", r"&apos;", s)
s = re.sub(r"(\s+)", r" ", s)
for p in preprocess:
if p:
s = p(s)
return s
def quotaposto(s, lang="en"):
s = re.sub(r"&quot;", r'"', s)
s = re.sub(r"&apos;", r"'", s)
@ -77,6 +94,15 @@ def toquotapos(s, lang="en"):
# engines[key]["translatorbigram"] = build_translator(opt, report_score=True)
# #translatorbiagram builds 5 best translations of length two
if value['src_bpe']:
print("BPE in SRC side")
bpe_src_code = os.path.join(dir_path, 'model', value['src_bpe'])
merge_file = open(bpe_src_code, "r")
bpe = apply_bpe.BPE(codes=merge_file)
engines[key]["src_segmenter"] = lambda x: bpe.process_line(x.strip())
else:
engines[key]["src_segmenter"] = None
global corpusops
corpusops = []

Просмотреть файл

@ -8,4 +8,5 @@ djangorestframework
torch
torchtext
configargparse
nltk
nltk
subword-nmt

Просмотреть файл

@ -42,6 +42,9 @@
if (lang == "en-hi") {
text = "The weather is pleasant today. Let us go out to in the evening."
}
if (lang == "en-ti") {
text = "The weather is pleasant today. Let us go out to in the evening."
}
if (lang == "hi-en") {
text = "आज मौसम सुहावना है। हमें शाम को बाहर जाना चाहिए।"
}
@ -90,6 +93,7 @@
<div class="row py-2">
<label for="targetlang">Target Language</label>
<select class="form-control" id="tgt">
<option value="ti">Tigrinya</option value="ti">
<option value="en">English</option value="en">
<option value="hi">Hindi</option value="en">
<option value="ta">Tamil</option>