From 0586ffef2e2f77de55f933b8561051e6c7918e71 Mon Sep 17 00:00:00 2001 From: Alp Date: Fri, 10 Jul 2020 11:30:13 +0200 Subject: [PATCH] source side bpe --- config.json | 6 +++++- mtsimple/api/views.py | 30 +++++++++++++++++++++++++++++- mtsimple/views.py | 26 ++++++++++++++++++++++++++ requirements.txt | 3 ++- templates/simplecorpus.html | 4 ++++ 5 files changed, 66 insertions(+), 3 deletions(-) diff --git a/config.json b/config.json index 7dae0c5..54d5f4b 100644 --- a/config.json +++ b/config.json @@ -4,13 +4,15 @@ "src": "ta", "tgt": "en", "model": "taen_final_step_100000.pt", + "src_bpe": "bpe/bpe_codes_taen", "provide_help": false, - "active": false + "active": true }, "en-ta": { "src": "en", "tgt": "ta", "model": "enta_final_step_100000.pt", + "src_bpe": null, "provide_help": true, "active": false }, @@ -18,6 +20,7 @@ "src": "hi", "tgt": "en", "model": "onmt-hien.pt", + "src_bpe": null, "provide_help": false, "active": true }, @@ -25,6 +28,7 @@ "src": "en", "tgt": "hi", "model": "full_iitb_enhi_50v.pt", + "src_bpe": null, "provide_help": true, "active": true } diff --git a/mtsimple/api/views.py b/mtsimple/api/views.py index 25eb9c7..1bc75d8 100644 --- a/mtsimple/api/views.py +++ b/mtsimple/api/views.py @@ -8,6 +8,7 @@ import pickle import json from indic_transliteration import sanscript +from subword_nmt import apply_bpe from onmt.translate.infertranslator import build_translator @@ -39,6 +40,31 @@ for key, value in langspecs.items(): engines[key]["translatorbigram"] = build_translator(opt, report_score=True) #translatorbiagram builds best translations of length two + if value['src_bpe']: + print("BPE in SRC side") + bpe_src_code = os.path.join(dir_path, 'model', value['src_bpe']) + merge_file = open(bpe_src_code, "r") + bpe = apply_bpe.BPE(codes=merge_file) + engines[key]["src_segmenter"] = lambda x: bpe.process_line(x.strip()) + else: + engines[key]["src_segmenter"] = None + +def preprocess_src(s, preprocess): + s = s.lower() + s = re.sub(r"([\“\”])", r'"', s) + s = re.sub(r"([\‘\’])", r"'", s) + s = re.sub(r"([\ः])", r":", s) + s = re.sub(r"([-!$%^&*()_+|~=`{}\[\]:\";<>?,.\/#@।]+)", r" \1 ", s) + # s = re.sub(r'"', r'"', s) + # s = re.sub(r"'", r"'", s) + s = re.sub(r"(\s+)", r" ", s) + + for p in preprocess: + if p: + s = p(s) + + return s + def quotaposto(s, lang="en"): s = re.sub(r""", r'"', s) s = re.sub(r"'", r"'", s) @@ -71,7 +97,9 @@ def translate_new(request): translatorbest = engines[langspec]["translatorbest"] translatorbigram = engines[langspec]["translatorbigram"] - L1 = toquotapos(sentence.strip()) + src_segmenter = engines[request.session["langspec"]]["src_segmenter"] + + L1 = preprocess_src(sentence.strip(), [src_segmenter]) L2 = partial_trans L2split = L2.split() diff --git a/mtsimple/views.py b/mtsimple/views.py index b25ad50..a747d0f 100644 --- a/mtsimple/views.py +++ b/mtsimple/views.py @@ -34,6 +34,23 @@ langspec = None global translatorbest, translatorbigram +def preprocess_src(s, preprocess): + s = s.lower() + s = re.sub(r"([\“\”])", r'"', s) + s = re.sub(r"([\‘\’])", r"'", s) + s = re.sub(r"([\ः])", r":", s) + s = re.sub(r"([-!$%^&*()_+|~=`{}\[\]:\";<>?,.\/#@।]+)", r" \1 ", s) + # s = re.sub(r'"', r'"', s) + # s = re.sub(r"'", r"'", s) + s = re.sub(r"(\s+)", r" ", s) + + for p in preprocess: + if p: + s = p(s) + + return s + + def quotaposto(s, lang="en"): s = re.sub(r""", r'"', s) s = re.sub(r"'", r"'", s) @@ -77,6 +94,15 @@ def toquotapos(s, lang="en"): # engines[key]["translatorbigram"] = build_translator(opt, report_score=True) # #translatorbiagram builds 5 best translations of length two + if value['src_bpe']: + print("BPE in SRC side") + bpe_src_code = os.path.join(dir_path, 'model', value['src_bpe']) + merge_file = open(bpe_src_code, "r") + bpe = apply_bpe.BPE(codes=merge_file) + engines[key]["src_segmenter"] = lambda x: bpe.process_line(x.strip()) + else: + engines[key]["src_segmenter"] = None + global corpusops corpusops = [] diff --git a/requirements.txt b/requirements.txt index 3a2764a..3c57c14 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ djangorestframework torch torchtext configargparse -nltk \ No newline at end of file +nltk +subword-nmt diff --git a/templates/simplecorpus.html b/templates/simplecorpus.html index 37a7d8f..83e4e24 100644 --- a/templates/simplecorpus.html +++ b/templates/simplecorpus.html @@ -42,6 +42,9 @@ if (lang == "en-hi") { text = "The weather is pleasant today. Let us go out to in the evening." } + if (lang == "en-ti") { + text = "The weather is pleasant today. Let us go out to in the evening." + } if (lang == "hi-en") { text = "आज मौसम सुहावना है। हमें शाम को बाहर जाना चाहिए।" } @@ -90,6 +93,7 @@