source side bpe

2020-07-10 11:30:13 +02:00 · 2020-07-10 11:30:13 +02:00 · 0586ffef2e
--- a/config.json
+++ b/config.json
@ -4,13 +4,15 @@
            "src": "ta",
            "tgt": "en",
            "model": "taen_final_step_100000.pt",
+            "src_bpe": "bpe/bpe_codes_taen",
            "provide_help": false,
-            "active": false
+            "active": true
        },
        "en-ta": {
            "src": "en",
            "tgt": "ta",
            "model": "enta_final_step_100000.pt",
+            "src_bpe": null,
            "provide_help": true,
            "active": false
        },
@ -18,6 +20,7 @@
            "src": "hi",
            "tgt": "en",
            "model": "onmt-hien.pt",
+            "src_bpe": null,
            "provide_help": false,
            "active": true
        },
@ -25,6 +28,7 @@
            "src": "en",
            "tgt": "hi",
            "model": "full_iitb_enhi_50v.pt",
+            "src_bpe": null,
            "provide_help": true,
            "active": true
        }
--- a/mtsimple/api/views.py
+++ b/mtsimple/api/views.py
@ -8,6 +8,7 @@ import pickle
 import json

 from indic_transliteration import sanscript
+from subword_nmt import apply_bpe


 from onmt.translate.infertranslator import build_translator
@ -39,6 +40,31 @@ for key, value in langspecs.items():
    engines[key]["translatorbigram"] = build_translator(opt, report_score=True)
    #translatorbiagram builds best translations of length two

+    if value['src_bpe']:
+        print("BPE in SRC side")
+        bpe_src_code = os.path.join(dir_path, 'model', value['src_bpe'])
+        merge_file = open(bpe_src_code, "r")
+        bpe = apply_bpe.BPE(codes=merge_file)
+        engines[key]["src_segmenter"] = lambda x: bpe.process_line(x.strip())
+    else:
+        engines[key]["src_segmenter"] = None
+
+def preprocess_src(s, preprocess):
+    s = s.lower()
+    s = re.sub(r"([\“\”])", r'"', s)
+    s = re.sub(r"([\‘\’])", r"'", s)
+    s = re.sub(r"([\ः])", r":", s)
+    s = re.sub(r"([-!$%^&*()_+|~=`{}\[\]:\";<>?,.\/#@।]+)", r" \1 ", s)
+    # s = re.sub(r'"', r'&quot;', s)
+    # s = re.sub(r"'", r"&apos;", s)
+    s = re.sub(r"(\s+)", r" ", s)
+
+    for p in preprocess:
+        if p:
+            s = p(s)
+    
+    return s
+
 def quotaposto(s, lang="en"):
    s = re.sub(r"&quot;", r'"', s)
    s = re.sub(r"&apos;", r"'", s)
@ -71,7 +97,9 @@ def translate_new(request):
    translatorbest = engines[langspec]["translatorbest"]
    translatorbigram = engines[langspec]["translatorbigram"]

-    L1 = toquotapos(sentence.strip()) 
+    src_segmenter = engines[request.session["langspec"]]["src_segmenter"]
+
+    L1 = preprocess_src(sentence.strip(), [src_segmenter]) 
    L2 = partial_trans 
    L2split = L2.split()

--- a/mtsimple/views.py
+++ b/mtsimple/views.py
@ -34,6 +34,23 @@ langspec = None

 global translatorbest, translatorbigram

+def preprocess_src(s, preprocess):
+    s = s.lower()
+    s = re.sub(r"([\“\”])", r'"', s)
+    s = re.sub(r"([\‘\’])", r"'", s)
+    s = re.sub(r"([\ः])", r":", s)
+    s = re.sub(r"([-!$%^&*()_+|~=`{}\[\]:\";<>?,.\/#@।]+)", r" \1 ", s)
+    # s = re.sub(r'"', r'&quot;', s)
+    # s = re.sub(r"'", r"&apos;", s)
+    s = re.sub(r"(\s+)", r" ", s)
+
+    for p in preprocess:
+        if p:
+            s = p(s)
+    
+    return s
+
+
 def quotaposto(s, lang="en"):
    s = re.sub(r"&quot;", r'"', s)
    s = re.sub(r"&apos;", r"'", s)
@ -77,6 +94,15 @@ def toquotapos(s, lang="en"):
 #     engines[key]["translatorbigram"] = build_translator(opt, report_score=True)
 #     #translatorbiagram builds 5 best translations of length two

+    if value['src_bpe']:
+        print("BPE in SRC side")
+        bpe_src_code = os.path.join(dir_path, 'model', value['src_bpe'])
+        merge_file = open(bpe_src_code, "r")
+        bpe = apply_bpe.BPE(codes=merge_file)
+        engines[key]["src_segmenter"] = lambda x: bpe.process_line(x.strip())
+    else:
+        engines[key]["src_segmenter"] = None
+
 global corpusops
 corpusops = []

--- a/requirements.txt
+++ b/requirements.txt
@ -8,4 +8,5 @@ djangorestframework
 torch
 torchtext
 configargparse
-nltk
+nltk
+subword-nmt
--- a/templates/simplecorpus.html
+++ b/templates/simplecorpus.html
@ -42,6 +42,9 @@
                if (lang == "en-hi") {
                  text = "The weather is pleasant today. Let us go out to in the evening."
                }
+                if (lang == "en-ti") {
+                  text = "The weather is pleasant today. Let us go out to in the evening."
+                }
                if (lang == "hi-en") {
                  text = "आज मौसम सुहावना है। हमें शाम को बाहर जाना चाहिए।"
                }
@ -90,6 +93,7 @@
                  <div class="row py-2">
                    <label for="targetlang">Target Language</label>
                    <select class="form-control" id="tgt">
+                      <option value="ti">Tigrinya</option value="ti">
                      <option value="en">English</option value="en">
                      <option value="hi">Hindi</option value="en">
                      <option value="ta">Tamil</option>