Add Tamil translation models

2020-07-21 09:39:58 +05:30 · 2020-07-21 09:39:58 +05:30 · 988d2cd520
--- a/mt/views.py
+++ b/mt/views.py
@ -70,7 +70,7 @@ langspecs = {
    }
 }
-id_to_spec = {'2': 'hi-en', '1': 'en-hi'}
+id_to_spec = {'2': 'hi-en', '1': 'en-hi', '3': 'ta-en', '4': 'en-ta'}
 translatordict = {}
--- a/mtsimple/api/views.py
+++ b/mtsimple/api/views.py
@ -17,20 +17,34 @@ dir_path = os.path.dirname(os.path.dirname(mtsimple.__file__))
 #TODO: Find a Way to not repeat the below starter code from mtsimple/views.py
 langspecs = {
-    'en-hi' : {
+    # 'en-hi' : {
-        'src' : 'en',
+    #     'src' : 'en',
-        'tgt' : 'hi',
+    #     'tgt' : 'hi',
-        'model': 'full_iitb_enhi_50v.pt',
+    #     'model': 'full_iitb_enhi_50v.pt',
-        'indic_code': sanscript.DEVANAGARI,
+    #     'indic_code': sanscript.DEVANAGARI,
-        'provide_help' : True,
+    #     'provide_help' : True,
-    },
+    # }d,
-    'hi-en' : {
+    # 'hi-en' : {
-        'src' : 'hi',
+    #     'src' : 'hi',
    #     'tgt' : 'en',
    #     'model': 'onmt-hien.pt',
    #     'indic_code': None,
    #     'provide_help' : False,
    # },
    'ta-en' : {
        'src' : 'ta',
        'tgt' : 'en',
-        'model': 'onmt-hien.pt',
+        'model': 'taen_final_step_100000.pt',
        'indic_code': None,
        'provide_help' : False,
    },
    'en-ta' : {
        'src' : 'en',
        'tgt' : 'ta',
        'model': 'enta_final_step_100000.pt',
        'indic_code': None,
        'provide_help' : True,
    },
 }
 with open(os.path.join(dir_path, 'opt_data'), 'rb') as f:
@ -57,6 +71,7 @@ def quotaposto(s, lang="en"):
    s = re.sub(r"&quot;", r'"', s)
    s = re.sub(r"&apos;", r"'", s)
    s = re.sub(r"(@@ )|(@@ ?$)", r"", s)
    s = re.sub(r"<|unk|>", r"", s)
    #This is work in progress to make writing as natural as possible. taking care of spaces before and after certain characters.
    # s = re.sub(r"(\s+)([!:?,.।\']+)", r"\2", s)
    # s = re.sub(r"([({\[<]+)(\s+)", r"\1", s)
@ -88,11 +103,14 @@ def translate_new(request):
    L2 = partial_trans 
    L2split = L2.split()
-    if langspecs[langspec]['indic_code']:
+    if langspecs[langspec]['provide_help']:
-        if L2 != '' and bool(re.search(r"([^\s\u0900-\u097F])", L2[-1])):
+        if L2 != '' and (bool(re.search(r"([^\s\u0900-\u097F])", L2[-1])) or bool(re.search(r"([^\s\u0B80-\u0BFF])", L2[-1]))):
            params = {}
            params['inString'] = L2split[-1]
-            params['lang'] = 'hindi'
+            if langspecs[langspec]['tgt'] == 'ta':
                params['lang'] = 'tamil'
            if langspecs[langspec]['tgt'] == 'hi':
                params['lang'] = 'hindi'
            data = requests.get('http://xlit.quillpad.in/quillpad_backend2/processWordJSON', params = params).json()
            L2split[-1] = data['twords'][0]['options'][0]
            L2 = ' '.join(L2split)
@ -155,6 +173,7 @@ def translate_new(request):
    perplexity = float(math.exp(-score_total / words_total))
    avg_score = float(score_total / words_total)
-    print("sentence", sentence)
+    print("sentence", quotaposto(sentence))
-    return JsonResponse({'result': sentence, 'attn': sumattn, 'partial': L2, 'ppl': perplexity, 'avg': avg_score})
+    print(quotaposto("என் <unk> என்னை"))
    return JsonResponse({'result': quotaposto(sentence), 'attn': sumattn, 'partial': L2, 'ppl': perplexity, 'avg': avg_score})
--- a/mtsimple/views.py
+++ b/mtsimple/views.py
@ -142,8 +142,8 @@ Splits the sentence based on !?।| cleans it and saves the list in session["cor
 def corpusinput(request):
    corpusraw = request.POST.get('translate')
    langselect = request.POST.get('langselect')
-    if langselect not in langspecs:
+    # if langselect not in langspecs:
-        langselect = '*-en'
+        # langselect = '*-en'
    request.session["langspec"] = langselect
    s = corpusraw.strip()
--- a/templates/simplecorpus.html
+++ b/templates/simplecorpus.html
@ -45,6 +45,9 @@
                if (lang == "hi-en") {
                  text = "आज मौसम सुहावना है। हमें शाम को बाहर जाना चाहिए।"
                }
                if (lang == "en-ta") {
                text = "The weather is pleasant today. Let us go out to in the evening."
                }
                if (lang == "ml-en") {
                  text = "മൺസൂൺ മഴ ഒരു അനുഗ്രഹവും ശാപവുമാണെന്ന് വിശ്വസിക്കപ്പെടുന്നു. ആവശ്യത്തിന് അളവിൽ മഴ പെയ്യുമ്പോൾ, വേനൽക്കാലത്തെ കടുത്ത ചൂടിനുശേഷം ഇത് ഞങ്ങൾക്ക് ഒരു അനുഗ്രഹമാണ്. വിളകൾ ധാരാളമായി വളരുന്നതിനാൽ ഇത് കർഷകർക്കും ഒരു അനുഗ്രഹമാണ്. വരണ്ട വേനൽക്കാലത്ത് നദികൾ നിറയുന്നു."
                }
@ -79,7 +82,8 @@
                    <label for="sourcelang">Source Language</label>
                    <select class="form-control"  id="src">
                      <option value="hi">Hindi</option>
-                      <option value="en">English</option value="en">
+                      <option value="en">English</option>
                      <option value="ta">Tamil</option>
                    </select>
                  </div>
@ -88,7 +92,8 @@
                    <select class="form-control" id="tgt">
                      <option value="en">English</option value="en">
                      <option value="hi">Hindi</option value="en">
-                      <option value="gondi" onselect="limitTgtOptions">Gondi</option value="en">
+                      <option value="ta">Tamil</option>
                      <!-- <option value="gondi" onselect="limitTgtOptions">Gondi</option value="en"> -->
                    </select>
                  </div>