From 988d2cd52092bd5fa249d13a99207735d6e9ba3b Mon Sep 17 00:00:00 2001 From: Sebastin Santy Date: Tue, 21 Jul 2020 09:39:58 +0530 Subject: [PATCH] Add Tamil translation models --- mt/views.py | 2 +- mtsimple/api/views.py | 49 +++++++++++++++++++++++++------------ mtsimple/views.py | 4 +-- templates/simplecorpus.html | 9 +++++-- 4 files changed, 44 insertions(+), 20 deletions(-) diff --git a/mt/views.py b/mt/views.py index 30aded4..914ef25 100644 --- a/mt/views.py +++ b/mt/views.py @@ -70,7 +70,7 @@ langspecs = { } } -id_to_spec = {'2': 'hi-en', '1': 'en-hi'} +id_to_spec = {'2': 'hi-en', '1': 'en-hi', '3': 'ta-en', '4': 'en-ta'} translatordict = {} diff --git a/mtsimple/api/views.py b/mtsimple/api/views.py index 18b3fb6..04427f7 100644 --- a/mtsimple/api/views.py +++ b/mtsimple/api/views.py @@ -17,20 +17,34 @@ dir_path = os.path.dirname(os.path.dirname(mtsimple.__file__)) #TODO: Find a Way to not repeat the below starter code from mtsimple/views.py langspecs = { - 'en-hi' : { - 'src' : 'en', - 'tgt' : 'hi', - 'model': 'full_iitb_enhi_50v.pt', - 'indic_code': sanscript.DEVANAGARI, - 'provide_help' : True, - }, - 'hi-en' : { - 'src' : 'hi', + # 'en-hi' : { + # 'src' : 'en', + # 'tgt' : 'hi', + # 'model': 'full_iitb_enhi_50v.pt', + # 'indic_code': sanscript.DEVANAGARI, + # 'provide_help' : True, + # }d, + # 'hi-en' : { + # 'src' : 'hi', + # 'tgt' : 'en', + # 'model': 'onmt-hien.pt', + # 'indic_code': None, + # 'provide_help' : False, + # }, + 'ta-en' : { + 'src' : 'ta', 'tgt' : 'en', - 'model': 'onmt-hien.pt', + 'model': 'taen_final_step_100000.pt', 'indic_code': None, 'provide_help' : False, }, + 'en-ta' : { + 'src' : 'en', + 'tgt' : 'ta', + 'model': 'enta_final_step_100000.pt', + 'indic_code': None, + 'provide_help' : True, + }, } with open(os.path.join(dir_path, 'opt_data'), 'rb') as f: @@ -57,6 +71,7 @@ def quotaposto(s, lang="en"): s = re.sub(r""", r'"', s) s = re.sub(r"'", r"'", s) s = re.sub(r"(@@ )|(@@ ?$)", r"", s) + s = re.sub(r"<|unk|>", r"", s) #This is work in progress to make writing as natural as possible. taking care of spaces before and after certain characters. # s = re.sub(r"(\s+)([!:?,.।\']+)", r"\2", s) # s = re.sub(r"([({\[<]+)(\s+)", r"\1", s) @@ -88,11 +103,14 @@ def translate_new(request): L2 = partial_trans L2split = L2.split() - if langspecs[langspec]['indic_code']: - if L2 != '' and bool(re.search(r"([^\s\u0900-\u097F])", L2[-1])): + if langspecs[langspec]['provide_help']: + if L2 != '' and (bool(re.search(r"([^\s\u0900-\u097F])", L2[-1])) or bool(re.search(r"([^\s\u0B80-\u0BFF])", L2[-1]))): params = {} params['inString'] = L2split[-1] - params['lang'] = 'hindi' + if langspecs[langspec]['tgt'] == 'ta': + params['lang'] = 'tamil' + if langspecs[langspec]['tgt'] == 'hi': + params['lang'] = 'hindi' data = requests.get('http://xlit.quillpad.in/quillpad_backend2/processWordJSON', params = params).json() L2split[-1] = data['twords'][0]['options'][0] L2 = ' '.join(L2split) @@ -155,6 +173,7 @@ def translate_new(request): perplexity = float(math.exp(-score_total / words_total)) avg_score = float(score_total / words_total) - print("sentence", sentence) - return JsonResponse({'result': sentence, 'attn': sumattn, 'partial': L2, 'ppl': perplexity, 'avg': avg_score}) + print("sentence", quotaposto(sentence)) + print(quotaposto("என் என்னை")) + return JsonResponse({'result': quotaposto(sentence), 'attn': sumattn, 'partial': L2, 'ppl': perplexity, 'avg': avg_score}) diff --git a/mtsimple/views.py b/mtsimple/views.py index c2ea99a..3db9d37 100644 --- a/mtsimple/views.py +++ b/mtsimple/views.py @@ -142,8 +142,8 @@ Splits the sentence based on !?।| cleans it and saves the list in session["cor def corpusinput(request): corpusraw = request.POST.get('translate') langselect = request.POST.get('langselect') - if langselect not in langspecs: - langselect = '*-en' + # if langselect not in langspecs: + # langselect = '*-en' request.session["langspec"] = langselect s = corpusraw.strip() diff --git a/templates/simplecorpus.html b/templates/simplecorpus.html index e9be5a1..37a7d8f 100644 --- a/templates/simplecorpus.html +++ b/templates/simplecorpus.html @@ -45,6 +45,9 @@ if (lang == "hi-en") { text = "आज मौसम सुहावना है। हमें शाम को बाहर जाना चाहिए।" } + if (lang == "en-ta") { + text = "The weather is pleasant today. Let us go out to in the evening." + } if (lang == "ml-en") { text = "മൺസൂൺ മഴ ഒരു അനുഗ്രഹവും ശാപവുമാണെന്ന് വിശ്വസിക്കപ്പെടുന്നു. ആവശ്യത്തിന് അളവിൽ മഴ പെയ്യുമ്പോൾ, വേനൽക്കാലത്തെ കടുത്ത ചൂടിനുശേഷം ഇത് ഞങ്ങൾക്ക് ഒരു അനുഗ്രഹമാണ്. വിളകൾ ധാരാളമായി വളരുന്നതിനാൽ ഇത് കർഷകർക്കും ഒരു അനുഗ്രഹമാണ്. വരണ്ട വേനൽക്കാലത്ത് നദികൾ നിറയുന്നു." } @@ -79,7 +82,8 @@ @@ -88,7 +92,8 @@