Add REST APIs for translation queries

This commit is contained in:
Anurag 2020-05-27 16:01:24 +05:30 коммит произвёл Sebastin Santy
Родитель b8c8d36476
Коммит b9bc997f6f
21 изменённых файлов: 776 добавлений и 19 удалений

Просмотреть файл

@ -52,6 +52,7 @@ INSTALLED_APPS = [
'django.contrib.sessions', 'django.contrib.sessions',
'django.contrib.messages', 'django.contrib.messages',
'django.contrib.staticfiles', 'django.contrib.staticfiles',
'rest_framework',
'mt', 'mt',
'mtsimple', 'mtsimple',
'mtpara', 'mtpara',

Просмотреть файл

@ -20,10 +20,16 @@ from django.conf.urls.static import static
from django.contrib.staticfiles.urls import staticfiles_urlpatterns from django.contrib.staticfiles.urls import staticfiles_urlpatterns
urlpatterns = [ urlpatterns = [
path('', include('mt.urls')), path('', include('mt.urls')), #TODO: This redirects to simple/ can we do it directly?
path('simple/', include('mtsimple.urls')), path('simple/', include('mtsimple.urls')),
path('para/', include('mtpara.urls')), path('para/', include('mtpara.urls')),
# path('gpt/', include('gpt.urls')), # path('gpt/', include('gpt.urls')),
path('admin/', admin.site.urls), path('admin/', admin.site.urls),
path('accounts/', include('django.contrib.auth.urls')), path('accounts/', include('django.contrib.auth.urls')),
# REST FRAMEWORK URLS
path('api/simple/', include('mtsimple.api.urls')),
] + static(settings.STATIC_URL, document_root=settings.STATIC_ROOT) + staticfiles_urlpatterns() ] + static(settings.STATIC_URL, document_root=settings.STATIC_ROOT) + staticfiles_urlpatterns()

Двоичные данные
docs/html-documentation-generated.zip Normal file

Двоичный файл не отображается.

Просмотреть файл

@ -0,0 +1,23 @@
# Swagger Codegen Ignore
# Generated by swagger-codegen https://github.com/swagger-api/swagger-codegen
# Use this file to prevent files from being overwritten by the generator.
# The patterns follow closely to .gitignore or .dockerignore.
# As an example, the C# client generator defines ApiClient.cs.
# You can make changes and tell Swagger Codgen to ignore just this file by uncommenting the following line:
#ApiClient.cs
# You can match any string of characters against a directory, file or extension with a single asterisk (*):
#foo/*/qux
# The above matches foo/bar/qux and foo/baz/qux, but not foo/bar/baz/qux
# You can recursively match patterns against a directory, file or extension with a double asterisk (**):
#foo/**/qux
# This matches foo/bar/qux, foo/baz/qux, and foo/bar/baz/qux
# You can also negate patterns with an exclamation (!).
# For example, you can ignore all files in a docs folder with the file extension .md:
#docs/*.md
# Then explicitly reverse the ignore rule for a single file:
#!docs/README.md

Просмотреть файл

@ -0,0 +1 @@
3.0.19

Просмотреть файл

@ -0,0 +1,282 @@
<!doctype html>
<html>
<head>
<title>SIMPLE TRANSLATION API</title>
<style type="text/css">
body {
font-family: Trebuchet MS, sans-serif;
font-size: 15px;
color: #444;
margin-right: 24px;
}
h1 {
font-size: 25px;
}
h2 {
font-size: 20px;
}
h3 {
font-size: 16px;
font-weight: bold;
}
hr {
height: 1px;
border: 0;
color: #ddd;
background-color: #ddd;
}
.app-desc {
clear: both;
margin-left: 20px;
}
.param-name {
width: 100%;
}
.license-info {
margin-left: 20px;
}
.license-url {
margin-left: 20px;
}
.model {
margin: 0 0 0px 20px;
}
.method {
margin-left: 20px;
}
.method-notes {
margin: 10px 0 20px 0;
font-size: 90%;
color: #555;
}
pre {
padding: 10px;
margin-bottom: 2px;
}
.http-method {
text-transform: uppercase;
}
pre.get {
background-color: #0f6ab4;
}
pre.post {
background-color: #10a54a;
}
pre.put {
background-color: #c5862b;
}
pre.delete {
background-color: #a41e22;
}
.huge {
color: #fff;
}
pre.example {
background-color: #f3f3f3;
padding: 10px;
border: 1px solid #ddd;
}
code {
white-space: pre;
}
.nickname {
font-weight: bold;
}
.method-path {
font-size: 1.5em;
background-color: #0f6ab4;
}
.up {
float:right;
}
.parameter {
width: 500px;
}
.param {
width: 500px;
padding: 10px 0 0 20px;
font-weight: bold;
}
.param-desc {
width: 700px;
padding: 0 0 0 20px;
color: #777;
}
.param-type {
font-style: italic;
}
.param-enum-header {
width: 700px;
padding: 0 0 0 60px;
color: #777;
font-weight: bold;
}
.param-enum {
width: 700px;
padding: 0 0 0 80px;
color: #777;
font-style: italic;
}
.field-label {
padding: 0;
margin: 0;
clear: both;
}
.field-items {
padding: 0 0 15px 0;
margin-bottom: 15px;
}
.return-type {
clear: both;
padding-bottom: 10px;
}
.param-header {
font-weight: bold;
}
.method-tags {
text-align: right;
}
.method-tag {
background: none repeat scroll 0% 0% #24A600;
border-radius: 3px;
padding: 2px 10px;
margin: 2px;
color: #FFF;
display: inline-block;
text-decoration: none;
}
</style>
</head>
<body>
<h1>SIMPLE TRANSLATION API</h1>
<div class="app-desc">Api for translation suggestions and attention score</div>
<div class="app-desc">More information: <a href="https://helloreverb.com">https://helloreverb.com</a></div>
<div class="app-desc">Contact Info: <a href="hello@helloreverb.com">hello@helloreverb.com</a></div>
<div class="app-desc">Version: 1.0</div>
<div class="app-desc">BasePath:/I927/INMT-SIMPLE/1.0</div>
<div class="license-info">All rights reserved</div>
<div class="license-url">http://apache.org/licenses/LICENSE-2.0.html</div>
<h2>Access</h2>
<h2><a name="__Methods">Methods</a></h2>
[ Jump to <a href="#__Models">Models</a> ]
<h3>Table of Contents </h3>
<div class="method-summary"></div>
<h4><a href="#Simple">Simple</a></h4>
<ul>
<li><a href="#translateNew"><code><span class="http-method">get</span> /api/simple/translate_new</code></a></li>
</ul>
<h1><a name="Simple">Simple</a></h1>
<div class="method"><a name="translateNew"></a>
<div class="method-path">
<a class="up" href="#__Methods">Up</a>
<pre class="get"><code class="huge"><span class="http-method">get</span> /api/simple/translate_new</code></pre></div>
<div class="method-summary">get suggestions, attension scores, preplex and average score for your partial translation (<span class="nickname">translateNew</span>)</div>
<div class="method-notes"></div>
<h3 class="field-label">Query parameters</h3>
<div class="field-items">
<div class="param">langspec (required)</div>
<div class="param-desc"><span class="param-type">Query Parameter</span> &mdash; The type of translation </div> <div class="param">sentence (required)</div>
<div class="param-desc"><span class="param-type">Query Parameter</span> &mdash; The sentence that is to be translated </div> <div class="param">partial_trans (required)</div>
<div class="param-desc"><span class="param-type">Query Parameter</span> &mdash; Partial translation done so far by the user </div> </div> <!-- field-items -->
<h3 class="field-label">Return type</h3>
<div class="return-type">
<a href="#inline_response_200">inline_response_200</a>
</div>
<!--Todo: process Response Object and its headers, schema, examples -->
<h3 class="field-label">Example data</h3>
<div class="example-data-content-type">Content-Type: application/json</div>
<pre class="example"><code>{
"result" : [ "result", "result" ],
"attn" : [ 0.8008281904610115, 0.8008281904610115 ],
"avg" : 1.4658129805029452,
"partial" : "partial",
"ppl" : 6.027456183070403
}</code></pre>
<h3 class="field-label">Produces</h3>
This API call produces the following media types according to the <span class="header">Accept</span> request header;
the media type will be conveyed by the <span class="header">Content-Type</span> response header.
<ul>
<li><code>application/json</code></li>
</ul>
<h3 class="field-label">Responses</h3>
<h4 class="field-label">200</h4>
An array containing result (the suggestions), attentions, partial translation by the user, perplexity and average score
<a href="#inline_response_200">inline_response_200</a>
<h4 class="field-label">400</h4>
Bad Request
<a href="#"></a>
<h4 class="field-label">500</h4>
Some internal server error
<a href="#"></a>
</div> <!-- method -->
<hr/>
<h2><a name="__Models">Models</a></h2>
[ Jump to <a href="#__Methods">Methods</a> ]
<h3>Table of Contents</h3>
<ol>
<li><a href="#inline_response_200"><code>inline_response_200</code></a></li>
</ol>
<div class="model">
<h3><a name="inline_response_200"><code>inline_response_200</code></a> <a class="up" href="#__Models">Up</a></h3>
<div class="field-items">
<div class="param">result (optional)</div><div class="param-desc"><span class="param-type"><a href="#string">array[String]</a></span> </div>
<div class="param">attn (optional)</div><div class="param-desc"><span class="param-type"><a href="#BigDecimal">array[BigDecimal]</a></span> </div>
<div class="param">partial (optional)</div><div class="param-desc"><span class="param-type"><a href="#string">String</a></span> </div>
<div class="param">ppl (optional)</div><div class="param-desc"><span class="param-type"><a href="#BigDecimal">BigDecimal</a></span> </div>
<div class="param">avg (optional)</div><div class="param-desc"><span class="param-type"><a href="#BigDecimal">BigDecimal</a></span> </div>
</div> <!-- field-items -->
</div>
</body>
</html>

Двоичные данные
docs/openapi-yaml-client-generated.zip Normal file

Двоичный файл не отображается.

Просмотреть файл

@ -0,0 +1,23 @@
# Swagger Codegen Ignore
# Generated by swagger-codegen https://github.com/swagger-api/swagger-codegen
# Use this file to prevent files from being overwritten by the generator.
# The patterns follow closely to .gitignore or .dockerignore.
# As an example, the C# client generator defines ApiClient.cs.
# You can make changes and tell Swagger Codgen to ignore just this file by uncommenting the following line:
#ApiClient.cs
# You can match any string of characters against a directory, file or extension with a single asterisk (*):
#foo/*/qux
# The above matches foo/bar/qux and foo/baz/qux, but not foo/bar/baz/qux
# You can recursively match patterns against a directory, file or extension with a double asterisk (**):
#foo/**/qux
# This matches foo/bar/qux, foo/baz/qux, and foo/bar/baz/qux
# You can also negate patterns with an exclamation (!).
# For example, you can ignore all files in a docs folder with the file extension .md:
#docs/*.md
# Then explicitly reverse the ignore rule for a single file:
#!docs/README.md

Просмотреть файл

@ -0,0 +1 @@
3.0.19

Просмотреть файл

Просмотреть файл

@ -0,0 +1,75 @@
openapi: 3.0.0
info:
title: SIMPLE TRANSLATION API
description: Api for translation suggestions and attention score
version: "1.0"
servers:
- url: https://virtserver.swaggerhub.com/I927/INMT-SIMPLE/1.0
description: SwaggerHub API Auto Mocking
paths:
/api/simple/translate_new:
get:
tags:
- simple
summary: get suggestions, attension scores, preplex and average score for your
partial translation
operationId: translate_new
parameters:
- name: langspec
in: query
description: The type of translation
required: true
style: form
explode: true
schema:
type: string
- name: sentence
in: query
description: The sentence that is to be translated
required: true
style: form
explode: true
schema:
type: string
- name: partial_trans
in: query
description: Partial translation done so far by the user
required: true
style: form
explode: true
schema:
type: string
responses:
"200":
description: An array containing result (the suggestions), attentions, partial
translation by the user, perplexity and average score
content:
application/json:
schema:
$ref: '#/components/schemas/inline_response_200'
"400":
description: Bad Request
"500":
description: Some internal server error
components:
schemas:
inline_response_200:
type: object
properties:
result:
type: array
items:
type: string
attn:
type: array
items:
multipleOf: 0.1
type: number
partial:
type: string
ppl:
multipleOf: 0.1
type: number
avg:
multipleOf: 0.1
type: number

Просмотреть файл

@ -0,0 +1,69 @@
# Generated by Django 3.0.3 on 2020-05-19 16:12
from django.db import migrations, models
import django.db.models.deletion
import jsonfield.fields
class Migration(migrations.Migration):
dependencies = [
('mt', '0014_auto_20190606_1535'),
]
operations = [
migrations.CreateModel(
name='customKeyboardCommands',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('select_entire_suggestion', models.IntegerField()),
('select_single_word_from_suggestion', models.IntegerField()),
('navigate_to_next_corpus_fragment', models.IntegerField()),
('navigate_to_previous_corpus_fragment', models.IntegerField()),
('submit_translation', models.IntegerField()),
('select_next_translation_suggestion', models.IntegerField()),
('select_previous_translation_suggestion', models.IntegerField()),
('custom_layout_name', models.CharField(max_length=30)),
],
options={
'verbose_name': 'Custom Keyboard Command Set',
'verbose_name_plural': 'Custom Keyboard Command Sets',
},
),
migrations.AlterField(
model_name='corpus',
name='helpprovision',
field=models.CharField(choices=[('IT', 'Interactive Translation'), ('PE', 'Post Editing'), ('BL', 'Baseline')], default='IT', max_length=2),
),
migrations.AlterField(
model_name='dockeystroke',
name='keystrokeseries',
field=jsonfield.fields.JSONField(),
),
migrations.CreateModel(
name='translatorKeyboardLayouts',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('customKeyboardCommands', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='translatorconfigs', to='mt.customKeyboardCommands')),
('translator', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='translatorconfigs', to='mt.translator')),
],
options={
'verbose_name': 'Translator Keyboard Layout Specified',
'verbose_name_plural': 'Translator Keyboard Layout Specified',
'unique_together': {('translator', 'customKeyboardCommands')},
},
),
migrations.CreateModel(
name='translatorcorpus',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('corpus', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='corpustranslators', to='mt.corpus')),
('translator', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='translatorcorpus', to='mt.translator')),
],
options={
'verbose_name': 'Translator Corpus Possible',
'verbose_name_plural': '6. Translator Corpus Possible',
'unique_together': {('translator', 'corpus')},
},
),
]

Просмотреть файл

@ -43,6 +43,10 @@ from django.conf import settings
with open(os.path.join(dir_path, 'opt_data'), 'rb') as f: with open(os.path.join(dir_path, 'opt_data'), 'rb') as f:
opt = pickle.load(f) opt = pickle.load(f)
print("###########################DEBUG######################################")
print(opt) # This is the file that mentions model details
print(dir_path + "/opt_data")
print("#######################################################################")

0
mtsimple/api/__init__.py Normal file
Просмотреть файл

7
mtsimple/api/urls.py Normal file
Просмотреть файл

@ -0,0 +1,7 @@
from django.urls import path
from . import views
urlpatterns = [
path('translate_new', views.translate_new, name='translate_new'),
]

191
mtsimple/api/views.py Normal file
Просмотреть файл

@ -0,0 +1,191 @@
from django.http import HttpResponse, JsonResponse
from rest_framework import status
from rest_framework.response import Response
from rest_framework.decorators import api_view
import re, os, math
import requests
import pickle
from indic_transliteration import sanscript
from onmt.translate.infertranslator import build_translator
from onmt.utils.parse import ArgumentParser
import mtsimple
dir_path = os.path.dirname(os.path.dirname(mtsimple.__file__))
#TODO: Find a Way to not repeat the below starter code from mtsimple/views.py
langspecs = {
'en-hi' : {
'src' : 'en',
'tgt' : 'hi',
'model': 'full_iitb_enhi_50v.pt',
'indic_code': sanscript.DEVANAGARI,
'provide_help' : True,
},
'hi-en' : {
'src' : 'hi',
'tgt' : 'en',
'model': 'full_iitb_bpe_hien.pt',
'indic_code': None,
'provide_help' : False,
},
'hi-gondi' : {
'src' : 'hi',
'tgt' : 'gondi',
'model': 'hi-gondi.pt',
'indic_code': sanscript.DEVANAGARI,
'provide_help' : False,
},
# '*-e
# '*-en' : {
# 'src' : 'hi',
# 'tgt' : 'en',
# 'model': 'multiling.pt',
# 'indic_code': None,
# 'provide_help' : False,
# }
}
with open(os.path.join(dir_path, 'opt_data'), 'rb') as f:
opt = pickle.load(f)
engines = {}
# The model engines are initialised here after loading opt (maybe it just specifies of how the model looks like?)
for key, value in langspecs.items():
opt.models = [os.path.join(dir_path, 'model', value['model'])]
opt.n_best = 1
opt.max_length = 100
opt.global_attention_function = 'sparsemax'
ArgumentParser.validate_translate_opts(opt)
engines[key] = {"translatorbest": build_translator(opt, report_score=True)}
#translatorbest builds the best complete translation of the sentence
opt.n_best = 5
opt.max_length = 2
opt.global_attention_function = 'sparsemax'
ArgumentParser.validate_translate_opts(opt)
engines[key]["translatorbigram"] = build_translator(opt, report_score=True)
#translatorbiagram builds best translations of length two
def quotaposto(s, lang="en"):
s = re.sub(r"&quot;", r'"', s)
s = re.sub(r"&apos;", r"'", s)
s = re.sub(r"(@@ )|(@@ ?$)", r"", s)
#This is work in progress to make writing as natural as possible. taking care of spaces before and after certain characters.
# s = re.sub(r"(\s+)([!:?,.।\']+)", r"\2", s)
# s = re.sub(r"([({\[<]+)(\s+)", r"\1", s)
# s = re.sub(r"(\s+)([)}\]>]+)", r"\2", s)
return s
def toquotapos(s, lang="en"):
# if lang=="en":
s = s.lower()
s = re.sub(r"([\\”])", r'"', s)
s = re.sub(r"([\\])", r"'", s)
s = re.sub(r"([\])", r":", s)
s = re.sub(r"([-!$%^&*()_+|~=`{}\[\]:\";<>?,.\/#@।]+)", r" \1 ", s)
s = re.sub(r'"', r'&quot;', s)
s = re.sub(r"'", r"&apos;", s)
s = re.sub(r"(\s+)", r" ", s)
return s
@api_view(['GET',])
def translate_new(request):
langspec = request.GET.get('langspec')
sentence = request.GET.get('sentence')
partial_trans = request.GET.get('partial_trans', '')
translatorbest = engines[langspec]["translatorbest"]
translatorbigram = engines[langspec]["translatorbigram"]
print("Before processing")
print("##########################")
print("##########################")
print(sentence.strip())
print("##########################")
print("##########################")
print("##########################")
L1 = toquotapos(sentence.strip()) # request.GET.get('a') contains the whole sentence to be translated
print("############After Processing########")
print((L1))
L2 = partial_trans # request.GET.get('b') contains the partial sentence to be translated
L2split = L2.split()
if langspecs[langspec]['indic_code']:
# print(L2[-1])
if L2 != '' and bool(re.search(r"([^\s\u0900-\u097F])", L2[-1])):
params = {}
params['inString'] = L2split[-1]
params['lang'] = 'hindi'
data = requests.get('http://xlit.quillpad.in/quillpad_backend2/processWordJSON', params = params).json()
L2split[-1] = data['twords'][0]['options'][0]
L2 = ' '.join(L2split)
# L2 = transliterate(L2, sanscript.ITRANS, langspec['indic_code'])
print(L2, u'\u0900-\u097F')
something, pred, covatn2d, score_total, words_total = translatorbest.translate(
src=[L1],
tgt=None,
src_dir='',
batch_size=30,
attn_debug=True,
partial = toquotapos(L2)
)
scores, predictions, score_total, words_total = translatorbigram.translate(
src=[L1],
tgt=None,
src_dir='',
batch_size=30,
attn_debug=False,
partial = toquotapos(L2),
dymax_len = 2,
)
print(covatn2d, 'convatn2d')
if L2 != '':
transpattn = [*zip(*covatn2d)]
attnind = [attn.index(max(attn)) for attn in transpattn]
print('attnind', attnind)
attndist = [[ i for i, x in enumerate(attnind) if x==k] for k in range(len(L2.strip().split(" ")))]
print('attndist', attndist)
sumattn = [1] * len(L1.split(" "))
for i in attndist:
for k in i:
sumattn[k] = 0
# attn = covatn2d[:len(L2.strip().split(" "))]
# sumattn = [sum(i) for i in zip(*attn)]
# for i in range(len(attn)):
# if max(attn[i]) > 0.30:
# sumattn[attn[i].index(max(attn[i]))] = 1
# print(max(attn[i]))
# newattn = [float("{0:.2f}".format(1-(k/max(sumattn)))) for k in sumattn]
# # sumattn = [float("{0:.2f}".format(k/sum(newattn))) for k in newattn]
# newattn = [ 1.66*max(0, (k-0.4)) for k in newattn]
else:
sumattn = [1.00] * len(L1.split(" "))
predictions = predictions[0]
print(predictions)
seen = set()
seen_add = seen.add
sentence = [quotaposto(L2 + x.capitalize()[len(L2):], langspecs[langspec]["tgt"]) + " " for x in predictions if not (x in seen or seen_add(x))]
# sentence = [x.replace(L2, "") for x in sentence]
sentence = '\n'.join(sentence)
print("pred[0][0]", pred[0][0], pred[0][0][len(L2):])
if langspecs[langspec]['provide_help'] and L2:
sentence = quotaposto(L2 + pred[0][0].capitalize()[len(L2):], langspecs[langspec]["tgt"]) + '\n' + L2 + '\n' + sentence
else:
sentence = quotaposto(L2 + pred[0][0].capitalize()[len(L2):], langspecs[langspec]["tgt"]) + '\n' + sentence
print(sentence)
perplexity = float(math.exp(-score_total / words_total))
avg_score = float(score_total / words_total)
print("sentence", sentence)
# print(something, pred)
return JsonResponse({'result': sentence.split('\n'), 'attn': sumattn, 'partial': L2, 'ppl': perplexity, 'avg': avg_score})

Просмотреть файл

@ -29,6 +29,7 @@ import requests
import math import math
# defines the configuration of the translation type selected by the user
langspecs = { langspecs = {
'en-hi' : { 'en-hi' : {
'src' : 'en', 'src' : 'en',
@ -44,6 +45,14 @@ langspecs = {
'indic_code': None, 'indic_code': None,
'provide_help' : False, 'provide_help' : False,
}, },
'hi-gondi' : {
'src' : 'hi',
'tgt' : 'gondi',
'model': 'hi-gondi.pt',
'indic_code': sanscript.DEVANAGARI,
'provide_help' : False,
},
# '*-en' : { # '*-en' : {
# 'src' : 'hi', # 'src' : 'hi',
# 'tgt' : 'en', # 'tgt' : 'en',
@ -85,6 +94,7 @@ with open(os.path.join(dir_path, 'opt_data'), 'rb') as f:
opt = pickle.load(f) opt = pickle.load(f)
engines = {} engines = {}
# The model engines are initialised here after loading opt (maybe it just specifies of how the model looks like?)
for key, value in langspecs.items(): for key, value in langspecs.items():
opt.models = [os.path.join(dir_path, 'model', value['model'])] opt.models = [os.path.join(dir_path, 'model', value['model'])]
opt.n_best = 1 opt.n_best = 1
@ -92,19 +102,23 @@ for key, value in langspecs.items():
opt.global_attention_function = 'sparsemax' opt.global_attention_function = 'sparsemax'
ArgumentParser.validate_translate_opts(opt) ArgumentParser.validate_translate_opts(opt)
engines[key] = {"translatorbest": build_translator(opt, report_score=True)} engines[key] = {"translatorbest": build_translator(opt, report_score=True)}
#translatorbest builds the best complete translation of the sentence
opt.n_best = 5 opt.n_best = 5
opt.max_length = 2 opt.max_length = 2
opt.global_attention_function = 'sparsemax' opt.global_attention_function = 'sparsemax'
ArgumentParser.validate_translate_opts(opt) ArgumentParser.validate_translate_opts(opt)
engines[key]["translatorbigram"] = build_translator(opt, report_score=True) engines[key]["translatorbigram"] = build_translator(opt, report_score=True)
#translatorbiagram builds 5 best translations of length two
global corpusops global corpusops
corpusops = [] corpusops = []
# The view function for the first page url : simple/
def corpus(request): def corpus(request):
return render(request, 'simplecorpus.html') return render(request, 'simplecorpus.html')
#The view function called after setting languagespecs and getting the input in simple/ (called after corpusinput)
def translate(request): def translate(request):
return render(request, 'simpletranslate.html') return render(request, 'simpletranslate.html')
@ -112,7 +126,7 @@ def end(request):
return render(request, 'simpleend.html') return render(request, 'simpleend.html')
def split_sentences(st): def split_sentences(st):
#Split sentences based #Split sentences based on !?।|.
sentences = re.split(r'[!?।|.](?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)', st) sentences = re.split(r'[!?।|.](?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)', st)
if sentences[-1]: if sentences[-1]:
@ -120,6 +134,12 @@ def split_sentences(st):
else: else:
return sentences[:-1] return sentences[:-1]
"""
The view function for getting the input for translation on the first page (simple/)
Splits the sentence based on !?| cleans it and saves the list in session["corpusinps"]
"""
def corpusinput(request): def corpusinput(request):
corpusraw = request.POST.get('translate') corpusraw = request.POST.get('translate')
langselect = request.POST.get('langselect') langselect = request.POST.get('langselect')
@ -128,7 +148,11 @@ def corpusinput(request):
request.session["langspec"] = langselect request.session["langspec"] = langselect
print(request.session["langspec"]) print(request.session["langspec"])
s = corpusraw.strip() s = corpusraw.strip()
print(s, "DEBUG: raw corpus before split_sentences")
spsent = [k.strip() for k in split_sentences(s)] spsent = [k.strip() for k in split_sentences(s)]
print(spsent, "DEBUG: raw corpus after split_sentences")
corpusinps = list(filter(lambda elem: elem.strip(), spsent)) corpusinps = list(filter(lambda elem: elem.strip(), spsent))
request.session["corpusinps"] = [[k, ''] for k in corpusinps] request.session["corpusinps"] = [[k, ''] for k in corpusinps]
print(request.session["corpusinps"]) print(request.session["corpusinps"])
@ -159,11 +183,22 @@ def indic(request):
def translate_new(request): def translate_new(request):
translatorbest = engines[request.session["langspec"]]["translatorbest"] translatorbest = engines[request.session["langspec"]]["translatorbest"]
translatorbigram = engines[request.session["langspec"]]["translatorbigram"] translatorbigram = engines[request.session["langspec"]]["translatorbigram"]
L1 = toquotapos(request.GET.get('a').strip()) print("Before processing")
L2 = request.GET.get('b', "") print("##########################")
print("##########################")
print(request.GET.get('a').strip())
print("##########################")
print("##########################")
print("##########################")
L1 = toquotapos(request.GET.get('a').strip()) # request.GET.get('a') contains the whole sentence to be translated
print("############After Processing########")
print((L1))
L2 = request.GET.get('b', "") # request.GET.get('b') contains the partial sentence to be translated
L2split = L2.split() L2split = L2.split()
if langspecs[request.session["langspec"]]['indic_code']: if langspecs[request.session["langspec"]]['indic_code']:
# print(L2[-1])
if L2 != '' and bool(re.search(r"([^\s\u0900-\u097F])", L2[-1])): if L2 != '' and bool(re.search(r"([^\s\u0900-\u097F])", L2[-1])):
params = {} params = {}
params['inString'] = L2split[-1] params['inString'] = L2split[-1]
@ -173,7 +208,7 @@ def translate_new(request):
L2 = ' '.join(L2split) L2 = ' '.join(L2split)
# L2 = transliterate(L2, sanscript.ITRANS, langspec['indic_code']) # L2 = transliterate(L2, sanscript.ITRANS, langspec['indic_code'])
print(L2) print(L2, u'\u0900-\u097F')
something, pred, covatn2d, score_total, words_total = translatorbest.translate( something, pred, covatn2d, score_total, words_total = translatorbest.translate(
src=[L1], src=[L1],
@ -183,6 +218,8 @@ def translate_new(request):
attn_debug=True, attn_debug=True,
partial = toquotapos(L2) partial = toquotapos(L2)
) )
print("$$$$$$$$$$$$$$$$$$$$$$$$")
scores, predictions, score_total, words_total = translatorbigram.translate( scores, predictions, score_total, words_total = translatorbigram.translate(
src=[L1], src=[L1],
@ -195,11 +232,13 @@ def translate_new(request):
) )
# print(covatn2d) print(covatn2d, 'convatn2d')
if L2 != '': if L2 != '':
transpattn = [*zip(*covatn2d)] transpattn = [*zip(*covatn2d)]
attnind = [attn.index(max(attn)) for attn in transpattn] attnind = [attn.index(max(attn)) for attn in transpattn]
print('attnind', attnind)
attndist = [[ i for i, x in enumerate(attnind) if x==k] for k in range(len(L2.strip().split(" ")))] attndist = [[ i for i, x in enumerate(attnind) if x==k] for k in range(len(L2.strip().split(" ")))]
print('attndist', attndist)
sumattn = [1] * len(L1.split(" ")) sumattn = [1] * len(L1.split(" "))
for i in attndist: for i in attndist:
for k in i: for k in i:
@ -223,6 +262,7 @@ def translate_new(request):
sentence = [quotaposto(L2 + x.capitalize()[len(L2):], langspecs[request.session["langspec"]]["tgt"]) + " " for x in predictions if not (x in seen or seen_add(x))] sentence = [quotaposto(L2 + x.capitalize()[len(L2):], langspecs[request.session["langspec"]]["tgt"]) + " " for x in predictions if not (x in seen or seen_add(x))]
# sentence = [x.replace(L2, "") for x in sentence] # sentence = [x.replace(L2, "") for x in sentence]
sentence = '\n'.join(sentence) sentence = '\n'.join(sentence)
print("pred[0][0]", pred[0][0], pred[0][0][len(L2):])
if langspecs[request.session["langspec"]]['provide_help'] and L2: if langspecs[request.session["langspec"]]['provide_help'] and L2:
sentence = quotaposto(L2 + pred[0][0].capitalize()[len(L2):], langspecs[request.session["langspec"]]["tgt"]) + '\n' + L2 + '\n' + sentence sentence = quotaposto(L2 + pred[0][0].capitalize()[len(L2):], langspecs[request.session["langspec"]]["tgt"]) + '\n' + L2 + '\n' + sentence
else: else:
@ -231,6 +271,6 @@ def translate_new(request):
print(sentence) print(sentence)
perplexity = float(math.exp(-score_total / words_total)) perplexity = float(math.exp(-score_total / words_total))
avg_score = float(score_total / words_total) avg_score = float(score_total / words_total)
# print(scores) print("sentence", sentence)
# print(something, pred) # print(something, pred)
return JsonResponse({'result': sentence, 'attn': sumattn, 'partial': L2, 'ppl': perplexity, 'avg': avg_score}) return JsonResponse({'result': sentence, 'attn': sumattn, 'partial': L2, 'ppl': perplexity, 'avg': avg_score})

Просмотреть файл

@ -307,6 +307,7 @@ class Translator(object):
* all_scores is a list of `batch_size` lists of `n_best` scores * all_scores is a list of `batch_size` lists of `n_best` scores
* all_predictions is a list of `batch_size` lists * all_predictions is a list of `batch_size` lists
of `n_best` predictions of `n_best` predictions
* attns is a list of attention scores for translation having highest cumilative log likelihood
""" """
self.dymax_len = dymax_len self.dymax_len = dymax_len
self.partialf = None self.partialf = None
@ -322,6 +323,7 @@ class Translator(object):
# Logic for partial and partialf # Logic for partial and partialf
if partial and partial != '': if partial and partial != '':
partials = partial.split() partials = partial.split()
print(partials, '~~~~partials~~~')
vocabdict = dict(self.fields)["tgt"].base_field.vocab vocabdict = dict(self.fields)["tgt"].base_field.vocab
# if vocabdict.stoi[partials[-1]] == 0: # if vocabdict.stoi[partials[-1]] == 0:
if partialfcheck: if partialfcheck:
@ -335,6 +337,9 @@ class Translator(object):
# self.partialf = [20.0] + [i[0] for i in sorted(editarr, key=lambda x: x[1])] # self.partialf = [20.0] + [i[0] for i in sorted(editarr, key=lambda x: x[1])]
self.partial = [vocabdict.stoi[x] for x in partials[:-1]] self.partial = [vocabdict.stoi[x] for x in partials[:-1]]
print("#########vocabdict.stoi########")
print(self.partial)
print("##################################")
self.partialf = [v for k, v in vocabdict.stoi.items() if k.startswith(partials[-1]) and v] self.partialf = [v for k, v in vocabdict.stoi.items() if k.startswith(partials[-1]) and v]
else: else:
@ -384,7 +389,7 @@ class Translator(object):
pred_score_total, pred_words_total = 0, 0 pred_score_total, pred_words_total = 0, 0
gold_score_total, gold_words_total = 0, 0 gold_score_total, gold_words_total = 0, 0
all_scores = [] all_scores = [] # I guess this is the cumilative log likelihood score of each sentence
all_predictions = [] all_predictions = []
start_time = time.time() start_time = time.time()
@ -396,6 +401,8 @@ class Translator(object):
translations = xlation_builder.from_batch(batch_data) translations = xlation_builder.from_batch(batch_data)
for trans in translations: for trans in translations:
print("Loop")
print(trans, trans.pred_sents)
all_scores += [trans.pred_scores[:self.n_best]] all_scores += [trans.pred_scores[:self.n_best]]
pred_score_total += trans.pred_scores[0] pred_score_total += trans.pred_scores[0]
pred_words_total += len(trans.pred_sents[0]) pred_words_total += len(trans.pred_sents[0])
@ -405,6 +412,12 @@ class Translator(object):
n_best_preds = [" ".join(pred) n_best_preds = [" ".join(pred)
for pred in trans.pred_sents[:self.n_best]] for pred in trans.pred_sents[:self.n_best]]
print("############n_best_preds###############")
print(n_best_preds)
print("############n_best_preds###############")
if self.report_align: if self.report_align:
align_pharaohs = [build_align_pharaoh(align) for align align_pharaohs = [build_align_pharaoh(align) for align
in trans.word_aligns[:self.n_best]] in trans.word_aligns[:self.n_best]]
@ -433,7 +446,7 @@ class Translator(object):
srcs = trans.src_raw srcs = trans.src_raw
else: else:
srcs = [str(item) for item in range(len(attns[0]))] srcs = [str(item) for item in range(len(attns[0]))]
output = report_matrix(srcs, preds, attns) output = report_matrix(srcs, preds, attns) # This prints attentions in output for the sentence having highest cumilative log likelihood score
if self.logger: if self.logger:
self.logger.info(output) self.logger.info(output)

Просмотреть файл

@ -387,6 +387,8 @@ function parseProcessedJsonResultsfunction(data, partial) {
var container = $('<div />'); var container = $('<div />');
// Code for adding suggestions//
var countcontainer = 0 var countcontainer = 0
finalresult = [] finalresult = []
for(var i = 0; i < result.length; i++) { for(var i = 0; i < result.length; i++) {

Просмотреть файл

@ -1,4 +1,4 @@
// This page serves as the script for simpletranslate.html
/* /*
************************************************* *************************************************
************************************************* *************************************************
@ -53,6 +53,7 @@ function sharedStart(feed, partial) {
part1text = partial.substring(0, lastspace) part1text = partial.substring(0, lastspace)
part2text = partial.substring(lastspace+1) part2text = partial.substring(lastspace+1)
var count = 0 var count = 0
console.log("DEBUG part1text", part1text, )
if (part1text) { if (part1text) {
newfeed = feed.replace(part1text + " ", '') newfeed = feed.replace(part1text + " ", '')
} else { } else {
@ -504,6 +505,7 @@ function parseProcessedJsonResultsfunction(data, partial) {
finalresult = [] finalresult = []
for(var i = 0; i < result.length; i++) { for(var i = 0; i < result.length; i++) {
var repres = sharedStart(result[i], partialret) var repres = sharedStart(result[i], partialret)
console.log(result[i] + '%%%%%%%%%%%%%%%%%%%%%%%%%%%')
if (repres !== "") { if (repres !== "") {
container.append('<span id="res'+countcontainer+'" class="res'+countcontainer+' spanres p-1"> ' + repres + '</span>'); container.append('<span id="res'+countcontainer+'" class="res'+countcontainer+' spanres p-1"> ' + repres + '</span>');
countcontainer += 1; countcontainer += 1;
@ -516,7 +518,7 @@ function parseProcessedJsonResultsfunction(data, partial) {
// Coloring the drop down box selections // Coloring the drop down box selections
partial.closest('.bmo').find('.dropdown').html(container); partial.closest('.bmo').find('.dropdown').html(container);
resetcolors('.res', $('.spanres').length) resetcolors('.res', $('.spanres').length)
$('.res' + selecte).css("background-color","#eee") $('.res' + selecte).css("background-color","#fff")
if (countcontainer>1) { if (countcontainer>1) {
partial.closest('.bmo').find('.dropdown').css('visibility', 'visible'); partial.closest('.bmo').find('.dropdown').css('visibility', 'visible');
} }
@ -528,7 +530,7 @@ function parseProcessedJsonResultsfunction(data, partial) {
for (m=0; m<attn.length; m++) { for (m=0; m<attn.length; m++) {
if (attn[m] != 0) { if (attn[m] != 0) {
// partial.closest('.bmo').find('.hin_inp_part' + m).css('background-color', 'rgba(255,0,0,' + attn[m] + ')') // partial.closest('.bmo').find('.hin_inp_part' + m).css('background-color', 'rgba(255,0,0,' + attn[m] + ')')
partial.closest('.bmo').find('.hin_inp_part' + m).css('background-color', 'rgba(255,0,0,0.5') partial.closest('.bmo').find('.hin_inp_part' + m).css('background-color', 'rgba(255,0,0,0.5)')
} }
else { else {
partial.closest('.bmo').find('.hin_inp_part' + m).css('background-color', 'rgba(0,255,0,0.5)') partial.closest('.bmo').find('.hin_inp_part' + m).css('background-color', 'rgba(0,255,0,0.5)')
@ -580,25 +582,28 @@ $(document).ready(function() {
inputs = data.result; inputs = data.result;
langspec = data.langspec langspec = data.langspec
// langtolangid = data.langtolangid; // langtolangid = data.langtolangid;
console.log(inputs) console.log(inputs)
$('#cardscoll').html('') $('#cardscoll').html('')
$('#corpusinput').html('') $('#corpusinput').html('')
for (i=0; i<inputs.length; i++) { for (i=0; i<inputs.length; i++) {
/*To set the source part of the page*/
if (langspec == 'hi-en') { if (langspec == 'hi-en') {
$('#corpusinput').append('<span class="corp_inp">' + inputs[i][0] + '</span>| ') $('#corpusinput').append('<span class="corp_inp">' + inputs[i][0] + '</span>| ') /* 1st index is the text with which the editable division is intitalised */
} else { } else {
$('#corpusinput').append('<span class="corp_inp">' + inputs[i][0] + '</span>. ') $('#corpusinput').append('<span class="corp_inp">' + inputs[i][0] + '</span>. ')
} }
/*--------------------------------*/
$('#cardscoll').append( $('#cardscoll').append(
`<div class="shadow p-3 my-3 rounded bmo cardescoll"> `<div class="shadow p-3 my-3 rounded bmo cardescoll">
<div class="row"> <div class="row">
<div class="col-9"> <div class="col-9">
<div class="hin_inp pb-2" contenteditable="false">`+ inputSpan(inputs[i][0]) + `</div> <div class="hin_inp pb-2" contenteditable="false">`+ inputSpan(inputs[i][0]) /*Wraps each word of sentence around span and returns*/ + `</div>
<div class="dropcontainer"> <div class="dropcontainer">
<div class="partcontainer"> <div class="partcontainer">
<div class="suggest transtext" contenteditable="false"></div> <div class="suggest transtext" contenteditable="false"></div>
<div class=" partial transtext" id="card` + i + `" contenteditable="true" <div class="partial transtext" id="card` + i + `" contenteditable="true"
data-tab=0 data-enter=0 data-up=0 data-down=0 data-others=0 data-pgup=0 data-pgdn=0 data-end=0 data-right=0 data-left=0 data-bkspc=0 data-time=0 data-tab=0 data-enter=0 data-up=0 data-down=0 data-others=0 data-pgup=0 data-pgdn=0 data-end=0 data-right=0 data-left=0 data-bkspc=0 data-time=0
>`+ inputs[i][1] + `</div> >`+ inputs[i][1] + `</div>
</div> </div>
@ -767,10 +772,16 @@ $(document).ready(function() {
var hin_inp = partial.closest('.bmo').find('.hin_inp') var hin_inp = partial.closest('.bmo').find('.hin_inp')
globalPartial = partial; globalPartial = partial;
console.log("#########################################3")
console.log("#########################################3")
console.log(partial.clone().children().remove().end().text())
console.log("#########################################4")
console.log("#########################################3")
if (sockets_use == true) { if (sockets_use == true) {
connectSocket.send(JSON.stringify({ connectSocket.send(JSON.stringify({
'partial_translation': partial.clone().children().remove().end().text(), 'partial_translation': partial.clone().children().remove().end().text(), // The text translated by user so far
'original': hin_inp.text(), 'original': hin_inp.text(), // The full sentence to be translated
'langspec': langspec 'langspec': langspec
})); }));
} }
@ -778,7 +789,7 @@ $(document).ready(function() {
//OLD, JANKY HTTP REQUEST!! //OLD, JANKY HTTP REQUEST!!
searchRequest = $.getJSON(http_translate, { searchRequest = $.getJSON(http_translate, {
a: hin_inp.text(), a: hin_inp.text(), // Maybe use some good names here?
b: partial.clone().children().remove().end().text() b: partial.clone().children().remove().end().text()
}, function(data) { }, function(data) {
// console.log(data) // console.log(data)

Просмотреть файл

@ -23,7 +23,7 @@
if (corpusinput) { if (corpusinput) {
$.ajax({ $.ajax({
type: "POST", type: "POST",
url: '/simple/corpusinput', url: '/simple/corpusinput', //corpusinput function in mtsimple/views.py
data: { data: {
'translate': corpusinput, 'translate': corpusinput,
'langselect': $('#src').find(":selected").val() + "-" + $('#tgt').find(":selected").val(), 'langselect': $('#src').find(":selected").val() + "-" + $('#tgt').find(":selected").val(),
@ -51,11 +51,18 @@
if (lang == "bn-en") { if (lang == "bn-en") {
text = "মৌসুমি বৃষ্টি একটি অভিশাপ দ্বারা আশীর্বাদ করা একটি আশীর্বাদ। যখন পরিমাণগত পরিমাণে বৃষ্টির পরিমাণ কম হয়, তখন এটি একটি আশীর্বাদের জন্য আমাদের পরে গরম তাপের গ্রীষ্ম। ফসলের প্রাচুর্যের কারণে এটি কৃষকদের জন্য একটি আশীর্বাদ। শুষ্ক গ্রীষ্মের পরে নদী ভরাট।" text = "মৌসুমি বৃষ্টি একটি অভিশাপ দ্বারা আশীর্বাদ করা একটি আশীর্বাদ। যখন পরিমাণগত পরিমাণে বৃষ্টির পরিমাণ কম হয়, তখন এটি একটি আশীর্বাদের জন্য আমাদের পরে গরম তাপের গ্রীষ্ম। ফসলের প্রাচুর্যের কারণে এটি কৃষকদের জন্য একটি আশীর্বাদ। শুষ্ক গ্রীষ্মের পরে নদী ভরাট।"
} }
if (lang == "hi-gondi") {
text = "आज मौसम सुहावना है। हमें शाम को बाहर जाना चाहिए।"
}
console.log(text) console.log(text)
$("#corpusinput").val(text); $("#corpusinput").val(text);
$("#corpusinput").focus(); $("#corpusinput").focus();
$("#corpusinput").trigger('autoresize'); $("#corpusinput").trigger('autoresize');
}); });
function limitTgtOptions() {
// TODO: Limit target language to only hindi
}
}); });
</script> </script>
@ -81,6 +88,7 @@
<select class="form-control" id="tgt"> <select class="form-control" id="tgt">
<option value="en">English</option value="en"> <option value="en">English</option value="en">
<option value="hi">Hindi</option value="en"> <option value="hi">Hindi</option value="en">
<option value="gondi" onselect="limitTgtOptions">Gondi</option value="en">
</select> </select>
</div> </div>