diff --git a/README.md b/README.md index c18bb9d..84f6ba5 100644 --- a/README.md +++ b/README.md @@ -92,3 +92,4 @@ Suffix of the model file in the registry: - Ukrainian <-> English - Dutch <-> English - Catalan -> English +- Hungarian -> English diff --git a/evaluation/dev/bleu-results.md b/evaluation/dev/bleu-results.md index e7294fd..7094bcc 100644 --- a/evaluation/dev/bleu-results.md +++ b/evaluation/dev/bleu-results.md @@ -56,15 +56,26 @@ Both absolute and relative differences in BLEU scores between Bergamot and other ## avg -| Translator/Dataset | ru-en | en-nl | en-ru | en-fa | nl-en | uk-en | fa-en | ca-en | en-uk | is-en | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 33.69 | 27.30 | 29.44 | 17.30 | 29.65 | 35.93 | 28.70 | 38.00 | 26.30 | 23.40 | -| google | 38.20 (+4.51, +13.38%) | 29.30 (+2.00, +7.33%) | 34.49 (+5.05, +17.15%) | 27.80 (+10.50, +60.69%) | 33.05 (+3.40, +11.47%) | 42.43 (+6.50, +18.09%) | 40.85 (+12.15, +42.33%) | 48.95 (+10.95, +28.82%) | 32.63 (+6.33, +24.08%) | 38.90 (+15.50, +66.24%) | -| microsoft | 38.38 (+4.68, +13.90%) | 28.80 (+1.50, +5.49%) | 33.62 (+4.18, +14.21%) | 20.50 (+3.20, +18.50%) | 32.60 (+2.95, +9.95%) | 42.30 (+6.37, +17.72%) | 36.15 (+7.45, +25.96%) | 46.50 (+8.50, +22.37%) | 32.03 (+5.73, +21.80%) | 38.17 (+14.77, +63.11%) | +| Translator/Dataset | hu-en | ru-en | en-nl | en-ru | en-fa | nl-en | uk-en | fa-en | ca-en | en-uk | is-en | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| bergamot | 26.53 | 33.69 | 27.30 | 29.44 | 17.30 | 29.65 | 35.93 | 28.70 | 38.00 | 26.30 | 23.40 | +| google | 31.30 (+4.77, +18.00%) | 38.20 (+4.51, +13.38%) | 29.30 (+2.00, +7.33%) | 34.49 (+5.05, +17.15%) | 27.80 (+10.50, +60.69%) | 33.05 (+3.40, +11.47%) | 42.43 (+6.50, +18.09%) | 40.85 (+12.15, +42.33%) | 48.95 (+10.95, +28.82%) | 32.63 (+6.33, +24.08%) | 38.90 (+15.50, +66.24%) | +| microsoft | 31.03 (+4.50, +16.97%) | 38.38 (+4.68, +13.90%) | 28.80 (+1.50, +5.49%) | 33.62 (+4.18, +14.21%) | 20.50 (+3.20, +18.50%) | 32.60 (+2.95, +9.95%) | 42.30 (+6.37, +17.72%) | 36.15 (+7.45, +25.96%) | 46.50 (+8.50, +22.37%) | 32.03 (+5.73, +21.80%) | 38.17 (+14.77, +63.11%) | ![Results](img/avg-bleu.png) --- +## hu-en + +| Translator/Dataset | wmt08 | flores-dev | flores-test | wmt09 | +| --- | --- | --- | --- | --- | +| bergamot | 20.00 | 32.20 | 31.60 | 22.30 | +| google | 22.40 (+2.40, +12.00%) | 39.40 (+7.20, +22.36%) | 38.00 (+6.40, +20.25%) | 25.40 (+3.10, +13.90%) | +| microsoft | 22.60 (+2.60, +13.00%) | 38.50 (+6.30, +19.57%) | 38.20 (+6.60, +20.89%) | 24.80 (+2.50, +11.21%) | + +![Results](img/hu-en-bleu.png) +--- + ## ru-en | Translator/Dataset | mtedx_test | wmt19 | wmt17 | flores-dev | wmt22 | flores-test | wmt14 | wmt15 | wmt16 | wmt13 | wmt18 | wmt21 | wmt20 | diff --git a/evaluation/dev/comet-results.md b/evaluation/dev/comet-results.md index 1550110..6fe6947 100644 --- a/evaluation/dev/comet-results.md +++ b/evaluation/dev/comet-results.md @@ -48,15 +48,44 @@ We also compare the systems using the `comet-compare` tool that calculates the s ## avg -| Translator/Dataset | ru-en | en-nl | en-ru | en-fa | nl-en | uk-en | fa-en | ca-en | en-uk | is-en | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 0.49 | 0.58 | 0.54 | 0.31 | 0.63 | 0.52 | 0.50 | 0.65 | 0.51 | 0.15 | -| google | 0.59 (+0.10, +20.83%) | 0.67 (+0.08, +14.30%) | 0.76 (+0.21, +39.38%) | 0.70 (+0.39, +126.54%) | 0.70 (+0.07, +10.71%) | 0.67 (+0.15, +28.26%) | 0.74 (+0.24, +48.00%) | 0.82 (+0.16, +24.78%) | 0.79 (+0.27, +53.31%) | 0.70 (+0.55, +370.91%) | -| microsoft | 0.60 (+0.11, +22.13%) | 0.65 (+0.06, +11.05%) | 0.72 (+0.18, +32.36%) | 0.41 (+0.10, +31.65%) | 0.69 (+0.06, +9.12%) | 0.64 (+0.12, +23.16%) | 0.66 (+0.16, +32.78%) | 0.79 (+0.14, +21.22%) | 0.75 (+0.23, +45.60%) | 0.67 (+0.52, +353.71%) | +| Translator/Dataset | hu-en | ru-en | en-nl | en-ru | en-fa | nl-en | uk-en | fa-en | ca-en | en-uk | is-en | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| bergamot | 0.56 | 0.49 | 0.58 | 0.54 | 0.31 | 0.63 | 0.52 | 0.50 | 0.65 | 0.51 | 0.15 | +| google | 0.66 (+0.10, +17.32%) | 0.59 (+0.10, +20.83%) | 0.67 (+0.08, +14.30%) | 0.76 (+0.21, +39.38%) | 0.70 (+0.39, +126.54%) | 0.70 (+0.07, +10.71%) | 0.67 (+0.15, +28.26%) | 0.74 (+0.24, +48.00%) | 0.82 (+0.16, +24.78%) | 0.79 (+0.27, +53.31%) | 0.70 (+0.55, +370.91%) | +| microsoft | 0.66 (+0.10, +17.85%) | 0.60 (+0.11, +22.13%) | 0.65 (+0.06, +11.05%) | 0.72 (+0.18, +32.36%) | 0.41 (+0.10, +31.65%) | 0.69 (+0.06, +9.12%) | 0.64 (+0.12, +23.16%) | 0.66 (+0.16, +32.78%) | 0.79 (+0.14, +21.22%) | 0.75 (+0.23, +45.60%) | 0.67 (+0.52, +353.71%) | ![Results](img/avg-comet.png) --- +## hu-en + +| Translator/Dataset | wmt08 | flores-test | wmt09 | flores-dev | +| --- | --- | --- | --- | --- | +| bergamot | 0.44 | 0.66 | 0.47 | 0.68 | +| google | 0.54 (+0.10, +22.47%) | 0.76 (+0.10, +15.46%) | 0.57 (+0.10, +20.66%) | 0.77 (+0.09, +13.48%) | +| microsoft | 0.55 (+0.11, +23.95%) | 0.76 (+0.10, +15.33%) | 0.57 (+0.10, +21.81%) | 0.77 (+0.09, +13.63%) | + +![Results](img/hu-en-comet.png) +### Comparisons between systems +*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* +#### [wmt08.hu-en](hu-en/wmt08.hu-en.cometcompare) +- wmt08.microsoft.en outperforms wmt08.bergamot.en. +- wmt08.google.en outperforms wmt08.bergamot.en. + +#### [flores-test.hu-en](hu-en/flores-test.hu-en.cometcompare) +- flores-test.microsoft.en outperforms flores-test.bergamot.en. +- flores-test.google.en outperforms flores-test.bergamot.en. + +#### [wmt09.hu-en](hu-en/wmt09.hu-en.cometcompare) +- wmt09.microsoft.en outperforms wmt09.bergamot.en. +- wmt09.google.en outperforms wmt09.bergamot.en. + +#### [flores-dev.hu-en](hu-en/flores-dev.hu-en.cometcompare) +- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.bergamot.en. + +--- + ## ru-en | Translator/Dataset | wmt17 | wmt22 | flores-test | wmt20 | mtedx_test | wmt15 | wmt18 | wmt14 | wmt16 | wmt19 | wmt21 | flores-dev | wmt13 | diff --git a/evaluation/dev/hu-en/flores-dev.bergamot.en.bleu b/evaluation/dev/hu-en/flores-dev.bergamot.en.bleu new file mode 100644 index 0000000..4d40a2c --- /dev/null +++ b/evaluation/dev/hu-en/flores-dev.bergamot.en.bleu @@ -0,0 +1 @@ +32.2 diff --git a/evaluation/dev/hu-en/flores-dev.bergamot.en.comet b/evaluation/dev/hu-en/flores-dev.bergamot.en.comet new file mode 100644 index 0000000..0f5fda6 --- /dev/null +++ b/evaluation/dev/hu-en/flores-dev.bergamot.en.comet @@ -0,0 +1 @@ +0.6780 diff --git a/evaluation/dev/hu-en/flores-dev.google.en.bleu b/evaluation/dev/hu-en/flores-dev.google.en.bleu new file mode 100644 index 0000000..e1281c9 --- /dev/null +++ b/evaluation/dev/hu-en/flores-dev.google.en.bleu @@ -0,0 +1 @@ +39.4 diff --git a/evaluation/dev/hu-en/flores-dev.google.en.comet b/evaluation/dev/hu-en/flores-dev.google.en.comet new file mode 100644 index 0000000..fc26070 --- /dev/null +++ b/evaluation/dev/hu-en/flores-dev.google.en.comet @@ -0,0 +1 @@ +0.7694 diff --git a/evaluation/dev/hu-en/flores-dev.hu-en.cometcompare b/evaluation/dev/hu-en/flores-dev.hu-en.cometcompare new file mode 100644 index 0000000..07db8f3 --- /dev/null +++ b/evaluation/dev/hu-en/flores-dev.hu-en.cometcompare @@ -0,0 +1,60 @@ +========================== +x_name: flores-dev.bergamot.en +y_name: flores-dev.microsoft.en + +Bootstrap Resampling Results: +x-mean: 0.6793 +y-mean: 0.7707 +ties (%): 0.0000 +x_wins (%): 0.0000 +y_wins (%): 1.0000 + +Paired T-Test Results: +statistic: -17.4244 +p_value: 0.0000 +Null hypothesis rejected according to t-test. +Scores differ significantly across samples. +flores-dev.microsoft.en outperforms flores-dev.bergamot.en. +========================== +x_name: flores-dev.bergamot.en +y_name: flores-dev.google.en + +Bootstrap Resampling Results: +x-mean: 0.6793 +y-mean: 0.7692 +ties (%): 0.0000 +x_wins (%): 0.0000 +y_wins (%): 1.0000 + +Paired T-Test Results: +statistic: -15.9188 +p_value: 0.0000 +Null hypothesis rejected according to t-test. +Scores differ significantly across samples. +flores-dev.google.en outperforms flores-dev.bergamot.en. +========================== +x_name: flores-dev.microsoft.en +y_name: flores-dev.google.en + +Bootstrap Resampling Results: +x-mean: 0.7707 +y-mean: 0.7692 +ties (%): 0.1567 +x_wins (%): 0.5467 +y_wins (%): 0.2967 + +Paired T-Test Results: +statistic: 0.2625 +p_value: 0.7930 +Null hypothesis can't be rejected. +Both systems have equal averages. + +Summary +If system_x is better than system_y then: +Null hypothesis rejected according to t-test with p_value=0.05. +Scores differ significantly across samples. +system_x \ system_y flores-dev.bergamot.en flores-dev.microsoft.en flores-dev.google.en +----------------------- ------------------------ ------------------------- ---------------------- +flores-dev.bergamot.en False False +flores-dev.microsoft.en True False +flores-dev.google.en True False diff --git a/evaluation/dev/hu-en/flores-dev.microsoft.en.bleu b/evaluation/dev/hu-en/flores-dev.microsoft.en.bleu new file mode 100644 index 0000000..9056213 --- /dev/null +++ b/evaluation/dev/hu-en/flores-dev.microsoft.en.bleu @@ -0,0 +1 @@ +38.5 diff --git a/evaluation/dev/hu-en/flores-dev.microsoft.en.comet b/evaluation/dev/hu-en/flores-dev.microsoft.en.comet new file mode 100644 index 0000000..90013b8 --- /dev/null +++ b/evaluation/dev/hu-en/flores-dev.microsoft.en.comet @@ -0,0 +1 @@ +0.7704 diff --git a/evaluation/dev/hu-en/flores-test.bergamot.en.bleu b/evaluation/dev/hu-en/flores-test.bergamot.en.bleu new file mode 100644 index 0000000..f65ad8e --- /dev/null +++ b/evaluation/dev/hu-en/flores-test.bergamot.en.bleu @@ -0,0 +1 @@ +31.6 diff --git a/evaluation/dev/hu-en/flores-test.bergamot.en.comet b/evaluation/dev/hu-en/flores-test.bergamot.en.comet new file mode 100644 index 0000000..f0d1dc8 --- /dev/null +++ b/evaluation/dev/hu-en/flores-test.bergamot.en.comet @@ -0,0 +1 @@ +0.6623 diff --git a/evaluation/dev/hu-en/flores-test.google.en.bleu b/evaluation/dev/hu-en/flores-test.google.en.bleu new file mode 100644 index 0000000..1f51558 --- /dev/null +++ b/evaluation/dev/hu-en/flores-test.google.en.bleu @@ -0,0 +1 @@ +38.0 diff --git a/evaluation/dev/hu-en/flores-test.google.en.comet b/evaluation/dev/hu-en/flores-test.google.en.comet new file mode 100644 index 0000000..0826bb6 --- /dev/null +++ b/evaluation/dev/hu-en/flores-test.google.en.comet @@ -0,0 +1 @@ +0.7647 diff --git a/evaluation/dev/hu-en/flores-test.hu-en.cometcompare b/evaluation/dev/hu-en/flores-test.hu-en.cometcompare new file mode 100644 index 0000000..96d0fa5 --- /dev/null +++ b/evaluation/dev/hu-en/flores-test.hu-en.cometcompare @@ -0,0 +1,60 @@ +========================== +x_name: flores-test.bergamot.en +y_name: flores-test.microsoft.en + +Bootstrap Resampling Results: +x-mean: 0.6624 +y-mean: 0.7634 +ties (%): 0.0000 +x_wins (%): 0.0000 +y_wins (%): 1.0000 + +Paired T-Test Results: +statistic: -16.7697 +p_value: 0.0000 +Null hypothesis rejected according to t-test. +Scores differ significantly across samples. +flores-test.microsoft.en outperforms flores-test.bergamot.en. +========================== +x_name: flores-test.bergamot.en +y_name: flores-test.google.en + +Bootstrap Resampling Results: +x-mean: 0.6624 +y-mean: 0.7637 +ties (%): 0.0000 +x_wins (%): 0.0000 +y_wins (%): 1.0000 + +Paired T-Test Results: +statistic: -16.2147 +p_value: 0.0000 +Null hypothesis rejected according to t-test. +Scores differ significantly across samples. +flores-test.google.en outperforms flores-test.bergamot.en. +========================== +x_name: flores-test.microsoft.en +y_name: flores-test.google.en + +Bootstrap Resampling Results: +x-mean: 0.7634 +y-mean: 0.7637 +ties (%): 0.1167 +x_wins (%): 0.4033 +y_wins (%): 0.4800 + +Paired T-Test Results: +statistic: -0.2467 +p_value: 0.8052 +Null hypothesis can't be rejected. +Both systems have equal averages. + +Summary +If system_x is better than system_y then: +Null hypothesis rejected according to t-test with p_value=0.05. +Scores differ significantly across samples. +system_x \ system_y flores-test.bergamot.en flores-test.microsoft.en flores-test.google.en +------------------------ ------------------------- -------------------------- ----------------------- +flores-test.bergamot.en False False +flores-test.microsoft.en True False +flores-test.google.en True False diff --git a/evaluation/dev/hu-en/flores-test.microsoft.en.bleu b/evaluation/dev/hu-en/flores-test.microsoft.en.bleu new file mode 100644 index 0000000..135c571 --- /dev/null +++ b/evaluation/dev/hu-en/flores-test.microsoft.en.bleu @@ -0,0 +1 @@ +38.2 diff --git a/evaluation/dev/hu-en/flores-test.microsoft.en.comet b/evaluation/dev/hu-en/flores-test.microsoft.en.comet new file mode 100644 index 0000000..f3eaca8 --- /dev/null +++ b/evaluation/dev/hu-en/flores-test.microsoft.en.comet @@ -0,0 +1 @@ +0.7638 diff --git a/evaluation/dev/hu-en/wmt08.bergamot.en.bleu b/evaluation/dev/hu-en/wmt08.bergamot.en.bleu new file mode 100644 index 0000000..9a7c1e5 --- /dev/null +++ b/evaluation/dev/hu-en/wmt08.bergamot.en.bleu @@ -0,0 +1 @@ +20.0 diff --git a/evaluation/dev/hu-en/wmt08.bergamot.en.comet b/evaluation/dev/hu-en/wmt08.bergamot.en.comet new file mode 100644 index 0000000..a5c2402 --- /dev/null +++ b/evaluation/dev/hu-en/wmt08.bergamot.en.comet @@ -0,0 +1 @@ +0.4401 diff --git a/evaluation/dev/hu-en/wmt08.google.en.bleu b/evaluation/dev/hu-en/wmt08.google.en.bleu new file mode 100644 index 0000000..3565ac0 --- /dev/null +++ b/evaluation/dev/hu-en/wmt08.google.en.bleu @@ -0,0 +1 @@ +22.4 diff --git a/evaluation/dev/hu-en/wmt08.google.en.comet b/evaluation/dev/hu-en/wmt08.google.en.comet new file mode 100644 index 0000000..49eb4d4 --- /dev/null +++ b/evaluation/dev/hu-en/wmt08.google.en.comet @@ -0,0 +1 @@ +0.5390 diff --git a/evaluation/dev/hu-en/wmt08.hu-en.cometcompare b/evaluation/dev/hu-en/wmt08.hu-en.cometcompare new file mode 100644 index 0000000..d44291b --- /dev/null +++ b/evaluation/dev/hu-en/wmt08.hu-en.cometcompare @@ -0,0 +1,60 @@ +========================== +x_name: wmt08.bergamot.en +y_name: wmt08.microsoft.en + +Bootstrap Resampling Results: +x-mean: 0.4394 +y-mean: 0.5451 +ties (%): 0.0000 +x_wins (%): 0.0000 +y_wins (%): 1.0000 + +Paired T-Test Results: +statistic: -19.3365 +p_value: 0.0000 +Null hypothesis rejected according to t-test. +Scores differ significantly across samples. +wmt08.microsoft.en outperforms wmt08.bergamot.en. +========================== +x_name: wmt08.bergamot.en +y_name: wmt08.google.en + +Bootstrap Resampling Results: +x-mean: 0.4394 +y-mean: 0.5388 +ties (%): 0.0000 +x_wins (%): 0.0000 +y_wins (%): 1.0000 + +Paired T-Test Results: +statistic: -17.6601 +p_value: 0.0000 +Null hypothesis rejected according to t-test. +Scores differ significantly across samples. +wmt08.google.en outperforms wmt08.bergamot.en. +========================== +x_name: wmt08.microsoft.en +y_name: wmt08.google.en + +Bootstrap Resampling Results: +x-mean: 0.5451 +y-mean: 0.5388 +ties (%): 0.0833 +x_wins (%): 0.8133 +y_wins (%): 0.1033 + +Paired T-Test Results: +statistic: 1.7401 +p_value: 0.0820 +Null hypothesis can't be rejected. +Both systems have equal averages. + +Summary +If system_x is better than system_y then: +Null hypothesis rejected according to t-test with p_value=0.05. +Scores differ significantly across samples. +system_x \ system_y wmt08.bergamot.en wmt08.microsoft.en wmt08.google.en +--------------------- ------------------- -------------------- ----------------- +wmt08.bergamot.en False False +wmt08.microsoft.en True False +wmt08.google.en True False diff --git a/evaluation/dev/hu-en/wmt08.microsoft.en.bleu b/evaluation/dev/hu-en/wmt08.microsoft.en.bleu new file mode 100644 index 0000000..0590802 --- /dev/null +++ b/evaluation/dev/hu-en/wmt08.microsoft.en.bleu @@ -0,0 +1 @@ +22.6 diff --git a/evaluation/dev/hu-en/wmt08.microsoft.en.comet b/evaluation/dev/hu-en/wmt08.microsoft.en.comet new file mode 100644 index 0000000..86ef28e --- /dev/null +++ b/evaluation/dev/hu-en/wmt08.microsoft.en.comet @@ -0,0 +1 @@ +0.5455 diff --git a/evaluation/dev/hu-en/wmt09.bergamot.en.bleu b/evaluation/dev/hu-en/wmt09.bergamot.en.bleu new file mode 100644 index 0000000..937387f --- /dev/null +++ b/evaluation/dev/hu-en/wmt09.bergamot.en.bleu @@ -0,0 +1 @@ +22.3 diff --git a/evaluation/dev/hu-en/wmt09.bergamot.en.comet b/evaluation/dev/hu-en/wmt09.bergamot.en.comet new file mode 100644 index 0000000..a8657b6 --- /dev/null +++ b/evaluation/dev/hu-en/wmt09.bergamot.en.comet @@ -0,0 +1 @@ +0.4696 diff --git a/evaluation/dev/hu-en/wmt09.google.en.bleu b/evaluation/dev/hu-en/wmt09.google.en.bleu new file mode 100644 index 0000000..ac61db5 --- /dev/null +++ b/evaluation/dev/hu-en/wmt09.google.en.bleu @@ -0,0 +1 @@ +25.4 diff --git a/evaluation/dev/hu-en/wmt09.google.en.comet b/evaluation/dev/hu-en/wmt09.google.en.comet new file mode 100644 index 0000000..a3d6a4d --- /dev/null +++ b/evaluation/dev/hu-en/wmt09.google.en.comet @@ -0,0 +1 @@ +0.5666 diff --git a/evaluation/dev/hu-en/wmt09.hu-en.cometcompare b/evaluation/dev/hu-en/wmt09.hu-en.cometcompare new file mode 100644 index 0000000..6e44a94 --- /dev/null +++ b/evaluation/dev/hu-en/wmt09.hu-en.cometcompare @@ -0,0 +1,60 @@ +========================== +x_name: wmt09.bergamot.en +y_name: wmt09.microsoft.en + +Bootstrap Resampling Results: +x-mean: 0.4693 +y-mean: 0.5718 +ties (%): 0.0000 +x_wins (%): 0.0000 +y_wins (%): 1.0000 + +Paired T-Test Results: +statistic: -23.9367 +p_value: 0.0000 +Null hypothesis rejected according to t-test. +Scores differ significantly across samples. +wmt09.microsoft.en outperforms wmt09.bergamot.en. +========================== +x_name: wmt09.bergamot.en +y_name: wmt09.google.en + +Bootstrap Resampling Results: +x-mean: 0.4693 +y-mean: 0.5661 +ties (%): 0.0000 +x_wins (%): 0.0000 +y_wins (%): 1.0000 + +Paired T-Test Results: +statistic: -22.2999 +p_value: 0.0000 +Null hypothesis rejected according to t-test. +Scores differ significantly across samples. +wmt09.google.en outperforms wmt09.bergamot.en. +========================== +x_name: wmt09.microsoft.en +y_name: wmt09.google.en + +Bootstrap Resampling Results: +x-mean: 0.5718 +y-mean: 0.5661 +ties (%): 0.0633 +x_wins (%): 0.8333 +y_wins (%): 0.1033 + +Paired T-Test Results: +statistic: 1.8459 +p_value: 0.0650 +Null hypothesis can't be rejected. +Both systems have equal averages. + +Summary +If system_x is better than system_y then: +Null hypothesis rejected according to t-test with p_value=0.05. +Scores differ significantly across samples. +system_x \ system_y wmt09.bergamot.en wmt09.microsoft.en wmt09.google.en +--------------------- ------------------- -------------------- ----------------- +wmt09.bergamot.en False False +wmt09.microsoft.en True False +wmt09.google.en True False diff --git a/evaluation/dev/hu-en/wmt09.microsoft.en.bleu b/evaluation/dev/hu-en/wmt09.microsoft.en.bleu new file mode 100644 index 0000000..7092078 --- /dev/null +++ b/evaluation/dev/hu-en/wmt09.microsoft.en.bleu @@ -0,0 +1 @@ +24.8 diff --git a/evaluation/dev/hu-en/wmt09.microsoft.en.comet b/evaluation/dev/hu-en/wmt09.microsoft.en.comet new file mode 100644 index 0000000..4f545ed --- /dev/null +++ b/evaluation/dev/hu-en/wmt09.microsoft.en.comet @@ -0,0 +1 @@ +0.5720 diff --git a/evaluation/dev/img/avg-bleu.png b/evaluation/dev/img/avg-bleu.png index 40b5de3..5a4b330 100644 Binary files a/evaluation/dev/img/avg-bleu.png and b/evaluation/dev/img/avg-bleu.png differ diff --git a/evaluation/dev/img/avg-comet.png b/evaluation/dev/img/avg-comet.png index 8fe4280..46ef191 100644 Binary files a/evaluation/dev/img/avg-comet.png and b/evaluation/dev/img/avg-comet.png differ diff --git a/evaluation/dev/img/hu-en-bleu.png b/evaluation/dev/img/hu-en-bleu.png new file mode 100644 index 0000000..83dab49 Binary files /dev/null and b/evaluation/dev/img/hu-en-bleu.png differ diff --git a/evaluation/dev/img/hu-en-comet.png b/evaluation/dev/img/hu-en-comet.png new file mode 100644 index 0000000..54dcf10 Binary files /dev/null and b/evaluation/dev/img/hu-en-comet.png differ diff --git a/models/dev/huen/lex.50.50.huen.s2t.bin.gz b/models/dev/huen/lex.50.50.huen.s2t.bin.gz new file mode 100644 index 0000000..2da2ac8 --- /dev/null +++ b/models/dev/huen/lex.50.50.huen.s2t.bin.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e062d80c132f8cee111b85adbef9841f502f73be09b1b6596f31385dfcd0add +size 2778202 diff --git a/models/dev/huen/model.huen.intgemm.alphas.bin.gz b/models/dev/huen/model.huen.intgemm.alphas.bin.gz new file mode 100644 index 0000000..c3aa43c --- /dev/null +++ b/models/dev/huen/model.huen.intgemm.alphas.bin.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f66a371ab2ba3316b438ae6430a6616469c5ecea307024a07af1205dfd5d716d +size 13181613 diff --git a/models/dev/huen/vocab.huen.spm.gz b/models/dev/huen/vocab.huen.spm.gz new file mode 100644 index 0000000..a525bf8 --- /dev/null +++ b/models/dev/huen/vocab.huen.spm.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97fca39ca82670f3dbb0d5e28a5476cd36a3556bbcced590e6e5de7687013dce +size 419908 diff --git a/registry.json b/registry.json index 8955085..b52bd13 100644 --- a/registry.json +++ b/registry.json @@ -602,6 +602,29 @@ "modelType": "dev" } }, + "huen": { + "model": { + "name": "model.huen.intgemm.alphas.bin", + "size": 17140899, + "estimatedCompressedSize": 13181613, + "expectedSha256Hash": "518356dbb0c071739318601963a87580fb41732652f52bd3635246330c186d9e", + "modelType": "dev" + }, + "lex": { + "name": "lex.50.50.huen.s2t.bin", + "size": 5162428, + "estimatedCompressedSize": 2778202, + "expectedSha256Hash": "fff56b2501258ec4c46a8fc715caee7aeb15d853f859cdfacd3ef9903ed2fff1", + "modelType": "dev" + }, + "vocab": { + "name": "vocab.huen.spm", + "size": 820746, + "estimatedCompressedSize": 419908, + "expectedSha256Hash": "0db772702235b02d1f29abafb7a49ed77e54c60245b3a46e90716e74263aedd6", + "modelType": "dev" + } + }, "isen": { "model": { "name": "model.isen.intgemm.alphas.bin",