Bug 1592561, adapt Pontoon parser to compare-locales, some basic tests, r=mathjazz

Copy .lang parser from Pontoon This is taken from 82190d5bdf/pontoon/sync/formats/lang.py. Adapt Pontoon parser to compare-locales, some basic tests. The parser isn't really intended to be used in a .lang production environment, so there's not a lot of effort here. The idea is that we can read good .lang files and get translations out. Differential Revision: https://phabricator.services.mozilla.com/D51123
2019-11-06 18:13:59 +01:00 · 2019-11-06 18:13:59 +01:00 · 5be0df0876
--- a/.hgignore
+++ b/.hgignore
@ -7,3 +7,6 @@
 ^.tox$
 ^.coverage$
 ^htmlcov$
+^contrib/lang/build$
+^contrib/lang/dist$
+^contrib/lang/src/cl_ext.lang.egg-info$
--- a/contrib/lang/.gitignore
+++ b/contrib/lang/.gitignore
@ -0,0 +1,5 @@
+*.orig
+__pycache__
+build/
+dist/
+src/cl_ext.lang.egg-info/
--- a/contrib/lang/LICENSE
+++ b/contrib/lang/LICENSE
@ -0,0 +1,25 @@
+Copyright (c) 2012, Mozilla Foundation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+    * Neither the name of the copyright owner nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/contrib/lang/setup.cfg
+++ b/contrib/lang/setup.cfg
@ -0,0 +1,6 @@
+[bdist_wheel]
+universal=1
+
+[options.entry_points]
+compare_locales.parsers =
+    lang=cl_ext.lang.lang:LangParser
--- a/contrib/lang/setup.py
+++ b/contrib/lang/setup.py
@ -0,0 +1,18 @@
+from __future__ import absolute_import
+
+from setuptools import setup
+
+setup(
+    name="cl_ext.lang",
+    author="Axel Hecht",
+    author_email="axel@mozilla.com",
+    description=".lang parser for compare-locales",
+    platforms=["any"],
+    python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4',
+    package_dir={"": "src"},
+    packages=['cl_ext', 'cl_ext.lang'],
+    install_requires=[
+        "parsimonious",
+        "compare_locales",
+    ]
+)
--- a/contrib/lang/src/cl_ext/init.py
+++ b/contrib/lang/src/cl_ext/init.py
@ -0,0 +1 @@
+__path__ = __import__('pkgutil').extend_path(__path__, __name__)
--- a/contrib/lang/src/cl_ext/lang/init.py
+++ b/contrib/lang/src/cl_ext/lang/init.py
--- a/contrib/lang/src/cl_ext/lang/lang.py
+++ b/contrib/lang/src/cl_ext/lang/lang.py
@ -0,0 +1,159 @@
+"""
+Parser for the .lang translation format.
+"""
+from __future__ import absolute_import
+
+import re
+
+from parsimonious.grammar import Grammar
+from parsimonious.nodes import NodeVisitor
+
+from compare_locales.parser.base import Comment, LiteralEntity, Junk, Parser
+from compare_locales.paths import File
+
+
+BLANK_LINE = 'blank_line'
+TAG_REGEX = re.compile(r'\{(ok)\}', re.I)
+
+
+class LangComment(Comment):
+    def __init__(self, marker, content, end):
+        self.marker = marker
+        self.raw_content = content
+        self.end = end
+
+    @property
+    def content(self):
+        return self.raw_content.strip()
+
+    @property
+    def raw(self):
+        return self.marker + self.raw_content + self.end
+
+
+class LangEntity(LiteralEntity):
+    def __init__(self, source_string, translation_string, all, tags):
+        super(LangEntity, self).__init__(
+            key=source_string,  # .lang files use the source as the key.
+            val=translation_string,
+            all=all,
+        )
+
+        self.tags = set(tags)
+
+    @property
+    def localized(self):
+        return self.key != self.val or 'ok' in self.tags
+
+    @property
+    def extra(self):
+        return {'tags': list(self.tags)}
+
+
+class LangVisitor(NodeVisitor):
+    grammar = Grammar(r"""
+        lang_file = (comment / entity / blank_line)*
+
+        comment = "#"+ line_content line_ending
+        line_content = ~r".*"
+        line_ending = ~r"$\n?"m # Match at EOL and EOF without newline.
+
+        blank_line = ~r"((?!\n)\s)*" line_ending
+
+        entity = string translation
+        string = ";" line_content line_ending
+        translation = line_content line_ending
+    """)
+
+    def __init__(self, ctx):
+        super().__init__()
+        self.ctx = ctx
+
+    def visit_lang_file(self, node, children):
+        """
+        Find comments that are associated with an entity and add them
+        to the entity's comments list. Also assign order to entities.
+        """
+        comments = []
+        order = 0
+        for child in children:
+            if isinstance(child, LangComment):
+                comments.append(child)
+                continue
+
+            if isinstance(child, LangEntity):
+                child.comments = [c.content for c in comments]
+                child.order = order
+                order += 1
+
+            comments = []
+
+        return children
+
+    def visit_comment(self, node, node_info):
+        marker, content, end = node_info
+        return LangComment(
+            node_text(marker), node_text(content), node_text(end)
+        )
+
+    def visit_blank_line(self, node, _):
+        return BLANK_LINE
+
+    def visit_entity(self, node, node_info):
+        string, translation = node_info
+
+        # Strip tags out of translation if they exist.
+        tags = []
+        tag_matches = list(re.finditer(TAG_REGEX, translation))
+        if tag_matches:
+            tags = [m.group(1).lower() for m in tag_matches]
+            translation = translation[:tag_matches[0].start()].strip()
+
+        if translation == '':
+            return Junk(self.ctx, (0, 0))
+
+        return LangEntity(string, translation, node.text, tags)
+
+    def visit_string(self, node, node_info):
+        marker, content, end = node_info
+        return content.text.strip()
+
+    def visit_translation(self, node, node_info):
+        content, end = node_info
+        return content.text.strip()
+
+    def generic_visit(self, node, children):
+        if children and len(children) == 1:
+            return children[0]
+        else:
+            return children or node
+
+
+def node_text(node):
+    """
+    Convert a Parsimonious node into text, including nodes that may
+    actually be a list of nodes due to repetition.
+    """
+    if node is None:
+        return u''
+    elif isinstance(node, list):
+        return ''.join([n.text for n in node])
+    else:
+        return node.text
+
+
+class LangParser(Parser):
+    def use(self, path):
+        if isinstance(path, File):
+            path = path.fullpath
+        return path.endswith('.lang')
+
+    def walk(self, only_localizable=False):
+        if not self.ctx:
+            # loading file failed, or we just didn't load anything
+            return
+        ctx = self.ctx
+        contents = ctx.contents
+        for c in LangVisitor(ctx).parse(contents):
+            if not only_localizable or isinstance(c, (LangEntity, Junk)):
+                yield c
--- a/contrib/lang/tests/init.py
+++ b/contrib/lang/tests/init.py
--- a/contrib/lang/tests/test_parser.py
+++ b/contrib/lang/tests/test_parser.py
@ -0,0 +1,44 @@
+from __future__ import absolute_import, unicode_literals
+
+import unittest
+from compare_locales import parser
+from parsimonious.exceptions import ParseError
+
+
+class TestLangParser(unittest.TestCase):
+    def test_good(self):
+        p = parser.getParser('foo.lang')
+        p.readUnicode('''\
+# Sample comment
+;Source String
+Translated String
+
+# First comment
+# Second comment
+;Multiple Comments
+Translated Multiple Comments
+
+;No Comments or Sources
+Translated No Comments or Sources
+''')
+        msgs = p.parse()
+        self.assertEqual(len(msgs), 3)
+
+    def test_empty_translation(self):
+        p = parser.getParser('foo.lang')
+        p.readUnicode('''\
+# Sample comment
+;Source String
+
+''')
+        msgs = p.parse()
+        self.assertEqual(len(msgs), 1)
+        self.assertIsInstance(msgs[0], parser.Junk)
+
+    def test_bad(self):
+        p = parser.getParser('foo.lang')
+        p.readUnicode('''\
+just garbage
+''')
+        with self.assertRaises(ParseError):
+            p.parse()
--- a/tox.ini
+++ b/tox.ini
@ -1,5 +1,5 @@
 [tox]
-envlist = py27, py35, py36, py37, flake8, integration
+envlist = py27, py35, py36, py37, flake8, lang, integration
 skipsdist=True

 [travis]
@ -8,10 +8,19 @@ python =

 [testenv]
 commands=python -B setup.py test
+
 [testenv:flake8]
 deps=flake8 >=3.7, <3.8
 basepython=python3.7
-commands=flake8 compare_locales
+commands=
+  flake8 compare_locales contrib/lang
+
 [testenv:integration]
 deps=six
 commands=python -m unittest discover -s compare_locales/integration_tests
+
+[testenv:lang]
+basepython=python3.7
+deps=
+  --editable=contrib/lang
+commands=python -m unittest discover contrib/lang/tests
				`@ -0,0 +1 @@`
				`__path__ = __import__('pkgutil').extend_path(__path__, __name__)`