gecko-dev/python/compare-locales/compare_locales/checks.py

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import re
from difflib import SequenceMatcher
from xml import sax
try:
    from cStringIO import StringIO
except ImportError:
    from StringIO import StringIO

from compare_locales.parser import DTDParser, PropertiesParser


class Checker(object):
    '''Abstract class to implement checks per file type.
    '''
    pattern = None

    @classmethod
    def use(cls, file):
        return cls.pattern.match(file.file)

    def check(self, refEnt, l10nEnt):
        '''Given the reference and localized Entities, performs checks.

        This is a generator yielding tuples of
        - "warning" or "error", depending on what should be reported,
        - tuple of line, column info for the error within the string
        - description string to be shown in the report
        '''
        if True:
            raise NotImplementedError("Need to subclass")
        yield ("error", (0, 0), "This is an example error", "example")


class PrintfException(Exception):
    def __init__(self, msg, pos):
        self.pos = pos
        self.msg = msg


class PropertiesChecker(Checker):
    '''Tests to run on .properties files.
    '''
    pattern = re.compile('.*\.properties$')
    printf = re.compile(r'%(?P<good>%|'
                        r'(?:(?P<number>[1-9][0-9]*)\$)?'
                        r'(?P<width>\*|[0-9]+)?'
                        r'(?P<prec>\.(?:\*|[0-9]+)?)?'
                        r'(?P<spec>[duxXosScpfg]))?')

    def check(self, refEnt, l10nEnt):
        '''Test for the different variable formats.
        '''
        refValue, l10nValue = refEnt.val, l10nEnt.val
        refSpecs = None
        # check for PluralForm.jsm stuff, should have the docs in the
        # comment
        if 'Localization_and_Plurals' in refEnt.pre_comment:
            # For plurals, common variable pattern is #1. Try that.
            pats = set(int(m.group(1)) for m in re.finditer('#([0-9]+)',
                                                            refValue))
            if len(pats) == 0:
                return
            lpats = set(int(m.group(1)) for m in re.finditer('#([0-9]+)',
                                                             l10nValue))
            if pats - lpats:
                yield ('warning', 0, 'not all variables used in l10n',
                       'plural')
                return
            if lpats - pats:
                yield ('error', 0, 'unreplaced variables in l10n',
                       'plural')
                return
            return
        # check for lost escapes
        raw_val = l10nEnt.raw_val
        for m in PropertiesParser.escape.finditer(raw_val):
            if m.group('single') and \
               m.group('single') not in PropertiesParser.known_escapes:
                yield ('warning', m.start(),
                       'unknown escape sequence, \\' + m.group('single'),
                       'escape')
        try:
            refSpecs = self.getPrintfSpecs(refValue)
        except PrintfException:
            refSpecs = []
        if refSpecs:
            for t in self.checkPrintf(refSpecs, l10nValue):
                yield t
            return

    def checkPrintf(self, refSpecs, l10nValue):
        try:
            l10nSpecs = self.getPrintfSpecs(l10nValue)
        except PrintfException, e:
            yield ('error', e.pos, e.msg, 'printf')
            return
        if refSpecs != l10nSpecs:
            sm = SequenceMatcher()
            sm.set_seqs(refSpecs, l10nSpecs)
            msgs = []
            warn = None
            for action, i1, i2, j1, j2 in sm.get_opcodes():
                if action == 'equal':
                    continue
                if action == 'delete':
                    # missing argument in l10n
                    if i2 == len(refSpecs):
                        # trailing specs missing, that's just a warning
                        warn = ', '.join('trailing argument %d `%s` missing' %
                                         (i+1, refSpecs[i])
                                         for i in xrange(i1, i2))
                    else:
                        for i in xrange(i1, i2):
                            msgs.append('argument %d `%s` missing' %
                                        (i+1, refSpecs[i]))
                    continue
                if action == 'insert':
                    # obsolete argument in l10n
                    for i in xrange(j1, j2):
                        msgs.append('argument %d `%s` obsolete' %
                                    (i+1, l10nSpecs[i]))
                    continue
                if action == 'replace':
                    for i, j in zip(xrange(i1, i2), xrange(j1, j2)):
                        msgs.append('argument %d `%s` should be `%s`' %
                                    (j+1, l10nSpecs[j], refSpecs[i]))
            if msgs:
                yield ('error', 0, ', '.join(msgs), 'printf')
            if warn is not None:
                yield ('warning', 0, warn, 'printf')

    def getPrintfSpecs(self, val):
        hasNumber = False
        specs = []
        for m in self.printf.finditer(val):
            if m.group("good") is None:
                # found just a '%', signal an error
                raise PrintfException('Found single %', m.start())
            if m.group("good") == '%':
                # escaped %
                continue
            if ((hasNumber and m.group('number') is None) or
                    (not hasNumber and specs and
                     m.group('number') is not None)):
                # mixed style, numbered and not
                raise PrintfException('Mixed ordered and non-ordered args',
                                      m.start())
            hasNumber = m.group('number') is not None
            if hasNumber:
                pos = int(m.group('number')) - 1
                ls = len(specs)
                if pos >= ls:
                    # pad specs
                    nones = pos - ls
                    specs[ls:pos] = nones*[None]
                    specs.append(m.group('spec'))
                else:
                    if specs[pos] is not None:
                        raise PrintfException('Double ordered argument %d' %
                                              (pos+1),
                                              m.start())
                    specs[pos] = m.group('spec')
            else:
                specs.append(m.group('spec'))
        # check for missing args
        if hasNumber and not all(specs):
            raise PrintfException('Ordered argument missing', 0)
        return specs


class DTDChecker(Checker):
    """Tests to run on DTD files.

    Uses xml.sax for the heavy lifting of xml parsing.

    The code tries to parse until it doesn't find any unresolved entities
    anymore. If it finds one, it tries to grab the key, and adds an empty
    <!ENTITY key ""> definition to the header.

    Also checks for some CSS and number heuristics in the values.
    """
    pattern = re.compile('.*\.dtd$')

    eref = re.compile('&(%s);' % DTDParser.Name)
    tmpl = '''<!DOCTYPE elem [%s]>
<elem>%s</elem>
'''
    xmllist = set(('amp', 'lt', 'gt', 'apos', 'quot'))

    def __init__(self, reference):
        self.reference = reference
        self.__known_entities = None

    def known_entities(self, refValue):
        if self.__known_entities is None and self.reference is not None:
            self.__known_entities = set()
            for ent in self.reference:
                self.__known_entities.update(self.entities_for_value(ent.val))
        return self.__known_entities if self.__known_entities is not None \
            else self.entities_for_value(refValue)

    def entities_for_value(self, value):
        reflist = set(m.group(1).encode('utf-8')
                      for m in self.eref.finditer(value))
        reflist -= self.xmllist
        return reflist

    # Setup for XML parser, with default and text-only content handler
    class TextContent(sax.handler.ContentHandler):
        textcontent = ''

        def characters(self, content):
            self.textcontent += content

    defaulthandler = sax.handler.ContentHandler()
    texthandler = TextContent()

    numPattern = r'([0-9]+|[0-9]*\.[0-9]+)'
    num = re.compile('^%s$' % numPattern)
    lengthPattern = '%s(em|px|ch|cm|in)' % numPattern
    length = re.compile('^%s$' % lengthPattern)
    spec = re.compile(r'((?:min\-)?(?:width|height))\s*:\s*%s' %
                      lengthPattern)
    style = re.compile(r'^%(spec)s\s*(;\s*%(spec)s\s*)*;?$' %
                       {'spec': spec.pattern})

    processContent = None

    def check(self, refEnt, l10nEnt):
        """Try to parse the refvalue inside a dummy element, and keep
        track of entities that we need to define to make that work.

        Return a checker that offers just those entities.
        """
        refValue, l10nValue = refEnt.val, l10nEnt.val
        # find entities the refValue references,
        # reusing markup from DTDParser.
        reflist = self.known_entities(refValue)
        entities = ''.join('<!ENTITY %s "">' % s for s in sorted(reflist))
        parser = sax.make_parser()
        parser.setFeature(sax.handler.feature_external_ges, False)

        parser.setContentHandler(self.defaulthandler)
        try:
            parser.parse(StringIO(self.tmpl %
                                  (entities, refValue.encode('utf-8'))))
            # also catch stray %
            parser.parse(StringIO(self.tmpl %
                                  (refEnt.all.encode('utf-8') + entities,
                                   '&%s;' % refEnt.key.encode('utf-8'))))
        except sax.SAXParseException, e:
            yield ('warning',
                   (0, 0),
                   "can't parse en-US value", 'xmlparse')

        # find entities the l10nValue references,
        # reusing markup from DTDParser.
        l10nlist = self.entities_for_value(l10nValue)
        missing = sorted(l10nlist - reflist)
        _entities = entities + ''.join('<!ENTITY %s "">' % s for s in missing)
        warntmpl = u'Referencing unknown entity `%s`'
        if reflist:
            warntmpl += ' (%s known)' % ', '.join(sorted(reflist))
        if self.processContent is not None:
            self.texthandler.textcontent = ''
            parser.setContentHandler(self.texthandler)
        try:
            parser.parse(StringIO(self.tmpl % (_entities,
                         l10nValue.encode('utf-8'))))
            # also catch stray %
            # if this fails, we need to substract the entity definition
            parser.setContentHandler(self.defaulthandler)
            parser.parse(StringIO(self.tmpl % (
                l10nEnt.all.encode('utf-8') + _entities,
                '&%s;' % l10nEnt.key.encode('utf-8'))))
        except sax.SAXParseException, e:
            # xml parse error, yield error
            # sometimes, the error is reported on our fake closing
            # element, make that the end of the last line
            lnr = e.getLineNumber() - 1
            lines = l10nValue.splitlines()
            if lnr > len(lines):
                lnr = len(lines)
                col = len(lines[lnr-1])
            else:
                col = e.getColumnNumber()
                if lnr == 1:
                    # first line starts with <elem>, substract
                    col -= len("<elem>")
                elif lnr == 0:
                    col -= len("<!DOCTYPE elem [")  # first line is DOCTYPE
            yield ('error', (lnr, col), ' '.join(e.args), 'xmlparse')

        for key in missing:
            yield ('warning', (0, 0), warntmpl % key.decode('utf-8'),
                   'xmlparse')

        # Number check
        if self.num.match(refValue) and not self.num.match(l10nValue):
            yield ('warning', 0, 'reference is a number', 'number')
        # CSS checks
        # just a length, width="100em"
        if self.length.match(refValue) and not self.length.match(l10nValue):
            yield ('error', 0, 'reference is a CSS length', 'css')
        # real CSS spec, style="width:100px;"
        if self.style.match(refValue):
            if not self.style.match(l10nValue):
                yield ('error', 0, 'reference is a CSS spec', 'css')
            else:
                # warn if different properties or units
                refMap = dict((s, u) for s, _, u in
                              self.spec.findall(refValue))
                msgs = []
                for s, _, u in self.spec.findall(l10nValue):
                    if s not in refMap:
                        msgs.insert(0, '%s only in l10n' % s)
                        continue
                    else:
                        ru = refMap.pop(s)
                        if u != ru:
                            msgs.append("units for %s don't match "
                                        "(%s != %s)" % (s, u, ru))
                for s in refMap.iterkeys():
                    msgs.insert(0, '%s only in reference' % s)
                if msgs:
                    yield ('warning', 0, ', '.join(msgs), 'css')

        if self.processContent is not None:
            for t in self.processContent(self.texthandler.textcontent):
                yield t


class PrincessAndroid(DTDChecker):
    """Checker for the string values that Android puts into an XML container.

    http://developer.android.com/guide/topics/resources/string-resource.html#FormattingAndStyling  # noqa
    has more info. Check for unescaped apostrophes and bad unicode escapes.
    """
    quoted = re.compile("(?P<q>[\"']).*(?P=q)$")

    def unicode_escape(self, str):
        """Helper method to try to decode all unicode escapes in a string.

        This code uses the standard python decode for unicode-escape, but
        that's somewhat tricky, as its input needs to be ascii. To get to
        ascii, the unicode string gets converted to ascii with
        backslashreplace, i.e., all non-ascii unicode chars get unicode
        escaped. And then we try to roll all of that back.
        Now, when that hits an error, that's from the original string, and we
        need to search for the actual error position in the original string,
        as the backslashreplace code changes string positions quite badly.
        See also the last check in TestAndroid.test_android_dtd, with a
        lengthy chinese string.
        """
        val = str.encode('ascii', 'backslashreplace')
        try:
            val.decode('unicode-escape')
        except UnicodeDecodeError, e:
            args = list(e.args)
            badstring = args[1][args[2]:args[3]]
            i = len(args[1][:args[2]].decode('unicode-escape'))
            args[2] = i
            args[3] = i + len(badstring)
            raise UnicodeDecodeError(*args)

    @classmethod
    def use(cls, file):
        """Use this Checker only for DTD files in embedding/android."""
        return (file.module in ("embedding/android",
                                "mobile/android/base")
                and cls.pattern.match(file.file))

    def processContent(self, val):
        """Actual check code.
        Check for unicode escapes and unescaped quotes and apostrophes,
        if string's not quoted.
        """
        # first, try to decode unicode escapes
        try:
            self.unicode_escape(val)
        except UnicodeDecodeError, e:
            yield ('error', e.args[2], e.args[4], 'android')
        # check for unescaped single or double quotes.
        # first, see if the complete string is single or double quoted,
        # that changes the rules
        m = self.quoted.match(val)
        if m:
            q = m.group('q')
            offset = 0
            val = val[1:-1]  # strip quotes
        else:
            q = "[\"']"
            offset = -1
        stray_quot = re.compile(r"[\\\\]*(%s)" % q)

        for m in stray_quot.finditer(val):
            if len(m.group(0)) % 2:
                # found an unescaped single or double quote, which message?
                if m.group(1) == '"':
                    msg = u"Quotes in Android DTDs need escaping with \\\" "\
                          u"or \\u0022, or put string in apostrophes."
                else:
                    msg = u"Apostrophes in Android DTDs need escaping with "\
                          u"\\' or \\u0027, or use \u2019, or put string in "\
                          u"quotes."
                yield ('error', m.end(0)+offset, msg, 'android')


def getChecker(file, reference=None):
    if PropertiesChecker.use(file):
        return PropertiesChecker()
    if PrincessAndroid.use(file):
        return PrincessAndroid(reference)
    if DTDChecker.use(file):
        return DTDChecker(reference)
    return None