Bug 1292720 - Remove support for UCS-2 mode Python

Since Heroku is now using a UCS-4 mode Python, matching Vagrant/Travis.
2016-12-26 02:06:55 +00:00 · 2016-12-26 02:06:55 +00:00 · 02428ca8f7
--- a/tests/log_parser/test_store_failure_lines.py
+++ b/tests/log_parser/test_store_failure_lines.py
@ -3,7 +3,6 @@ import responses
 from django.conf import settings
 from requests.exceptions import HTTPError

-from treeherder.etl.text import char_to_codepoint_ucs2
 from treeherder.log_parser.failureline import (store_failure_lines,
                                               write_failure_lines)
 from treeherder.model.models import (FailureLine,
@ -131,20 +130,6 @@ def test_store_error_summary_500(activate_responses, test_repository, jm, eleven
    assert log_obj.status == JobLog.FAILED


-def test_char_data_to_codepoint_ucs2():
-    # Unbelivably, putting the two codepoints in a string seems to cause them to be
-    # interpreted as a single character, but only in unit tests, and only sometimes.
-    # Since we only use indexing operations, putting the codepoints in a tuple is
-    # equivalent to a lenth 2 string.
-    data = [
-        ((u"\ud800", u"\udc00"), 0x010000),
-        ((u"\udbff", u"\udfff"), 0x10FFFF),
-        ((u"\uda00", u"\uddff"), 0x0901ff),
-    ]
-    for value, expected in data:
-        assert char_to_codepoint_ucs2(value) == expected
-
-
 def test_store_error_summary_duplicate(activate_responses, test_repository, jm, eleven_jobs_stored):
    log_url = 'http://my-log.mozilla.org'
    job = Job.objects.get(guid=jm.get_job(1)[0]['job_guid'])
--- a/treeherder/etl/text.py
+++ b/treeherder/etl/text.py
@ -1,28 +1,13 @@
 import re

-
-def char_to_codepoint_ucs4(x):
-    return ord(x)
-
-
-def char_to_codepoint_ucs2(x):
-    return (0x10000 + (ord(x[0]) - 0xD800) * 0x400 +
-            (ord(x[1]) - 0xDC00))
-
+if len(u"\U0010FFFF") != 1:
+    raise Exception('Python has been compiled in UCS-2 mode which is not supported.')

 # Regexp that matches all non-BMP unicode characters.
-if len(u"\U0010FFFF") == 1:
-    filter_re = re.compile(ur"([\U00010000-\U0010FFFF])", re.U)
-    char_to_codepoint = char_to_codepoint_ucs4
-else:
-    # Python is compiled as the UCS2 variant so we have to match two
-    # bytes in a surrogate pair. Then we have to decode the two bytes
-    # according to UTF16 rules to get a single codepoint
-    filter_re = re.compile(ur"([\uD800-\uDBFF][\uDC00-\uDFFF])", re.U)
-    char_to_codepoint = char_to_codepoint_ucs2
+filter_re = re.compile(ur"([\U00010000-\U0010FFFF])", re.U)


 def astral_filter(text):
    if text is None:
        return text
-    return filter_re.sub(lambda x: "<U+%s>" % hex(char_to_codepoint(x.group(0)))[2:].zfill(6).upper(), text)
+    return filter_re.sub(lambda x: "<U+%s>" % hex(ord(x.group(0)))[2:].zfill(6).upper(), text)