Fix the utility to grab S3 JSON files

This additionally adds new test coverage and works around a travis-ci/issues/7940 .
2017-09-06 15:47:11 +02:00 · 2017-09-06 15:47:11 +02:00 · 188b0824fb
--- a/.travis.yml
+++ b/.travis.yml
@ -15,6 +15,8 @@ install:
 - pip install -e . --process-dependency-links
 - pip install -r test_requirements.txt
 script:
+# The line below is to work around https://github.com/travis-ci/travis-ci/issues/7940
+- export BOTO_CONFIG=/dev/null
 - flake8 taar tests
 - py.test --cov taar tests
 after_success:
--- a/taar/recommenders/utils.py
+++ b/taar/recommenders/utils.py
@ -1,10 +1,14 @@
-import json
-import os
-from tempfile import gettempdir
-
 import boto3
-from botocore.exceptions import ClientError
+import json
+import logging
+import os
 import requests
+import shutil
+from botocore.exceptions import ClientError
+from tempfile import gettempdir, NamedTemporaryFile
+
+
+logger = logging.getLogger(__name__)


 def fetch_json(uri):
@ -24,21 +28,44 @@ def fetch_json(uri):
    return r.json()


+def get_s3_cache_filename(s3_bucket, s3_key):
+    return os.path.join(gettempdir(),
+                        '_'.join([s3_bucket, s3_key]).replace('/', '_'))
+
+
 def get_s3_json_content(s3_bucket, s3_key):
    """Download and parse a json file stored on AWS S3.

    The file is downloaded and then cached for future use.
    """
-    local_filename = '_'.join([s3_bucket, s3_key]).replace('/', '_')
-    local_path = os.path.join(gettempdir(), local_filename)
+    local_path = get_s3_cache_filename(s3_bucket, s3_key)

    if not os.path.exists(local_path):
-        with open(local_path, 'wb') as data:
+        # Use NamedTemporaryFile, so that the file gets removed.
+        with NamedTemporaryFile() as temp_file:
            try:
                s3 = boto3.client('s3')
-                s3.download_fileobj(s3_bucket, s3_key, data)
+                s3.download_fileobj(s3_bucket, s3_key, temp_file)
+                # Flush the file.
+                temp_file.flush()
            except ClientError:
+                logger.exception("Failed to download from S3", extra={
+                    "bucket": s3_bucket,
+                    "key": s3_key})
                return None

-    with open(local_path, 'r') as data:
-        return json.loads(data.read())
+            with open(local_path, 'wb') as data:
+                temp_file.seek(0)
+                shutil.copyfileobj(temp_file, data)
+
+    # It can happen to have corrupted files. Account for the
+    # sad reality of life.
+    try:
+        with open(local_path, 'r') as data:
+            return json.loads(data.read())
+    except ValueError:
+        # Remove the corrupted cache file.
+        logging.error("Removing corrupted S3 cache", extra={"cache_path": local_path})
+        os.remove(local_path)
+
+    return None
--- a/test_requirements.txt
+++ b/test_requirements.txt
@ -1,5 +1,6 @@
 pytest
 pytest-cov
 flake8
+moto
 responses
 coveralls
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@ -0,0 +1,31 @@
+import boto3
+import os
+import taar.recommenders.utils as utils
+from moto import mock_s3
+
+
+@mock_s3
+def test_get_non_existing():
+    bucket = 'test-bucket'
+    key = 'non-existing.json'
+
+    conn = boto3.resource('s3', region_name='us-west-2')
+    conn.create_bucket(Bucket=bucket)
+
+    assert utils.get_s3_json_content(bucket, key) is None
+    assert os.path.exists(utils.get_s3_cache_filename(bucket, key)) is False
+
+
+@mock_s3
+def test_get_corrupted():
+    bucket = 'test-bucket'
+    key = 'corrupted.json'
+
+    conn = boto3.resource('s3', region_name='us-west-2')
+    conn.create_bucket(Bucket=bucket)
+
+    # Write a corrupted file to the mocked S3.
+    conn.Object(bucket, key).put(Body='This is invalid JSON.')
+
+    assert utils.get_s3_json_content(bucket, key) is None
+    assert os.path.exists(utils.get_s3_cache_filename(bucket, key)) is False