[AIRFLOW-4564] ACI bugfixes and improvements (#5319)

This commit is contained in:
Asgeir Berland 2019-07-01 18:23:02 +02:00 коммит произвёл Ash Berlin-Taylor
Родитель f0460ccb31
Коммит 722379adc7
5 изменённых файлов: 218 добавлений и 101 удалений

Просмотреть файл

@ -27,6 +27,7 @@ from azure.common.client_factory import get_client_from_auth_file
from azure.common.credentials import ServicePrincipalCredentials
from azure.mgmt.containerinstance import ContainerInstanceManagementClient
from zope.deprecation import deprecation
class AzureContainerInstanceHook(BaseHook):
@ -92,6 +93,7 @@ class AzureContainerInstanceHook(BaseHook):
name,
container_group)
@deprecation.deprecate("get_state_exitcode_details() is deprecated. Related method is get_state()")
def get_state_exitcode_details(self, resource_group, name):
"""
Get the state and exitcode of a container group
@ -104,17 +106,11 @@ class AzureContainerInstanceHook(BaseHook):
If the exitcode is unknown 0 is returned.
:rtype: tuple(state,exitcode,details)
"""
current_state = self._get_instance_view(resource_group, name).current_state
return (current_state.state,
current_state.exit_code,
current_state.detail_status)
def _get_instance_view(self, resource_group, name):
response = self.connection.container_groups.get(resource_group,
name,
raw=False)
return response.containers[0].instance_view
cg_state = self.get_state(resource_group, name)
c_state = cg_state.containers[0].instance_view.current_state
return (c_state.state, c_state.exit_code, c_state.detail_status)
@deprecation.deprecate("get_messages() is deprecated. Related method is get_state()")
def get_messages(self, resource_group, name):
"""
Get the messages of a container group
@ -126,10 +122,25 @@ class AzureContainerInstanceHook(BaseHook):
:return: A list of the event messages
:rtype: list[str]
"""
instance_view = self._get_instance_view(resource_group, name)
cg_state = self.get_state(resource_group, name)
instance_view = cg_state.containers[0].instance_view
return [event.message for event in instance_view.events]
def get_state(self, resource_group, name):
"""
Get the state of a container group
:param resource_group: the name of the resource group
:type resource_group: str
:param name: the name of the container group
:type name: str
:return: ContainerGroup
:rtype: ~azure.mgmt.containerinstance.models.ContainerGroup
"""
return self.connection.container_groups.get(resource_group,
name,
raw=False)
def get_logs(self, resource_group, name, tail=1000):
"""
Get the tail from logs of a container group

Просмотреть файл

@ -18,8 +18,9 @@
# under the License.
from collections import namedtuple
from typing import Sequence, Dict
from time import sleep
from typing import Dict, Sequence
import re
from airflow.contrib.hooks.azure_container_instance_hook import AzureContainerInstanceHook
from airflow.contrib.hooks.azure_container_registry_hook import AzureContainerRegistryHook
@ -43,7 +44,9 @@ Volume = namedtuple(
['conn_id', 'account_name', 'share_name', 'mount_path', 'read_only'],
)
DEFAULT_ENVIRONMENT_VARIABLES = {} # type: Dict[str, str]
DEFAULT_SECURED_VARIABLES = [] # type: Sequence[str]
DEFAULT_VOLUMES = [] # type: Sequence[Volume]
DEFAULT_MEMORY_IN_GB = 2.0
DEFAULT_CPU = 1.0
@ -72,66 +75,101 @@ class AzureContainerInstancesOperator(BaseOperator):
:param environment_variables: key,value pairs containing environment
variables which will be passed to the running container
:type environment_variables: dict
:param volumes: list of volumes to be mounted to the container.
:param secured_variables: names of environmental variables that should not
be exposed outside the container (typically passwords).
:type secured_variables: [str]
:param volumes: list of ``Volume`` tuples to be mounted to the container.
Currently only Azure Fileshares are supported.
:type volumes: list[<conn_id, account_name, share_name, mount_path, read_only>]
:param memory_in_gb: the amount of memory to allocate to this container
:type memory_in_gb: double
:param cpu: the number of cpus to allocate to this container
:type cpu: double
:param gpu: GPU Resource for the container.
:type gpu: azure.mgmt.containerinstance.models.GpuResource
:param command: the command to run inside the container
:type command: str
:type command: [str]
:param container_timeout: max time allowed for the execution of
the container instance.
:type container_timeout: datetime.timedelta
:Example:
**Example**::
>>> a = AzureContainerInstancesOperator(
'azure_service_principal',
'azure_registry_user',
'my-resource-group',
'my-container-name-{{ ds }}',
'myprivateregistry.azurecr.io/my_container:latest',
'westeurope',
{'EXECUTION_DATE': '{{ ds }}'},
[('azure_wasb_conn_id',
'my_storage_container',
'my_fileshare',
'/input-data',
True),],
memory_in_gb=14.0,
cpu=4.0,
command='python /app/myfile.py',
task_id='start_container'
)
AzureContainerInstancesOperator(
"azure_service_principal",
"azure_registry_user",
"my-resource-group",
"my-container-name-{{ ds }}",
"myprivateregistry.azurecr.io/my_container:latest",
"westeurope",
{"MODEL_PATH": "my_value",
"POSTGRES_LOGIN": "{{ macros.connection('postgres_default').login }}"
"POSTGRES_PASSWORD": "{{ macros.connection('postgres_default').password }}",
"JOB_GUID": "{{ ti.xcom_pull(task_ids='task1', key='guid') }}" },
['POSTGRES_PASSWORD'],
[("azure_wasb_conn_id",
"my_storage_container",
"my_fileshare",
"/input-data",
True),],
memory_in_gb=14.0,
cpu=4.0,
gpu=GpuResource(count=1, sku='K80'),
command=["/bin/echo", "world"],
container_timeout=timedelta(hours=2),
task_id="start_container"
)
"""
template_fields = ('name', 'environment_variables')
template_fields = ('name', 'image', 'command', 'environment_variables')
@apply_defaults
def __init__(self, ci_conn_id, registry_conn_id, resource_group, name, image, region,
environment_variables=None, volumes=None, memory_in_gb=None, cpu=None,
command=None, remove_on_error=True, fail_if_exists=True, *args, **kwargs):
def __init__(self,
ci_conn_id,
registry_conn_id,
resource_group,
name,
image,
region,
environment_variables=None,
secured_variables=None,
volumes=None,
memory_in_gb=None,
cpu=None,
gpu=None,
command=None,
remove_on_error=True,
fail_if_exists=True,
*args,
**kwargs):
super().__init__(*args, **kwargs)
self.ci_conn_id = ci_conn_id
self.resource_group = resource_group
self.name = name
self.name = self._check_name(name)
self.image = image
self.region = region
self.registry_conn_id = registry_conn_id
self.environment_variables = environment_variables or DEFAULT_ENVIRONMENT_VARIABLES
self.secured_variables = secured_variables or DEFAULT_SECURED_VARIABLES
self.volumes = volumes or DEFAULT_VOLUMES
self.memory_in_gb = memory_in_gb or DEFAULT_MEMORY_IN_GB
self.cpu = cpu or DEFAULT_CPU
self.gpu = gpu
self.command = command
self.remove_on_error = remove_on_error
self.fail_if_exists = fail_if_exists
self._ci_hook = None
def execute(self, context):
ci_hook = AzureContainerInstanceHook(self.ci_conn_id)
# Check name again in case it was templated.
self._check_name(self.name)
self._ci_hook = AzureContainerInstanceHook(self.ci_conn_id)
if self.fail_if_exists:
self.log.info("Testing if container group already exists")
if ci_hook.exists(self.resource_group, self.name):
if self._ci_hook.exists(self.resource_group, self.name):
raise AirflowException("Container group exists")
if self.registry_conn_id:
@ -142,7 +180,11 @@ class AzureContainerInstancesOperator(BaseOperator):
environment_variables = []
for key, value in self.environment_variables.items():
environment_variables.append(EnvironmentVariable(key, value))
if key in self.secured_variables:
e = EnvironmentVariable(name=key, secure_value=value)
else:
e = EnvironmentVariable(name=key, value=value)
environment_variables.append(e)
volumes = []
volume_mounts = []
@ -154,16 +196,22 @@ class AzureContainerInstancesOperator(BaseOperator):
share_name,
account_name,
read_only))
volume_mounts.append(VolumeMount(mount_name, mount_path, read_only))
volume_mounts.append(VolumeMount(name=mount_name,
mount_path=mount_path,
read_only=read_only))
exit_code = 1
try:
self.log.info("Starting container group with %.1f cpu %.1f mem",
self.cpu, self.memory_in_gb)
if self.gpu:
self.log.info("GPU count: %.1f, GPU SKU: %s",
self.gpu.count, self.gpu.sku)
resources = ResourceRequirements(requests=ResourceRequests(
memory_in_gb=self.memory_in_gb,
cpu=self.cpu))
cpu=self.cpu,
gpu=self.gpu))
container = Container(
name=self.name,
@ -181,16 +229,17 @@ class AzureContainerInstancesOperator(BaseOperator):
restart_policy='Never',
os_type='Linux')
ci_hook.create_or_update(self.resource_group, self.name, container_group)
self._ci_hook.create_or_update(self.resource_group, self.name, container_group)
self.log.info("Container group started %s/%s", self.resource_group, self.name)
exit_code = self._monitor_logging(ci_hook, self.resource_group, self.name)
exit_code = self._monitor_logging(self._ci_hook, self.resource_group, self.name)
self.log.info("Container had exit code: %s", exit_code)
if exit_code != 0:
raise AirflowException("Container had a non-zero exit code, %s"
% exit_code)
return exit_code
except CloudError:
self.log.exception("Could not start container group")
@ -198,29 +247,47 @@ class AzureContainerInstancesOperator(BaseOperator):
finally:
if exit_code == 0 or self.remove_on_error:
self.log.info("Deleting container group")
try:
ci_hook.delete(self.resource_group, self.name)
except Exception:
self.log.exception("Could not delete container group")
self.on_kill()
def on_kill(self):
if self.remove_on_error:
self.log.info("Deleting container group")
try:
self._ci_hook.delete(self.resource_group, self.name)
except Exception:
self.log.exception("Could not delete container group")
def _monitor_logging(self, ci_hook, resource_group, name):
last_state = None
last_message_logged = None
last_line_logged = None
for _ in range(43200): # roughly 12 hours
while True:
try:
state, exit_code, detail_status = ci_hook.get_state_exitcode_details(resource_group, name)
cg_state = self._ci_hook.get_state(resource_group, name)
instance_view = cg_state.containers[0].instance_view
# If there is no instance view, we show the provisioning state
if instance_view is not None:
c_state = instance_view.current_state
state, exit_code, detail_status = (c_state.state,
c_state.exit_code,
c_state.detail_status)
messages = [event.message for event in instance_view.events]
last_message_logged = self._log_last(messages, last_message_logged)
else:
state = cg_state.provisioning_state
exit_code = 0
detail_status = "Provisioning"
if state != last_state:
self.log.info("Container group state changed to %s", state)
last_state = state
messages = ci_hook.get_messages(resource_group, name)
last_message_logged = self._log_last(messages, last_message_logged)
if state in ["Running", "Terminated"]:
try:
logs = ci_hook.get_logs(resource_group, name)
logs = self._ci_hook.get_logs(resource_group, name)
last_line_logged = self._log_last(logs, last_line_logged)
except CloudError:
self.log.exception("Exception while getting logs from "
@ -230,6 +297,12 @@ class AzureContainerInstancesOperator(BaseOperator):
self.log.info("Container exited with detail_status %s", detail_status)
return exit_code
if state == "Failed":
self.log.info("Azure provision failure")
return 1
except AirflowTaskTimeout:
raise
except CloudError as err:
if 'ResourceNotFound' in str(err):
self.log.warning("ResourceNotFound, container is probably removed "
@ -241,10 +314,7 @@ class AzureContainerInstancesOperator(BaseOperator):
except Exception:
self.log.exception("Exception while getting container groups")
sleep(1)
# no return -> hence still running
raise AirflowTaskTimeout("Did not complete on time")
sleep(1)
def _log_last(self, logs, last_line_logged):
if logs:
@ -261,3 +331,15 @@ class AzureContainerInstancesOperator(BaseOperator):
self.log.info(line.rstrip())
return logs[-1]
@staticmethod
def _check_name(name):
if '{{' in name:
# Let macros pass as they cannot be checked at construction time
return name
regex_check = re.match("[a-z0-9]([-a-z0-9]*[a-z0-9])?", name)
if regex_check is None or regex_check.group() != name:
raise AirflowException('ACI name must match regex [a-z0-9]([-a-z0-9]*[a-z0-9])? (like "my-name")')
if len(name) > 63:
raise AirflowException('ACI name cannot be longer than 63 characters')
return name

Просмотреть файл

@ -168,11 +168,11 @@ aws = [
]
azure = [
'azure-storage>=0.34.0',
'azure-mgmt-resource==1.2.2',
'azure-mgmt-datalake-store==0.4.0',
'azure-datalake-store==0.0.19',
'azure-mgmt-resource>=2.2.0',
'azure-mgmt-datalake-store>=0.5.0',
'azure-datalake-store>=0.0.45',
'azure-cosmos>=3.0.1',
'azure-mgmt-containerinstance',
'azure-mgmt-containerinstance>=1.5.0',
]
cassandra = ['cassandra-driver>=3.13.0']
celery = [

Просмотреть файл

@ -20,7 +20,6 @@
import json
import unittest
from unittest.mock import patch
from collections import namedtuple
from airflow.models import Connection
from airflow.contrib.hooks.azure_container_instance_hook import AzureContainerInstanceHook
@ -28,8 +27,6 @@ from airflow.utils import db
from azure.mgmt.containerinstance.models import (Container,
ContainerGroup,
ContainerState,
Event,
Logs,
ResourceRequests,
ResourceRequirements)
@ -63,32 +60,10 @@ class TestAzureContainerInstanceHook(unittest.TestCase):
self.testHook.create_or_update('resource_group', 'aci-test', container_group_mock)
create_or_update_mock.assert_called_with('resource_group', 'aci-test', container_group_mock)
@patch('airflow.contrib.hooks.azure_container_instance_hook'
'.AzureContainerInstanceHook._get_instance_view')
def test_get_state_exitcode_details(self, get_instance_view_mock):
expected_state = ContainerState(state='testing', exit_code=1, detail_status='details')
instance_view = {"current_state": expected_state}
named_instance = namedtuple("InstanceView", instance_view.keys())(*instance_view.values())
get_instance_view_mock.return_value = named_instance
state, exit_code, details = self.testHook.get_state_exitcode_details('resource-group', 'test')
self.assertEqual(state, expected_state.state)
self.assertEqual(exit_code, expected_state.exit_code)
self.assertEqual(details, expected_state.detail_status)
@patch('airflow.contrib.hooks.azure_container_instance_hook'
'.AzureContainerInstanceHook._get_instance_view')
def test_get_messages(self, get_instance_view_mock):
expected_messages = ['test1', 'test2']
events = [Event(message=m) for m in expected_messages]
instance_view = {"events": events}
named_instance = namedtuple("Events", instance_view.keys())(*instance_view.values())
get_instance_view_mock.return_value = named_instance
messages = self.testHook.get_messages('resource-group', 'test')
self.assertSequenceEqual(messages, expected_messages)
@patch('azure.mgmt.containerinstance.operations.ContainerGroupsOperations.get')
def test_get_state(self, get_state_mock):
self.testHook.get_state('resource_group', 'aci-test')
get_state_mock.assert_called_with('resource_group', 'aci-test', raw=False)
@patch('azure.mgmt.containerinstance.operations.ContainerOperations.list_logs')
def test_get_logs(self, list_logs_mock):

Просмотреть файл

@ -18,19 +18,46 @@
# under the License.
#
from collections import namedtuple
from airflow.exceptions import AirflowException
from airflow.contrib.operators.azure_container_instances_operator import AzureContainerInstancesOperator
from azure.mgmt.containerinstance.models import (ContainerState,
Event)
from tests.compat import mock
import unittest
def make_mock_cg(container_state, events=[]):
"""
Make a mock Container Group as the underlying azure Models have read-only attributes
See https://docs.microsoft.com/en-us/rest/api/container-instances/containergroups
"""
instance_view_dict = {"current_state": container_state,
"events": events}
instance_view = namedtuple("InstanceView",
instance_view_dict.keys())(*instance_view_dict.values())
container_dict = {"instance_view": instance_view}
container = namedtuple("Container", container_dict.keys())(*container_dict.values())
container_g_dict = {"containers": [container]}
container_g = namedtuple("ContainerGroup",
container_g_dict.keys())(*container_g_dict.values())
return container_g
class TestACIOperator(unittest.TestCase):
@mock.patch("airflow.contrib.operators."
"azure_container_instances_operator.AzureContainerInstanceHook")
def test_execute(self, aci_mock):
aci_mock.return_value.get_state_exitcode_details.return_value = "Terminated", 0, "test"
expected_c_state = ContainerState(state='Terminated', exit_code=0, detail_status='test')
expected_cg = make_mock_cg(expected_c_state)
aci_mock.return_value.get_state.return_value = expected_cg
aci_mock.return_value.exists.return_value = False
aci = AzureContainerInstancesOperator(ci_conn_id=None,
@ -63,7 +90,10 @@ class TestACIOperator(unittest.TestCase):
@mock.patch("airflow.contrib.operators."
"azure_container_instances_operator.AzureContainerInstanceHook")
def test_execute_with_failures(self, aci_mock):
aci_mock.return_value.get_state_exitcode_details.return_value = "Terminated", 1, "test"
expected_c_state = ContainerState(state='Terminated', exit_code=1, detail_status='test')
expected_cg = make_mock_cg(expected_c_state)
aci_mock.return_value.get_state.return_value = expected_cg
aci_mock.return_value.exists.return_value = False
aci = AzureContainerInstancesOperator(ci_conn_id=None,
@ -81,9 +111,14 @@ class TestACIOperator(unittest.TestCase):
@mock.patch("airflow.contrib.operators."
"azure_container_instances_operator.AzureContainerInstanceHook")
def test_execute_with_messages_logs(self, aci_mock):
aci_mock.return_value.get_state_exitcode_details.side_effect = [("Running", 0, "test"),
("Terminated", 0, "test")]
aci_mock.return_value.get_messages.return_value = ["test", "messages"]
events = [Event(message="test"), Event(message="messages")]
expected_c_state1 = ContainerState(state='Running', exit_code=0, detail_status='test')
expected_cg1 = make_mock_cg(expected_c_state1, events)
expected_c_state2 = ContainerState(state='Terminated', exit_code=0, detail_status='test')
expected_cg2 = make_mock_cg(expected_c_state2, events)
aci_mock.return_value.get_state.side_effect = [expected_cg1,
expected_cg2]
aci_mock.return_value.get_logs.return_value = ["test", "logs"]
aci_mock.return_value.exists.return_value = False
@ -97,12 +132,26 @@ class TestACIOperator(unittest.TestCase):
aci.execute(None)
self.assertEqual(aci_mock.return_value.create_or_update.call_count, 1)
self.assertEqual(aci_mock.return_value.get_state_exitcode_details.call_count, 2)
self.assertEqual(aci_mock.return_value.get_messages.call_count, 2)
self.assertEqual(aci_mock.return_value.get_state.call_count, 2)
self.assertEqual(aci_mock.return_value.get_logs.call_count, 2)
self.assertEqual(aci_mock.return_value.delete.call_count, 1)
def test_name_checker(self):
valid_names = ['test-dash', 'name-with-length---63' * 3]
invalid_names = ['test_underscore',
'name-with-length---84' * 4,
'name-ending-with-dash-',
'-name-starting-with-dash']
for name in invalid_names:
with self.assertRaises(AirflowException):
AzureContainerInstancesOperator._check_name(name)
for name in valid_names:
checked_name = AzureContainerInstancesOperator._check_name(name)
self.assertEqual(checked_name, name)
if __name__ == '__main__':
unittest.main()