Add Azure File ingress support

This commit is contained in:
Fred Park 2016-10-15 13:56:56 -07:00
Родитель 33300c551c
Коммит ed84011383
4 изменённых файлов: 150 добавлений и 59 удалений

Просмотреть файл

@ -7,7 +7,8 @@ more information.
- Ingress from local machine with `files` in global configuration
- To GlusterFS shared volume
- To Azure Blob Storage
- Ingress from Azure Blob Storage with `input_data` in pool and jobs
- To Azure File Storage
- Ingress from Azure Blob or File Storage with `input_data` in pool and jobs
configuration
- Pool-level: to compute nodes
- Job-level: to compute nodes running the specified job

Просмотреть файл

@ -53,9 +53,9 @@ _MAX_READ_BLOCKSIZE_BYTES = 4194304
_FILE_SPLIT_PREFIX = '_shipyard-'
def _process_blob_input_data(blob_client, config, input_data, on_task):
def _process_storage_input_data(blob_client, config, input_data, on_task):
# type: (azure.storage.blob.BlockBlobService, dict, dict, bool) -> str
"""Process blob input data to ingress
"""Process Azure storage input data to ingress
:param azure.storage.blob.BlockBlobService blob_client: blob client
:param dict config: configuration dict
:param dict spec: config spec with input_data
@ -67,7 +67,44 @@ def _process_blob_input_data(blob_client, config, input_data, on_task):
for xfer in input_data:
storage_settings = config['credentials']['storage'][
xfer['storage_account_settings']]
container = xfer['container']
try:
container = xfer['container']
if container is not None and len(container) == 0:
container = None
except KeyError:
container = None
try:
fshare = xfer['file_share']
if fshare is not None and len(fshare) == 0:
fshare = None
except KeyError:
fshare = None
if container is None and fshare is None:
raise ValueError('container or file_share not specified')
elif container is not None and fshare is not None:
raise ValueError(
'cannot specify both container and file_share at the '
'same time')
try:
eo = xfer['blobxfer_extra_options']
if eo is None:
eo = ''
except KeyError:
eo = ''
# configure for file share
if fshare is not None:
if '--fileshare' not in eo:
eo = '--fileshare {}'.format(eo)
# create saskey for file share with 7day expiry with rl perm
saskey = convoy.storage.create_file_share_rl_saskey(
storage_settings, fshare)
# set container as fshare
container = fshare
del fshare
else:
# create saskey for container with 7day expiry with rl perm
saskey = convoy.storage.create_blob_container_rl_saskey(
storage_settings, container)
try:
include = xfer['include']
if include is not None:
@ -91,15 +128,6 @@ def _process_blob_input_data(blob_client, config, input_data, on_task):
raise
if on_task and dst is None or len(dst) == 0:
dst = '$AZ_BATCH_TASK_WORKING_DIR'
try:
eo = xfer['blobxfer_extra_options']
if eo is None:
eo = ''
except KeyError:
eo = ''
# create saskey for container with 7day expiry with rl perm
saskey = convoy.storage.create_blob_container_rl_saskey(
storage_settings, container)
# construct argument
# sa:ep:saskey:container:include:eo:dst
args.append('"{}:{}:{}:{}:{}:{}:{}"'.format(
@ -128,8 +156,8 @@ def process_input_data(blob_client, config, bifile, spec, on_task=False):
if key == 'azure_batch':
# TODO implement compute node ingress
raise NotImplementedError()
elif key == 'azure_blob':
blobargs = _process_blob_input_data(
elif key == 'azure_storage':
blobargs = _process_storage_input_data(
blob_client, config, input_data[key], on_task)
ret = '$AZ_BATCH_NODE_SHARED_DIR/{} {}'.format(
bifile[0], ' '.join(blobargs))
@ -732,8 +760,32 @@ def ingress_data(batch_client, config, rls=None, kind=None):
container = None
except KeyError:
container = None
if container is None:
raise ValueError('container is invalid')
try:
fshare = fdict['destination']['data_transfer']['file_share']
if fshare is not None and len(fshare) == 0:
fshare = None
except KeyError:
fshare = None
if container is None and fshare is None:
raise ValueError('container or file_share not specified')
elif container is not None and fshare is not None:
raise ValueError(
'cannot specify both container and file_share at the '
'same time for source {}'.format(src))
try:
eo = fdict['destination']['data_transfer'][
'blobxfer_extra_options']
if eo is None:
eo = ''
except KeyError:
eo = ''
# append appropriate option for fshare
if fshare is not None:
if '--fileshare' not in eo:
eo = '--fileshare {}'.format(eo)
# set container as fshare
container = fshare
del fshare
if src_incl is not None:
if len(src_incl) > 1:
raise ValueError(
@ -745,13 +797,6 @@ def ingress_data(batch_client, config, rls=None, kind=None):
raise ValueError(
'exclude cannot be specified for ingress to Azure Blob '
'Storage')
try:
eo = fdict['destination']['data_transfer'][
'blobxfer_extra_options']
if eo is None:
eo = ''
except KeyError:
eo = ''
thr = _azure_blob_storage_transfer(
config['credentials']['storage'][storage], container, src,
src_incl, eo)

Просмотреть файл

@ -34,6 +34,7 @@ except ImportError:
# non-stdlib imports
import azure.common
import azure.storage.blob as azureblob
import azure.storage.file as azurefile
import azure.storage.queue as azurequeue
import azure.storage.table as azuretable
# local imports
@ -185,6 +186,26 @@ def create_blob_container_rl_saskey(storage_settings, container):
)
def create_file_share_rl_saskey(storage_settings, file_share):
# type: (dict, str) -> str
"""Create a saskey for a file share with a 7day expiry time and rl perm
:param dict storage_settings: storage settings
:param str file_share: file share
:rtype: str
:return: saskey
"""
file_client = azurefile.FileService(
account_name=storage_settings['account'],
account_key=storage_settings['account_key'],
endpoint_suffix=storage_settings['endpoint'])
return file_client.generate_share_shared_access_signature(
file_share,
azurefile.SharePermissions.READ |
azurefile.SharePermissions.LIST,
expiry=datetime.datetime.utcnow() + datetime.timedelta(days=7)
)
def _add_global_resource(
queue_client, table_client, config, pk, p2pcsd, grtype):
# type: (azurequeue.QueueService, azuretable.TableService, dict, str,

Просмотреть файл

@ -224,9 +224,10 @@ supported.
`files` is an optional property that specifies data that should be ingressed
from a location accessible by the local machine (i.e., machine invoking
`shipyard.py` to a shared file system location accessible by compute nodes
in the pool). `files` is a json list of objects, which allows for multiple
sources to destinations to be ingressed during the same invocation. Each
object within the `files` list contains the following members:
in the pool or Azure Blob or File Storage). `files` is a json list of objects,
which allows for multiple sources to destinations to be ingressed during the
same invocation. Each object within the `files` list contains the following
members:
* (required) `source` property contains the following members:
* (required) `path` is a local path. A single file or a directory
can be specified. Filters below will be ignored if `path` is a file and
@ -246,12 +247,14 @@ object within the `files` list contains the following members:
ingress.
* (required) `destination` property contains the following members:
* (required) `shared_data_volume` or `storage_account_settings` for data
ingress to a GlusterFS volume or Azure Blob Storage. You may specify one
or the other, but not both in the same object. Please see below in the
`shared_data_volumes` for information on how to set up a GlusterFS share.
ingress to a GlusterFS volume or Azure Blob or File Storage. You may
specify one or the other, but not both in the same object. Please see
below in the `shared_data_volumes` for information on how to set up a
GlusterFS share.
* (required) `data_transfer` specifies how the transfer should take place.
The following list contains members for GlusterFS ingress when a GlusterFS
volume is provided for `shared_data_volume`:
volume is provided for `shared_data_volume` (see below for ingressing to
Azure Blob or File Storage):
* (required) `method` specified which method should be used to ingress
data, which should be one of: `scp`, `multinode_scp`, `rsync+ssh` or
`multinode_rsync+ssh`. `scp` will use secure copy to copy a file or a
@ -297,12 +300,17 @@ object within the `files` list contains the following members:
maximum of 6 concurrent scp sessions to the pool. The default is 1 if
not specified or omitted.
* (required) `data_transfer` specifies how the transfer should take place.
When Azure Blob Storage is selected as the destination for data ingress,
[blobxfer](https://github.com/Azure/blobxfer) is invoked. The following
list contains members for Azure Blob Storage ingress when a storage
account link is provided for `storage_account_settings`:
* (required) `container` is the container to upload to. The container
need not be created beforehand.
When Azure Blob or File Storage is selected as the destination for data
ingress, [blobxfer](https://github.com/Azure/blobxfer) is invoked. The
following list contains members for Azure Blob or File Storage ingress
when a storage account link is provided for `storage_account_settings`:
* (required) `container` or `file_share` is required when uploading to
Azure Blob Storage or Azure File Storage, respectively. `container`
specifies which container to upload to for Azure Blob Storage while
`file_share` specifies which file share to upload to for Azure File
Storage. Only one of these properties can be specified per
`data_transfer` object. The container or file share need not be created
beforehand.
* (optional) `blobxfer_extra_options` are any extra options to pass to
`blobxfer`. In the example above, `--no-computefilemd5` will force
`blobxfer` to skip MD5 calculation on files ingressed.
@ -370,7 +378,7 @@ The pool schema is as follows:
"block_until_all_global_resources_loaded": true,
"transfer_files_on_pool_creation": false,
"input_data": {
"azure_blob": [
"azure_storage": [
{
"storage_account_settings": "mystorageaccount",
"container": "poolcontainer",
@ -423,29 +431,35 @@ from entering ready state until all Docker images are loaded. This defaults
to `true`.
* (optional) `transfer_files_on_pool_creation` will ingress all `files`
specified in the `global_resources` section of the configuration json when
the pool is created. If files are to be ingressed to Azure Blob Storage,
then data movement operations are overlapped with the creation of the pool.
If files are to be ingressed to a shared file system on the compute nodes,
then the files are ingressed after the pool is created and the shared file
system is ready. Files can be ingressed to both Azure Blob Storage and a
the pool is created. If files are to be ingressed to Azure Blob or File
Storage, then data movement operations are overlapped with the creation of the
pool. If files are to be ingressed to a shared file system on the compute
nodes, then the files are ingressed after the pool is created and the shared
file system is ready. Files can be ingressed to both Azure Blob Storage and a
shared file system during the same pool creation invocation. If this property
is set to `true` then `block_until_all_global_resources_loaded` will be force
disabled. If omitted, this property defaults to `false`.
* (optional) `input_data` is an object containing data that should be
ingressed to all compute nodes as part of node preparation. It is
important to note that if you are combining this action with `files` and
are ingressing data to Blob storage as part of pool creation, that the blob
containers defined here will be downloaded as soon as the compute node is
ready to do so. This may result in the blob container/blobs not being ready
in time for the `input_data` transfer. It is up to you to ensure that these
two operations do not overlap. If there is a possibility of overlap, then you
should ingress data defined in `files` prior to pool creation and disable
the option above `transfer_files_on_pool_creation`. This object currently only
supports `azure_blob` as a member.
* `azure_blob` contains the following members:
are ingressing data to Azure Blob or File storage as part of pool creation,
that the blob containers or file shares defined here will be downloaded as
soon as the compute node is ready to do so. This may result in the blob
container/blobs or file share/files not being ready in time for the
`input_data` transfer. It is up to you to ensure that these two operations do
not overlap. If there is a possibility of overlap, then you should ingress
data defined in `files` prior to pool creation and disable the option above
`transfer_files_on_pool_creation`. This object currently only supports
`azure_storage` as a member.
* `azure_storage` contains the following members:
* (required) `storage_account_settings` contains a storage account link
as defined in the credentials json.
* (required) `container` the container to transfer.
* (required) `container` or `file_share` is required when uploading to
Azure Blob Storage or Azure File Storage, respectively. `container`
specifies which container to upload to for Azure Blob Storage while
`file_share` specifies which file share to upload to for Azure File
Storage. Only one of these properties can be specified per
`data_transfer` object.
* (optional) `include` property defines an optional include filter.
Although this property is an array, it is only allowed to have 1
maximum filter.
@ -588,11 +602,16 @@ For example, if `job-1`:`task-1` is run on compute node A and then
`job-1`:`task-2` is run on compute node B, then this `input_data` is ingressed
to both compute node A and B. However, if `job-1`:`task-3` is run on
compute node A, then the `input_data` is not transferred again. This object
currently only supports `azure_blob` as a member.
* `azure_blob` contains the following members:
currently only supports `azure_storage` as a member.
* `azure_storage` contains the following members:
* (required) `storage_account_settings` contains a storage account link
as defined in the credentials json.
* (required) `container` the container to transfer.
* (required) `container` or `file_share` is required when uploading to
Azure Blob Storage or Azure File Storage, respectively. `container`
specifies which container to upload to for Azure Blob Storage while
`file_share` specifies which file share to upload to for Azure File
Storage. Only one of these properties can be specified per
`data_transfer` object.
* (optional) `include` property defines an optional include filter.
Although this property is an array, it is only allowed to have 1
maximum filter.
@ -637,11 +656,16 @@ currently only supports `azure_blob` as a member.
This is optional.
* (optional) `input_data` is an object containing data that should be
ingressed for this specific task. This object currently only supports
`azure_blob` as a member.
* `azure_blob` contains the following members:
`azure_storage` as a member.
* `azure_storage` contains the following members:
* (required) `storage_account_settings` contains a storage account link
as defined in the credentials json.
* (required) `container` the container to transfer.
* (required) `container` or `file_share` is required when uploading to
Azure Blob Storage or Azure File Storage, respectively. `container`
specifies which container to upload to for Azure Blob Storage while
`file_share` specifies which file share to upload to for Azure File
Storage. Only one of these properties can be specified per
`data_transfer` object.
* (optional) `include` property defines an optional include filter.
Although this property is an array, it is only allowed to have 1
maximum filter.