Add arbitrary remote url synccopy

This commit is contained in:
Fred Park 2019-05-17 17:08:41 +00:00
Родитель 5a2797140e
Коммит 6ce66b396b
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 3C4D545F457737EB
14 изменённых файлов: 172 добавлений и 45 удалений

Просмотреть файл

@ -3,7 +3,12 @@
## [Unreleased]
### Added
- Support ability to set the Content Type property explicitly
- Server side copy support for `synccopy` commands. By default, `synccopy`
operations will now use server side copies eliminating the machine initiating
the operation as a potential bottleneck. Arbitrary URLs are now supported
in the `--remote-path` argument that reference a single object
([#93](https://github.com/Azure/blobxfer/issues/93)).
- Support setting the Content Type property explicitly
([#95](https://github.com/Azure/blobxfer/issues/95)). Specifying this
option will override automatically inferring the MIME type of the file.
@ -14,6 +19,9 @@ Microsoft Container Registry
- Binary builds are now built against Python 3.7.3
- Windows Docker image uses Python 3.7.3
### Fixed
- Allow `--rename` in `synccopy` operations
### Removed
- Python 3.4 support dropped

Просмотреть файл

@ -28,7 +28,9 @@ within a file or object
* `replica` mode allows replication of a file across multiple destinations
including to multiple storage accounts
* Synchronous copy with cross-mode (object transform) replication support
(including block-level copies for Block blobs)
* By default, leverages server-side copies
* Arbitrary URL copy support
* Exact block-level copies for block blobs
* Client-side encryption support
* Support all Azure Blob types and Azure Files for both upload and download
* Advanced skip options for rsync-like operations

Просмотреть файл

@ -80,6 +80,7 @@ class StorageEntity(object):
self._raw_metadata = None
self._access_tier = None
self._content_type = None
self._is_arbitrary_url = False
self.replica_targets = None
@property
@ -130,7 +131,10 @@ class StorageEntity(object):
:rtype: str
:return: remote path of entity
"""
return '{}/{}'.format(self._container, self._name)
if self._is_arbitrary_url:
return self._name
else:
return '{}/{}'.format(self._container, self._name)
@property
def lmt(self):
@ -339,6 +343,16 @@ class StorageEntity(object):
"""
self._content_type = value
@property
def is_arbitrary_url(self):
# type: (StorageEntity) -> bool
"""Is an arbitrary URL
:param StorageEntity self: this
:rtype: bool
:return: arbitrary URL
"""
return self._is_arbitrary_url
def populate_from_blob(self, sa, blob, vio=None, store_raw_metadata=False):
# type: (StorageEntity, blobxfer.operations.azure.StorageAccount,
# azure.storage.blob.models.Blob) -> None
@ -446,3 +460,19 @@ class StorageEntity(object):
else:
self._client = sa.block_blob_client
self._mode = StorageModes.Block
def populate_from_arbitrary_url(self, remote_path, size):
# type: (StorageEntity, str, int) -> None
"""Populate properties from an arbitrary url
:param StorageEntity self: this
:param str remote_path: remote path
:param int size: content length
"""
# fake a client
self._client = lambda: None
setattr(self._client, 'primary_endpoint', remote_path.split('/')[2])
# set attributes
self._is_arbitrary_url = True
self._container = None
self._name = remote_path
self._size = size

Просмотреть файл

@ -45,6 +45,7 @@ import blobxfer.util
# create logger
logger = logging.getLogger(__name__)
# global defines
_DEFAULT_AUTO_CHUNKSIZE_BYTES = 16777216
_MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES = 4194304
# named tuples
Offsets = collections.namedtuple(
@ -339,7 +340,10 @@ class Descriptor(object):
else:
return -1
else:
return _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES
if self._dst_ase.mode == blobxfer.models.azure.StorageModes.Block:
return _DEFAULT_AUTO_CHUNKSIZE_BYTES
else:
return _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES
def _compute_total_chunks(self, chunk_size):
# type: (Descriptor, int) -> int

Просмотреть файл

@ -396,6 +396,14 @@ class SourcePath(blobxfer.models._BaseSourcePaths):
super().__init__()
self._path_map = {}
def add_arbitrary_remote_url(self, remote_path):
# type: (SourcePath, str) -> None
"""Add an arbitrary remote URL
:param SourcePath self: this
:param str remote_path: remote path
"""
self._paths.append(remote_path)
def add_path_with_storage_account(self, remote_path, storage_account):
# type: (SourcePath, str, str) -> None
"""Add a path with an associated storage account
@ -617,6 +625,21 @@ class SourcePath(blobxfer.models._BaseSourcePaths):
continue
yield ase
def _populate_from_arbitrary_url(self, remote_path):
# type: (SourcePath, str) -> StorageEntity
"""Internal generator for Azure remote blobs
:param SourcePath self: this
:param str remote_path: remote path
:rtype: StorageEntity
:return: Azure storage entity object
"""
# HEAD remote path to retrieve length
response = requests.head(remote_path)
ase = blobxfer.models.azure.StorageEntity(container=None)
ase.populate_from_arbitrary_url(
remote_path, int(response.headers['Content-Length']))
return ase
def _populate_from_list_blobs(self, creds, options, dry_run):
# type: (SourcePath, StorageCredentials, Any, bool) -> StorageEntity
"""Internal generator for Azure remote blobs
@ -630,6 +653,12 @@ class SourcePath(blobxfer.models._BaseSourcePaths):
is_synccopy = isinstance(options, blobxfer.models.options.SyncCopy)
for _path in self._paths:
rpath = str(_path)
if (is_synccopy and
(rpath.lower().startswith('http://') or
rpath.lower().startswith('https://'))):
ase = self._populate_from_arbitrary_url(rpath)
yield ase
continue
sa = creds.get_storage_account(self.lookup_storage_account(rpath))
# ensure at least read permissions
if not sa.can_read_object:

Просмотреть файл

@ -140,17 +140,20 @@ def put_block_from_url(src_ase, dst_ase, offsets, timeout=None):
:param blobxfer.models.upload.Offsets offsets: upload offsets
:param int timeout: timeout
"""
if blobxfer.util.is_not_empty(src_ase.client.account_key):
sas = src_ase.client.generate_blob_shared_access_signature(
container_name=src_ase.container,
blob_name=src_ase.name,
permission=azure.storage.blob.BlobPermissions(read=True),
expiry=datetime.datetime.utcnow() + datetime.timedelta(days=1),
)
if src_ase.is_arbitrary_url:
src_url = src_ase.path
else:
sas = src_ase.client.sas_token
src_url = 'https://{}/{}?{}'.format(
src_ase.client.primary_endpoint, src_ase.path, sas)
if blobxfer.util.is_not_empty(src_ase.client.account_key):
sas = src_ase.client.generate_blob_shared_access_signature(
container_name=src_ase.container,
blob_name=src_ase.name,
permission=azure.storage.blob.BlobPermissions(read=True),
expiry=datetime.datetime.utcnow() + datetime.timedelta(days=1),
)
else:
sas = src_ase.client.sas_token
src_url = 'https://{}/{}?{}'.format(
src_ase.client.primary_endpoint, src_ase.path, sas)
dst_ase.client.put_block_from_url(
container_name=dst_ase.container,
blob_name=dst_ase.name,

Просмотреть файл

@ -245,7 +245,8 @@ class SyncCopy(object):
# prepare remote file for download
# if remote file is a block blob, need to retrieve block list
if (src_ase.mode == dst_ase.mode ==
blobxfer.models.azure.StorageModes.Block):
blobxfer.models.azure.StorageModes.Block and
not src_ase.is_arbitrary_url):
bl = blobxfer.operations.azure.blob.block.get_committed_block_list(
src_ase)
else:
@ -316,8 +317,6 @@ class SyncCopy(object):
if data is not None:
blobxfer.operations.azure.blob.append.append_block(ase, data)
elif ase.mode == blobxfer.models.azure.StorageModes.Block:
src_url = 'https://{}/{}'.format(sd.src_entity._client.primary_endpoint, sd.src_entity.path)
print(src_url, offsets.range_start, offsets.range_end, sd.is_one_shot_block_blob, sd.is_server_side_copyable)
# handle one-shot uploads
if sd.is_one_shot_block_blob:
if blobxfer.util.is_not_empty(sd.src_entity.md5):
@ -605,8 +604,8 @@ class SyncCopy(object):
:rtype: SynccopyAction
:return: synccopy action
"""
# if remote file doesn't exist, copy
if dst is None or dst.from_local:
# if src is arbitrary or remote file doesn't exist, copy
if src.is_arbitrary_url or dst is None or dst.from_local:
return SynccopyAction.Copy
# check overwrite option
if not self._spec.options.overwrite:
@ -716,7 +715,11 @@ class SyncCopy(object):
raise RuntimeError(
'attempting rename multiple files to a directory')
else:
name = str(pathlib.Path(name) / src_ase.name)
if src_ase.is_arbitrary_url:
tmp = '/'.join(src_ase.name.split('/')[3:])
name = str(pathlib.Path(name) / tmp)
else:
name = str(pathlib.Path(name) / src_ase.name)
# translate source mode to dest mode
dst_mode = self._translate_src_mode_to_dst_mode(src_ase.mode)
dst_ase = self._check_for_existing_remote(sa, cont, name, dst_mode)
@ -856,7 +859,7 @@ class SyncCopy(object):
self._update_progress_bar()
# check for exceptions
if len(self._exceptions) > 0:
logger.error('exceptions encountered while downloading')
logger.error('exceptions encountered during synccopy')
# raise the first one
raise self._exceptions[0]
# check for mismatches

Просмотреть файл

@ -1002,6 +1002,19 @@ def _sync_copy_dest_storage_url_option(f):
callback=callback)(f)
def _sync_copy_source_url(f):
def callback(ctx, param, value):
clictx = ctx.ensure_object(CliContext)
clictx.cli_options['sync_copy_source_url'] = value
return value
return click.option(
'--sync-copy-source-url',
expose_value=False,
default=None,
help='Remote URL for synccopy source',
callback=callback)(f)
def upload_options(f):
f = _stripe_chunk_size_bytes_option(f)
f = _strip_components_option(f)
@ -1059,6 +1072,7 @@ def download_options(f):
def sync_copy_options(f):
f = _sync_copy_source_url(f)
f = _sync_copy_dest_storage_url_option(f)
f = _sync_copy_dest_storage_account_option(f)
f = _sync_copy_dest_sas_option(f)
@ -1071,6 +1085,7 @@ def sync_copy_options(f):
f = _skip_on_filesize_match_option(f)
f = _server_side_copy_option(f)
f = _sas_option(f)
f = _rename_option(f)
f = _remote_path_option(f)
f = _overwrite_option(f)
f = _mode_option(f)

Просмотреть файл

@ -182,6 +182,8 @@ def add_cli_options(cli_options, action):
sync_copy_dest_remote_path
}
]
if 'accounts' not in azstorage:
azstorage['accounts'] = {}
azstorage['accounts'][sync_copy_dest_storage_account] = (
cli_options.get('sync_copy_dest_access_key') or
cli_options.get('sync_copy_dest_sas')
@ -189,7 +191,6 @@ def add_cli_options(cli_options, action):
else:
sync_copy_dest = None
arg = {
'source': [sa_rp] if sa_rp[storage_account] is not None else None,
'destination': sync_copy_dest,
'include': cli_options.get('include'),
'exclude': cli_options.get('exclude'),
@ -209,6 +210,17 @@ def add_cli_options(cli_options, action):
},
},
}
if storage_account is not None:
arg['source'] = (
[sa_rp] if sa_rp[storage_account] is not None else None
)
else:
src_url = cli_options.get('remote_path')
if src_url is None:
raise ValueError('--remote-path not specified')
arg['source'] = [{
'*': src_url
}]
elif action == TransferAction.Upload:
arg = {
'source': [local_resource] if local_resource is not None else None,
@ -671,13 +683,20 @@ def create_synccopy_specifications(ctx_cli_options, config):
for src in conf['source']:
sa = next(iter(src))
asp = blobxfer.operations.azure.SourcePath()
asp.add_path_with_storage_account(src[sa], sa)
incl = _merge_setting(cli_conf, conf, 'include', default=None)
if blobxfer.util.is_not_empty(incl):
asp.add_includes(incl)
excl = _merge_setting(cli_conf, conf, 'exclude', default=None)
if blobxfer.util.is_not_empty(excl):
asp.add_excludes(excl)
if sa != '*':
asp.add_path_with_storage_account(src[sa], sa)
incl = _merge_setting(cli_conf, conf, 'include', default=None)
if blobxfer.util.is_not_empty(incl):
asp.add_includes(incl)
excl = _merge_setting(cli_conf, conf, 'exclude', default=None)
if blobxfer.util.is_not_empty(excl):
asp.add_excludes(excl)
else:
if not scs.options.server_side_copy:
raise ValueError(
'Server side copy must be enabled for abitrary '
'source remote paths')
asp.add_arbitrary_remote_url(src[sa])
scs.add_azure_source_path(asp)
# create remote destination paths
for dst in conf['destination']:

Просмотреть файл

@ -53,7 +53,8 @@ configuration file:
the Authentication sub-section below under Options.
### `synccopy`
Synchronously copies remote Azure paths to other remote Azure paths. This
Synchronously copies remote paths (Azure or arbitrary URLs) to other remote
Azure paths. By default, copies occur on the Azure Storage servers. This
command requires at the minimum, the following options if invoked without
a YAML configuration file:
@ -64,8 +65,11 @@ a YAML configuration file:
* `--mode` specifies the source Azure Storage mode. This defaults
to `auto` which will source from Azure Blob storage (any blob
type). To source from Azure File storage, set this option to `file`.
* `--remote-path` for the source remote Azure path. This must have, at
the minimum, a container or file share name.
* `--remote-path` for the source remote path. If an Azure path this
must have, at the minimum, a container or file share name. For
an arbitrary URL, this must be a complete URL that starts with
the proper protocol, e.g., `http://` or `https://`. Aribtrary URL
support is limited to a single object.
* `--storage-account` storage account for the source remote Azure path
or the environment variable `BLOBXFER_STORAGE_ACCOUNT`
* Remote Azure Storage _destination_ reference using one of two methods:
@ -124,7 +128,7 @@ to be output.
* `-q` or `--quiet` enables quiet mode
* `--recursive` or `--no-recursive` controls if the source path should be
recursively uploaded or downloaded.
* `--remote-path` is the remote Azure path. This path must contain the
* `--remote-path` is a remote path. If an Azure path, this must contain the
Blob container or File share at the begining, e.g., `mycontainer/vdir`
* `--restore-file-lmt` will set the last access and modified times of a
downloaded file to the modified time set in Azure storage. This option can
@ -306,6 +310,8 @@ behavior.
This can only be used when transferring a single source file to a destination
and can be used with any command. This is automatically enabled when
using `stdin` as a source.
* `--server-side-copy` or `--no-server-side-copy` enables or disables
server side copies for synccopy operations. By default, this is enabled.
* `--stdin-as-page-blob-size` allows a page blob size to be set if known
beforehand when using `stdin` as a source and the destination is a page blob.
This value will automatically be page blob boundary aligned.
@ -386,6 +392,11 @@ blobxfer upload --config myconfig.yaml
blobxfer synccopy --storage-account mystorageaccount --sas "mysastoken" --remote-path mysourcecontainer --sync-copy-dest-storage-account mydestaccount --sync-copy-dest-storage-account-key "mydestkey" --sync-copy-dest-remote-path mydestcontainer
```
#### Synchronously Copy an Arbitrary URL
```shell
blobxfer synccopy --remote-path "https://raw.githubusercontent.com/Azure/blobxfer/master/README.md" --sync-copy-dest-storage-account mydestaccount --sync-copy-dest-storage-account-key "mydestkey" --sync-copy-dest-remote-path mydestcontainer
```
#### Synchronously Copy using a YAML Configuration File
```shell
blobxfer synccopy --config myconfig.yaml

Просмотреть файл

@ -315,7 +315,8 @@ synccopy:
```
* `source` is a list of storage account to remote path mappings. All sources
are copied to each destination specified.
are copied to each destination specified. To use an arbitrary URL, specify
the map as `*: https://some.url/path`.
* `destination` is a list of storage account to remote path mappings
* `include` is a list of include patterns
* `exclude` is a list of exclude patterns

Просмотреть файл

@ -71,8 +71,10 @@ remote path of `mycontainer/mydir/` will not work as intended as, internally,
Blob Storage or General Purpose V2 Storage accounts. Please see
[this article](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-blob-storage-tiers)
for more information.
* Synchronous copy operations are limited to transfers within the same Azure
cloud (i.e., the source and destination `endpoint` must match). For example,
you can synchonously copy within or between storage accounts in Azure
Public cloud, e.g., `core.windows.net` but not between Azure clouds, e.g.,
`core.windows.net` and `core.usgovcloudapi.net`.
* Synchronous copy operations between Azure source and remote paths are
limited to transfers within the same Azure cloud (i.e., the source and
destination `endpoint` must match). For example, you can synchonously copy
within or between storage accounts in Azure Public cloud,
e.g., `core.windows.net` but not between Azure clouds, e.g.,
`core.windows.net` and `core.usgovcloudapi.net`. Single object copies between
clouds can be performed by providing a full source URL as the `--remote-path`.

Просмотреть файл

@ -29,16 +29,16 @@ packages = [
]
install_requires = [
'azure-storage-blob~=1.5.0',
'azure-storage-file~=1.4.0',
'azure-storage-blob~=2.0.1',
'azure-storage-file~=2.0.1',
'bitstring~=3.1.5',
'click~=7.0',
'cryptography~=2.6.1',
'future~=0.17.1',
'pathlib2>=2.3.3;python_version<"3.5"',
'python-dateutil~=2.8.0',
'requests~=2.21.0',
'ruamel.yaml~=0.15.91',
'requests~=2.22.0',
'ruamel.yaml~=0.15.96',
'scandir>=1.10.0;python_version<"3.5"',
]

Просмотреть файл

@ -1,5 +1,5 @@
coverage==4.5.3
flake8==3.6.0
mock==2.0.0; python_version < '3.3'
pytest==4.4.1
pytest-cov==2.6.1
pytest==4.5.0
pytest-cov==2.7.1