зеркало из https://github.com/Azure/blobxfer.git
Add arbitrary remote url synccopy
This commit is contained in:
Родитель
5a2797140e
Коммит
6ce66b396b
10
CHANGELOG.md
10
CHANGELOG.md
|
@ -3,7 +3,12 @@
|
|||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- Support ability to set the Content Type property explicitly
|
||||
- Server side copy support for `synccopy` commands. By default, `synccopy`
|
||||
operations will now use server side copies eliminating the machine initiating
|
||||
the operation as a potential bottleneck. Arbitrary URLs are now supported
|
||||
in the `--remote-path` argument that reference a single object
|
||||
([#93](https://github.com/Azure/blobxfer/issues/93)).
|
||||
- Support setting the Content Type property explicitly
|
||||
([#95](https://github.com/Azure/blobxfer/issues/95)). Specifying this
|
||||
option will override automatically inferring the MIME type of the file.
|
||||
|
||||
|
@ -14,6 +19,9 @@ Microsoft Container Registry
|
|||
- Binary builds are now built against Python 3.7.3
|
||||
- Windows Docker image uses Python 3.7.3
|
||||
|
||||
### Fixed
|
||||
- Allow `--rename` in `synccopy` operations
|
||||
|
||||
### Removed
|
||||
- Python 3.4 support dropped
|
||||
|
||||
|
|
|
@ -28,7 +28,9 @@ within a file or object
|
|||
* `replica` mode allows replication of a file across multiple destinations
|
||||
including to multiple storage accounts
|
||||
* Synchronous copy with cross-mode (object transform) replication support
|
||||
(including block-level copies for Block blobs)
|
||||
* By default, leverages server-side copies
|
||||
* Arbitrary URL copy support
|
||||
* Exact block-level copies for block blobs
|
||||
* Client-side encryption support
|
||||
* Support all Azure Blob types and Azure Files for both upload and download
|
||||
* Advanced skip options for rsync-like operations
|
||||
|
|
|
@ -80,6 +80,7 @@ class StorageEntity(object):
|
|||
self._raw_metadata = None
|
||||
self._access_tier = None
|
||||
self._content_type = None
|
||||
self._is_arbitrary_url = False
|
||||
self.replica_targets = None
|
||||
|
||||
@property
|
||||
|
@ -130,7 +131,10 @@ class StorageEntity(object):
|
|||
:rtype: str
|
||||
:return: remote path of entity
|
||||
"""
|
||||
return '{}/{}'.format(self._container, self._name)
|
||||
if self._is_arbitrary_url:
|
||||
return self._name
|
||||
else:
|
||||
return '{}/{}'.format(self._container, self._name)
|
||||
|
||||
@property
|
||||
def lmt(self):
|
||||
|
@ -339,6 +343,16 @@ class StorageEntity(object):
|
|||
"""
|
||||
self._content_type = value
|
||||
|
||||
@property
|
||||
def is_arbitrary_url(self):
|
||||
# type: (StorageEntity) -> bool
|
||||
"""Is an arbitrary URL
|
||||
:param StorageEntity self: this
|
||||
:rtype: bool
|
||||
:return: arbitrary URL
|
||||
"""
|
||||
return self._is_arbitrary_url
|
||||
|
||||
def populate_from_blob(self, sa, blob, vio=None, store_raw_metadata=False):
|
||||
# type: (StorageEntity, blobxfer.operations.azure.StorageAccount,
|
||||
# azure.storage.blob.models.Blob) -> None
|
||||
|
@ -446,3 +460,19 @@ class StorageEntity(object):
|
|||
else:
|
||||
self._client = sa.block_blob_client
|
||||
self._mode = StorageModes.Block
|
||||
|
||||
def populate_from_arbitrary_url(self, remote_path, size):
|
||||
# type: (StorageEntity, str, int) -> None
|
||||
"""Populate properties from an arbitrary url
|
||||
:param StorageEntity self: this
|
||||
:param str remote_path: remote path
|
||||
:param int size: content length
|
||||
"""
|
||||
# fake a client
|
||||
self._client = lambda: None
|
||||
setattr(self._client, 'primary_endpoint', remote_path.split('/')[2])
|
||||
# set attributes
|
||||
self._is_arbitrary_url = True
|
||||
self._container = None
|
||||
self._name = remote_path
|
||||
self._size = size
|
||||
|
|
|
@ -45,6 +45,7 @@ import blobxfer.util
|
|||
# create logger
|
||||
logger = logging.getLogger(__name__)
|
||||
# global defines
|
||||
_DEFAULT_AUTO_CHUNKSIZE_BYTES = 16777216
|
||||
_MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES = 4194304
|
||||
# named tuples
|
||||
Offsets = collections.namedtuple(
|
||||
|
@ -339,7 +340,10 @@ class Descriptor(object):
|
|||
else:
|
||||
return -1
|
||||
else:
|
||||
return _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES
|
||||
if self._dst_ase.mode == blobxfer.models.azure.StorageModes.Block:
|
||||
return _DEFAULT_AUTO_CHUNKSIZE_BYTES
|
||||
else:
|
||||
return _MAX_NONBLOCK_BLOB_CHUNKSIZE_BYTES
|
||||
|
||||
def _compute_total_chunks(self, chunk_size):
|
||||
# type: (Descriptor, int) -> int
|
||||
|
|
|
@ -396,6 +396,14 @@ class SourcePath(blobxfer.models._BaseSourcePaths):
|
|||
super().__init__()
|
||||
self._path_map = {}
|
||||
|
||||
def add_arbitrary_remote_url(self, remote_path):
|
||||
# type: (SourcePath, str) -> None
|
||||
"""Add an arbitrary remote URL
|
||||
:param SourcePath self: this
|
||||
:param str remote_path: remote path
|
||||
"""
|
||||
self._paths.append(remote_path)
|
||||
|
||||
def add_path_with_storage_account(self, remote_path, storage_account):
|
||||
# type: (SourcePath, str, str) -> None
|
||||
"""Add a path with an associated storage account
|
||||
|
@ -617,6 +625,21 @@ class SourcePath(blobxfer.models._BaseSourcePaths):
|
|||
continue
|
||||
yield ase
|
||||
|
||||
def _populate_from_arbitrary_url(self, remote_path):
|
||||
# type: (SourcePath, str) -> StorageEntity
|
||||
"""Internal generator for Azure remote blobs
|
||||
:param SourcePath self: this
|
||||
:param str remote_path: remote path
|
||||
:rtype: StorageEntity
|
||||
:return: Azure storage entity object
|
||||
"""
|
||||
# HEAD remote path to retrieve length
|
||||
response = requests.head(remote_path)
|
||||
ase = blobxfer.models.azure.StorageEntity(container=None)
|
||||
ase.populate_from_arbitrary_url(
|
||||
remote_path, int(response.headers['Content-Length']))
|
||||
return ase
|
||||
|
||||
def _populate_from_list_blobs(self, creds, options, dry_run):
|
||||
# type: (SourcePath, StorageCredentials, Any, bool) -> StorageEntity
|
||||
"""Internal generator for Azure remote blobs
|
||||
|
@ -630,6 +653,12 @@ class SourcePath(blobxfer.models._BaseSourcePaths):
|
|||
is_synccopy = isinstance(options, blobxfer.models.options.SyncCopy)
|
||||
for _path in self._paths:
|
||||
rpath = str(_path)
|
||||
if (is_synccopy and
|
||||
(rpath.lower().startswith('http://') or
|
||||
rpath.lower().startswith('https://'))):
|
||||
ase = self._populate_from_arbitrary_url(rpath)
|
||||
yield ase
|
||||
continue
|
||||
sa = creds.get_storage_account(self.lookup_storage_account(rpath))
|
||||
# ensure at least read permissions
|
||||
if not sa.can_read_object:
|
||||
|
|
|
@ -140,17 +140,20 @@ def put_block_from_url(src_ase, dst_ase, offsets, timeout=None):
|
|||
:param blobxfer.models.upload.Offsets offsets: upload offsets
|
||||
:param int timeout: timeout
|
||||
"""
|
||||
if blobxfer.util.is_not_empty(src_ase.client.account_key):
|
||||
sas = src_ase.client.generate_blob_shared_access_signature(
|
||||
container_name=src_ase.container,
|
||||
blob_name=src_ase.name,
|
||||
permission=azure.storage.blob.BlobPermissions(read=True),
|
||||
expiry=datetime.datetime.utcnow() + datetime.timedelta(days=1),
|
||||
)
|
||||
if src_ase.is_arbitrary_url:
|
||||
src_url = src_ase.path
|
||||
else:
|
||||
sas = src_ase.client.sas_token
|
||||
src_url = 'https://{}/{}?{}'.format(
|
||||
src_ase.client.primary_endpoint, src_ase.path, sas)
|
||||
if blobxfer.util.is_not_empty(src_ase.client.account_key):
|
||||
sas = src_ase.client.generate_blob_shared_access_signature(
|
||||
container_name=src_ase.container,
|
||||
blob_name=src_ase.name,
|
||||
permission=azure.storage.blob.BlobPermissions(read=True),
|
||||
expiry=datetime.datetime.utcnow() + datetime.timedelta(days=1),
|
||||
)
|
||||
else:
|
||||
sas = src_ase.client.sas_token
|
||||
src_url = 'https://{}/{}?{}'.format(
|
||||
src_ase.client.primary_endpoint, src_ase.path, sas)
|
||||
dst_ase.client.put_block_from_url(
|
||||
container_name=dst_ase.container,
|
||||
blob_name=dst_ase.name,
|
||||
|
|
|
@ -245,7 +245,8 @@ class SyncCopy(object):
|
|||
# prepare remote file for download
|
||||
# if remote file is a block blob, need to retrieve block list
|
||||
if (src_ase.mode == dst_ase.mode ==
|
||||
blobxfer.models.azure.StorageModes.Block):
|
||||
blobxfer.models.azure.StorageModes.Block and
|
||||
not src_ase.is_arbitrary_url):
|
||||
bl = blobxfer.operations.azure.blob.block.get_committed_block_list(
|
||||
src_ase)
|
||||
else:
|
||||
|
@ -316,8 +317,6 @@ class SyncCopy(object):
|
|||
if data is not None:
|
||||
blobxfer.operations.azure.blob.append.append_block(ase, data)
|
||||
elif ase.mode == blobxfer.models.azure.StorageModes.Block:
|
||||
src_url = 'https://{}/{}'.format(sd.src_entity._client.primary_endpoint, sd.src_entity.path)
|
||||
print(src_url, offsets.range_start, offsets.range_end, sd.is_one_shot_block_blob, sd.is_server_side_copyable)
|
||||
# handle one-shot uploads
|
||||
if sd.is_one_shot_block_blob:
|
||||
if blobxfer.util.is_not_empty(sd.src_entity.md5):
|
||||
|
@ -605,8 +604,8 @@ class SyncCopy(object):
|
|||
:rtype: SynccopyAction
|
||||
:return: synccopy action
|
||||
"""
|
||||
# if remote file doesn't exist, copy
|
||||
if dst is None or dst.from_local:
|
||||
# if src is arbitrary or remote file doesn't exist, copy
|
||||
if src.is_arbitrary_url or dst is None or dst.from_local:
|
||||
return SynccopyAction.Copy
|
||||
# check overwrite option
|
||||
if not self._spec.options.overwrite:
|
||||
|
@ -716,7 +715,11 @@ class SyncCopy(object):
|
|||
raise RuntimeError(
|
||||
'attempting rename multiple files to a directory')
|
||||
else:
|
||||
name = str(pathlib.Path(name) / src_ase.name)
|
||||
if src_ase.is_arbitrary_url:
|
||||
tmp = '/'.join(src_ase.name.split('/')[3:])
|
||||
name = str(pathlib.Path(name) / tmp)
|
||||
else:
|
||||
name = str(pathlib.Path(name) / src_ase.name)
|
||||
# translate source mode to dest mode
|
||||
dst_mode = self._translate_src_mode_to_dst_mode(src_ase.mode)
|
||||
dst_ase = self._check_for_existing_remote(sa, cont, name, dst_mode)
|
||||
|
@ -856,7 +859,7 @@ class SyncCopy(object):
|
|||
self._update_progress_bar()
|
||||
# check for exceptions
|
||||
if len(self._exceptions) > 0:
|
||||
logger.error('exceptions encountered while downloading')
|
||||
logger.error('exceptions encountered during synccopy')
|
||||
# raise the first one
|
||||
raise self._exceptions[0]
|
||||
# check for mismatches
|
||||
|
|
15
cli/cli.py
15
cli/cli.py
|
@ -1002,6 +1002,19 @@ def _sync_copy_dest_storage_url_option(f):
|
|||
callback=callback)(f)
|
||||
|
||||
|
||||
def _sync_copy_source_url(f):
|
||||
def callback(ctx, param, value):
|
||||
clictx = ctx.ensure_object(CliContext)
|
||||
clictx.cli_options['sync_copy_source_url'] = value
|
||||
return value
|
||||
return click.option(
|
||||
'--sync-copy-source-url',
|
||||
expose_value=False,
|
||||
default=None,
|
||||
help='Remote URL for synccopy source',
|
||||
callback=callback)(f)
|
||||
|
||||
|
||||
def upload_options(f):
|
||||
f = _stripe_chunk_size_bytes_option(f)
|
||||
f = _strip_components_option(f)
|
||||
|
@ -1059,6 +1072,7 @@ def download_options(f):
|
|||
|
||||
|
||||
def sync_copy_options(f):
|
||||
f = _sync_copy_source_url(f)
|
||||
f = _sync_copy_dest_storage_url_option(f)
|
||||
f = _sync_copy_dest_storage_account_option(f)
|
||||
f = _sync_copy_dest_sas_option(f)
|
||||
|
@ -1071,6 +1085,7 @@ def sync_copy_options(f):
|
|||
f = _skip_on_filesize_match_option(f)
|
||||
f = _server_side_copy_option(f)
|
||||
f = _sas_option(f)
|
||||
f = _rename_option(f)
|
||||
f = _remote_path_option(f)
|
||||
f = _overwrite_option(f)
|
||||
f = _mode_option(f)
|
||||
|
|
|
@ -182,6 +182,8 @@ def add_cli_options(cli_options, action):
|
|||
sync_copy_dest_remote_path
|
||||
}
|
||||
]
|
||||
if 'accounts' not in azstorage:
|
||||
azstorage['accounts'] = {}
|
||||
azstorage['accounts'][sync_copy_dest_storage_account] = (
|
||||
cli_options.get('sync_copy_dest_access_key') or
|
||||
cli_options.get('sync_copy_dest_sas')
|
||||
|
@ -189,7 +191,6 @@ def add_cli_options(cli_options, action):
|
|||
else:
|
||||
sync_copy_dest = None
|
||||
arg = {
|
||||
'source': [sa_rp] if sa_rp[storage_account] is not None else None,
|
||||
'destination': sync_copy_dest,
|
||||
'include': cli_options.get('include'),
|
||||
'exclude': cli_options.get('exclude'),
|
||||
|
@ -209,6 +210,17 @@ def add_cli_options(cli_options, action):
|
|||
},
|
||||
},
|
||||
}
|
||||
if storage_account is not None:
|
||||
arg['source'] = (
|
||||
[sa_rp] if sa_rp[storage_account] is not None else None
|
||||
)
|
||||
else:
|
||||
src_url = cli_options.get('remote_path')
|
||||
if src_url is None:
|
||||
raise ValueError('--remote-path not specified')
|
||||
arg['source'] = [{
|
||||
'*': src_url
|
||||
}]
|
||||
elif action == TransferAction.Upload:
|
||||
arg = {
|
||||
'source': [local_resource] if local_resource is not None else None,
|
||||
|
@ -671,13 +683,20 @@ def create_synccopy_specifications(ctx_cli_options, config):
|
|||
for src in conf['source']:
|
||||
sa = next(iter(src))
|
||||
asp = blobxfer.operations.azure.SourcePath()
|
||||
asp.add_path_with_storage_account(src[sa], sa)
|
||||
incl = _merge_setting(cli_conf, conf, 'include', default=None)
|
||||
if blobxfer.util.is_not_empty(incl):
|
||||
asp.add_includes(incl)
|
||||
excl = _merge_setting(cli_conf, conf, 'exclude', default=None)
|
||||
if blobxfer.util.is_not_empty(excl):
|
||||
asp.add_excludes(excl)
|
||||
if sa != '*':
|
||||
asp.add_path_with_storage_account(src[sa], sa)
|
||||
incl = _merge_setting(cli_conf, conf, 'include', default=None)
|
||||
if blobxfer.util.is_not_empty(incl):
|
||||
asp.add_includes(incl)
|
||||
excl = _merge_setting(cli_conf, conf, 'exclude', default=None)
|
||||
if blobxfer.util.is_not_empty(excl):
|
||||
asp.add_excludes(excl)
|
||||
else:
|
||||
if not scs.options.server_side_copy:
|
||||
raise ValueError(
|
||||
'Server side copy must be enabled for abitrary '
|
||||
'source remote paths')
|
||||
asp.add_arbitrary_remote_url(src[sa])
|
||||
scs.add_azure_source_path(asp)
|
||||
# create remote destination paths
|
||||
for dst in conf['destination']:
|
||||
|
|
|
@ -53,7 +53,8 @@ configuration file:
|
|||
the Authentication sub-section below under Options.
|
||||
|
||||
### `synccopy`
|
||||
Synchronously copies remote Azure paths to other remote Azure paths. This
|
||||
Synchronously copies remote paths (Azure or arbitrary URLs) to other remote
|
||||
Azure paths. By default, copies occur on the Azure Storage servers. This
|
||||
command requires at the minimum, the following options if invoked without
|
||||
a YAML configuration file:
|
||||
|
||||
|
@ -64,8 +65,11 @@ a YAML configuration file:
|
|||
* `--mode` specifies the source Azure Storage mode. This defaults
|
||||
to `auto` which will source from Azure Blob storage (any blob
|
||||
type). To source from Azure File storage, set this option to `file`.
|
||||
* `--remote-path` for the source remote Azure path. This must have, at
|
||||
the minimum, a container or file share name.
|
||||
* `--remote-path` for the source remote path. If an Azure path this
|
||||
must have, at the minimum, a container or file share name. For
|
||||
an arbitrary URL, this must be a complete URL that starts with
|
||||
the proper protocol, e.g., `http://` or `https://`. Aribtrary URL
|
||||
support is limited to a single object.
|
||||
* `--storage-account` storage account for the source remote Azure path
|
||||
or the environment variable `BLOBXFER_STORAGE_ACCOUNT`
|
||||
* Remote Azure Storage _destination_ reference using one of two methods:
|
||||
|
@ -124,7 +128,7 @@ to be output.
|
|||
* `-q` or `--quiet` enables quiet mode
|
||||
* `--recursive` or `--no-recursive` controls if the source path should be
|
||||
recursively uploaded or downloaded.
|
||||
* `--remote-path` is the remote Azure path. This path must contain the
|
||||
* `--remote-path` is a remote path. If an Azure path, this must contain the
|
||||
Blob container or File share at the begining, e.g., `mycontainer/vdir`
|
||||
* `--restore-file-lmt` will set the last access and modified times of a
|
||||
downloaded file to the modified time set in Azure storage. This option can
|
||||
|
@ -306,6 +310,8 @@ behavior.
|
|||
This can only be used when transferring a single source file to a destination
|
||||
and can be used with any command. This is automatically enabled when
|
||||
using `stdin` as a source.
|
||||
* `--server-side-copy` or `--no-server-side-copy` enables or disables
|
||||
server side copies for synccopy operations. By default, this is enabled.
|
||||
* `--stdin-as-page-blob-size` allows a page blob size to be set if known
|
||||
beforehand when using `stdin` as a source and the destination is a page blob.
|
||||
This value will automatically be page blob boundary aligned.
|
||||
|
@ -386,6 +392,11 @@ blobxfer upload --config myconfig.yaml
|
|||
blobxfer synccopy --storage-account mystorageaccount --sas "mysastoken" --remote-path mysourcecontainer --sync-copy-dest-storage-account mydestaccount --sync-copy-dest-storage-account-key "mydestkey" --sync-copy-dest-remote-path mydestcontainer
|
||||
```
|
||||
|
||||
#### Synchronously Copy an Arbitrary URL
|
||||
```shell
|
||||
blobxfer synccopy --remote-path "https://raw.githubusercontent.com/Azure/blobxfer/master/README.md" --sync-copy-dest-storage-account mydestaccount --sync-copy-dest-storage-account-key "mydestkey" --sync-copy-dest-remote-path mydestcontainer
|
||||
```
|
||||
|
||||
#### Synchronously Copy using a YAML Configuration File
|
||||
```shell
|
||||
blobxfer synccopy --config myconfig.yaml
|
||||
|
|
|
@ -315,7 +315,8 @@ synccopy:
|
|||
```
|
||||
|
||||
* `source` is a list of storage account to remote path mappings. All sources
|
||||
are copied to each destination specified.
|
||||
are copied to each destination specified. To use an arbitrary URL, specify
|
||||
the map as `*: https://some.url/path`.
|
||||
* `destination` is a list of storage account to remote path mappings
|
||||
* `include` is a list of include patterns
|
||||
* `exclude` is a list of exclude patterns
|
||||
|
|
|
@ -71,8 +71,10 @@ remote path of `mycontainer/mydir/` will not work as intended as, internally,
|
|||
Blob Storage or General Purpose V2 Storage accounts. Please see
|
||||
[this article](https://docs.microsoft.com/en-us/azure/storage/blobs/storage-blob-storage-tiers)
|
||||
for more information.
|
||||
* Synchronous copy operations are limited to transfers within the same Azure
|
||||
cloud (i.e., the source and destination `endpoint` must match). For example,
|
||||
you can synchonously copy within or between storage accounts in Azure
|
||||
Public cloud, e.g., `core.windows.net` but not between Azure clouds, e.g.,
|
||||
`core.windows.net` and `core.usgovcloudapi.net`.
|
||||
* Synchronous copy operations between Azure source and remote paths are
|
||||
limited to transfers within the same Azure cloud (i.e., the source and
|
||||
destination `endpoint` must match). For example, you can synchonously copy
|
||||
within or between storage accounts in Azure Public cloud,
|
||||
e.g., `core.windows.net` but not between Azure clouds, e.g.,
|
||||
`core.windows.net` and `core.usgovcloudapi.net`. Single object copies between
|
||||
clouds can be performed by providing a full source URL as the `--remote-path`.
|
||||
|
|
8
setup.py
8
setup.py
|
@ -29,16 +29,16 @@ packages = [
|
|||
]
|
||||
|
||||
install_requires = [
|
||||
'azure-storage-blob~=1.5.0',
|
||||
'azure-storage-file~=1.4.0',
|
||||
'azure-storage-blob~=2.0.1',
|
||||
'azure-storage-file~=2.0.1',
|
||||
'bitstring~=3.1.5',
|
||||
'click~=7.0',
|
||||
'cryptography~=2.6.1',
|
||||
'future~=0.17.1',
|
||||
'pathlib2>=2.3.3;python_version<"3.5"',
|
||||
'python-dateutil~=2.8.0',
|
||||
'requests~=2.21.0',
|
||||
'ruamel.yaml~=0.15.91',
|
||||
'requests~=2.22.0',
|
||||
'ruamel.yaml~=0.15.96',
|
||||
'scandir>=1.10.0;python_version<"3.5"',
|
||||
]
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
coverage==4.5.3
|
||||
flake8==3.6.0
|
||||
mock==2.0.0; python_version < '3.3'
|
||||
pytest==4.4.1
|
||||
pytest-cov==2.6.1
|
||||
pytest==4.5.0
|
||||
pytest-cov==2.7.1
|
||||
|
|
Загрузка…
Ссылка в новой задаче