- Rename some options
- Make thread join more robust on Python2
This commit is contained in:
Fred Park 2017-06-02 08:16:46 -07:00
Родитель b7782619d1
Коммит e1d97fa3cb
12 изменённых файлов: 129 добавлений и 61 удалений

Просмотреть файл

@ -2,7 +2,7 @@
## [Unreleased]
## [1.0.0a2] - 2017-06-02
## [1.0.0a3] - 2017-06-02
### Changed
- From scratch rewrite providing a consistent CLI experience and a vast
array of new and advanced features. Please see the
@ -201,8 +201,8 @@ usage documentation carefully when upgrading from 0.12.1.
`--no-skiponmatch`.
- 0.8.2: performance regression fixes
[Unreleased]: https://github.com/Azure/blobxfer/compare/1.0.0a2...HEAD
[1.0.0a2]: https://github.com/Azure/blobxfer/compare/0.12.1...1.0.0a2
[Unreleased]: https://github.com/Azure/blobxfer/compare/1.0.0a3...HEAD
[1.0.0a3]: https://github.com/Azure/blobxfer/compare/0.12.1...1.0.0a3
[0.12.1]: https://github.com/Azure/blobxfer/compare/0.12.0...0.12.1
[0.12.0]: https://github.com/Azure/blobxfer/compare/0.11.5...0.12.0
[0.11.5]: https://github.com/Azure/blobxfer/compare/0.11.4...0.11.5

Просмотреть файл

@ -105,13 +105,14 @@ class Concurrency(object):
"""Concurrency Options"""
def __init__(
self, crypto_processes, md5_processes, disk_threads,
transfer_threads):
transfer_threads, is_download=None):
"""Ctor for Concurrency Options
:param Concurrency self: this
:param int crypto_processes: number of crypto procs
:param int md5_processes: number of md5 procs
:param int disk_threads: number of disk threads
:param int transfer_threads: number of transfer threads
:param bool is_download: download hint
"""
self.crypto_processes = crypto_processes
self.md5_processes = md5_processes
@ -131,6 +132,9 @@ class Concurrency(object):
# cap maximum number of disk threads from cpu count to 64
if self.disk_threads > 64:
self.disk_threads = 64
# for downloads, cap disk threads to lower value
if is_download and self.disk_threads > 16:
self.disk_threads = 16
auto_disk = True
if self.transfer_threads is None or self.transfer_threads < 1:
if auto_disk:

Просмотреть файл

@ -430,7 +430,7 @@ class Downloader(object):
if terminate:
self._download_terminate = terminate
for thr in self._disk_threads:
thr.join()
blobxfer.util.join_thread(thr)
def _wait_for_transfer_threads(self, terminate):
# type: (Downloader, bool) -> None
@ -441,7 +441,7 @@ class Downloader(object):
if terminate:
self._download_terminate = terminate
for thr in self._transfer_threads:
thr.join()
blobxfer.util.join_thread(thr)
def _worker_thread_transfer(self):
# type: (Downloader) -> None
@ -452,7 +452,7 @@ class Downloader(object):
while not self.termination_check:
try:
if len(self._disk_set) > max_set_len:
time.sleep(0.2)
time.sleep(0.1)
continue
else:
dd = self._transfer_queue.get(block=False, timeout=0.1)
@ -792,8 +792,8 @@ class Downloader(object):
'KeyboardInterrupt detected, force terminating '
'processes and threads (this may take a while)...')
try:
self._wait_for_transfer_threads(terminate=True)
self._wait_for_disk_threads(terminate=True)
self._wait_for_transfer_threads(terminate=True)
finally:
self._cleanup_temporary_files()
raise

Просмотреть файл

@ -447,10 +447,10 @@ class Uploader(object):
while not self.termination_check:
try:
if len(self._transfer_set) > max_set_len:
time.sleep(0.2)
time.sleep(0.1)
continue
else:
ud = self._upload_queue.get(False, 0.1)
ud = self._upload_queue.get(block=False, timeout=0.1)
except queue.Empty:
continue
try:

Просмотреть файл

@ -124,6 +124,20 @@ def is_not_empty(obj):
return obj is not None and len(obj) > 0
def join_thread(thr):
# type: (threading.Thread) -> None
"""Join a thread
:type threading.Thread thr: thread to join
"""
if on_python2():
while True:
thr.join(timeout=1)
if not thr.isAlive():
break
else:
thr.join()
def merge_dict(dict1, dict2):
# type: (dict, dict) -> dict
"""Recursively merge dictionaries: dict2 on to dict1. This differs

Просмотреть файл

@ -22,4 +22,4 @@
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
__version__ = '1.0.0a2'
__version__ = '1.0.0a3'

Просмотреть файл

@ -62,13 +62,15 @@ class CliContext(object):
self.credentials = None
self.general_options = None
def initialize(self):
# type: (CliContext) -> None
def initialize(self, action):
# type: (CliContext, settings.TransferAction) -> None
"""Initialize context
:param CliContext self: this
:param settings.TransferAction action: transfer action
"""
self._init_config()
self.general_options = settings.create_general_options(self.config)
self.general_options = settings.create_general_options(
self.config, action)
self.credentials = settings.create_azure_storage_credentials(
self.config, self.general_options)
@ -164,7 +166,8 @@ def _log_file_option(f):
'--log-file',
expose_value=False,
default=None,
help='Log to file specified',
help='Log to file specified; this must be specified for progress '
'bar to show',
callback=callback)(f)
@ -191,7 +194,8 @@ def _progress_bar_option(f):
'--progress-bar/--no-progress-bar',
expose_value=False,
default=True,
help='Display progress bar instead of console logs [True]',
help='Display progress bar instead of console logs; log file must '
'be specified [True]',
callback=callback)(f)
@ -254,22 +258,22 @@ def _local_resource_option(f):
clictx.cli_options['local_resource'] = value
return value
return click.option(
'--local-resource',
'--local-path',
expose_value=False,
help='Local resource; use - for stdin',
help='Local path; use - for stdin',
callback=callback)(f)
def _storage_account_name_option(f):
def _storage_account_option(f):
def callback(ctx, param, value):
clictx = ctx.ensure_object(CliContext)
clictx.cli_options['storage_account'] = value
return value
return click.option(
'--storage-account-name',
'--storage-account',
expose_value=False,
help='Storage account name',
envvar='BLOBXFER_STORAGE_ACCOUNT_NAME',
envvar='BLOBXFER_STORAGE_ACCOUNT',
callback=callback)(f)
@ -301,7 +305,7 @@ def common_options(f):
def upload_download_options(f):
f = _remote_path_option(f)
f = _storage_account_name_option(f)
f = _storage_account_option(f)
f = _local_resource_option(f)
return f
@ -633,16 +637,16 @@ def _sync_copy_dest_access_key_option(f):
callback=callback)(f)
def _sync_copy_dest_storage_account_name_option(f):
def _sync_copy_dest_storage_account_option(f):
def callback(ctx, param, value):
clictx = ctx.ensure_object(CliContext)
clictx.cli_options['sync_copy_dest_storage_account'] = value
return value
return click.option(
'--sync-copy-dest-storage-account-name',
'--sync-copy-dest-storage-account',
expose_value=False,
help='Storage account name for synccopy destination',
envvar='BLOBXFER_SYNC_COPY_DEST_STORAGE_ACCOUNT_NAME',
envvar='BLOBXFER_SYNC_COPY_DEST_STORAGE_ACCOUNT',
callback=callback)(f)
@ -721,11 +725,11 @@ def download_options(f):
def sync_copy_options(f):
f = _sync_copy_dest_storage_account_name_option(f)
f = _sync_copy_dest_storage_account_option(f)
f = _sync_copy_dest_sas_option(f)
f = _sync_copy_dest_remote_path_option(f)
f = _sync_copy_dest_access_key_option(f)
f = _storage_account_name_option(f)
f = _storage_account_option(f)
f = _skip_on_md5_match_option(f)
f = _skip_on_lmt_ge_option(f)
f = _skip_on_filesize_match_option(f)
@ -757,7 +761,7 @@ def cli(ctx):
def download(ctx):
"""Download blobs or files from Azure Storage"""
settings.add_cli_options(ctx.cli_options, settings.TransferAction.Download)
ctx.initialize()
ctx.initialize(settings.TransferAction.Download)
specs = settings.create_download_specifications(ctx.config)
for spec in specs:
blobxfer.api.Downloader(
@ -773,7 +777,7 @@ def synccopy(ctx):
"""Synchronously copy blobs between Azure Storage accounts"""
raise NotImplementedError()
settings.add_cli_options(ctx.cli_options, settings.TransferAction.Synccopy)
ctx.initialize()
ctx.initialize(settings.TransferAction.Synccopy)
@cli.command('upload')
@ -784,7 +788,7 @@ def synccopy(ctx):
def upload(ctx):
"""Upload files to Azure Storage"""
settings.add_cli_options(ctx.cli_options, settings.TransferAction.Upload)
ctx.initialize()
ctx.initialize(settings.TransferAction.Upload)
specs = settings.create_upload_specifications(ctx.config)
for spec in specs:
blobxfer.api.Uploader(

Просмотреть файл

@ -61,13 +61,13 @@ def add_cli_options(cli_options, action):
if blobxfer.util.is_none_or_empty(local_resource):
raise KeyError()
except KeyError:
raise ValueError('--local-resource must be specified')
raise ValueError('--local-path must be specified')
try:
storage_account = cli_options['storage_account']
if blobxfer.util.is_none_or_empty(storage_account):
raise KeyError()
except KeyError:
raise ValueError('--storage-account-name must be specified')
raise ValueError('--storage-account must be specified')
try:
remote_path = cli_options['remote_path']
if blobxfer.util.is_none_or_empty(remote_path):
@ -167,7 +167,7 @@ def add_cli_options(cli_options, action):
raise KeyError()
except KeyError:
raise ValueError(
'--sync-copy-dest-storage-account-name must be specified')
'--sync-copy-dest-storage-account must be specified')
try:
sync_copy_dest_remote_path = \
cli_options['sync_copy_dest_remote_path']
@ -278,10 +278,11 @@ def create_azure_storage_credentials(config, general_options):
return creds
def create_general_options(config):
# type: (dict) -> blobxfer.models.options.General
def create_general_options(config, action):
# type: (dict, TransferAction) -> blobxfer.models.options.General
"""Create a General Options object from configuration
:param dict config: config dict
:param TransferAction action: transfer action
:rtype: blobxfer.models.options.General
:return: general options object
"""
@ -292,6 +293,7 @@ def create_general_options(config):
disk_threads=conc.get('disk_threads', 0),
md5_processes=conc.get('md5_processes', 0),
transfer_threads=conc.get('transfer_threads', 0),
is_download=action == TransferAction.Download,
),
log_file=config['options'].get('log_file', None),
progress_bar=config['options'].get('progress_bar', True),

Просмотреть файл

@ -72,9 +72,10 @@ docker pull alfpark/blobxfer
## Troubleshooting
#### `azure.storage` dependency not found
If you get an error that `azure.storage` cannot be found or loaded, then
most likely there was a conflict with this package with other `azure` packages
that share the same base namespace. You can correct this by issuing:
If you get an error such as `ImportError: No module named storage` or that
`azure.storage` cannot be found or loaded, then most likely there was a
conflict with this package with other `azure` packages that share the same
base namespace. You can correct this by issuing:
```shell
# for Python3
pip3 install --upgrade --force-reinstall azure-storage

Просмотреть файл

@ -12,9 +12,9 @@ command will be detailed along with all options available.
### `download`
Downloads a remote Azure path, which may contain many resources, to the
local machine. This command requires at the minimum, the following options:
* `--storage-account-name`
* `--storage-account`
* `--remote-path`
* `--local-resource`
* `--local-path`
Additionally, an authentication option for the storage account is required.
Please see the Authentication sub-section below under Options.
@ -23,14 +23,14 @@ Please see the Authentication sub-section below under Options.
Uploads a local path to a remote Azure path. The local path may contain
many resources on the local machine. This command requires at the minimum,
the following options:
* `--local-resource`
* `--storage-account-name`
* `--local-path`
* `--storage-account`
* `--remote-path`
Additionally, an authentication option for the storage account is required.
Please see the Authentication sub-section below under Options.
If piping from `stdin`, `--local-resource` should be set to `-` as per
If piping from `stdin`, `--local-path` should be set to `-` as per
convention.
### `synccopy`
@ -49,9 +49,10 @@ of up to 100MiB, all others have a maximum of 4MiB.
attributes (mode and ownership) should be stored or restored. Note that to
restore uid/gid, `blobxfer` must be run as root or under sudo.
* `--file-md5` or `--no-file-md5` controls if the file MD5 should be computed.
* `--local-resource` is the local resource path. Set to `-` if piping from
* `--local-path` is the local resource path. Set to `-` if piping from
`stdin`.
* `--log-file` specifies the log file to write to.
* `--log-file` specifies the log file to write to. This must be specified
for a progress bar to be output to console.
* `--mode` is the operating mode. The default is `auto` but may be set to
`append`, `block`, `file`, or `page`. If specified with the `upload`
command, then all files will be uploaded as the specified `mode` type.
@ -61,12 +62,16 @@ with Azure File shares.
* `--overwrite` or `--no-overwrite` controls clobber semantics at the
destination.
* `--progress-bar` or `--no-progress-bar` controls if a progress bar is
output to the console.
output to the console. `--log-file` must be specified for a progress bar
to be output.
* `--recursive` or `--no-recursive` controls if the source path should be
recursively uploaded or downloaded.
* `--remote-path` is the remote Azure path. This path must contain the
Blob container or File share at the begining, e.g., `mycontainer/vdir`
* `--resume-file` specifies the resume file to write to.
* `--storage-account` specifies the storage account to use. This can be
optionally provided through an environment variable `BLOBXFER_STORAGE_ACCOUNT`
instead.
* `--timeout` is the integral timeout value in seconds to use.
* `-h` or `--help` can be passed at every command level to receive context
sensitive help.
@ -96,7 +101,7 @@ to/from Azure Storage.
### Connection
* `--endpoint` is the Azure Storage endpoint to connect to; the default is
Azure Public regions, or `core.windows.net`.
* `--storage-account-name` is the storage account to connect to.
* `--storage-account` is the storage account to connect to.
### Encryption
* `--rsa-private-key` is the RSA private key in PEM format to use. This can
@ -161,27 +166,27 @@ file path. The default is `1`.
### `download` Examples
#### Download an Entire Encrypted Blob Container to Current Working Directory
```shell
blobxfer download --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource . --rsa-public-key ~/mypubkey.pem
blobxfer download --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path . --rsa-public-key ~/mypubkey.pem
```
#### Download an Entire File Share to Designated Path and Skip On Filesize Matches
```shell
blobxfer download --mode file --storage-account-name mystorageaccount --storage-account-key "myaccesskey" --remote-path myfileshare --local-resource /my/path --skip-on-filesize-match
blobxfer download --mode file --storage-account mystorageaccount --storage-account-key "myaccesskey" --remote-path myfileshare --local-path /my/path --skip-on-filesize-match
```
#### Download only Page Blobs in Blob Container Virtual Directory Non-recursively and Cleanup Local Path to Match Remote Path
```shell
blobxfer download --mode page --storage-account-name mystorageaccount --storage-account-key "myaccesskey" --remote-path mycontainer --local-resource /my/pageblobs --no-recursive --delete
blobxfer download --mode page --storage-account mystorageaccount --storage-account-key "myaccesskey" --remote-path mycontainer --local-path /my/pageblobs --no-recursive --delete
```
#### Resume Incomplete Downloads Matching an Include Pattern and Log to File and Restore POSIX File Attributes
```shell
blobxfer download --storage-account-name mystorageaccount --storage-account-key "myaccesskey" --remote-path mycontainer --local-resource . --include '*.bin' --resume-file myresumefile.db --log-file blobxfer.log --file-attributes
blobxfer download --storage-account mystorageaccount --storage-account-key "myaccesskey" --remote-path mycontainer --local-path . --include '*.bin' --resume-file myresumefile.db --log-file blobxfer.log --file-attributes
```
#### Download a Blob Snapshot
```shell
blobxfer download --storage-account-name mystorageaccount --sas "mysastoken" --remote-path "mycontainer/file.bin?snapshot=2017-04-20T02:12:49.0311708Z" --local-resource .
blobxfer download --storage-account mystorageaccount --sas "mysastoken" --remote-path "mycontainer/file.bin?snapshot=2017-04-20T02:12:49.0311708Z" --local-path .
```
#### Download using a YAML Configuration File
@ -192,27 +197,27 @@ blobxfer download --config myconfig.yaml
### `upload` Examples
#### Upload Current Working Directory as Encrypted Block Blobs Non-recursively
```shell
blobxfer upload --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource . --rsa-private-key ~/myprivatekey.pem --no-recursive
blobxfer upload --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path . --rsa-private-key ~/myprivatekey.pem --no-recursive
```
#### Upload Specific Path Recursively to a File Share, Store File MD5 and POSIX File Attributes to a File Share and Exclude Some Files
```shell
blobxfer upload --mode file --storage-account-name mystorageaccount --sas "mysastoken" --remote-path myfileshare --local-resource . --file-md5 --file-attributes --exclude '*.bak'
blobxfer upload --mode file --storage-account mystorageaccount --sas "mysastoken" --remote-path myfileshare --local-path . --file-md5 --file-attributes --exclude '*.bak'
```
#### Upload Single File with Resume and Striped Vectored IO into 512MiB Chunks
```shell
blobxfer upload --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource /some/huge/file --resume-file hugefileresume.db --distribution-mode stripe --stripe-chunk-size-bytes 536870912
blobxfer upload --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path /some/huge/file --resume-file hugefileresume.db --distribution-mode stripe --stripe-chunk-size-bytes 536870912
```
#### Upload Specific Path but Skip On Any MD5 Matches, Store File MD5 and Cleanup Remote Path to Match Local Path
```shell
blobxfer upload --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource /my/path --file-md5 --skip-on-md5-match --delete
blobxfer upload --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path /my/path --file-md5 --skip-on-md5-match --delete
```
#### Upload From Piped `stdin`
```shell
curl -fSsL https://some.uri | blobxfer upload --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource -
curl -fSsL https://some.uri | blobxfer upload --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path -
```
#### Upload using a YAML Configuration File

Просмотреть файл

@ -91,5 +91,5 @@ keep this metadata in-tact or reconstruction will fail.
+---------------------+
```
In order to take advantage of `stripe` Vectored IO, you must use a YAML
configuration file to define multiple destinations.
In order to take advantage of `stripe` Vectored IO across multiple
destinations, you must use a YAML configuration file.

Просмотреть файл

@ -3,8 +3,8 @@ Please read the following carefully regarding considerations that should
be applied with regard to performance and `blobxfer`. Additionally,
please review the
[Azure Storage Scalability and Performance Targets](https://azure.microsoft.com/en-us/documentation/articles/storage-scalability-targets/)
for an overview of general performance targets that apply to Azure Blobs
and File shares.
for an overview of general performance targets that apply to Azure Blobs,
File shares and Storage Account types (GRS, LRS, ZRS, etc).
## Concurrency
* `blobxfer` offers four concurrency knobs. Each one should be tuned for
@ -23,6 +23,44 @@ maximum performance according to your system and network characteristics.
* The thread concurrency options (disk and transfer) can be set to a
non-positive number to be automatically set as a multiple of the number of
cores available on the machine.
* For uploads, there should be a sufficient number of disk threads to ensure
that all transfer threads have work to do. For downloads, there should be
sufficient number of disk threads to write data to disk so transfer threads
are not artificially blocked.
## Chunk Sizing
Chunk sizing refers to the `chunk_size_bytes` option and the meaning of which
varies upon the context of uploading or downloading.
### Uploads
For uploads, chunk sizes correspond to the maximum amount of data to transfer
with a single request. The Azure Storage service imposes maximums depending
upon the type of entity that is being written. For block blobs, the maximum
is 100MiB (although you may "one-shot" up to 256MiB). For page blobs, the
maximum is 4MiB. For append blobs, the maximum is 4MiB. For Azure Files,
the maximum is 4MiB.
For block blobs, setting the chunk size to something greater than 4MiB will
not only allow you larger file sizes (recall that the maximum number of
blocks for a block blob is 50000, thus at 100MiB blocks, you can create a
5TiB block blob object) but will allow you to amortize larger portions of
data transfer over each request/response overhead. `blobxfer` can
automatically select the proper block size given your file, but will not
automatically tune the chunk size as that depends upon your system and
network characteristics.
### Downloads
For downloads, chunk sizes correspond to the maximum amount of data to
request from the server for each request. It is important to keep a balance
between the chunk size and the number of in-flight operations afforded by
the `transfer_threads` concurrency control. `blobxfer` does not automatically
tune this (but can automatically set it to a value that should work for
most situations) due to varying system and network conditions.
Additionally, disk write performance is typically lower than disk read
performance so you need to ensure that the number of `disk_threads` is not
set to a very large number to prevent thrashing and highly random write
patterns.
## Azure File Share Performance
File share performance can be "slow" or become a bottleneck, especially for