Tag for 1.0.0a3 release

- Rename some options - Make thread join more robust on Python2
2017-06-02 08:16:46 -07:00 · 2017-06-02 08:16:46 -07:00 · e1d97fa3cb
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,7 +2,7 @@

 ## [Unreleased]

-## [1.0.0a2] - 2017-06-02
+## [1.0.0a3] - 2017-06-02
 ### Changed
 - From scratch rewrite providing a consistent CLI experience and a vast
 array of new and advanced features. Please see the
@ -201,8 +201,8 @@ usage documentation carefully when upgrading from 0.12.1.
  `--no-skiponmatch`.
 - 0.8.2: performance regression fixes

-[Unreleased]: https://github.com/Azure/blobxfer/compare/1.0.0a2...HEAD
-[1.0.0a2]: https://github.com/Azure/blobxfer/compare/0.12.1...1.0.0a2
+[Unreleased]: https://github.com/Azure/blobxfer/compare/1.0.0a3...HEAD
+[1.0.0a3]: https://github.com/Azure/blobxfer/compare/0.12.1...1.0.0a3
 [0.12.1]: https://github.com/Azure/blobxfer/compare/0.12.0...0.12.1
 [0.12.0]: https://github.com/Azure/blobxfer/compare/0.11.5...0.12.0
 [0.11.5]: https://github.com/Azure/blobxfer/compare/0.11.4...0.11.5
--- a/blobxfer/models/options.py
+++ b/blobxfer/models/options.py
@ -105,13 +105,14 @@ class Concurrency(object):
    """Concurrency Options"""
    def __init__(
            self, crypto_processes, md5_processes, disk_threads,
-            transfer_threads):
+            transfer_threads, is_download=None):
        """Ctor for Concurrency Options
        :param Concurrency self: this
        :param int crypto_processes: number of crypto procs
        :param int md5_processes: number of md5 procs
        :param int disk_threads: number of disk threads
        :param int transfer_threads: number of transfer threads
+        :param bool is_download: download hint
        """
        self.crypto_processes = crypto_processes
        self.md5_processes = md5_processes
@ -131,6 +132,9 @@ class Concurrency(object):
            # cap maximum number of disk threads from cpu count to 64
            if self.disk_threads > 64:
                self.disk_threads = 64
+            # for downloads, cap disk threads to lower value
+            if is_download and self.disk_threads > 16:
+                self.disk_threads = 16
            auto_disk = True
        if self.transfer_threads is None or self.transfer_threads < 1:
            if auto_disk:
--- a/blobxfer/operations/download.py
+++ b/blobxfer/operations/download.py
@ -430,7 +430,7 @@ class Downloader(object):
        if terminate:
            self._download_terminate = terminate
        for thr in self._disk_threads:
-            thr.join()
+            blobxfer.util.join_thread(thr)

    def _wait_for_transfer_threads(self, terminate):
        # type: (Downloader, bool) -> None
@ -441,7 +441,7 @@ class Downloader(object):
        if terminate:
            self._download_terminate = terminate
        for thr in self._transfer_threads:
-            thr.join()
+            blobxfer.util.join_thread(thr)

    def _worker_thread_transfer(self):
        # type: (Downloader) -> None
@ -452,7 +452,7 @@ class Downloader(object):
        while not self.termination_check:
            try:
                if len(self._disk_set) > max_set_len:
-                    time.sleep(0.2)
+                    time.sleep(0.1)
                    continue
                else:
                    dd = self._transfer_queue.get(block=False, timeout=0.1)
@ -792,8 +792,8 @@ class Downloader(object):
                    'KeyboardInterrupt detected, force terminating '
                    'processes and threads (this may take a while)...')
            try:
-                self._wait_for_transfer_threads(terminate=True)
                self._wait_for_disk_threads(terminate=True)
+                self._wait_for_transfer_threads(terminate=True)
            finally:
                self._cleanup_temporary_files()
            raise
--- a/blobxfer/operations/upload.py
+++ b/blobxfer/operations/upload.py
@ -447,10 +447,10 @@ class Uploader(object):
        while not self.termination_check:
            try:
                if len(self._transfer_set) > max_set_len:
-                    time.sleep(0.2)
+                    time.sleep(0.1)
                    continue
                else:
-                    ud = self._upload_queue.get(False, 0.1)
+                    ud = self._upload_queue.get(block=False, timeout=0.1)
            except queue.Empty:
                continue
            try:
--- a/blobxfer/util.py
+++ b/blobxfer/util.py
@ -124,6 +124,20 @@ def is_not_empty(obj):
    return obj is not None and len(obj) > 0


+def join_thread(thr):
+    # type: (threading.Thread) -> None
+    """Join a thread
+    :type threading.Thread thr: thread to join
+    """
+    if on_python2():
+        while True:
+            thr.join(timeout=1)
+            if not thr.isAlive():
+                break
+    else:
+        thr.join()
+
+
 def merge_dict(dict1, dict2):
    # type: (dict, dict) -> dict
    """Recursively merge dictionaries: dict2 on to dict1. This differs
--- a/blobxfer/version.py
+++ b/blobxfer/version.py
@ -22,4 +22,4 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 # DEALINGS IN THE SOFTWARE.

-__version__ = '1.0.0a2'
+__version__ = '1.0.0a3'
--- a/cli/cli.py
+++ b/cli/cli.py
@ -62,13 +62,15 @@ class CliContext(object):
        self.credentials = None
        self.general_options = None

-    def initialize(self):
-        # type: (CliContext) -> None
+    def initialize(self, action):
+        # type: (CliContext, settings.TransferAction) -> None
        """Initialize context
        :param CliContext self: this
+        :param settings.TransferAction action: transfer action
        """
        self._init_config()
-        self.general_options = settings.create_general_options(self.config)
+        self.general_options = settings.create_general_options(
+            self.config, action)
        self.credentials = settings.create_azure_storage_credentials(
            self.config, self.general_options)

@ -164,7 +166,8 @@ def _log_file_option(f):
        '--log-file',
        expose_value=False,
        default=None,
-        help='Log to file specified',
+        help='Log to file specified; this must be specified for progress '
+        'bar to show',
        callback=callback)(f)


@ -191,7 +194,8 @@ def _progress_bar_option(f):
        '--progress-bar/--no-progress-bar',
        expose_value=False,
        default=True,
-        help='Display progress bar instead of console logs [True]',
+        help='Display progress bar instead of console logs; log file must '
+        'be specified [True]',
        callback=callback)(f)


@ -254,22 +258,22 @@ def _local_resource_option(f):
        clictx.cli_options['local_resource'] = value
        return value
    return click.option(
-        '--local-resource',
+        '--local-path',
        expose_value=False,
-        help='Local resource; use - for stdin',
+        help='Local path; use - for stdin',
        callback=callback)(f)


-def _storage_account_name_option(f):
+def _storage_account_option(f):
    def callback(ctx, param, value):
        clictx = ctx.ensure_object(CliContext)
        clictx.cli_options['storage_account'] = value
        return value
    return click.option(
-        '--storage-account-name',
+        '--storage-account',
        expose_value=False,
        help='Storage account name',
-        envvar='BLOBXFER_STORAGE_ACCOUNT_NAME',
+        envvar='BLOBXFER_STORAGE_ACCOUNT',
        callback=callback)(f)


@ -301,7 +305,7 @@ def common_options(f):

 def upload_download_options(f):
    f = _remote_path_option(f)
-    f = _storage_account_name_option(f)
+    f = _storage_account_option(f)
    f = _local_resource_option(f)
    return f

@ -633,16 +637,16 @@ def _sync_copy_dest_access_key_option(f):
        callback=callback)(f)


-def _sync_copy_dest_storage_account_name_option(f):
+def _sync_copy_dest_storage_account_option(f):
    def callback(ctx, param, value):
        clictx = ctx.ensure_object(CliContext)
        clictx.cli_options['sync_copy_dest_storage_account'] = value
        return value
    return click.option(
-        '--sync-copy-dest-storage-account-name',
+        '--sync-copy-dest-storage-account',
        expose_value=False,
        help='Storage account name for synccopy destination',
-        envvar='BLOBXFER_SYNC_COPY_DEST_STORAGE_ACCOUNT_NAME',
+        envvar='BLOBXFER_SYNC_COPY_DEST_STORAGE_ACCOUNT',
        callback=callback)(f)


@ -721,11 +725,11 @@ def download_options(f):


 def sync_copy_options(f):
-    f = _sync_copy_dest_storage_account_name_option(f)
+    f = _sync_copy_dest_storage_account_option(f)
    f = _sync_copy_dest_sas_option(f)
    f = _sync_copy_dest_remote_path_option(f)
    f = _sync_copy_dest_access_key_option(f)
-    f = _storage_account_name_option(f)
+    f = _storage_account_option(f)
    f = _skip_on_md5_match_option(f)
    f = _skip_on_lmt_ge_option(f)
    f = _skip_on_filesize_match_option(f)
@ -757,7 +761,7 @@ def cli(ctx):
 def download(ctx):
    """Download blobs or files from Azure Storage"""
    settings.add_cli_options(ctx.cli_options, settings.TransferAction.Download)
-    ctx.initialize()
+    ctx.initialize(settings.TransferAction.Download)
    specs = settings.create_download_specifications(ctx.config)
    for spec in specs:
        blobxfer.api.Downloader(
@ -773,7 +777,7 @@ def synccopy(ctx):
    """Synchronously copy blobs between Azure Storage accounts"""
    raise NotImplementedError()
    settings.add_cli_options(ctx.cli_options, settings.TransferAction.Synccopy)
-    ctx.initialize()
+    ctx.initialize(settings.TransferAction.Synccopy)


@cli.command('upload')
@ -784,7 +788,7 @@ def synccopy(ctx):
 def upload(ctx):
    """Upload files to Azure Storage"""
    settings.add_cli_options(ctx.cli_options, settings.TransferAction.Upload)
-    ctx.initialize()
+    ctx.initialize(settings.TransferAction.Upload)
    specs = settings.create_upload_specifications(ctx.config)
    for spec in specs:
        blobxfer.api.Uploader(
--- a/cli/settings.py
+++ b/cli/settings.py
@ -61,13 +61,13 @@ def add_cli_options(cli_options, action):
        if blobxfer.util.is_none_or_empty(local_resource):
            raise KeyError()
    except KeyError:
-        raise ValueError('--local-resource must be specified')
+        raise ValueError('--local-path must be specified')
    try:
        storage_account = cli_options['storage_account']
        if blobxfer.util.is_none_or_empty(storage_account):
            raise KeyError()
    except KeyError:
-        raise ValueError('--storage-account-name must be specified')
+        raise ValueError('--storage-account must be specified')
    try:
        remote_path = cli_options['remote_path']
        if blobxfer.util.is_none_or_empty(remote_path):
@ -167,7 +167,7 @@ def add_cli_options(cli_options, action):
                    raise KeyError()
            except KeyError:
                raise ValueError(
-                    '--sync-copy-dest-storage-account-name must be specified')
+                    '--sync-copy-dest-storage-account must be specified')
            try:
                sync_copy_dest_remote_path = \
                    cli_options['sync_copy_dest_remote_path']
@ -278,10 +278,11 @@ def create_azure_storage_credentials(config, general_options):
    return creds


-def create_general_options(config):
-    # type: (dict) -> blobxfer.models.options.General
+def create_general_options(config, action):
+    # type: (dict, TransferAction) -> blobxfer.models.options.General
    """Create a General Options object from configuration
    :param dict config: config dict
+    :param TransferAction action: transfer action
    :rtype: blobxfer.models.options.General
    :return: general options object
    """
@ -292,6 +293,7 @@ def create_general_options(config):
            disk_threads=conc.get('disk_threads', 0),
            md5_processes=conc.get('md5_processes', 0),
            transfer_threads=conc.get('transfer_threads', 0),
+            is_download=action == TransferAction.Download,
        ),
        log_file=config['options'].get('log_file', None),
        progress_bar=config['options'].get('progress_bar', True),
--- a/docs/01-installation.md
+++ b/docs/01-installation.md
@ -72,9 +72,10 @@ docker pull alfpark/blobxfer

 ## Troubleshooting
 #### `azure.storage` dependency not found
-If you get an error that `azure.storage` cannot be found or loaded, then
-most likely there was a conflict with this package with other `azure` packages
-that share the same base namespace. You can correct this by issuing:
+If you get an error such as `ImportError: No module named storage` or that
+`azure.storage` cannot be found or loaded, then most likely there was a
+conflict with this package with other `azure` packages that share the same
+base namespace. You can correct this by issuing:
 ```shell
 # for Python3
 pip3 install --upgrade --force-reinstall azure-storage
--- a/docs/10-cli-usage.md
+++ b/docs/10-cli-usage.md
@ -12,9 +12,9 @@ command will be detailed along with all options available.
 ### `download`
 Downloads a remote Azure path, which may contain many resources, to the
 local machine. This command requires at the minimum, the following options:
-* `--storage-account-name`
+* `--storage-account`
 * `--remote-path`
-* `--local-resource`
+* `--local-path`

 Additionally, an authentication option for the storage account is required.
 Please see the Authentication sub-section below under Options.
@ -23,14 +23,14 @@ Please see the Authentication sub-section below under Options.
 Uploads a local path to a remote Azure path. The local path may contain
 many resources on the local machine. This command requires at the minimum,
 the following options:
-* `--local-resource`
-* `--storage-account-name`
+* `--local-path`
+* `--storage-account`
 * `--remote-path`

 Additionally, an authentication option for the storage account is required.
 Please see the Authentication sub-section below under Options.

-If piping from `stdin`, `--local-resource` should be set to `-` as per
+If piping from `stdin`, `--local-path` should be set to `-` as per
 convention.

 ### `synccopy`
@ -49,9 +49,10 @@ of up to 100MiB, all others have a maximum of 4MiB.
 attributes (mode and ownership) should be stored or restored. Note that to
 restore uid/gid, `blobxfer` must be run as root or under sudo.
 * `--file-md5` or `--no-file-md5` controls if the file MD5 should be computed.
-* `--local-resource` is the local resource path. Set to `-` if piping from
+* `--local-path` is the local resource path. Set to `-` if piping from
 `stdin`.
-* `--log-file` specifies the log file to write to.
+* `--log-file` specifies the log file to write to. This must be specified
+for a progress bar to be output to console.
 * `--mode` is the operating mode. The default is `auto` but may be set to
 `append`, `block`, `file`, or `page`. If specified with the `upload`
 command, then all files will be uploaded as the specified `mode` type.
@ -61,12 +62,16 @@ with Azure File shares.
 * `--overwrite` or `--no-overwrite` controls clobber semantics at the
 destination.
 * `--progress-bar` or `--no-progress-bar` controls if a progress bar is
-output to the console.
+output to the console. `--log-file` must be specified for a progress bar
+to be output.
 * `--recursive` or `--no-recursive` controls if the source path should be
 recursively uploaded or downloaded.
 * `--remote-path` is the remote Azure path. This path must contain the
 Blob container or File share at the begining, e.g., `mycontainer/vdir`
 * `--resume-file` specifies the resume file to write to.
+* `--storage-account` specifies the storage account to use. This can be
+optionally provided through an environment variable `BLOBXFER_STORAGE_ACCOUNT`
+instead.
 * `--timeout` is the integral timeout value in seconds to use.
 * `-h` or `--help` can be passed at every command level to receive context
 sensitive help.
@ -96,7 +101,7 @@ to/from Azure Storage.
 ### Connection
 * `--endpoint` is the Azure Storage endpoint to connect to; the default is
 Azure Public regions, or `core.windows.net`.
-* `--storage-account-name` is the storage account to connect to.
+* `--storage-account` is the storage account to connect to.

 ### Encryption
 * `--rsa-private-key` is the RSA private key in PEM format to use. This can
@ -161,27 +166,27 @@ file path. The default is `1`.
 ### `download` Examples
 #### Download an Entire Encrypted Blob Container to Current Working Directory
 ```shell
-blobxfer download --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource . --rsa-public-key ~/mypubkey.pem
+blobxfer download --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path . --rsa-public-key ~/mypubkey.pem
 ```

 #### Download an Entire File Share to Designated Path and Skip On Filesize Matches
 ```shell
-blobxfer download --mode file --storage-account-name mystorageaccount --storage-account-key "myaccesskey" --remote-path myfileshare --local-resource /my/path --skip-on-filesize-match
+blobxfer download --mode file --storage-account mystorageaccount --storage-account-key "myaccesskey" --remote-path myfileshare --local-path /my/path --skip-on-filesize-match
 ```

 #### Download only Page Blobs in Blob Container Virtual Directory Non-recursively and Cleanup Local Path to Match Remote Path
 ```shell
-blobxfer download --mode page --storage-account-name mystorageaccount --storage-account-key "myaccesskey" --remote-path mycontainer --local-resource /my/pageblobs --no-recursive --delete
+blobxfer download --mode page --storage-account mystorageaccount --storage-account-key "myaccesskey" --remote-path mycontainer --local-path /my/pageblobs --no-recursive --delete
 ```

 #### Resume Incomplete Downloads Matching an Include Pattern and Log to File and Restore POSIX File Attributes
 ```shell
-blobxfer download --storage-account-name mystorageaccount --storage-account-key "myaccesskey" --remote-path mycontainer --local-resource . --include '*.bin' --resume-file myresumefile.db --log-file blobxfer.log --file-attributes
+blobxfer download --storage-account mystorageaccount --storage-account-key "myaccesskey" --remote-path mycontainer --local-path . --include '*.bin' --resume-file myresumefile.db --log-file blobxfer.log --file-attributes
 ```

 #### Download a Blob Snapshot
 ```shell
-blobxfer download --storage-account-name mystorageaccount --sas "mysastoken" --remote-path "mycontainer/file.bin?snapshot=2017-04-20T02:12:49.0311708Z" --local-resource .
+blobxfer download --storage-account mystorageaccount --sas "mysastoken" --remote-path "mycontainer/file.bin?snapshot=2017-04-20T02:12:49.0311708Z" --local-path .
 ```

 #### Download using a YAML Configuration File
@ -192,27 +197,27 @@ blobxfer download --config myconfig.yaml
 ### `upload` Examples
 #### Upload Current Working Directory as Encrypted Block Blobs Non-recursively
 ```shell
-blobxfer upload --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource . --rsa-private-key ~/myprivatekey.pem --no-recursive
+blobxfer upload --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path . --rsa-private-key ~/myprivatekey.pem --no-recursive
 ```

 #### Upload Specific Path Recursively to a File Share, Store File MD5 and POSIX File Attributes to a File Share and Exclude Some Files
 ```shell
-blobxfer upload --mode file --storage-account-name mystorageaccount --sas "mysastoken" --remote-path myfileshare --local-resource . --file-md5 --file-attributes --exclude '*.bak'
+blobxfer upload --mode file --storage-account mystorageaccount --sas "mysastoken" --remote-path myfileshare --local-path . --file-md5 --file-attributes --exclude '*.bak'
 ```

 #### Upload Single File with Resume and Striped Vectored IO into 512MiB Chunks
 ```shell
-blobxfer upload --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource /some/huge/file --resume-file hugefileresume.db --distribution-mode stripe --stripe-chunk-size-bytes 536870912
+blobxfer upload --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path /some/huge/file --resume-file hugefileresume.db --distribution-mode stripe --stripe-chunk-size-bytes 536870912
 ```

 #### Upload Specific Path but Skip On Any MD5 Matches, Store File MD5 and Cleanup Remote Path to Match Local Path
 ```shell
-blobxfer upload --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource /my/path --file-md5 --skip-on-md5-match --delete
+blobxfer upload --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path /my/path --file-md5 --skip-on-md5-match --delete
 ```

 #### Upload From Piped `stdin`
 ```shell
-curl -fSsL https://some.uri | blobxfer upload --storage-account-name mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-resource -
+curl -fSsL https://some.uri | blobxfer upload --storage-account mystorageaccount --sas "mysastoken" --remote-path mycontainer --local-path -
 ```

 #### Upload using a YAML Configuration File
--- a/docs/30-vectored-io.md
+++ b/docs/30-vectored-io.md
@ -91,5 +91,5 @@ keep this metadata in-tact or reconstruction will fail.
                                                     +---------------------+
 ```

-In order to take advantage of `stripe` Vectored IO, you must use a YAML
-configuration file to define multiple destinations.
+In order to take advantage of `stripe` Vectored IO across multiple
+destinations, you must use a YAML configuration file.
--- a/docs/98-performance-considerations.md
+++ b/docs/98-performance-considerations.md
@ -3,8 +3,8 @@ Please read the following carefully regarding considerations that should
 be applied with regard to performance and `blobxfer`. Additionally,
 please review the
 [Azure Storage Scalability and Performance Targets](https://azure.microsoft.com/en-us/documentation/articles/storage-scalability-targets/)
-for an overview of general performance targets that apply to Azure Blobs
-and File shares.
+for an overview of general performance targets that apply to Azure Blobs,
+File shares and Storage Account types (GRS, LRS, ZRS, etc).

 ## Concurrency
 * `blobxfer` offers four concurrency knobs. Each one should be tuned for
@ -23,6 +23,44 @@ maximum performance according to your system and network characteristics.
 * The thread concurrency options (disk and transfer) can be set to a
 non-positive number to be automatically set as a multiple of the number of
 cores available on the machine.
+* For uploads, there should be a sufficient number of disk threads to ensure
+that all transfer threads have work to do. For downloads, there should be
+sufficient number of disk threads to write data to disk so transfer threads
+are not artificially blocked.
+
+## Chunk Sizing
+Chunk sizing refers to the `chunk_size_bytes` option and the meaning of which
+varies upon the context of uploading or downloading.
+
+### Uploads
+For uploads, chunk sizes correspond to the maximum amount of data to transfer
+with a single request. The Azure Storage service imposes maximums depending
+upon the type of entity that is being written. For block blobs, the maximum
+is 100MiB (although you may "one-shot" up to 256MiB). For page blobs, the
+maximum is 4MiB. For append blobs, the maximum is 4MiB. For Azure Files,
+the maximum is 4MiB.
+
+For block blobs, setting the chunk size to something greater than 4MiB will
+not only allow you larger file sizes (recall that the maximum number of
+blocks for a block blob is 50000, thus at 100MiB blocks, you can create a
+5TiB block blob object) but will allow you to amortize larger portions of
+data transfer over each request/response overhead. `blobxfer` can
+automatically select the proper block size given your file, but will not
+automatically tune the chunk size as that depends upon your system and
+network characteristics.
+
+### Downloads
+For downloads, chunk sizes correspond to the maximum amount of data to
+request from the server for each request. It is important to keep a balance
+between the chunk size and the number of in-flight operations afforded by
+the `transfer_threads` concurrency control. `blobxfer` does not automatically
+tune this (but can automatically set it to a value that should work for
+most situations) due to varying system and network conditions.
+
+Additionally, disk write performance is typically lower than disk read
+performance so you need to ensure that the number of `disk_threads` is not
+set to a very large number to prevent thrashing and highly random write
+patterns.

 ## Azure File Share Performance
 File share performance can be "slow" or become a bottleneck, especially for