* update configuration.cfg to secrets.cfg

* fix node count pretty print

* add more unit tests

* configure pytest to only run tests from the 'tests' directory

* add missing space

* remove static call to loading secrets file

* initial commit for automatic wasb support

* wasb native support

* fix for wasb

* bug fix in azure_api

* clean up setup_wasb

* README for wasb

* README updates for wasb

* configure wasb creds in start task environment variables

* add missing merged files

* revert back to working state

* added storage account suffix

* removed zip to secrets
This commit is contained in:
JS 2017-08-22 11:57:45 -04:00 коммит произвёл GitHub
Родитель 4822172b6a
Коммит 577ff296b5
7 изменённых файлов: 118 добавлений и 6 удалений

Просмотреть файл

@ -129,6 +129,22 @@ azb spark cluster ssh \
--jupyter <local-port>
```
### Connect your cluster to Azure Blob Storage (WASB connection)
Pre-built into this package is native support for connecting your spark cluster to Azure Blob Storage. To do so, make sure that the storage fields in your **secrets.cfg** file are properly filled out.
Even if you are just testing and have no need to connect with Azure Blob Storage, you still need to correctly fill out the storage fields in your **secrets.cfg** folder as it is a requirement for this package.
Once you have correctly filled out the **secrets.cfg** with your storage credentials, you will be able to access said storage account from your Spark job.
Please note: If you want to access another Azure Blob Storage account, you will need to recreate your cluster with an updated **secrets.cfg** file with the appropriate storage credentials.
Here's an example of how you may access your data in Blob Storage:
``` python
df = spark.read.csv("wasbs://<STORAGE_CONTAINER_NAME>@<STORAGE_ACCOUNT_NAME>.blob.core.windows.net/<BLOB_NAME>")
```
### Manage your Spark cluster
You can also see your clusters from the CLI:

Просмотреть файл

@ -9,6 +9,7 @@ global_config = None
batch_client = None
batch_config = None
blob_config = None
blob_client = None
@ -23,6 +24,13 @@ class BatchConfig:
self.account_url = account_url
class BlobConfig:
def __init__(self, account_key: str, account_name: str, account_suffix: str):
self.account_key = account_key
self.account_name = account_name
self.account_suffix = account_suffix
def get_batch_client():
"""
:returns: the batch client singleton
@ -34,7 +42,7 @@ def get_batch_client():
def get_blob_client():
"""
:returns: the batch client singleton
:returns: the blob client singleton
"""
if not blob_client:
__load_blob_client()
@ -129,9 +137,18 @@ def __load_batch_client():
config.account_url)
def __load_blob_client():
def get_blob_config() -> BlobConfig:
if not blob_config:
__load_blob_config()
return blob_config
def __load_blob_config():
global_config = config.get()
global blob_client
global blob_config
if not global_config.has_option('Storage', 'storageaccountkey'):
raise AzureApiInitError("Storage account key is not set in config")
@ -148,8 +165,21 @@ def __load_blob_client():
storage_account_suffix = global_config.get(
'Storage', 'storageaccountsuffix')
blob_config = BlobConfig(
account_key=storage_account_key,
account_name=storage_account_name,
account_suffix=storage_account_suffix)
def __load_blob_client():
global blob_client
blob_config = get_blob_config()
# create storage client
blob_client = create_blob_client(
storage_account_key,
storage_account_name,
storage_account_suffix)
blob_config.account_key,
blob_config.account_name,
blob_config.account_suffix)

Просмотреть файл

@ -48,6 +48,9 @@ def docker_run_cmd() -> str:
cmd.add_option('-e', 'AZ_BATCH_ACCOUNT_NAME=$AZ_BATCH_ACCOUNT_NAME')
cmd.add_option('-e', 'BATCH_ACCOUNT_KEY=$BATCH_ACCOUNT_KEY')
cmd.add_option('-e', 'BATCH_ACCOUNT_URL=$BATCH_ACCOUNT_URL')
cmd.add_option('-e', 'STORAGE_ACCOUNT_NAME=$STORAGE_ACCOUNT_NAME')
cmd.add_option('-e', 'STORAGE_ACCOUNT_KEY=$STORAGE_ACCOUNT_KEY')
cmd.add_option('-e', 'STORAGE_ACCOUNT_SUFFIX=$STORAGE_ACCOUNT_SUFFIX')
cmd.add_option('-e', 'AZ_BATCH_POOL_ID=$AZ_BATCH_POOL_ID')
cmd.add_option('-e', 'AZ_BATCH_NODE_ID=$AZ_BATCH_NODE_ID')
cmd.add_option(
@ -93,11 +96,18 @@ def generate_cluster_start_task(
# TODO use certificate
batch_config = azure_api.get_batch_config()
blob_config = azure_api.get_blob_config()
environment_settings = [
batch_models.EnvironmentSetting(
name="BATCH_ACCOUNT_KEY", value=batch_config.account_key),
batch_models.EnvironmentSetting(
name="BATCH_ACCOUNT_URL", value=batch_config.account_url),
batch_models.EnvironmentSetting(
name="STORAGE_ACCOUNT_NAME", value=blob_config.account_name),
batch_models.EnvironmentSetting(
name="STORAGE_ACCOUNT_KEY", value=blob_config.account_key),
batch_models.EnvironmentSetting(
name="STORAGE_ACCOUNT_SUFFIX", value=blob_config.account_suffix),
batch_models.EnvironmentSetting(
name="SPARK_MASTER_UI_PORT", value=spark_master_ui_port),
batch_models.EnvironmentSetting(

Просмотреть файл

@ -37,6 +37,8 @@ def __create_zip():
ensure_dir(local_tmp_zipfile)
zipf = zipfile.ZipFile(local_tmp_zipfile, 'w', zipfile.ZIP_DEFLATED)
zipdir(os.path.join(root, "node_scripts"), zipf)
# zipf.write()
zipf.close()
logging.debug("Ziped file")

Просмотреть файл

@ -17,6 +17,18 @@ if [ -d "$custom_script_dir" ]; then
else
echo "Custom script dir '$custom_script_dir' doesn't exists. Will not run any custom scripts."
fi
storage_account_name=$STORAGE_ACCOUNT_NAME
storage_account_key=$STORAGE_ACCOUNT_KEY
storage_account_suffix=$STORAGE_ACCOUNT_SUFFIX
if [ -n "$STORAGE_ACCOUNT_NAME" ] && [ -n "$STORAGE_ACCOUNT_KEY" ] && [ -n "$STORAGE_ACCOUNT_SUFFIX" ]; then
echo "Setting up WASB connection"
bash $(dirname $0)/setup_wasb.sh $storage_account_name $storage_account_key $storage_account_suffix
else
echo "Storage credentials not set"
fi
echo "Starting setup using Docker"
pip3 install -r $(dirname $0)/requirements.txt

Просмотреть файл

@ -170,3 +170,5 @@ def start_spark_worker():
"--webui-port", str(config.spark_worker_ui_port)]
print("Connecting to master with '{0}'".format(" ".join(cmd)))
call(cmd)

Просмотреть файл

@ -0,0 +1,40 @@
#!/bin/bash
# script to install blob storage connection on each node
# Usage:
# setup_wasb.sh [storage_account_name] [storage_account_key]
storage_account_name=$1
storage_account_key=$2
storage_account_suffix=$3
spark_home=/home/spark-2.2.0-bin-hadoop2.7
cd $spark_home/conf
cp spark-defaults.conf.template spark-defaults.conf
cat >> spark-defaults.conf <<EOF
spark.jars $spark_home/jars/azure-storage-2.0.0.jar,$spark_home/jars/hadoop-azure-2.7.3.jar
EOF
cat >> core-site.xml <<EOF
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>fs.AbstractFileSystem.wasb.Impl</name>
<value>org.apache.hadoop.fs.azure.Wasb</value>
</property>
<property>
<name>fs.azure.account.key.$storage_account_name.blob.$storage_account_suffix</name>
<value>$storage_account_key</value>
</property>
</configuration>
EOF
cd $spark_home/jars
# install the appropriate jars
apt install wget
wget http://repo1.maven.org/maven2/com/microsoft/azure/azure-storage/2.0.0/azure-storage-2.0.0.jar
wget http://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/2.7.3/hadoop-azure-2.7.3.jar