diff --git a/blobfuse2-perf.yaml b/blobfuse2-perf.yaml new file mode 100644 index 00000000..760d1c4a --- /dev/null +++ b/blobfuse2-perf.yaml @@ -0,0 +1,150 @@ +schedules: + # Cron string < minute hour day-of-month month day-of-week> + # * means all like '*' in day of month means everyday + # Run only on main branch + # 'always' controls whether to run only if there is a change or not + # Run this pipeline every 15:00 time + - cron: '0 15 * * 6' + displayName: 'Weekly Perf Blobfuse2 Perf Run' + branches: + include: + - main + +jobs: + - job: Blobfuse2_Perf_Test + timeoutInMinutes: 2800 # two day timeout + strategy: + matrix: + Ubuntu-20: + DistroVer: "Ubn20_PERF" + AgentName: "UBN20-PERF" + Description: "Blobfuse2 Perf Test" + + pool: + name: "Blobfuse Pool" + demands: + - Agent.Name -equals $(AgentName) + + variables: + - group: NightlyBlobFuse + - name: MOUNT_DIR + value: "/home/vsts/workv2/blob_mnt" + - name: TEMP_DIR + value: "/home/vsts/workv2/blobfuse2tmp" + - name: BLOBFUSE2_CFG + value: "/home/tamer/blobfuse2.yaml" + - name: GOPATH + value: "/home/vsts/workv2/go" + - name: ROOT_DIR + value: "/home/vsts/workv2/" + - name: WORK_DIR + value: "/home/vsts/workv2/go/src/azure-storage-fuse" + + steps: + - checkout: none + + # Prestart cleanup + - script: | + sudo fusermount -u $(MOUNT_DIR) + sudo kill -9 `pidof blobfuse2` + sudo rm -rf $(ROOT_DIR) + displayName: 'PreBuild Cleanup' + + # Create directory structure + - script: | + sudo rm -rf $(ROOT_DIR) + sudo mkdir -p $(ROOT_DIR) + sudo chown -R `whoami` $(ROOT_DIR) + chmod 777 $(ROOT_DIR) + mkdir -p $(ROOT_DIR)/go/src + displayName: 'Create Directory Structure' + + # Checkout the code + - script: | + git clone https://github.com/Azure/azure-storage-fuse + displayName: 'Checkout Code' + workingDirectory: $(ROOT_DIR)/go/src + + # Pull the branch + - script: | + git checkout `echo $(Build.SourceBranch) | cut -d "/" -f 1,2 --complement` + displayName: 'Checkout Branch' + workingDirectory: $(WORK_DIR) + + # ------------------------------------------------------- + # Pull and build the code + - template: 'azure-pipeline-templates/build.yml' + parameters: + working_directory: $(WORK_DIR) + root_dir: $(ROOT_DIR) + mount_dir: $(MOUNT_DIR) + temp_dir: $(TEMP_DIR) + gopath: $(GOPATH) + container: cont1 + skip_ut: true + + - script: | + cd $(ROOT_DIR) + pip install numpy tensorflow + displayName: "Install Python Dependencies" + continueOnError: false + + - script: | + cd $(ROOT_DIR) + wget https://github.com/Azure/azure-storage-fuse/releases/download/blobfuse2-2.0.0-preview.1/blobfuse2-2.0.0-preview.1-ubuntu-20.04-x86-64.deb + sudo dpkg -i $(ROOT_DIR)/blobfuse2-2.0.0-preview.1-ubuntu-20.04-x86-64.deb + sudo apt-get install -f + sudo apt-get install fuse3 + blobfuse2 mount $(MOUNT_DIR) --config-file=$(BLOBFUSE2_CFG) --default-working-dir=$(WORK_DIR) + cd $(WORK_DIR) + python3 $(WORK_DIR)/test/perf_test/resnet50_classify.py --dataset='$(MOUNT_DIR)/data' --job='binary' --log=$(ROOT_DIR)/blobfuse2-perf.json + displayName: "Blobfuse2 ResNet50 Binary" + continueOnError: false + + - script: | + sudo fusermount -u ${MOUNT_DIR} + sudo kill -9 `pidof blobfuse2` || true + displayName: "Unmount Blobfuse2 Binary Run" + + - script: | + cd $(WORK_DIR) + $(WORK_DIR)/blobfuse2 gen-test-config --config-file=azure_key.yaml --container-name=cont1 --temp-path=$(TEMP_DIR) --output-file=$(BLOBFUSE2_CFG) + $(WORK_DIR)/blobfuse2 mount $(MOUNT_DIR) --config-file=$(BLOBFUSE2_CFG) --default-working-dir=$(WORK_DIR) + python3 $(WORK_DIR)/test/perf_test/resnet50_classify.py --dataset='$(MOUNT_DIR)/data' --job='main' --log=$(ROOT_DIR)/blobfuse2-perf.json + displayName: "Blobfuse2 ResNet50 on Main" + env: + NIGHTLY_STO_ACC_NAME: $(PERF_WEEKLY_STO_BLOB_ACC_NAME) + NIGHTLY_STO_ACC_KEY: $(PERF_WEEKLY_STO_BLOB_ACC_KEY) + ACCOUNT_TYPE: 'block' + ACCOUNT_ENDPOINT: 'https://$(PERF_WEEKLY_STO_BLOB_ACC_NAME).blob.core.windows.net' + continueOnError: false + + - script: | + cd $(WORK_DIR) + python3 $(WORK_DIR)/test/perf_test/generate_perf_report.py --metrics=images/second --log=$(ROOT_DIR)/blobfuse2-perf.json + displayName: "Perf Regression Test" + continueOnError: false + + - publish: $(ROOT_DIR)/blobfuse2-perf.json + artifact: Blobfuse2_performance_report + displayName: Publish Performance Report + + - script: | + sudo fusermount -u ${MOUNT_DIR} + sudo kill -9 `pidof blobfuse2` || true + displayName: "Unmount Blobfuse2 Main Branch Run" + + # Cleanup + - template: 'azure-pipeline-templates/cleanup.yml' + parameters: + working_dir: $(WORK_DIR) + mount_dir: $(MOUNT_DIR) + temp_dir: $(TEMP_DIR) + + - script: | + sudo rm -rf ${ROOT_DIR} + pwd + cd /`pwd | cut -d '/' -f 2,3,4,5` + sudo rm -rf [0-9] + displayName: 'Clean Agent Directories' + condition: always() diff --git a/test/perf_test/generate_perf_report.py b/test/perf_test/generate_perf_report.py new file mode 100644 index 00000000..3672ad72 --- /dev/null +++ b/test/perf_test/generate_perf_report.py @@ -0,0 +1,49 @@ +# Python program to read +# json file + + +import json +import argparse +import sys +import os +import math + +def compare_numbers(job_one, job_two, metrics_list, log_file): + f = open(log_file, mode='r+') + data = json.load(f) + result = {'performance_diff':{}} + for i in metrics_list: + metric_value = math.floor(((data[job_one][i]/data[job_two][i])*100)-100) + if metric_value < 0: + result['performance_diff'][i] = metric_value + sys.stdout.write('{} has regressed - there is a perf regression of {}%\n'.format(i, metric_value)) + if metric_value < -3: + raise ValueError("large perf regression in {} detected of {}".format(i, metric_value)) + if metric_value >= 0: + result['performance_diff'][i] = metric_value + sys.stdout.write('{} has a perf improvement of {}%\n'.format(i, metric_value)) + data.update(result) + f.seek(0) + json.dump(data, f) + f.close() + + +if __name__ == "__main__": + # parse argument + parser = argparse.ArgumentParser("compare performance") + parser.add_argument('-j1', '--job1', default='main', help='name of the first job', required=False) + parser.add_argument('-j2', '--job2', default='binary', help='name of the second job', required=False) + parser.add_argument('-m','--metrics', nargs='+', help='metrics to compare from log file', required=True) + parser.add_argument('-lf', + '--log', + default="./blobfuse2-perf.json", + help='path of log file', + required=False) + args = vars(parser.parse_args()) + log_file = args['log'] + job_one_name = args['job1'] + job_two_name = args['job2'] + metrics_list = args['metrics'] + + compare_numbers(job_one_name, job_two_name, metrics_list, log_file) + \ No newline at end of file diff --git a/test/perf_test/resnet50_classify.py b/test/perf_test/resnet50_classify.py new file mode 100644 index 00000000..14ae9ac0 --- /dev/null +++ b/test/perf_test/resnet50_classify.py @@ -0,0 +1,91 @@ +import os +import sys +import time +import json +import argparse +import numpy as np +from multiprocessing import Pool +from tensorflow.keras.applications import resnet50 +from tensorflow.keras.preprocessing.image import load_img +from tensorflow.keras.preprocessing.image import img_to_array +from tensorflow.keras.applications.imagenet_utils import decode_predictions + +# we're not using any GPUs +os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #15 +os.environ["CUDA_VISIBLE_DEVICES"] = "" + + +def classify_images(images): + # we need to load the model within the process since we can't share a model across processes + resnet_model = resnet50.ResNet50(weights='imagenet') + + tic = time.time() + sys.stdout.write('starting to process {} images in this thread at time: {}\n'.format(len(images), time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(tic)))) + + for filename in images: + # load image + original = load_img(filename, target_size=(224, 224)) + # transform image + numpy_image = img_to_array(original) + image_batch = np.expand_dims(numpy_image, axis=0) + processed_image = resnet50.preprocess_input(image_batch) + # predict + predictions = resnet_model.predict(processed_image) + + +def chunks(paths, batch_size): + # yield successive batch size path chunks from paths. + for i in range(0, len(paths), batch_size): + yield paths[i:i + batch_size] + +if __name__ == "__main__": + # parse argument + parser = argparse.ArgumentParser("classify dataset") + parser.add_argument('-d', '--dataset', help='dataset dir path', required=True) + parser.add_argument('-n', '--job', help='name of the resnet job', required=True) + parser.add_argument('-p', '--procs', default=32, help='number of parallel processes', required=False) + parser.add_argument('-lf', + '--log', + default="./blobfuse2-perf.json", + help='path of log file', + required=False) + + args = vars(parser.parse_args()) + + # create a pool of 32 threads + dataset_path = args['dataset'] + log_file_path = args['log'] + job_name = args['job'] + procs = args['procs'] + p = Pool(processes=procs) + tic = time.time() + + sys.stdout.write('collecting images at time: {}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(tic)))) + # get list of files and split them in batches of 10k to be classified + images = [os.path.join(dp, f) for dp, dn, filenames in os.walk(dataset_path) for f in filenames] + image_subsets = list(chunks(images, 10000)) + + # load each batch onto a thread + result = p.map(classify_images, image_subsets) + p.close() + p.join() + + toc=time.time() + sys.stdout.write('ended processing dataset at time {}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(toc)))) + sys.stdout.write('time elapsed {}\n'.format((toc-tic))) + + result = {job_name:{}} + result[job_name]['time elapsed'] = toc-tic + result[job_name]['total images'] = len(images) + result[job_name]['images/second'] = len(images)/(toc-tic) + + if os.path.exists(log_file_path): + f = open(log_file_path, mode='r+') + data = json.load(f) + data.update(result) + f.seek(0) + json.dump(data, f) + else: + f = open(log_file_path, mode='a+') + json.dump(result, f) + f.close()