[PERF] ResNet50 Weekly Perf Runs (#731)

* added new pipeline * python cleanup * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * removed print * testing binary * testing binary * testing binary * testing binary * testing binary * testing binary * testing * testing * testing * testing * testing * finished * finished * finished * attempting to fix py issue * cleanup * testing * removed direct read * removed direct read * removed direct read * removed direct read * fail if regression is too bad * cleanup * added dependency step to pipeline * cleanup
2022-03-17 16:03:09 -07:00 · 2022-03-17 16:03:09 -07:00 · 168ea5d03a
--- a/blobfuse2-perf.yaml
+++ b/blobfuse2-perf.yaml
@ -0,0 +1,150 @@
+schedules:
+  # Cron string < minute hour day-of-month month day-of-week>
+  #             * means all like '*' in day of month means everyday
+  # Run only on main branch
+  # 'always' controls whether to run only if there is a change or not
+  # Run this pipeline every 15:00 time
+  - cron: '0 15 * * 6'
+    displayName: 'Weekly Perf Blobfuse2 Perf Run'
+    branches:
+      include:
+        - main
+
+jobs:
+  - job: Blobfuse2_Perf_Test
+    timeoutInMinutes: 2800 # two day timeout
+    strategy:
+      matrix:
+        Ubuntu-20:
+          DistroVer: "Ubn20_PERF"
+          AgentName: "UBN20-PERF"
+          Description: "Blobfuse2 Perf Test"
+
+    pool:
+      name: "Blobfuse Pool"
+      demands:
+        - Agent.Name -equals $(AgentName)
+
+    variables:
+      - group: NightlyBlobFuse
+      - name: MOUNT_DIR
+        value: "/home/vsts/workv2/blob_mnt"
+      - name: TEMP_DIR
+        value: "/home/vsts/workv2/blobfuse2tmp"
+      - name: BLOBFUSE2_CFG
+        value: "/home/tamer/blobfuse2.yaml"
+      - name: GOPATH
+        value: "/home/vsts/workv2/go"
+      - name: ROOT_DIR
+        value: "/home/vsts/workv2/"
+      - name: WORK_DIR
+        value: "/home/vsts/workv2/go/src/azure-storage-fuse"
+
+    steps:
+      - checkout: none
+
+        # Prestart cleanup
+      - script: |
+          sudo fusermount -u $(MOUNT_DIR)
+          sudo kill -9 `pidof blobfuse2`
+          sudo rm -rf $(ROOT_DIR)
+        displayName: 'PreBuild Cleanup'
+
+      # Create directory structure
+      - script: |
+          sudo rm -rf $(ROOT_DIR)
+          sudo mkdir -p $(ROOT_DIR)
+          sudo chown -R `whoami` $(ROOT_DIR)
+          chmod 777 $(ROOT_DIR)
+          mkdir -p $(ROOT_DIR)/go/src
+        displayName: 'Create Directory Structure'
+  
+      # Checkout the code  
+      - script: |
+          git clone https://github.com/Azure/azure-storage-fuse
+        displayName: 'Checkout Code'
+        workingDirectory: $(ROOT_DIR)/go/src
+
+      # Pull the branch
+      - script: |
+          git checkout `echo $(Build.SourceBranch) | cut -d "/" -f 1,2 --complement`
+        displayName: 'Checkout Branch'
+        workingDirectory: $(WORK_DIR)
+
+      # -------------------------------------------------------
+      # Pull and build the code
+      - template: 'azure-pipeline-templates/build.yml'
+        parameters:
+          working_directory: $(WORK_DIR)
+          root_dir: $(ROOT_DIR)
+          mount_dir: $(MOUNT_DIR)
+          temp_dir: $(TEMP_DIR)
+          gopath: $(GOPATH)
+          container: cont1
+          skip_ut: true
+
+      - script: |
+          cd $(ROOT_DIR)
+          pip install numpy tensorflow
+        displayName: "Install Python Dependencies"
+        continueOnError: false
+
+      - script: |
+          cd $(ROOT_DIR)
+          wget https://github.com/Azure/azure-storage-fuse/releases/download/blobfuse2-2.0.0-preview.1/blobfuse2-2.0.0-preview.1-ubuntu-20.04-x86-64.deb
+          sudo dpkg -i $(ROOT_DIR)/blobfuse2-2.0.0-preview.1-ubuntu-20.04-x86-64.deb
+          sudo apt-get install -f
+          sudo apt-get install fuse3
+          blobfuse2 mount $(MOUNT_DIR) --config-file=$(BLOBFUSE2_CFG) --default-working-dir=$(WORK_DIR)
+          cd $(WORK_DIR)
+          python3 $(WORK_DIR)/test/perf_test/resnet50_classify.py --dataset='$(MOUNT_DIR)/data' --job='binary' --log=$(ROOT_DIR)/blobfuse2-perf.json
+        displayName: "Blobfuse2 ResNet50 Binary"
+        continueOnError: false
+
+      - script: |
+          sudo fusermount -u ${MOUNT_DIR}
+          sudo kill -9 `pidof blobfuse2` || true
+        displayName: "Unmount Blobfuse2 Binary Run"
+
+      - script: |
+          cd $(WORK_DIR)
+          $(WORK_DIR)/blobfuse2 gen-test-config --config-file=azure_key.yaml --container-name=cont1 --temp-path=$(TEMP_DIR) --output-file=$(BLOBFUSE2_CFG)
+          $(WORK_DIR)/blobfuse2 mount $(MOUNT_DIR) --config-file=$(BLOBFUSE2_CFG) --default-working-dir=$(WORK_DIR)
+          python3 $(WORK_DIR)/test/perf_test/resnet50_classify.py --dataset='$(MOUNT_DIR)/data' --job='main' --log=$(ROOT_DIR)/blobfuse2-perf.json
+        displayName: "Blobfuse2 ResNet50 on Main"
+        env:
+          NIGHTLY_STO_ACC_NAME: $(PERF_WEEKLY_STO_BLOB_ACC_NAME)
+          NIGHTLY_STO_ACC_KEY: $(PERF_WEEKLY_STO_BLOB_ACC_KEY)
+          ACCOUNT_TYPE: 'block'
+          ACCOUNT_ENDPOINT: 'https://$(PERF_WEEKLY_STO_BLOB_ACC_NAME).blob.core.windows.net'
+        continueOnError: false
+
+      - script: |
+          cd $(WORK_DIR)
+          python3 $(WORK_DIR)/test/perf_test/generate_perf_report.py --metrics=images/second --log=$(ROOT_DIR)/blobfuse2-perf.json
+        displayName: "Perf Regression Test"
+        continueOnError: false
+
+      - publish: $(ROOT_DIR)/blobfuse2-perf.json
+        artifact: Blobfuse2_performance_report
+        displayName: Publish Performance Report
+
+      - script: |
+          sudo fusermount -u ${MOUNT_DIR}
+          sudo kill -9 `pidof blobfuse2` || true
+        displayName: "Unmount Blobfuse2 Main Branch Run"
+
+      # Cleanup
+      - template: 'azure-pipeline-templates/cleanup.yml'
+        parameters:
+          working_dir: $(WORK_DIR)
+          mount_dir: $(MOUNT_DIR)
+          temp_dir: $(TEMP_DIR)
+      
+      - script: |
+          sudo rm -rf ${ROOT_DIR}
+          pwd
+          cd /`pwd | cut -d '/' -f 2,3,4,5`
+          sudo rm -rf [0-9]
+        displayName: 'Clean Agent Directories'
+        condition: always() 
--- a/test/perf_test/generate_perf_report.py
+++ b/test/perf_test/generate_perf_report.py
@ -0,0 +1,49 @@
+# Python program to read
+# json file
+
+
+import json
+import argparse
+import sys
+import os
+import math
+
+def compare_numbers(job_one, job_two, metrics_list, log_file):
+    f = open(log_file, mode='r+')
+    data = json.load(f)
+    result = {'performance_diff':{}}
+    for i in metrics_list:
+        metric_value = math.floor(((data[job_one][i]/data[job_two][i])*100)-100)
+        if metric_value < 0:
+            result['performance_diff'][i] = metric_value
+            sys.stdout.write('{} has regressed - there is a perf regression of {}%\n'.format(i, metric_value))
+            if metric_value < -3:
+                raise ValueError("large perf regression in {} detected of {}".format(i, metric_value))
+        if metric_value >= 0:
+            result['performance_diff'][i] = metric_value
+            sys.stdout.write('{} has a perf improvement of {}%\n'.format(i, metric_value))
+        data.update(result)
+        f.seek(0)
+        json.dump(data, f)
+    f.close()
+
+
+if __name__ == "__main__":
+    # parse argument
+    parser = argparse.ArgumentParser("compare performance")
+    parser.add_argument('-j1', '--job1', default='main', help='name of the first job', required=False)
+    parser.add_argument('-j2', '--job2', default='binary', help='name of the second job', required=False)
+    parser.add_argument('-m','--metrics', nargs='+', help='metrics to compare from log file', required=True)
+    parser.add_argument('-lf',
+                        '--log',
+                        default="./blobfuse2-perf.json",
+                        help='path of log file', 
+                        required=False)
+    args = vars(parser.parse_args())
+    log_file = args['log']
+    job_one_name = args['job1']
+    job_two_name = args['job2']
+    metrics_list = args['metrics']
+
+    compare_numbers(job_one_name, job_two_name, metrics_list, log_file)
+    
--- a/test/perf_test/resnet50_classify.py
+++ b/test/perf_test/resnet50_classify.py
@ -0,0 +1,91 @@
+import os
+import sys
+import time
+import json
+import argparse
+import numpy as np
+from multiprocessing import Pool
+from tensorflow.keras.applications import resnet50
+from tensorflow.keras.preprocessing.image import load_img
+from tensorflow.keras.preprocessing.image import img_to_array
+from tensorflow.keras.applications.imagenet_utils import decode_predictions
+
+# we're not using any GPUs
+os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #15
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+
+
+def classify_images(images):
+    # we need to load the model within the process since we can't share a model across processes
+    resnet_model = resnet50.ResNet50(weights='imagenet')
+    
+    tic = time.time()
+    sys.stdout.write('starting to process {} images in this thread at time: {}\n'.format(len(images), time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(tic))))
+    
+    for filename in images:
+        # load image
+        original = load_img(filename, target_size=(224, 224))
+        # transform image
+        numpy_image = img_to_array(original)
+        image_batch = np.expand_dims(numpy_image, axis=0)
+        processed_image = resnet50.preprocess_input(image_batch)
+        # predict
+        predictions = resnet_model.predict(processed_image)
+
+
+def chunks(paths, batch_size):
+    # yield successive batch size path chunks from paths.
+    for i in range(0, len(paths), batch_size):
+        yield paths[i:i + batch_size]
+        
+if __name__ == "__main__":
+    # parse argument
+    parser = argparse.ArgumentParser("classify dataset")
+    parser.add_argument('-d', '--dataset', help='dataset dir path', required=True)
+    parser.add_argument('-n', '--job', help='name of the resnet job', required=True)
+    parser.add_argument('-p', '--procs', default=32, help='number of parallel processes', required=False)
+    parser.add_argument('-lf', 
+                        '--log', 
+                        default="./blobfuse2-perf.json",
+                        help='path of log file', 
+                        required=False)
+    
+    args = vars(parser.parse_args())
+    
+    # create a pool of 32 threads
+    dataset_path = args['dataset']
+    log_file_path = args['log']
+    job_name = args['job']
+    procs = args['procs']
+    p = Pool(processes=procs)
+    tic = time.time()
+
+    sys.stdout.write('collecting images at time: {}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(tic))))
+    # get list of files and split them in batches of 10k to be classified
+    images = [os.path.join(dp, f) for dp, dn, filenames in os.walk(dataset_path) for f in filenames]
+    image_subsets = list(chunks(images, 10000))
+
+    # load each batch onto a thread
+    result = p.map(classify_images, image_subsets)
+    p.close()
+    p.join()
+    
+    toc=time.time()
+    sys.stdout.write('ended processing dataset at time {}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(toc))))
+    sys.stdout.write('time elapsed {}\n'.format((toc-tic)))
+    
+    result = {job_name:{}}
+    result[job_name]['time elapsed'] = toc-tic
+    result[job_name]['total images'] = len(images)
+    result[job_name]['images/second'] = len(images)/(toc-tic)
+    
+    if os.path.exists(log_file_path):
+        f = open(log_file_path, mode='r+')
+        data = json.load(f)
+        data.update(result)
+        f.seek(0)
+        json.dump(data, f)
+    else:
+        f = open(log_file_path, mode='a+')
+        json.dump(result, f)
+    f.close()