[PERF] ResNet50 Weekly Perf Runs (#731)

* added new pipeline

* python cleanup

* test run

* test run

* test run

* test run

* test run

* test run

* test run

* test run

* test run

* test run

* test run

* test run

* test run

* test run

* test run

* test run

* test run

* test run

* test run

* test run

* test run

* test run

* test run

* test run

* test run

* removed print

* testing binary

* testing binary

* testing binary

* testing binary

* testing binary

* testing binary

* testing

* testing

* testing

* testing

* testing

* finished

* finished

* finished

* attempting to fix py issue

* cleanup

* testing

* removed direct read

* removed direct read

* removed direct read

* removed direct read

* fail if regression is too bad

* cleanup

* added dependency step to pipeline

* cleanup
This commit is contained in:
Tamer Sherif 2022-03-17 16:03:09 -07:00 коммит произвёл GitHub
Родитель c5d1c7f3ff
Коммит 168ea5d03a
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 290 добавлений и 0 удалений

150
blobfuse2-perf.yaml Normal file
Просмотреть файл

@ -0,0 +1,150 @@
schedules:
# Cron string < minute hour day-of-month month day-of-week>
# * means all like '*' in day of month means everyday
# Run only on main branch
# 'always' controls whether to run only if there is a change or not
# Run this pipeline every 15:00 time
- cron: '0 15 * * 6'
displayName: 'Weekly Perf Blobfuse2 Perf Run'
branches:
include:
- main
jobs:
- job: Blobfuse2_Perf_Test
timeoutInMinutes: 2800 # two day timeout
strategy:
matrix:
Ubuntu-20:
DistroVer: "Ubn20_PERF"
AgentName: "UBN20-PERF"
Description: "Blobfuse2 Perf Test"
pool:
name: "Blobfuse Pool"
demands:
- Agent.Name -equals $(AgentName)
variables:
- group: NightlyBlobFuse
- name: MOUNT_DIR
value: "/home/vsts/workv2/blob_mnt"
- name: TEMP_DIR
value: "/home/vsts/workv2/blobfuse2tmp"
- name: BLOBFUSE2_CFG
value: "/home/tamer/blobfuse2.yaml"
- name: GOPATH
value: "/home/vsts/workv2/go"
- name: ROOT_DIR
value: "/home/vsts/workv2/"
- name: WORK_DIR
value: "/home/vsts/workv2/go/src/azure-storage-fuse"
steps:
- checkout: none
# Prestart cleanup
- script: |
sudo fusermount -u $(MOUNT_DIR)
sudo kill -9 `pidof blobfuse2`
sudo rm -rf $(ROOT_DIR)
displayName: 'PreBuild Cleanup'
# Create directory structure
- script: |
sudo rm -rf $(ROOT_DIR)
sudo mkdir -p $(ROOT_DIR)
sudo chown -R `whoami` $(ROOT_DIR)
chmod 777 $(ROOT_DIR)
mkdir -p $(ROOT_DIR)/go/src
displayName: 'Create Directory Structure'
# Checkout the code
- script: |
git clone https://github.com/Azure/azure-storage-fuse
displayName: 'Checkout Code'
workingDirectory: $(ROOT_DIR)/go/src
# Pull the branch
- script: |
git checkout `echo $(Build.SourceBranch) | cut -d "/" -f 1,2 --complement`
displayName: 'Checkout Branch'
workingDirectory: $(WORK_DIR)
# -------------------------------------------------------
# Pull and build the code
- template: 'azure-pipeline-templates/build.yml'
parameters:
working_directory: $(WORK_DIR)
root_dir: $(ROOT_DIR)
mount_dir: $(MOUNT_DIR)
temp_dir: $(TEMP_DIR)
gopath: $(GOPATH)
container: cont1
skip_ut: true
- script: |
cd $(ROOT_DIR)
pip install numpy tensorflow
displayName: "Install Python Dependencies"
continueOnError: false
- script: |
cd $(ROOT_DIR)
wget https://github.com/Azure/azure-storage-fuse/releases/download/blobfuse2-2.0.0-preview.1/blobfuse2-2.0.0-preview.1-ubuntu-20.04-x86-64.deb
sudo dpkg -i $(ROOT_DIR)/blobfuse2-2.0.0-preview.1-ubuntu-20.04-x86-64.deb
sudo apt-get install -f
sudo apt-get install fuse3
blobfuse2 mount $(MOUNT_DIR) --config-file=$(BLOBFUSE2_CFG) --default-working-dir=$(WORK_DIR)
cd $(WORK_DIR)
python3 $(WORK_DIR)/test/perf_test/resnet50_classify.py --dataset='$(MOUNT_DIR)/data' --job='binary' --log=$(ROOT_DIR)/blobfuse2-perf.json
displayName: "Blobfuse2 ResNet50 Binary"
continueOnError: false
- script: |
sudo fusermount -u ${MOUNT_DIR}
sudo kill -9 `pidof blobfuse2` || true
displayName: "Unmount Blobfuse2 Binary Run"
- script: |
cd $(WORK_DIR)
$(WORK_DIR)/blobfuse2 gen-test-config --config-file=azure_key.yaml --container-name=cont1 --temp-path=$(TEMP_DIR) --output-file=$(BLOBFUSE2_CFG)
$(WORK_DIR)/blobfuse2 mount $(MOUNT_DIR) --config-file=$(BLOBFUSE2_CFG) --default-working-dir=$(WORK_DIR)
python3 $(WORK_DIR)/test/perf_test/resnet50_classify.py --dataset='$(MOUNT_DIR)/data' --job='main' --log=$(ROOT_DIR)/blobfuse2-perf.json
displayName: "Blobfuse2 ResNet50 on Main"
env:
NIGHTLY_STO_ACC_NAME: $(PERF_WEEKLY_STO_BLOB_ACC_NAME)
NIGHTLY_STO_ACC_KEY: $(PERF_WEEKLY_STO_BLOB_ACC_KEY)
ACCOUNT_TYPE: 'block'
ACCOUNT_ENDPOINT: 'https://$(PERF_WEEKLY_STO_BLOB_ACC_NAME).blob.core.windows.net'
continueOnError: false
- script: |
cd $(WORK_DIR)
python3 $(WORK_DIR)/test/perf_test/generate_perf_report.py --metrics=images/second --log=$(ROOT_DIR)/blobfuse2-perf.json
displayName: "Perf Regression Test"
continueOnError: false
- publish: $(ROOT_DIR)/blobfuse2-perf.json
artifact: Blobfuse2_performance_report
displayName: Publish Performance Report
- script: |
sudo fusermount -u ${MOUNT_DIR}
sudo kill -9 `pidof blobfuse2` || true
displayName: "Unmount Blobfuse2 Main Branch Run"
# Cleanup
- template: 'azure-pipeline-templates/cleanup.yml'
parameters:
working_dir: $(WORK_DIR)
mount_dir: $(MOUNT_DIR)
temp_dir: $(TEMP_DIR)
- script: |
sudo rm -rf ${ROOT_DIR}
pwd
cd /`pwd | cut -d '/' -f 2,3,4,5`
sudo rm -rf [0-9]
displayName: 'Clean Agent Directories'
condition: always()

Просмотреть файл

@ -0,0 +1,49 @@
# Python program to read
# json file
import json
import argparse
import sys
import os
import math
def compare_numbers(job_one, job_two, metrics_list, log_file):
f = open(log_file, mode='r+')
data = json.load(f)
result = {'performance_diff':{}}
for i in metrics_list:
metric_value = math.floor(((data[job_one][i]/data[job_two][i])*100)-100)
if metric_value < 0:
result['performance_diff'][i] = metric_value
sys.stdout.write('{} has regressed - there is a perf regression of {}%\n'.format(i, metric_value))
if metric_value < -3:
raise ValueError("large perf regression in {} detected of {}".format(i, metric_value))
if metric_value >= 0:
result['performance_diff'][i] = metric_value
sys.stdout.write('{} has a perf improvement of {}%\n'.format(i, metric_value))
data.update(result)
f.seek(0)
json.dump(data, f)
f.close()
if __name__ == "__main__":
# parse argument
parser = argparse.ArgumentParser("compare performance")
parser.add_argument('-j1', '--job1', default='main', help='name of the first job', required=False)
parser.add_argument('-j2', '--job2', default='binary', help='name of the second job', required=False)
parser.add_argument('-m','--metrics', nargs='+', help='metrics to compare from log file', required=True)
parser.add_argument('-lf',
'--log',
default="./blobfuse2-perf.json",
help='path of log file',
required=False)
args = vars(parser.parse_args())
log_file = args['log']
job_one_name = args['job1']
job_two_name = args['job2']
metrics_list = args['metrics']
compare_numbers(job_one_name, job_two_name, metrics_list, log_file)

Просмотреть файл

@ -0,0 +1,91 @@
import os
import sys
import time
import json
import argparse
import numpy as np
from multiprocessing import Pool
from tensorflow.keras.applications import resnet50
from tensorflow.keras.preprocessing.image import load_img
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications.imagenet_utils import decode_predictions
# we're not using any GPUs
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #15
os.environ["CUDA_VISIBLE_DEVICES"] = ""
def classify_images(images):
# we need to load the model within the process since we can't share a model across processes
resnet_model = resnet50.ResNet50(weights='imagenet')
tic = time.time()
sys.stdout.write('starting to process {} images in this thread at time: {}\n'.format(len(images), time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(tic))))
for filename in images:
# load image
original = load_img(filename, target_size=(224, 224))
# transform image
numpy_image = img_to_array(original)
image_batch = np.expand_dims(numpy_image, axis=0)
processed_image = resnet50.preprocess_input(image_batch)
# predict
predictions = resnet_model.predict(processed_image)
def chunks(paths, batch_size):
# yield successive batch size path chunks from paths.
for i in range(0, len(paths), batch_size):
yield paths[i:i + batch_size]
if __name__ == "__main__":
# parse argument
parser = argparse.ArgumentParser("classify dataset")
parser.add_argument('-d', '--dataset', help='dataset dir path', required=True)
parser.add_argument('-n', '--job', help='name of the resnet job', required=True)
parser.add_argument('-p', '--procs', default=32, help='number of parallel processes', required=False)
parser.add_argument('-lf',
'--log',
default="./blobfuse2-perf.json",
help='path of log file',
required=False)
args = vars(parser.parse_args())
# create a pool of 32 threads
dataset_path = args['dataset']
log_file_path = args['log']
job_name = args['job']
procs = args['procs']
p = Pool(processes=procs)
tic = time.time()
sys.stdout.write('collecting images at time: {}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(tic))))
# get list of files and split them in batches of 10k to be classified
images = [os.path.join(dp, f) for dp, dn, filenames in os.walk(dataset_path) for f in filenames]
image_subsets = list(chunks(images, 10000))
# load each batch onto a thread
result = p.map(classify_images, image_subsets)
p.close()
p.join()
toc=time.time()
sys.stdout.write('ended processing dataset at time {}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(toc))))
sys.stdout.write('time elapsed {}\n'.format((toc-tic)))
result = {job_name:{}}
result[job_name]['time elapsed'] = toc-tic
result[job_name]['total images'] = len(images)
result[job_name]['images/second'] = len(images)/(toc-tic)
if os.path.exists(log_file_path):
f = open(log_file_path, mode='r+')
data = json.load(f)
data.update(result)
f.seek(0)
json.dump(data, f)
else:
f = open(log_file_path, mode='a+')
json.dump(result, f)
f.close()