[PERF] ResNet50 Weekly Perf Runs (#731)
* added new pipeline * python cleanup * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * test run * removed print * testing binary * testing binary * testing binary * testing binary * testing binary * testing binary * testing * testing * testing * testing * testing * finished * finished * finished * attempting to fix py issue * cleanup * testing * removed direct read * removed direct read * removed direct read * removed direct read * fail if regression is too bad * cleanup * added dependency step to pipeline * cleanup
This commit is contained in:
Родитель
c5d1c7f3ff
Коммит
168ea5d03a
|
@ -0,0 +1,150 @@
|
|||
schedules:
|
||||
# Cron string < minute hour day-of-month month day-of-week>
|
||||
# * means all like '*' in day of month means everyday
|
||||
# Run only on main branch
|
||||
# 'always' controls whether to run only if there is a change or not
|
||||
# Run this pipeline every 15:00 time
|
||||
- cron: '0 15 * * 6'
|
||||
displayName: 'Weekly Perf Blobfuse2 Perf Run'
|
||||
branches:
|
||||
include:
|
||||
- main
|
||||
|
||||
jobs:
|
||||
- job: Blobfuse2_Perf_Test
|
||||
timeoutInMinutes: 2800 # two day timeout
|
||||
strategy:
|
||||
matrix:
|
||||
Ubuntu-20:
|
||||
DistroVer: "Ubn20_PERF"
|
||||
AgentName: "UBN20-PERF"
|
||||
Description: "Blobfuse2 Perf Test"
|
||||
|
||||
pool:
|
||||
name: "Blobfuse Pool"
|
||||
demands:
|
||||
- Agent.Name -equals $(AgentName)
|
||||
|
||||
variables:
|
||||
- group: NightlyBlobFuse
|
||||
- name: MOUNT_DIR
|
||||
value: "/home/vsts/workv2/blob_mnt"
|
||||
- name: TEMP_DIR
|
||||
value: "/home/vsts/workv2/blobfuse2tmp"
|
||||
- name: BLOBFUSE2_CFG
|
||||
value: "/home/tamer/blobfuse2.yaml"
|
||||
- name: GOPATH
|
||||
value: "/home/vsts/workv2/go"
|
||||
- name: ROOT_DIR
|
||||
value: "/home/vsts/workv2/"
|
||||
- name: WORK_DIR
|
||||
value: "/home/vsts/workv2/go/src/azure-storage-fuse"
|
||||
|
||||
steps:
|
||||
- checkout: none
|
||||
|
||||
# Prestart cleanup
|
||||
- script: |
|
||||
sudo fusermount -u $(MOUNT_DIR)
|
||||
sudo kill -9 `pidof blobfuse2`
|
||||
sudo rm -rf $(ROOT_DIR)
|
||||
displayName: 'PreBuild Cleanup'
|
||||
|
||||
# Create directory structure
|
||||
- script: |
|
||||
sudo rm -rf $(ROOT_DIR)
|
||||
sudo mkdir -p $(ROOT_DIR)
|
||||
sudo chown -R `whoami` $(ROOT_DIR)
|
||||
chmod 777 $(ROOT_DIR)
|
||||
mkdir -p $(ROOT_DIR)/go/src
|
||||
displayName: 'Create Directory Structure'
|
||||
|
||||
# Checkout the code
|
||||
- script: |
|
||||
git clone https://github.com/Azure/azure-storage-fuse
|
||||
displayName: 'Checkout Code'
|
||||
workingDirectory: $(ROOT_DIR)/go/src
|
||||
|
||||
# Pull the branch
|
||||
- script: |
|
||||
git checkout `echo $(Build.SourceBranch) | cut -d "/" -f 1,2 --complement`
|
||||
displayName: 'Checkout Branch'
|
||||
workingDirectory: $(WORK_DIR)
|
||||
|
||||
# -------------------------------------------------------
|
||||
# Pull and build the code
|
||||
- template: 'azure-pipeline-templates/build.yml'
|
||||
parameters:
|
||||
working_directory: $(WORK_DIR)
|
||||
root_dir: $(ROOT_DIR)
|
||||
mount_dir: $(MOUNT_DIR)
|
||||
temp_dir: $(TEMP_DIR)
|
||||
gopath: $(GOPATH)
|
||||
container: cont1
|
||||
skip_ut: true
|
||||
|
||||
- script: |
|
||||
cd $(ROOT_DIR)
|
||||
pip install numpy tensorflow
|
||||
displayName: "Install Python Dependencies"
|
||||
continueOnError: false
|
||||
|
||||
- script: |
|
||||
cd $(ROOT_DIR)
|
||||
wget https://github.com/Azure/azure-storage-fuse/releases/download/blobfuse2-2.0.0-preview.1/blobfuse2-2.0.0-preview.1-ubuntu-20.04-x86-64.deb
|
||||
sudo dpkg -i $(ROOT_DIR)/blobfuse2-2.0.0-preview.1-ubuntu-20.04-x86-64.deb
|
||||
sudo apt-get install -f
|
||||
sudo apt-get install fuse3
|
||||
blobfuse2 mount $(MOUNT_DIR) --config-file=$(BLOBFUSE2_CFG) --default-working-dir=$(WORK_DIR)
|
||||
cd $(WORK_DIR)
|
||||
python3 $(WORK_DIR)/test/perf_test/resnet50_classify.py --dataset='$(MOUNT_DIR)/data' --job='binary' --log=$(ROOT_DIR)/blobfuse2-perf.json
|
||||
displayName: "Blobfuse2 ResNet50 Binary"
|
||||
continueOnError: false
|
||||
|
||||
- script: |
|
||||
sudo fusermount -u ${MOUNT_DIR}
|
||||
sudo kill -9 `pidof blobfuse2` || true
|
||||
displayName: "Unmount Blobfuse2 Binary Run"
|
||||
|
||||
- script: |
|
||||
cd $(WORK_DIR)
|
||||
$(WORK_DIR)/blobfuse2 gen-test-config --config-file=azure_key.yaml --container-name=cont1 --temp-path=$(TEMP_DIR) --output-file=$(BLOBFUSE2_CFG)
|
||||
$(WORK_DIR)/blobfuse2 mount $(MOUNT_DIR) --config-file=$(BLOBFUSE2_CFG) --default-working-dir=$(WORK_DIR)
|
||||
python3 $(WORK_DIR)/test/perf_test/resnet50_classify.py --dataset='$(MOUNT_DIR)/data' --job='main' --log=$(ROOT_DIR)/blobfuse2-perf.json
|
||||
displayName: "Blobfuse2 ResNet50 on Main"
|
||||
env:
|
||||
NIGHTLY_STO_ACC_NAME: $(PERF_WEEKLY_STO_BLOB_ACC_NAME)
|
||||
NIGHTLY_STO_ACC_KEY: $(PERF_WEEKLY_STO_BLOB_ACC_KEY)
|
||||
ACCOUNT_TYPE: 'block'
|
||||
ACCOUNT_ENDPOINT: 'https://$(PERF_WEEKLY_STO_BLOB_ACC_NAME).blob.core.windows.net'
|
||||
continueOnError: false
|
||||
|
||||
- script: |
|
||||
cd $(WORK_DIR)
|
||||
python3 $(WORK_DIR)/test/perf_test/generate_perf_report.py --metrics=images/second --log=$(ROOT_DIR)/blobfuse2-perf.json
|
||||
displayName: "Perf Regression Test"
|
||||
continueOnError: false
|
||||
|
||||
- publish: $(ROOT_DIR)/blobfuse2-perf.json
|
||||
artifact: Blobfuse2_performance_report
|
||||
displayName: Publish Performance Report
|
||||
|
||||
- script: |
|
||||
sudo fusermount -u ${MOUNT_DIR}
|
||||
sudo kill -9 `pidof blobfuse2` || true
|
||||
displayName: "Unmount Blobfuse2 Main Branch Run"
|
||||
|
||||
# Cleanup
|
||||
- template: 'azure-pipeline-templates/cleanup.yml'
|
||||
parameters:
|
||||
working_dir: $(WORK_DIR)
|
||||
mount_dir: $(MOUNT_DIR)
|
||||
temp_dir: $(TEMP_DIR)
|
||||
|
||||
- script: |
|
||||
sudo rm -rf ${ROOT_DIR}
|
||||
pwd
|
||||
cd /`pwd | cut -d '/' -f 2,3,4,5`
|
||||
sudo rm -rf [0-9]
|
||||
displayName: 'Clean Agent Directories'
|
||||
condition: always()
|
|
@ -0,0 +1,49 @@
|
|||
# Python program to read
|
||||
# json file
|
||||
|
||||
|
||||
import json
|
||||
import argparse
|
||||
import sys
|
||||
import os
|
||||
import math
|
||||
|
||||
def compare_numbers(job_one, job_two, metrics_list, log_file):
|
||||
f = open(log_file, mode='r+')
|
||||
data = json.load(f)
|
||||
result = {'performance_diff':{}}
|
||||
for i in metrics_list:
|
||||
metric_value = math.floor(((data[job_one][i]/data[job_two][i])*100)-100)
|
||||
if metric_value < 0:
|
||||
result['performance_diff'][i] = metric_value
|
||||
sys.stdout.write('{} has regressed - there is a perf regression of {}%\n'.format(i, metric_value))
|
||||
if metric_value < -3:
|
||||
raise ValueError("large perf regression in {} detected of {}".format(i, metric_value))
|
||||
if metric_value >= 0:
|
||||
result['performance_diff'][i] = metric_value
|
||||
sys.stdout.write('{} has a perf improvement of {}%\n'.format(i, metric_value))
|
||||
data.update(result)
|
||||
f.seek(0)
|
||||
json.dump(data, f)
|
||||
f.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# parse argument
|
||||
parser = argparse.ArgumentParser("compare performance")
|
||||
parser.add_argument('-j1', '--job1', default='main', help='name of the first job', required=False)
|
||||
parser.add_argument('-j2', '--job2', default='binary', help='name of the second job', required=False)
|
||||
parser.add_argument('-m','--metrics', nargs='+', help='metrics to compare from log file', required=True)
|
||||
parser.add_argument('-lf',
|
||||
'--log',
|
||||
default="./blobfuse2-perf.json",
|
||||
help='path of log file',
|
||||
required=False)
|
||||
args = vars(parser.parse_args())
|
||||
log_file = args['log']
|
||||
job_one_name = args['job1']
|
||||
job_two_name = args['job2']
|
||||
metrics_list = args['metrics']
|
||||
|
||||
compare_numbers(job_one_name, job_two_name, metrics_list, log_file)
|
||||
|
|
@ -0,0 +1,91 @@
|
|||
import os
|
||||
import sys
|
||||
import time
|
||||
import json
|
||||
import argparse
|
||||
import numpy as np
|
||||
from multiprocessing import Pool
|
||||
from tensorflow.keras.applications import resnet50
|
||||
from tensorflow.keras.preprocessing.image import load_img
|
||||
from tensorflow.keras.preprocessing.image import img_to_array
|
||||
from tensorflow.keras.applications.imagenet_utils import decode_predictions
|
||||
|
||||
# we're not using any GPUs
|
||||
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #15
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
||||
|
||||
|
||||
def classify_images(images):
|
||||
# we need to load the model within the process since we can't share a model across processes
|
||||
resnet_model = resnet50.ResNet50(weights='imagenet')
|
||||
|
||||
tic = time.time()
|
||||
sys.stdout.write('starting to process {} images in this thread at time: {}\n'.format(len(images), time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(tic))))
|
||||
|
||||
for filename in images:
|
||||
# load image
|
||||
original = load_img(filename, target_size=(224, 224))
|
||||
# transform image
|
||||
numpy_image = img_to_array(original)
|
||||
image_batch = np.expand_dims(numpy_image, axis=0)
|
||||
processed_image = resnet50.preprocess_input(image_batch)
|
||||
# predict
|
||||
predictions = resnet_model.predict(processed_image)
|
||||
|
||||
|
||||
def chunks(paths, batch_size):
|
||||
# yield successive batch size path chunks from paths.
|
||||
for i in range(0, len(paths), batch_size):
|
||||
yield paths[i:i + batch_size]
|
||||
|
||||
if __name__ == "__main__":
|
||||
# parse argument
|
||||
parser = argparse.ArgumentParser("classify dataset")
|
||||
parser.add_argument('-d', '--dataset', help='dataset dir path', required=True)
|
||||
parser.add_argument('-n', '--job', help='name of the resnet job', required=True)
|
||||
parser.add_argument('-p', '--procs', default=32, help='number of parallel processes', required=False)
|
||||
parser.add_argument('-lf',
|
||||
'--log',
|
||||
default="./blobfuse2-perf.json",
|
||||
help='path of log file',
|
||||
required=False)
|
||||
|
||||
args = vars(parser.parse_args())
|
||||
|
||||
# create a pool of 32 threads
|
||||
dataset_path = args['dataset']
|
||||
log_file_path = args['log']
|
||||
job_name = args['job']
|
||||
procs = args['procs']
|
||||
p = Pool(processes=procs)
|
||||
tic = time.time()
|
||||
|
||||
sys.stdout.write('collecting images at time: {}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(tic))))
|
||||
# get list of files and split them in batches of 10k to be classified
|
||||
images = [os.path.join(dp, f) for dp, dn, filenames in os.walk(dataset_path) for f in filenames]
|
||||
image_subsets = list(chunks(images, 10000))
|
||||
|
||||
# load each batch onto a thread
|
||||
result = p.map(classify_images, image_subsets)
|
||||
p.close()
|
||||
p.join()
|
||||
|
||||
toc=time.time()
|
||||
sys.stdout.write('ended processing dataset at time {}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(toc))))
|
||||
sys.stdout.write('time elapsed {}\n'.format((toc-tic)))
|
||||
|
||||
result = {job_name:{}}
|
||||
result[job_name]['time elapsed'] = toc-tic
|
||||
result[job_name]['total images'] = len(images)
|
||||
result[job_name]['images/second'] = len(images)/(toc-tic)
|
||||
|
||||
if os.path.exists(log_file_path):
|
||||
f = open(log_file_path, mode='r+')
|
||||
data = json.load(f)
|
||||
data.update(result)
|
||||
f.seek(0)
|
||||
json.dump(data, f)
|
||||
else:
|
||||
f = open(log_file_path, mode='a+')
|
||||
json.dump(result, f)
|
||||
f.close()
|
Загрузка…
Ссылка в новой задаче