action recognition (#535)
* update gitignore * updates to utils for loading data * train_test_split_function * update 01, 02 notebook * show batch, new notebook * updates to show batch * added graph for fit * notebooks and testS * refactor * dataset tests * dataset tests * flake8 * black * test update * 01_training update * round deci * transform notebook * remove blackmagic cell' * working state * working state with webcam * flake8 and black' * update jupyter notebooks * notebook updates * fixes to pr comments * minor fix * update 02 hmdb notebook * update 02 hmdb notebook
This commit is contained in:
Родитель
eb6644f82d
Коммит
28f474bed0
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -0,0 +1,400 @@
|
|||
abseiling
|
||||
air drumming
|
||||
answering questions
|
||||
applauding
|
||||
applying cream
|
||||
archery
|
||||
arm wrestling
|
||||
arranging flowers
|
||||
assembling computer
|
||||
auctioning
|
||||
baby waking up
|
||||
baking cookies
|
||||
balloon blowing
|
||||
bandaging
|
||||
barbequing
|
||||
bartending
|
||||
beatboxing
|
||||
bee keeping
|
||||
belly dancing
|
||||
bench pressing
|
||||
bending back
|
||||
bending metal
|
||||
biking through snow
|
||||
blasting sand
|
||||
blowing glass
|
||||
blowing leaves
|
||||
blowing nose
|
||||
blowing out candles
|
||||
bobsledding
|
||||
bookbinding
|
||||
bouncing on trampoline
|
||||
bowling
|
||||
braiding hair
|
||||
breading or breadcrumbing
|
||||
breakdancing
|
||||
brush painting
|
||||
brushing hair
|
||||
brushing teeth
|
||||
building cabinet
|
||||
building shed
|
||||
bungee jumping
|
||||
busking
|
||||
canoeing or kayaking
|
||||
capoeira
|
||||
carrying baby
|
||||
cartwheeling
|
||||
carving pumpkin
|
||||
catching fish
|
||||
catching or throwing baseball
|
||||
catching or throwing frisbee
|
||||
catching or throwing softball
|
||||
celebrating
|
||||
changing oil
|
||||
changing wheel
|
||||
checking tires
|
||||
cheerleading
|
||||
chopping wood
|
||||
clapping
|
||||
clay pottery making
|
||||
clean and jerk
|
||||
cleaning floor
|
||||
cleaning gutters
|
||||
cleaning pool
|
||||
cleaning shoes
|
||||
cleaning toilet
|
||||
cleaning windows
|
||||
climbing a rope
|
||||
climbing ladder
|
||||
climbing tree
|
||||
contact juggling
|
||||
cooking chicken
|
||||
cooking egg
|
||||
cooking on campfire
|
||||
cooking sausages
|
||||
counting money
|
||||
country line dancing
|
||||
cracking neck
|
||||
crawling baby
|
||||
crossing river
|
||||
crying
|
||||
curling hair
|
||||
cutting nails
|
||||
cutting pineapple
|
||||
cutting watermelon
|
||||
dancing ballet
|
||||
dancing charleston
|
||||
dancing gangnam style
|
||||
dancing macarena
|
||||
deadlifting
|
||||
decorating the christmas tree
|
||||
digging
|
||||
dining
|
||||
disc golfing
|
||||
diving cliff
|
||||
dodgeball
|
||||
doing aerobics
|
||||
doing laundry
|
||||
doing nails
|
||||
drawing
|
||||
dribbling basketball
|
||||
drinking
|
||||
drinking beer
|
||||
drinking shots
|
||||
driving car
|
||||
driving tractor
|
||||
drop kicking
|
||||
drumming fingers
|
||||
dunking basketball
|
||||
dying hair
|
||||
eating burger
|
||||
eating cake
|
||||
eating carrots
|
||||
eating chips
|
||||
eating doughnuts
|
||||
eating hotdog
|
||||
eating ice cream
|
||||
eating spaghetti
|
||||
eating watermelon
|
||||
egg hunting
|
||||
exercising arm
|
||||
exercising with an exercise ball
|
||||
extinguishing fire
|
||||
faceplanting
|
||||
feeding birds
|
||||
feeding fish
|
||||
feeding goats
|
||||
filling eyebrows
|
||||
finger snapping
|
||||
fixing hair
|
||||
flipping pancake
|
||||
flying kite
|
||||
folding clothes
|
||||
folding napkins
|
||||
folding paper
|
||||
front raises
|
||||
frying vegetables
|
||||
garbage collecting
|
||||
gargling
|
||||
getting a haircut
|
||||
getting a tattoo
|
||||
giving or receiving award
|
||||
golf chipping
|
||||
golf driving
|
||||
golf putting
|
||||
grinding meat
|
||||
grooming dog
|
||||
grooming horse
|
||||
gymnastics tumbling
|
||||
hammer throw
|
||||
headbanging
|
||||
headbutting
|
||||
high jump
|
||||
high kick
|
||||
hitting baseball
|
||||
hockey stop
|
||||
holding snake
|
||||
hopscotch
|
||||
hoverboarding
|
||||
hugging
|
||||
hula hooping
|
||||
hurdling
|
||||
hurling (sport)
|
||||
ice climbing
|
||||
ice fishing
|
||||
ice skating
|
||||
ironing
|
||||
javelin throw
|
||||
jetskiing
|
||||
jogging
|
||||
juggling balls
|
||||
juggling fire
|
||||
juggling soccer ball
|
||||
jumping into pool
|
||||
jumpstyle dancing
|
||||
kicking field goal
|
||||
kicking soccer ball
|
||||
kissing
|
||||
kitesurfing
|
||||
knitting
|
||||
krumping
|
||||
laughing
|
||||
laying bricks
|
||||
long jump
|
||||
lunge
|
||||
making a cake
|
||||
making a sandwich
|
||||
making bed
|
||||
making jewelry
|
||||
making pizza
|
||||
making snowman
|
||||
making sushi
|
||||
making tea
|
||||
marching
|
||||
massaging back
|
||||
massaging feet
|
||||
massaging legs
|
||||
massaging person's head
|
||||
milking cow
|
||||
mopping floor
|
||||
motorcycling
|
||||
moving furniture
|
||||
mowing lawn
|
||||
news anchoring
|
||||
opening bottle
|
||||
opening present
|
||||
paragliding
|
||||
parasailing
|
||||
parkour
|
||||
passing American football (in game)
|
||||
passing American football (not in game)
|
||||
peeling apples
|
||||
peeling potatoes
|
||||
petting animal (not cat)
|
||||
petting cat
|
||||
picking fruit
|
||||
planting trees
|
||||
plastering
|
||||
playing accordion
|
||||
playing badminton
|
||||
playing bagpipes
|
||||
playing basketball
|
||||
playing bass guitar
|
||||
playing cards
|
||||
playing cello
|
||||
playing chess
|
||||
playing clarinet
|
||||
playing controller
|
||||
playing cricket
|
||||
playing cymbals
|
||||
playing didgeridoo
|
||||
playing drums
|
||||
playing flute
|
||||
playing guitar
|
||||
playing harmonica
|
||||
playing harp
|
||||
playing ice hockey
|
||||
playing keyboard
|
||||
playing kickball
|
||||
playing monopoly
|
||||
playing organ
|
||||
playing paintball
|
||||
playing piano
|
||||
playing poker
|
||||
playing recorder
|
||||
playing saxophone
|
||||
playing squash or racquetball
|
||||
playing tennis
|
||||
playing trombone
|
||||
playing trumpet
|
||||
playing ukulele
|
||||
playing violin
|
||||
playing volleyball
|
||||
playing xylophone
|
||||
pole vault
|
||||
presenting weather forecast
|
||||
pull ups
|
||||
pumping fist
|
||||
pumping gas
|
||||
punching bag
|
||||
punching person (boxing)
|
||||
push up
|
||||
pushing car
|
||||
pushing cart
|
||||
pushing wheelchair
|
||||
reading book
|
||||
reading newspaper
|
||||
recording music
|
||||
riding a bike
|
||||
riding camel
|
||||
riding elephant
|
||||
riding mechanical bull
|
||||
riding mountain bike
|
||||
riding mule
|
||||
riding or walking with horse
|
||||
riding scooter
|
||||
riding unicycle
|
||||
ripping paper
|
||||
robot dancing
|
||||
rock climbing
|
||||
rock scissors paper
|
||||
roller skating
|
||||
running on treadmill
|
||||
sailing
|
||||
salsa dancing
|
||||
sanding floor
|
||||
scrambling eggs
|
||||
scuba diving
|
||||
setting table
|
||||
shaking hands
|
||||
shaking head
|
||||
sharpening knives
|
||||
sharpening pencil
|
||||
shaving head
|
||||
shaving legs
|
||||
shearing sheep
|
||||
shining shoes
|
||||
shooting basketball
|
||||
shooting goal (soccer)
|
||||
shot put
|
||||
shoveling snow
|
||||
shredding paper
|
||||
shuffling cards
|
||||
side kick
|
||||
sign language interpreting
|
||||
singing
|
||||
situp
|
||||
skateboarding
|
||||
ski jumping
|
||||
skiing (not slalom or crosscountry)
|
||||
skiing crosscountry
|
||||
skiing slalom
|
||||
skipping rope
|
||||
skydiving
|
||||
slacklining
|
||||
slapping
|
||||
sled dog racing
|
||||
smoking
|
||||
smoking hookah
|
||||
snatch weight lifting
|
||||
sneezing
|
||||
sniffing
|
||||
snorkeling
|
||||
snowboarding
|
||||
snowkiting
|
||||
snowmobiling
|
||||
somersaulting
|
||||
spinning poi
|
||||
spray painting
|
||||
spraying
|
||||
springboard diving
|
||||
squat
|
||||
sticking tongue out
|
||||
stomping grapes
|
||||
stretching arm
|
||||
stretching leg
|
||||
strumming guitar
|
||||
surfing crowd
|
||||
surfing water
|
||||
sweeping floor
|
||||
swimming backstroke
|
||||
swimming breast stroke
|
||||
swimming butterfly stroke
|
||||
swing dancing
|
||||
swinging legs
|
||||
swinging on something
|
||||
sword fighting
|
||||
tai chi
|
||||
taking a shower
|
||||
tango dancing
|
||||
tap dancing
|
||||
tapping guitar
|
||||
tapping pen
|
||||
tasting beer
|
||||
tasting food
|
||||
testifying
|
||||
texting
|
||||
throwing axe
|
||||
throwing ball
|
||||
throwing discus
|
||||
tickling
|
||||
tobogganing
|
||||
tossing coin
|
||||
tossing salad
|
||||
training dog
|
||||
trapezing
|
||||
trimming or shaving beard
|
||||
trimming trees
|
||||
triple jump
|
||||
tying bow tie
|
||||
tying knot (not on a tie)
|
||||
tying tie
|
||||
unboxing
|
||||
unloading truck
|
||||
using computer
|
||||
using remote controller (not gaming)
|
||||
using segway
|
||||
vault
|
||||
waiting in line
|
||||
walking the dog
|
||||
washing dishes
|
||||
washing feet
|
||||
washing hair
|
||||
washing hands
|
||||
water skiing
|
||||
water sliding
|
||||
watering plants
|
||||
waxing back
|
||||
waxing chest
|
||||
waxing eyebrows
|
||||
waxing legs
|
||||
weaving basket
|
||||
welding
|
||||
whistling
|
||||
windsurfing
|
||||
wrapping present
|
||||
wrestling
|
||||
writing
|
||||
yawning
|
||||
yoga
|
||||
zumba
|
|
@ -13,6 +13,8 @@ import pytest
|
|||
import torch
|
||||
import urllib.request
|
||||
import random
|
||||
import requests
|
||||
|
||||
from PIL import Image
|
||||
from torch import tensor
|
||||
from pathlib import Path
|
||||
|
@ -53,6 +55,8 @@ from utils_cv.segmentation.model import (
|
|||
)
|
||||
from utils_cv.similarity.data import Urls as is_urls
|
||||
from utils_cv.similarity.model import compute_features_learner
|
||||
from utils_cv.action_recognition.data import Urls as ar_urls
|
||||
from utils_cv.action_recognition.dataset import VideoDataset
|
||||
|
||||
|
||||
def path_classification_notebooks():
|
||||
|
@ -752,15 +756,65 @@ def od_detections(od_detection_dataset):
|
|||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ar_path(tmp_session) -> str:
|
||||
def ar_vid_path(tmp_session) -> str:
|
||||
""" Returns the path to the downloaded cup image. """
|
||||
VID_URL = "https://cvbp.blob.core.windows.net/public/datasets/action_recognition/drinking.mp4"
|
||||
drinking_url = ar_urls.drinking_path
|
||||
vid_path = os.path.join(tmp_session, "drinking.mp4")
|
||||
urllib.request.urlretrieve(VID_URL, vid_path)
|
||||
urllib.request.urlretrieve(drinking_url, vid_path)
|
||||
return vid_path
|
||||
|
||||
|
||||
# TODO
|
||||
@pytest.fixture(scope="session")
|
||||
def ar_milk_bottle_path(tmp_session) -> str:
|
||||
""" Returns the path of the milk bottle action dataset. """
|
||||
return unzip_url(
|
||||
ar_urls.milk_bottle_action_path,
|
||||
fpath=tmp_session,
|
||||
dest=tmp_session,
|
||||
exist_ok=True,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ar_milk_bottle_dataset(ar_milk_bottle_path) -> VideoDataset:
|
||||
""" Returns an instance of a VideoDatset built using the milk bottle dataset. """
|
||||
return VideoDataset(ar_milk_bottle_path)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ar_milk_bottle_split_files(tmp_session) -> VideoDataset:
|
||||
""" Returns an instance of a VideoDatset built using the milk bottle dataset. """
|
||||
r = requests.get(ar_urls.milk_bottle_action_test_split)
|
||||
test_split_file_path = os.path.join(
|
||||
tmp_session, "milk_bottle_action_test_split.txt"
|
||||
)
|
||||
with open(test_split_file_path, "wb") as f:
|
||||
f.write(r.content)
|
||||
|
||||
r = requests.get(ar_urls.milk_bottle_action_train_split)
|
||||
train_split_file_path = os.path.join(
|
||||
tmp_session, "milk_bottle_action_train_split.txt"
|
||||
)
|
||||
with open(train_split_file_path, "wb") as f:
|
||||
f.write(r.content)
|
||||
|
||||
return (train_split_file_path, test_split_file_path)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def ar_milk_bottle_dataset_with_split_file(
|
||||
ar_milk_bottle_path, ar_milk_bottle_split_files,
|
||||
) -> VideoDataset:
|
||||
""" Returns an instance of a VideoDataset built using the milk bottle
|
||||
dataset and custom split files. """
|
||||
train_split_file_path = ar_milk_bottle_split_files[0]
|
||||
test_split_file_path = ar_milk_bottle_split_files[1]
|
||||
return VideoDataset(
|
||||
ar_milk_bottle_path,
|
||||
train_split_file=train_split_file_path,
|
||||
test_split_file=test_split_file_path,
|
||||
)
|
||||
|
||||
|
||||
# ----- AML Settings ----------------------------------------------------------
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
# Licensed under the MIT License.
|
||||
|
||||
import os
|
||||
import requests
|
||||
from utils_cv.action_recognition.data import (
|
||||
_DatasetSpec,
|
||||
Urls,
|
||||
|
@ -21,3 +22,11 @@ def test__DatasetSpec_hmdb():
|
|||
hmdb51 = _DatasetSpec(Urls.hmdb51_label_map, 51)
|
||||
hmdb51.class_names
|
||||
assert os.path.exists(str(data_path() / "label_map.txt"))
|
||||
|
||||
|
||||
def test_urls():
|
||||
""" Test that urls work """
|
||||
for attr, value in Urls.__dict__.items():
|
||||
if not str.startswith(attr, "__") and "base" not in attr:
|
||||
with requests.get(value):
|
||||
pass
|
||||
|
|
|
@ -0,0 +1,131 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
# This test is based on the test suite implemented for Recommenders project
|
||||
# https://github.com/Microsoft/Recommenders/tree/master/tests
|
||||
|
||||
import pytest
|
||||
import torchvision
|
||||
import torch
|
||||
|
||||
from utils_cv.common.misc import Config
|
||||
from utils_cv.action_recognition.dataset import (
|
||||
VideoRecord,
|
||||
get_transforms,
|
||||
DEFAULT_MEAN,
|
||||
DEFAULT_STD,
|
||||
get_default_tfms_config,
|
||||
VideoDataset,
|
||||
)
|
||||
|
||||
|
||||
def check_VideoRecord(record: VideoRecord, ar_vid_path: str) -> None:
|
||||
""" Checks that property methods work. """
|
||||
assert record.path is ar_vid_path
|
||||
assert record.label == 0
|
||||
assert record.label_name in ("cooking", None)
|
||||
|
||||
|
||||
def test_VideoRecord(ar_vid_path) -> None:
|
||||
""" Test the video record initialization. """
|
||||
correct_input_one = [ar_vid_path, 0, "cooking"]
|
||||
check_VideoRecord(VideoRecord(correct_input_one), ar_vid_path)
|
||||
|
||||
correct_input_two = [ar_vid_path, 0]
|
||||
check_VideoRecord(VideoRecord(correct_input_two), ar_vid_path)
|
||||
|
||||
|
||||
def test_VideoRecord_invalid(ar_vid_path) -> None:
|
||||
""" Test the video record initialization failure. """
|
||||
incorrect_inputs = [
|
||||
[ar_vid_path, "0", "cooking", "extra"],
|
||||
[ar_vid_path],
|
||||
[ar_vid_path, "cooking", 0],
|
||||
["ar_vid_path, 0, cooking"],
|
||||
"ar_vid_path, 0, cooking",
|
||||
]
|
||||
for inp in incorrect_inputs:
|
||||
with pytest.raises(Exception):
|
||||
VideoRecord(inp)
|
||||
|
||||
|
||||
def test_get_transforms() -> None:
|
||||
""" Test the transforms function. """
|
||||
train_tfms = get_transforms(train=True)
|
||||
assert isinstance(train_tfms, torchvision.transforms.Compose)
|
||||
|
||||
test_tfms = get_transforms(train=False)
|
||||
assert isinstance(test_tfms, torchvision.transforms.Compose)
|
||||
|
||||
conf = Config(
|
||||
dict(
|
||||
input_size=300,
|
||||
im_scale=128,
|
||||
resize_keep_ratio=True,
|
||||
random_crop=True,
|
||||
random_crop_scales=True,
|
||||
flip_ratio=0.5,
|
||||
mean=DEFAULT_MEAN,
|
||||
std=DEFAULT_STD,
|
||||
)
|
||||
)
|
||||
custom_tfms = get_transforms(tfms_config=conf)
|
||||
assert isinstance(custom_tfms, torchvision.transforms.Compose)
|
||||
|
||||
|
||||
def test_get_default_tfms_config() -> None:
|
||||
""" Test the function that provides basic defaults for train/test. """
|
||||
train_default_tfms = get_default_tfms_config(train=True)
|
||||
assert train_default_tfms.flip_ratio == 0.5
|
||||
assert train_default_tfms.random_crop is True
|
||||
assert train_default_tfms.random_crop_scales == (0.6, 1.0)
|
||||
assert isinstance(train_default_tfms, Config)
|
||||
|
||||
test_default_tfms = get_default_tfms_config(train=False)
|
||||
assert test_default_tfms.flip_ratio == 0.0
|
||||
assert test_default_tfms.random_crop is False
|
||||
assert test_default_tfms.random_crop_scales is None
|
||||
assert isinstance(test_default_tfms, Config)
|
||||
|
||||
|
||||
def test_VideoDataset(ar_milk_bottle_path) -> None:
|
||||
""" Test the initialization of the video dataset. """
|
||||
dataset = VideoDataset(ar_milk_bottle_path)
|
||||
assert isinstance(dataset.train_dl, torch.utils.data.DataLoader)
|
||||
assert isinstance(dataset.test_dl, torch.utils.data.DataLoader)
|
||||
assert len(dataset) == 60
|
||||
assert len(dataset.train_ds) == 45
|
||||
assert len(dataset.test_ds) == 15
|
||||
|
||||
# test if train_pct is altered
|
||||
dataset = VideoDataset(ar_milk_bottle_path, train_pct=0.5)
|
||||
assert len(dataset) == 60
|
||||
assert len(dataset.train_ds) == 30
|
||||
assert len(dataset.test_ds) == 30
|
||||
|
||||
|
||||
def test_VideoDataset_split_file(
|
||||
ar_milk_bottle_path, ar_milk_bottle_split_files,
|
||||
) -> None:
|
||||
""" Tests VideoDataset initializing using split file. """
|
||||
dataset = VideoDataset(
|
||||
ar_milk_bottle_path,
|
||||
train_split_file=ar_milk_bottle_split_files[0],
|
||||
test_split_file=ar_milk_bottle_split_files[1],
|
||||
)
|
||||
|
||||
assert len(dataset) == 60
|
||||
assert len(dataset.train_ds) == 40
|
||||
assert len(dataset.test_ds) == 20
|
||||
|
||||
|
||||
def test_VideoDataset_show_batch(ar_milk_bottle_dataset) -> None:
|
||||
""" Tests the show batch functionality. """
|
||||
# test base case
|
||||
ar_milk_bottle_dataset.show_batch()
|
||||
|
||||
# test with set rows
|
||||
ar_milk_bottle_dataset.show_batch(rows=3)
|
||||
|
||||
# test with train_or_test == "test"
|
||||
ar_milk_bottle_dataset.show_batch(train_or_test="test")
|
|
@ -0,0 +1,26 @@
|
|||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
# This test is based on the test suite implemented for Recommenders project
|
||||
# https://github.com/Microsoft/Recommenders/tree/master/tests
|
||||
|
||||
|
||||
from utils_cv.action_recognition.model import VideoLearner
|
||||
|
||||
|
||||
def test_VideoLearner(ar_milk_bottle_dataset) -> None:
|
||||
""" Test VideoLearner Initialization. """
|
||||
learner = VideoLearner(ar_milk_bottle_dataset, num_classes=2)
|
||||
learner.fit(lr=0.001, epochs=1)
|
||||
learner.evaluate()
|
||||
|
||||
|
||||
def test_VideoLearner_using_split_file(
|
||||
ar_milk_bottle_dataset_with_split_file,
|
||||
) -> None:
|
||||
""" Test VideoLearner Initialization. """
|
||||
learner = VideoLearner(
|
||||
ar_milk_bottle_dataset_with_split_file, num_classes=2
|
||||
)
|
||||
learner.fit(lr=0.001, epochs=1)
|
||||
learner.evaluate()
|
|
@ -4,11 +4,12 @@
|
|||
# This test is based on the test suite implemented for Recommenders project
|
||||
# https://github.com/Microsoft/Recommenders/tree/master/tests
|
||||
|
||||
import os
|
||||
import papermill as pm
|
||||
import pytest
|
||||
import scrapbook as sb
|
||||
|
||||
from utils_cv.action_recognition.data import Urls
|
||||
|
||||
# Unless manually modified, python3 should be
|
||||
# the name of the current jupyter kernel
|
||||
# that runs on the activated conda environment
|
||||
|
@ -22,7 +23,10 @@ def test_00_notebook_run(action_recognition_notebooks):
|
|||
pm.execute_notebook(
|
||||
notebook_path,
|
||||
OUTPUT_NOTEBOOK,
|
||||
parameters=dict(PM_VERSION=pm.__version__),
|
||||
parameters=dict(
|
||||
PM_VERSION=pm.__version__,
|
||||
sample_video_url=Urls.webcam_vid_low_res
|
||||
),
|
||||
kernel_name=KERNEL_NAME,
|
||||
)
|
||||
|
||||
|
@ -34,24 +38,29 @@ def test_00_notebook_run(action_recognition_notebooks):
|
|||
|
||||
@pytest.mark.notebooks
|
||||
def test_01_notebook_run(action_recognition_notebooks):
|
||||
# TODO - this notebook relies on downloading hmdb51, so pass for now
|
||||
pass
|
||||
notebook_path = action_recognition_notebooks["01"]
|
||||
pm.execute_notebook(
|
||||
notebook_path,
|
||||
OUTPUT_NOTEBOOK,
|
||||
parameters=dict(
|
||||
PM_VERSION=pm.__version__,
|
||||
MODEL_INPUT_SIZE=8,
|
||||
EPOCHS=2,
|
||||
BATCH_SIZE=8,
|
||||
LR=0.001,
|
||||
),
|
||||
kernel_name=KERNEL_NAME,
|
||||
)
|
||||
|
||||
# notebook_path = classification_notebooks["01"]
|
||||
# pm.execute_notebook(
|
||||
# notebook_path,
|
||||
# OUTPUT_NOTEBOOK,
|
||||
# parameters=dict(PM_VERSION=pm.__version__),
|
||||
# kernel_name=KERNEL_NAME,
|
||||
# )
|
||||
|
||||
# nb_output = sb.read_notebook(OUTPUT_NOTEBOOK)
|
||||
# TODO add some asserts like below
|
||||
# assert len(nb_output.scraps["training_accuracies"].data) == 1
|
||||
nb_output = sb.read_notebook(OUTPUT_NOTEBOOK)
|
||||
assert isinstance(nb_output.scraps["vid_pred_accuracy"].data, float)
|
||||
assert isinstance(nb_output.scraps["clip_pred_accuracy"].data, float)
|
||||
|
||||
|
||||
@pytest.mark.notebooks
|
||||
def test_02_notebook_run(action_recognition_notebooks):
|
||||
# for note we pass on this notebook as it requires having hmdb51
|
||||
# downloaded
|
||||
pass
|
||||
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@ import os
|
|||
from pathlib import Path
|
||||
from typing import Union, List
|
||||
from urllib.request import urlretrieve
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from ..common.data import data_path
|
||||
|
||||
|
@ -40,9 +41,35 @@ class _DatasetSpec:
|
|||
|
||||
|
||||
class Urls:
|
||||
# base url
|
||||
base = "https://cvbp.blob.core.windows.net/public/datasets/action_recognition/"
|
||||
|
||||
# label maps
|
||||
kinetics_label_map = "https://github.com/microsoft/ComputerVision/files/3746975/kinetics400_lable_map.txt"
|
||||
hmdb51_label_map = "https://github.com/microsoft/ComputerVision/files/3746963/hmdb51_label_map.txt"
|
||||
|
||||
# milk bottle action split test files
|
||||
hmdb_train_split_1 = urljoin(base, "hmdb51_vid_train_split_1.txt")
|
||||
hmdb_test_split_1 = urljoin(base, "hmdb51_vid_test_split_1.txt")
|
||||
|
||||
# testing datasets
|
||||
milk_bottle_action_path = urljoin(base, "milkBottleActions.zip")
|
||||
|
||||
# milk bottle action split test files
|
||||
milk_bottle_action_train_split = urljoin(
|
||||
base, "milk_bottle_actions_train_split.txt"
|
||||
)
|
||||
milk_bottle_action_test_split = urljoin(
|
||||
base, "milk_bottle_actions_test_split.txt"
|
||||
)
|
||||
|
||||
# test vid
|
||||
drinking_path = urljoin(base, "drinking.mp4")
|
||||
|
||||
# webcam sample vids
|
||||
webcam_vid = urljoin(base, "action_sample.mp4")
|
||||
webcam_vid_low_res = urljoin(base, "action_sample_lowRes.mp4")
|
||||
|
||||
|
||||
KINETICS = _DatasetSpec(
|
||||
Urls.kinetics_label_map, 400, os.path.join("data", "kinetics400"),
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
import os
|
||||
import copy
|
||||
import math
|
||||
from pathlib import Path
|
||||
import warnings
|
||||
from typing import Callable, Tuple, Union, List
|
||||
|
@ -20,7 +21,7 @@ from .references import transforms_video as transforms
|
|||
from .references.functional_video import denormalize
|
||||
|
||||
from ..common.misc import Config
|
||||
from ..common.gpu import num_devices
|
||||
from ..common.gpu import num_devices, db_num_workers
|
||||
|
||||
Trans = Callable[[object, dict], Tuple[object, dict]]
|
||||
|
||||
|
@ -35,18 +36,27 @@ class VideoRecord(object):
|
|||
|
||||
Ex:
|
||||
```
|
||||
path/to/my/clip.mp4 3
|
||||
path/to/another/clip.mp4 32
|
||||
path/to/my/clip_1 3
|
||||
path/to/another/clip_2 32
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, data: List[str]):
|
||||
""" Initialized a VideoRecord
|
||||
|
||||
Ex.
|
||||
data = ["path/to/video.mp4", 2, "cooking"]
|
||||
|
||||
Args:
|
||||
row: a list where first element is the path and second element is
|
||||
the label
|
||||
the label, and the third element (optional) is the label name
|
||||
"""
|
||||
assert len(data) >= 2 and len(data) <= 3
|
||||
assert isinstance(data[0], str)
|
||||
assert isinstance(int(data[1]), int)
|
||||
if len(data) == 3:
|
||||
assert isinstance(data[2], str)
|
||||
|
||||
self._data = data
|
||||
self._num_frames = None
|
||||
|
||||
|
@ -66,8 +76,12 @@ class VideoRecord(object):
|
|||
def label(self) -> int:
|
||||
return int(self._data[1])
|
||||
|
||||
@property
|
||||
def label_name(self) -> str:
|
||||
return None if len(self._data) <= 2 else self._data[2]
|
||||
|
||||
def get_transforms(train: bool, tfms_config: Config = None) -> Trans:
|
||||
|
||||
def get_transforms(train: bool = True, tfms_config: Config = None) -> Trans:
|
||||
""" Get default transformations to apply depending on whether we're applying it to the training or the validation set. If no tfms configurations are passed in, use the defaults.
|
||||
|
||||
Args:
|
||||
|
@ -78,11 +92,7 @@ def get_transforms(train: bool, tfms_config: Config = None) -> Trans:
|
|||
A list of transforms to apply
|
||||
"""
|
||||
if tfms_config is None:
|
||||
tfms_config = (
|
||||
get_default_tfms_config(train=True)
|
||||
if train
|
||||
else get_default_tfms_config(train=False)
|
||||
)
|
||||
tfms_config = get_default_tfms_config(train=train)
|
||||
|
||||
# 1. resize
|
||||
tfms = [
|
||||
|
@ -91,6 +101,7 @@ def get_transforms(train: bool, tfms_config: Config = None) -> Trans:
|
|||
tfms_config.im_scale, tfms_config.resize_keep_ratio
|
||||
),
|
||||
]
|
||||
|
||||
# 2. crop
|
||||
if tfms_config.random_crop:
|
||||
if tfms_config.random_crop_scales:
|
||||
|
@ -102,8 +113,10 @@ def get_transforms(train: bool, tfms_config: Config = None) -> Trans:
|
|||
else:
|
||||
crop = transforms.CenterCropVideo(tfms_config.input_size)
|
||||
tfms.append(crop)
|
||||
|
||||
# 3. flip
|
||||
tfms.append(transforms.RandomHorizontalFlipVideo(tfms_config.flip_ratio))
|
||||
|
||||
# 4. normalize
|
||||
tfms.append(transforms.NormalizeVideo(tfms_config.mean, tfms_config.std))
|
||||
|
||||
|
@ -150,6 +163,7 @@ class VideoDataset:
|
|||
def __init__(
|
||||
self,
|
||||
root: str,
|
||||
seed: int = None,
|
||||
train_pct: float = 0.75,
|
||||
num_samples: int = 1,
|
||||
sample_length: int = 8,
|
||||
|
@ -169,6 +183,7 @@ class VideoDataset:
|
|||
|
||||
Arg:
|
||||
root: Videos directory.
|
||||
seed: random seed
|
||||
train_pct: percentage of dataset to use for training
|
||||
num_samples: Number of clips to sample from each video.
|
||||
sample_length: Number of consecutive frames to sample from a video (i.e. clip length).
|
||||
|
@ -205,6 +220,7 @@ class VideoDataset:
|
|||
)
|
||||
|
||||
self.root = root
|
||||
self.seed = seed
|
||||
self.num_samples = num_samples
|
||||
self.sample_length = sample_length
|
||||
self.sample_step = sample_step
|
||||
|
@ -225,16 +241,30 @@ class VideoDataset:
|
|||
test_split_file=test_split_file,
|
||||
)
|
||||
if train_split_file
|
||||
else self.split_train_test(train_pct=train_pct)
|
||||
else self.split_by_folder(train_pct=train_pct)
|
||||
)
|
||||
|
||||
# initialize dataloaders
|
||||
self.init_data_loaders()
|
||||
|
||||
def split_train_test(
|
||||
def split_by_folder(
|
||||
self, train_pct: float = 0.8
|
||||
) -> Tuple[Dataset, Dataset]:
|
||||
""" Split this dataset into a training and testing set
|
||||
""" Split this dataset into a training and testing set based on the
|
||||
folders that the videos are in.
|
||||
|
||||
```
|
||||
/data
|
||||
+-- action_class_1
|
||||
| +-- video_01.mp4
|
||||
| +-- video_02.mp4
|
||||
| +-- ...
|
||||
+-- action_class_2
|
||||
| +-- video_11.mp4
|
||||
| +-- video_12.mp4
|
||||
| +-- ...
|
||||
+-- ...
|
||||
```
|
||||
|
||||
Args:
|
||||
train_pct: the ratio of images to use for training vs
|
||||
|
@ -243,7 +273,45 @@ class VideoDataset:
|
|||
Return
|
||||
A training and testing dataset in that order
|
||||
"""
|
||||
pass
|
||||
self.video_records = []
|
||||
|
||||
# get all dirs in root (and make sure they are dirs)
|
||||
dirs = []
|
||||
for entry in os.listdir(self.root):
|
||||
if os.path.isdir(os.path.join(self.root, entry)):
|
||||
dirs.append(os.path.join(self.root, entry))
|
||||
|
||||
# add each video in each dir as a video record
|
||||
label = 0
|
||||
self.classes = []
|
||||
for action in dirs:
|
||||
action = os.path.basename(os.path.normpath(action))
|
||||
self.video_records.extend(
|
||||
[
|
||||
VideoRecord(
|
||||
[
|
||||
os.path.join(self.root, action, vid.split(".")[0]),
|
||||
label,
|
||||
action,
|
||||
]
|
||||
)
|
||||
for vid in os.listdir(os.path.join(self.root, action))
|
||||
]
|
||||
)
|
||||
label += 1
|
||||
self.classes.append(action)
|
||||
|
||||
# random split
|
||||
test_num = math.floor(len(self) * (1 - train_pct))
|
||||
if self.seed:
|
||||
torch.manual_seed(self.seed)
|
||||
|
||||
# set indices
|
||||
indices = torch.randperm(len(self)).tolist()
|
||||
train_range = indices[test_num:]
|
||||
test_range = indices[:test_num]
|
||||
|
||||
return self.split_train_test(train_range, test_range)
|
||||
|
||||
def split_with_file(
|
||||
self,
|
||||
|
@ -254,9 +322,9 @@ class VideoDataset:
|
|||
|
||||
Each line in the split file must use the form:
|
||||
```
|
||||
path/to/jumping/video.mp4 3
|
||||
path/to/swimming/video.mp4 5
|
||||
path/to/another/jumping/video.mp4 3
|
||||
path/to/jumping/video_name_1 3
|
||||
path/to/swimming/video_name_2 5
|
||||
path/to/another/jumping/video_name_3 3
|
||||
```
|
||||
|
||||
Args:
|
||||
|
@ -289,6 +357,20 @@ class VideoDataset:
|
|||
train_range = indices[:train_len]
|
||||
test_range = indices[train_len:]
|
||||
|
||||
return self.split_train_test(train_range, test_range)
|
||||
|
||||
def split_train_test(
|
||||
self, train_range: torch.Tensor, test_range: torch.Tensor,
|
||||
) -> Tuple[Dataset, Dataset]:
|
||||
""" Split this dataset into a training and testing set
|
||||
|
||||
Args:
|
||||
train_range: range of indices for training set
|
||||
test_range: range of indices for testing set
|
||||
|
||||
Return
|
||||
A training and testing dataset in that order
|
||||
"""
|
||||
# create train subset
|
||||
train = copy.deepcopy(Subset(self, train_range))
|
||||
train.dataset.transforms = self.train_transforms
|
||||
|
@ -315,7 +397,7 @@ class VideoDataset:
|
|||
self.train_ds,
|
||||
batch_size=self.batch_size * devices,
|
||||
shuffle=True,
|
||||
num_workers=0, # Torch 1.2 has a bug when num-workers > 0 (0 means run a main-processor worker)
|
||||
num_workers=db_num_workers(),
|
||||
pin_memory=True,
|
||||
)
|
||||
|
||||
|
@ -323,7 +405,7 @@ class VideoDataset:
|
|||
self.test_ds,
|
||||
batch_size=self.batch_size * devices,
|
||||
shuffle=False,
|
||||
num_workers=0,
|
||||
num_workers=db_num_workers(),
|
||||
pin_memory=True,
|
||||
)
|
||||
|
||||
|
@ -421,7 +503,7 @@ class VideoDataset:
|
|||
def __getitem__(self, idx: int) -> Tuple[torch.tensor, int]:
|
||||
"""
|
||||
Return:
|
||||
clips (torch.tensor), label (int)
|
||||
(clips (torch.tensor), label (int))
|
||||
"""
|
||||
record = self.video_records[idx]
|
||||
video_reader = decord.VideoReader(
|
||||
|
@ -436,11 +518,15 @@ class VideoDataset:
|
|||
clips = np.array([self._get_frames(video_reader, o) for o in offsets])
|
||||
|
||||
if self.num_samples == 1:
|
||||
# [T, H, W, C] -> [C, T, H, W]
|
||||
return self.transforms(torch.from_numpy(clips[0])), record.label
|
||||
else:
|
||||
# [S, T, H, W, C] -> [S, C, T, H, W]
|
||||
return (
|
||||
# [T, H, W, C] -> [C, T, H, W]
|
||||
self.transforms(torch.from_numpy(clips[0])),
|
||||
record.label,
|
||||
)
|
||||
|
||||
else:
|
||||
return (
|
||||
# [S, T, H, W, C] -> [S, C, T, H, W]
|
||||
torch.stack(
|
||||
[self.transforms(torch.from_numpy(c)) for c in clips]
|
||||
),
|
||||
|
@ -449,7 +535,8 @@ class VideoDataset:
|
|||
|
||||
def _show_batch(
|
||||
self,
|
||||
batch: List[torch.tensor],
|
||||
images: List[torch.tensor],
|
||||
labels: List[int],
|
||||
sample_length: int,
|
||||
mean: Tuple[int, int, int] = DEFAULT_MEAN,
|
||||
std: Tuple[int, int, int] = DEFAULT_STD,
|
||||
|
@ -458,12 +545,13 @@ class VideoDataset:
|
|||
Display a batch of images.
|
||||
|
||||
Args:
|
||||
batch: List of sample (clip) tensors
|
||||
images: List of sample (clip) tensors
|
||||
labels: List of labels
|
||||
sample_length: Number of frames to show for each sample
|
||||
mean: Normalization mean
|
||||
std: Normalization std-dev
|
||||
"""
|
||||
batch_size = len(batch)
|
||||
batch_size = len(images)
|
||||
plt.tight_layout()
|
||||
fig, axs = plt.subplots(
|
||||
batch_size,
|
||||
|
@ -473,9 +561,9 @@ class VideoDataset:
|
|||
|
||||
for i, ax in enumerate(axs):
|
||||
if batch_size == 1:
|
||||
clip = batch[0]
|
||||
clip = images[0]
|
||||
else:
|
||||
clip = batch[i]
|
||||
clip = images[i]
|
||||
clip = Rearrange("c t h w -> t c h w")(clip)
|
||||
if not isinstance(ax, np.ndarray):
|
||||
ax = [ax]
|
||||
|
@ -484,15 +572,27 @@ class VideoDataset:
|
|||
a.imshow(
|
||||
np.moveaxis(denormalize(clip[j], mean, std).numpy(), 0, -1)
|
||||
)
|
||||
pass
|
||||
|
||||
def show_batch(self, train_or_test: str = "train", rows: int = 1) -> None:
|
||||
# display label/label_name on the first image
|
||||
if j == 0:
|
||||
a.text(
|
||||
x=3,
|
||||
y=15,
|
||||
s=f"{labels[i]}",
|
||||
fontsize=20,
|
||||
bbox=dict(facecolor="white", alpha=0.80),
|
||||
)
|
||||
|
||||
def show_batch(self, train_or_test: str = "train", rows: int = 2) -> None:
|
||||
"""Plot first few samples in the datasets"""
|
||||
if train_or_test == "train":
|
||||
batch = [self.train_ds.dataset[i][0] for i in range(rows)]
|
||||
elif train_or_test == "valid":
|
||||
batch = [self.test_ds.dataset[i][0] for i in range(rows)]
|
||||
batch = [self.train_ds[i] for i in range(rows)]
|
||||
elif train_or_test == "test":
|
||||
batch = [self.test_ds[i] for i in range(rows)]
|
||||
else:
|
||||
raise ValueError("Unknown data type {}".format(which_data))
|
||||
|
||||
self._show_batch(batch, self.sample_length)
|
||||
images = [im[0] for im in batch]
|
||||
labels = [im[1] for im in batch]
|
||||
|
||||
self._show_batch(images, labels, self.sample_length)
|
||||
|
|
|
@ -3,10 +3,13 @@
|
|||
|
||||
from collections import OrderedDict
|
||||
import os
|
||||
import time
|
||||
import warnings
|
||||
from typing import Union
|
||||
import numpy as np
|
||||
from typing import Any, Callable, Dict, List, Tuple, Union
|
||||
from pathlib import Path
|
||||
import matplotlib.pyplot as plt
|
||||
import torch.cuda as cuda
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
try:
|
||||
from apex import amp
|
||||
|
@ -20,7 +23,17 @@ import torch.nn as nn
|
|||
import torch.optim as optim
|
||||
import torchvision
|
||||
|
||||
from ..common.misc import Config
|
||||
# this
|
||||
from collections import deque
|
||||
import io
|
||||
import decord
|
||||
import IPython.display
|
||||
from time import sleep, time
|
||||
from PIL import Image
|
||||
from threading import Thread
|
||||
from torchvision.transforms import Compose
|
||||
from utils_cv.action_recognition.dataset import get_transforms
|
||||
|
||||
from ..common.gpu import torch_device, num_devices
|
||||
from .dataset import VideoDataset
|
||||
|
||||
|
@ -43,9 +56,10 @@ class VideoLearner(object):
|
|||
|
||||
def __init__(
|
||||
self,
|
||||
dataset: VideoDataset,
|
||||
num_classes: int, # ie 51 for hmdb51
|
||||
dataset: VideoDataset = None,
|
||||
num_classes: int = None, # ie 51 for hmdb51
|
||||
base_model: str = "ig65m", # or "kinetics"
|
||||
sample_length: int = None,
|
||||
) -> None:
|
||||
""" By default, the Video Learner will use a R2plus1D model. Pass in
|
||||
a dataset of type Video Dataset and the Video Learner will intialize
|
||||
|
@ -58,9 +72,21 @@ class VideoLearner(object):
|
|||
kinetics. By default it will use the weights from ig65m since it
|
||||
tends attain higher results.
|
||||
"""
|
||||
self.dataset = dataset
|
||||
# set empty - populated when fit is called
|
||||
self.results = []
|
||||
|
||||
# set num classes
|
||||
self.num_classes = num_classes
|
||||
|
||||
if dataset:
|
||||
self.dataset = dataset
|
||||
self.sample_length = self.dataset.sample_length
|
||||
else:
|
||||
assert sample_length == 8 or sample_length == 32
|
||||
self.sample_length = sample_length
|
||||
|
||||
self.model, self.model_name = self.init_model(
|
||||
self.dataset.sample_length, base_model, num_classes,
|
||||
self.sample_length, base_model, num_classes,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
|
@ -85,12 +111,7 @@ class VideoLearner(object):
|
|||
)
|
||||
|
||||
# Decide if to use pre-trained weights for DNN trained using 8 or for 32 frames
|
||||
if sample_length <= 8:
|
||||
model_sample_length = 8
|
||||
else:
|
||||
model_sample_length = 32
|
||||
|
||||
model_name = f"r2plus1d_34_{model_sample_length}_{base_model}"
|
||||
model_name = f"r2plus1d_34_{sample_length}_{base_model}"
|
||||
|
||||
print(f"Loading {model_name} model")
|
||||
|
||||
|
@ -114,17 +135,42 @@ class VideoLearner(object):
|
|||
param.requires_grad = True
|
||||
|
||||
def unfreeze(self) -> None:
|
||||
"""Unfreeze all layers in model"""
|
||||
self._set_requires_grad(True)
|
||||
|
||||
def _set_requires_grad(self, requires_grad=True) -> None:
|
||||
""" sets requires grad """
|
||||
for param in self.model.parameters():
|
||||
param.requires_grad = requires_grad
|
||||
|
||||
def fit(self, train_cfgs) -> None:
|
||||
def fit(
|
||||
self,
|
||||
lr: float,
|
||||
epochs: int,
|
||||
model_dir: str = "checkpoints",
|
||||
model_name: str = None,
|
||||
momentum: float = 0.95,
|
||||
weight_decay: float = 0.0001,
|
||||
mixed_prec: bool = False,
|
||||
use_one_cycle_policy: bool = False,
|
||||
warmup_pct: float = 0.3,
|
||||
lr_gamma: float = 0.1,
|
||||
lr_step_size: float = None,
|
||||
grad_steps: int = 2,
|
||||
save_model: bool = False,
|
||||
) -> None:
|
||||
""" The primary fit function """
|
||||
train_cfgs = Config(train_cfgs)
|
||||
# set epochs
|
||||
self.epochs = epochs
|
||||
|
||||
# set lr_step_size based on epochs
|
||||
if lr_step_size is None:
|
||||
lr_step_size = np.ceil(2 / 3 * self.epochs)
|
||||
|
||||
# set model name
|
||||
if model_name is None:
|
||||
model_name = self.model_name
|
||||
|
||||
model_dir = train_cfgs.get("model_dir", "checkpoints")
|
||||
os.makedirs(model_dir, exist_ok=True)
|
||||
|
||||
data_loaders = {}
|
||||
|
@ -152,17 +198,16 @@ class VideoLearner(object):
|
|||
print(f"\t{name}")
|
||||
|
||||
# create optimizer
|
||||
momentum = train_cfgs.get("momentum", 0.95)
|
||||
optimizer = optim.SGD(
|
||||
list(named_params_to_update.values()),
|
||||
lr=train_cfgs.lr,
|
||||
lr=lr,
|
||||
momentum=momentum,
|
||||
weight_decay=train_cfgs.get("weight_decay", 0.0001),
|
||||
weight_decay=weight_decay,
|
||||
)
|
||||
|
||||
# Use mixed-precision if available
|
||||
# Currently, only O1 works with DataParallel: See issues https://github.com/NVIDIA/apex/issues/227
|
||||
if train_cfgs.get("mixed_prec", False):
|
||||
if mixed_prec:
|
||||
# break if not AMP_AVAILABLE
|
||||
assert AMP_AVAILABLE
|
||||
# 'O0': Full FP32, 'O1': Conservative, 'O2': Standard, 'O3': Full FP16
|
||||
|
@ -175,22 +220,20 @@ class VideoLearner(object):
|
|||
)
|
||||
|
||||
# Learning rate scheduler
|
||||
if train_cfgs.get("use_one_cycle_policy", False):
|
||||
if use_one_cycle_policy:
|
||||
# Use warmup with the one-cycle policy
|
||||
scheduler = torch.optim.lr_scheduler.OneCycleLR(
|
||||
optimizer,
|
||||
max_lr=train_cfgs.lr,
|
||||
total_steps=train_cfgs.epochs,
|
||||
pct_start=train_cfgs.get("warmup_pct", 0.3),
|
||||
max_lr=lr,
|
||||
total_steps=self.epochs,
|
||||
pct_start=warmup_pct,
|
||||
base_momentum=0.9 * momentum,
|
||||
max_momentum=momentum,
|
||||
)
|
||||
else:
|
||||
# Simple step-decay
|
||||
scheduler = torch.optim.lr_scheduler.StepLR(
|
||||
optimizer,
|
||||
step_size=train_cfgs.get("lr_step_size", float("inf")),
|
||||
gamma=train_cfgs.get("lr_gamma", 0.1),
|
||||
optimizer, step_size=lr_step_size, gamma=lr_gamma,
|
||||
)
|
||||
|
||||
# DataParallel after amp.initialize
|
||||
|
@ -200,34 +243,42 @@ class VideoLearner(object):
|
|||
|
||||
criterion = nn.CrossEntropyLoss().to(device)
|
||||
|
||||
for e in range(1, train_cfgs.epochs + 1):
|
||||
print(f"Epoch {e} ==========")
|
||||
# set num classes
|
||||
topk = 5
|
||||
if topk >= self.num_classes:
|
||||
topk = self.num_classes
|
||||
|
||||
for e in range(1, self.epochs + 1):
|
||||
print(
|
||||
f"Epoch {e} ========================================================="
|
||||
)
|
||||
print(f"lr={scheduler.get_lr()}")
|
||||
|
||||
self.train_an_epoch(
|
||||
model,
|
||||
data_loaders,
|
||||
device,
|
||||
criterion,
|
||||
optimizer,
|
||||
grad_steps=train_cfgs.grad_steps,
|
||||
mixed_prec=train_cfgs.mixed_prec,
|
||||
self.results.append(
|
||||
self.train_an_epoch(
|
||||
model,
|
||||
data_loaders,
|
||||
device,
|
||||
criterion,
|
||||
optimizer,
|
||||
grad_steps=grad_steps,
|
||||
mixed_prec=mixed_prec,
|
||||
topk=topk,
|
||||
)
|
||||
)
|
||||
|
||||
scheduler.step()
|
||||
|
||||
if train_cfgs.get("save_models", False):
|
||||
if save_model:
|
||||
self.save(
|
||||
os.path.join(
|
||||
model_dir,
|
||||
"{model_name}_{epoch}.pt".format(
|
||||
model_name=train_cfgs.get(
|
||||
"model_name", self.model_name
|
||||
),
|
||||
epoch=str(e).zfill(3),
|
||||
"{model_name}_{self.epoch}.pt".format(
|
||||
model_name=model_name, epoch=str(e).zfill(3),
|
||||
),
|
||||
)
|
||||
)
|
||||
self.plot_precision_loss_curves()
|
||||
|
||||
@staticmethod
|
||||
def train_an_epoch(
|
||||
|
@ -236,9 +287,10 @@ class VideoLearner(object):
|
|||
device,
|
||||
criterion,
|
||||
optimizer,
|
||||
grad_steps=1,
|
||||
mixed_prec=False,
|
||||
) -> None:
|
||||
grad_steps: int = 1,
|
||||
mixed_prec: bool = False,
|
||||
topk: int = 5,
|
||||
) -> Dict[str, Any]:
|
||||
"""Train / validate a model for one epoch.
|
||||
|
||||
Args:
|
||||
|
@ -249,6 +301,7 @@ class VideoLearner(object):
|
|||
optimizer: TODO
|
||||
grad_steps: If > 1, use gradient accumulation. Useful for larger batching
|
||||
mixed_prec: If True, use FP16 + FP32 mixed precision via NVIDIA apex.amp
|
||||
topk: top k classes
|
||||
|
||||
Return:
|
||||
dict {
|
||||
|
@ -276,15 +329,19 @@ class VideoLearner(object):
|
|||
else:
|
||||
model.eval()
|
||||
|
||||
# set loader
|
||||
dl = data_loaders[phase]
|
||||
|
||||
# collect metrics
|
||||
batch_time = AverageMeter()
|
||||
losses = AverageMeter()
|
||||
top1 = AverageMeter()
|
||||
top5 = AverageMeter()
|
||||
|
||||
end = time.time()
|
||||
end = time()
|
||||
for step, (inputs, target) in enumerate(dl, start=1):
|
||||
if step % 10 == 0:
|
||||
print(f" Phase {phase}: batch {step} of {len(dl)}")
|
||||
inputs = inputs.to(device, non_blocking=True)
|
||||
target = target.to(device, non_blocking=True)
|
||||
|
||||
|
@ -294,7 +351,7 @@ class VideoLearner(object):
|
|||
loss = criterion(outputs, target)
|
||||
|
||||
# measure accuracy and record loss
|
||||
prec1, prec5 = accuracy(outputs, target, topk=(1, 5))
|
||||
prec1, prec5 = accuracy(outputs, target, topk=(1, topk))
|
||||
|
||||
losses.update(loss.item(), inputs.size(0))
|
||||
top1.update(prec1[0], inputs.size(0))
|
||||
|
@ -317,12 +374,16 @@ class VideoLearner(object):
|
|||
optimizer.zero_grad()
|
||||
|
||||
# measure elapsed time
|
||||
batch_time.update(time.time() - end)
|
||||
end = time.time()
|
||||
batch_time.update(time() - end)
|
||||
end = time()
|
||||
|
||||
print(f"{phase} took {batch_time.sum:.2f} sec ", end="| ")
|
||||
print(f"loss = {losses.avg:.4f} ", end="| ")
|
||||
print(f"top1_acc = {top1.avg:.4f} ", end=" ")
|
||||
if topk >= 5:
|
||||
print(f"| top5_acc = {top5.avg:.4f}", end="")
|
||||
print()
|
||||
|
||||
print(
|
||||
f"{phase} took {batch_time.sum:.2f} sec: loss = {losses.avg:.4f}, top1_acc = {top1.avg:.4f}, top5_acc = {top5.avg:.4f}"
|
||||
)
|
||||
result[f"{phase}/time"] = batch_time.sum
|
||||
result[f"{phase}/loss"] = losses.avg
|
||||
result[f"{phase}/top1"] = top1.avg
|
||||
|
@ -330,6 +391,318 @@ class VideoLearner(object):
|
|||
|
||||
return result
|
||||
|
||||
def plot_precision_loss_curves(
|
||||
self, figsize: Tuple[int, int] = (10, 5)
|
||||
) -> None:
|
||||
""" Plot training loss and accuracy from calling `fit` on the test set. """
|
||||
assert len(self.results) > 0
|
||||
|
||||
fig = plt.figure(figsize=figsize)
|
||||
valid_losses = [dic["valid/loss"] for dic in self.results]
|
||||
valid_top1 = [float(dic["valid/top1"]) for dic in self.results]
|
||||
|
||||
ax1 = fig.add_subplot(1, 1, 1)
|
||||
ax1.set_xlim([0, self.epochs - 1])
|
||||
ax1.set_xticks(range(0, self.epochs))
|
||||
ax1.set_xlabel("epochs")
|
||||
ax1.set_ylabel("loss", color="g")
|
||||
ax1.plot(valid_losses, "g-")
|
||||
ax2 = ax1.twinx()
|
||||
ax2.set_ylabel("top1 %acc", color="b")
|
||||
ax2.plot(valid_top1, "b-")
|
||||
fig.suptitle("Loss and Average Precision (AP) over Epochs")
|
||||
|
||||
def evaluate(
|
||||
self,
|
||||
num_samples: int = 10,
|
||||
report_every: int = 100,
|
||||
train_or_test: str = "test",
|
||||
) -> None:
|
||||
""" eval code for validation/test set and saves the evaluation results in self.results.
|
||||
|
||||
Args:
|
||||
num_samples: number of samples (clips) of the validation set to test
|
||||
report_every: print line of results every n times
|
||||
train_or_test: use train or test set
|
||||
"""
|
||||
# asset train or test valid
|
||||
assert train_or_test in ["train", "test"]
|
||||
|
||||
# set device and num_gpus
|
||||
num_gpus = num_devices()
|
||||
device = torch_device()
|
||||
torch.backends.cudnn.benchmark = True if cuda.is_available() else False
|
||||
|
||||
# init model with gpu (or not)
|
||||
self.model.to(device)
|
||||
if num_gpus > 1:
|
||||
self.model = nn.DataParallel(model)
|
||||
self.model.eval()
|
||||
|
||||
# set train or test
|
||||
ds = (
|
||||
self.dataset.test_ds
|
||||
if train_or_test == "test"
|
||||
else self.dataset.train_ds
|
||||
)
|
||||
|
||||
# set num_samples
|
||||
ds.dataset.num_samples = num_samples
|
||||
print(
|
||||
f"{len(self.dataset.test_ds)} samples of {self.dataset.test_ds[0][0][0].shape}"
|
||||
)
|
||||
|
||||
# Loop over all examples in the test set and compute accuracies
|
||||
ret = dict(
|
||||
infer_times=[],
|
||||
video_preds=[],
|
||||
video_trues=[],
|
||||
clip_preds=[],
|
||||
clip_trues=[],
|
||||
)
|
||||
report_every = 100
|
||||
|
||||
# inference
|
||||
with torch.no_grad():
|
||||
for i in range(
|
||||
1, len(ds)
|
||||
): # [::10]: # Skip some examples to speed up accuracy computation
|
||||
if i % report_every == 0:
|
||||
print(
|
||||
f"Processsing {i} of {len(self.dataset.test_ds)} samples.."
|
||||
)
|
||||
|
||||
# Get model inputs
|
||||
inputs, label = ds[i]
|
||||
inputs = inputs.to(device, non_blocking=True)
|
||||
|
||||
# Run inference
|
||||
start_time = time()
|
||||
outputs = self.model(inputs)
|
||||
outputs = outputs.cpu().numpy()
|
||||
infer_time = time() - start_time
|
||||
ret["infer_times"].append(infer_time)
|
||||
|
||||
# Store results
|
||||
ret["video_preds"].append(outputs.sum(axis=0).argmax())
|
||||
ret["video_trues"].append(label)
|
||||
ret["clip_preds"].extend(outputs.argmax(axis=1))
|
||||
ret["clip_trues"].extend([label] * num_samples)
|
||||
|
||||
print(
|
||||
f"Avg. inference time per video ({len(ds)} clips) =",
|
||||
round(np.array(ret["infer_times"]).mean() * 1000, 2),
|
||||
"ms",
|
||||
)
|
||||
print(
|
||||
"Video prediction accuracy =",
|
||||
round(accuracy_score(ret["video_trues"], ret["video_preds"]), 2),
|
||||
)
|
||||
print(
|
||||
"Clip prediction accuracy =",
|
||||
round(accuracy_score(ret["clip_trues"], ret["clip_preds"]), 2),
|
||||
)
|
||||
return ret
|
||||
|
||||
def _predict(self, frames, transform):
|
||||
"""Runs prediction on frames applying transforms before predictions."""
|
||||
clip = torch.from_numpy(np.array(frames))
|
||||
# Transform frames and append batch dim
|
||||
sample = torch.unsqueeze(transform(clip), 0)
|
||||
sample = sample.to(torch_device())
|
||||
output = self.model(sample)
|
||||
scores = nn.functional.softmax(output, dim=1).data.cpu().numpy()[0]
|
||||
return scores
|
||||
|
||||
def _filter_labels(
|
||||
self,
|
||||
id_score_dict: dict,
|
||||
labels: List[str],
|
||||
threshold: float = 0.0,
|
||||
target_labels: List[str] = None,
|
||||
filter_labels: List[str] = None,
|
||||
) -> Dict[str, int]:
|
||||
""" Given the predictions, filter out the noise based on threshold,
|
||||
target labels and filter labels.
|
||||
|
||||
Arg:
|
||||
id_score_dict: dictionary of predictions
|
||||
labels: all labels
|
||||
threshold: the min threshold to keep prediction
|
||||
target_labels: exclude any labels not in target labels
|
||||
filter_labels: exclude any labels in filter labels
|
||||
|
||||
Returns
|
||||
A dictionary of labels and scores
|
||||
"""
|
||||
# Show only interested actions (target_labels) with a confidence score >= threshold
|
||||
result = {}
|
||||
for i, s in id_score_dict.items():
|
||||
label = labels[i]
|
||||
if (
|
||||
(s < threshold)
|
||||
or (target_labels is not None and label not in target_labels)
|
||||
or (filter_labels is not None and label in filter_labels)
|
||||
):
|
||||
continue
|
||||
|
||||
if label in result:
|
||||
result[label] += s
|
||||
else:
|
||||
result[label] = s
|
||||
|
||||
return result
|
||||
|
||||
def predict_frames(
|
||||
self,
|
||||
window: deque,
|
||||
scores_cache: deque,
|
||||
scores_sum: np.ndarray,
|
||||
is_ready: list,
|
||||
averaging_size: int,
|
||||
score_threshold: float,
|
||||
labels: List[str],
|
||||
target_labels: List[str],
|
||||
transforms: Compose,
|
||||
update_println: Callable,
|
||||
) -> None:
|
||||
""" Predicts frames """
|
||||
# set model device and to eval mode
|
||||
self.model.to(torch_device())
|
||||
self.model.eval()
|
||||
|
||||
# score
|
||||
t = time()
|
||||
scores = self._predict(window, transforms)
|
||||
dur = time() - t
|
||||
|
||||
# Averaging scores across clips (dense prediction)
|
||||
scores_cache.append(scores)
|
||||
scores_sum += scores
|
||||
|
||||
if len(scores_cache) == averaging_size:
|
||||
scores_avg = scores_sum / averaging_size
|
||||
|
||||
if len(labels) >= 5:
|
||||
num_labels = 5
|
||||
else:
|
||||
num_labels = len(labels) - 1
|
||||
|
||||
top5_id_score_dict = {
|
||||
i: scores_avg[i]
|
||||
for i in (-scores_avg).argpartition(num_labels - 1)[
|
||||
:num_labels
|
||||
]
|
||||
}
|
||||
top5_label_score_dict = self._filter_labels(
|
||||
top5_id_score_dict,
|
||||
labels,
|
||||
threshold=score_threshold,
|
||||
target_labels=target_labels,
|
||||
)
|
||||
top5 = sorted(top5_label_score_dict.items(), key=lambda kv: -kv[1])
|
||||
|
||||
# fps and preds
|
||||
println = (
|
||||
f"{1 // dur} fps"
|
||||
+ "<p style='font-size:20px'>"
|
||||
+ "<br>".join([f"{k} ({v:.3f})" for k, v in top5])
|
||||
+ "</p>"
|
||||
)
|
||||
|
||||
# Plot final results nicely
|
||||
update_println(println)
|
||||
scores_sum -= scores_cache.popleft()
|
||||
|
||||
# Inference done. Ready to run on the next frames.
|
||||
window.popleft()
|
||||
if is_ready:
|
||||
is_ready[0] = True
|
||||
|
||||
def predict_video(
|
||||
self,
|
||||
video_fpath: str,
|
||||
labels: List[str] = None,
|
||||
averaging_size: int = 5,
|
||||
score_threshold: float = 0.025,
|
||||
target_labels: List[str] = None,
|
||||
transforms: Compose = None,
|
||||
) -> None:
|
||||
"""Load video and show frames and inference results while displaying the results
|
||||
"""
|
||||
# set up video reader
|
||||
video_reader = decord.VideoReader(video_fpath)
|
||||
print(f"Total frames = {len(video_reader)}")
|
||||
|
||||
# set up ipython jupyter display
|
||||
d_video = IPython.display.display("", display_id=1)
|
||||
d_caption = IPython.display.display("Preparing...", display_id=2)
|
||||
|
||||
# set vars
|
||||
is_ready = [True]
|
||||
window = deque()
|
||||
scores_cache = deque()
|
||||
|
||||
# use labels if given, else see if we have labels from our dataset
|
||||
if not labels:
|
||||
if self.dataset.classes:
|
||||
labels = self.dataset.classes
|
||||
else:
|
||||
raise ("No labels found, add labels argument.")
|
||||
scores_sum = np.zeros(len(labels))
|
||||
|
||||
# set up transforms
|
||||
if not transforms:
|
||||
transforms = get_transforms(train=False)
|
||||
|
||||
# set up print function
|
||||
def update_println(println):
|
||||
d_caption.update(IPython.display.HTML(println))
|
||||
|
||||
while True:
|
||||
try:
|
||||
frame = video_reader.next().asnumpy()
|
||||
if len(frame.shape) != 3:
|
||||
break
|
||||
|
||||
# Start an inference thread when ready
|
||||
if is_ready[0]:
|
||||
window.append(frame)
|
||||
if len(window) == self.sample_length:
|
||||
is_ready[0] = False
|
||||
Thread(
|
||||
target=self.predict_frames,
|
||||
args=(
|
||||
window,
|
||||
scores_cache,
|
||||
scores_sum,
|
||||
is_ready,
|
||||
averaging_size,
|
||||
score_threshold,
|
||||
labels,
|
||||
target_labels,
|
||||
transforms,
|
||||
update_println,
|
||||
),
|
||||
).start()
|
||||
|
||||
# Show video preview
|
||||
f = io.BytesIO()
|
||||
im = Image.fromarray(frame)
|
||||
im.save(f, "jpeg")
|
||||
|
||||
# resize frames to avoid flicker for windows
|
||||
w, h = frame.shape[0], frame.shape[1]
|
||||
scale = 300.0 / max(w, h)
|
||||
w = round(w * scale)
|
||||
h = round(h * scale)
|
||||
im = im.resize((h, w))
|
||||
|
||||
d_video.update(IPython.display.Image(data=f.getvalue()))
|
||||
sleep(0.03)
|
||||
except Exception:
|
||||
break
|
||||
|
||||
def save(self, model_path: Union[Path, str]) -> None:
|
||||
""" Save the model to a path on disk. """
|
||||
torch.save(self.model.state_dict(), model_path)
|
||||
|
|
|
@ -61,7 +61,7 @@ def read_classes_file(classes_filepath):
|
|||
Read file that maps class names to class IDs. The file should be in the format:
|
||||
ActionName1 0
|
||||
ActionName2 1
|
||||
|
||||
|
||||
:param classes_filepath: str
|
||||
The filepath of the classes file
|
||||
:return: dict
|
||||
|
|
|
@ -22,6 +22,14 @@ def data_path() -> Path:
|
|||
return data_dir
|
||||
|
||||
|
||||
def download(url: str, loc: str):
|
||||
""" Download contents of a url into 'loc'"""
|
||||
r = requests.get(url)
|
||||
with open(loc, 'wb') as f:
|
||||
f.write(r.content)
|
||||
return loc
|
||||
|
||||
|
||||
def get_files_in_directory(
|
||||
directory: str, suffixes: List[str] = None
|
||||
) -> List[str]:
|
||||
|
|
|
@ -51,11 +51,7 @@ def torch_device():
|
|||
|
||||
def num_devices():
|
||||
""" Gets the number of devices based on cpu/gpu """
|
||||
return (
|
||||
torch.cuda.device_count()
|
||||
if torch.cuda.is_available()
|
||||
else 1
|
||||
)
|
||||
return torch.cuda.device_count() if torch.cuda.is_available() else 1
|
||||
|
||||
|
||||
def db_num_workers(non_windows_num_workers: int = 16):
|
||||
|
|
|
@ -69,7 +69,9 @@ def get_font(size: int = 12) -> ImageFont:
|
|||
Tries different fonts and lower/upper case to be compatible with both Linux and Windows.
|
||||
"""
|
||||
font = None
|
||||
for font_name in "Tahoma tahoma Verdana verdana Arial arial Helvetica helvetica DejaVuSans dejavusans".split():
|
||||
for (
|
||||
font_name
|
||||
) in "Tahoma tahoma Verdana verdana Arial arial Helvetica helvetica DejaVuSans dejavusans".split():
|
||||
try:
|
||||
font = ImageFont.truetype(f"{font_name}.ttf", size)
|
||||
except (AttributeError, IOError):
|
||||
|
@ -88,11 +90,9 @@ def get_font(size: int = 12) -> ImageFont:
|
|||
class Config(object):
|
||||
def __init__(self, config=None, **extras):
|
||||
"""Dictionary wrapper to access keys as attributes.
|
||||
|
||||
Args:
|
||||
config (dict or Config): Configurations
|
||||
extras (kwargs): Extra configurations
|
||||
|
||||
Examples:
|
||||
>>> cfg = Config({'lr': 0.01}, momentum=0.95)
|
||||
or
|
||||
|
|
Загрузка…
Ссылка в новой задаче