updated to ORBIT Challenge 2023

2022-12-16 08:05:20 +00:00 · 2022-12-16 08:05:20 +00:00 · d5a76e2793
--- a/README.md
+++ b/README.md
@ -188,9 +188,9 @@ The following checkpoints have been trained on the ORBIT train users using the a
 |  |     224    |  ViT-B-32  |         Y          |[`orbit_cluve_protonets_vit_b_32_224_lite.pth`](https://taixmachinelearning.blob.core.windows.net/publicbaselines/orbit_cluve_protonets_vit_b_32_224_lite.pth)| 73.76 (0.51) 
 |  |     224    |  ViT-B-32-CLIP  |         Y          |[`orbit_cluve_protonets_vit_b_32_clip_224_lite.pth`](https://taixmachinelearning.blob.core.windows.net/publicbaselines/orbit_cluve_protonets_vit_b_32_clip_224_lite.pth)|  74.57 (0.52) 
 | ProtoNets (cosine) |     224    |  EfficientNet-B0  |         Y          |[`orbit_cluve_protonets_cosine_efficientnet_b0_224_lite.pth`](https://taixmachinelearning.blob.core.windows.net/publicbaselines/orbit_cluve_protonets_cosine_efficientnet_b0_224_lite.pth)| 66.57 (0.57)
-|           |     224    |  EfficientNet-V2-S  |         Y          |[`orbit_cluve_protonets_efficientnet_v2_s_224_lite.pth`](https://taixmachinelearning.blob.core.windows.net/publicbaselines/orbit_cluve_protonets_efficientnet_v2_s_224_lite.pth)| 73.43 (0.54) 
-|           |     224    |  ViT-B-32  |         Y          |[`orbit_cluve_protonets_vit_b_32_224_lite.pth`](https://taixmachinelearning.blob.core.windows.net/publicbaselines/orbit_cluve_protonets_vit_b_32_224_lite.pth)| 75.36 (0.51) 
-|           |     224    |  ViT-B-32-CLIP  |         Y          |[`orbit_cluve_protonets_vit_b_32_clip_224_lite.pth`](https://taixmachinelearning.blob.core.windows.net/publicbaselines/orbit_cluve_protonets_vit_b_32_clip_224_lite.pth)| 73.47 (0.52) 
+|           |     224    |  EfficientNet-V2-S  |         Y          |[`orbit_cluve_protonets_cosine_efficientnet_v2_s_224_lite.pth`](https://taixmachinelearning.blob.core.windows.net/publicbaselines/orbit_cluve_protonets_cosine_efficientnet_v2_s_224_lite.pth)| 73.43 (0.54) 
+|           |     224    |  ViT-B-32  |         Y          |[`orbit_cluve_protonets_cosine_vit_b_32_224_lite.pth`](https://taixmachinelearning.blob.core.windows.net/publicbaselines/orbit_cluve_protonets_cosine_vit_b_32_224_lite.pth)| 75.36 (0.51) 
+|           |     224    |  ViT-B-32-CLIP  |         Y          |[`orbit_cluve_protonets_cosine_vit_b_32_clip_224_lite.pth`](https://taixmachinelearning.blob.core.windows.net/publicbaselines/orbit_cluve_protonets_cosine_vit_b_32_clip_224_lite.pth)| 73.47 (0.52) 
 | FineTuner |     224    |  EfficientNet-B0  |         N          | Used pre-trained extractor |
 |           |     224    |  EfficientNet-V2-S  |         N          | Used pre-trained extractor |
 |           |     224    |  ViT-B-32  |         N          | Used pre-trained extractor |
--- a/data/dataloaders.py
+++ b/data/dataloaders.py
@ -83,11 +83,11 @@ class DataLoader():
                            num_tasks, test_mode=False, with_cluster_labels=False, with_caps=False, shuffle=False, logfile=None):
        return UserEpisodicDatasetQueue(root, way_method, object_cap, shot_method, shots, video_types, \
                                subsample_factor, clip_methods, clip_length, frame_size, frame_norm_method, annotations_to_load, filter_by_annotations, \
-                                num_tasks, test_mode, with_cluster_labels, with_caps, shuffle, logfile)
+                                num_tasks, test_mode, with_cluster_labels, with_caps, shuffle, logfile=logfile)
    
    def config_object_centric_queue(self, root, way_method, object_cap, shot_method, shots, video_types, \
                            subsample_factor, clip_methods, clip_length, frame_size, frame_norm_method, annotations_to_load, filter_by_annotations, \
                            num_tasks, test_mode=False, with_cluster_labels=False, with_caps=False, shuffle=False, logfile=None):
        return ObjectEpisodicDatasetQueue(root, way_method, object_cap, shot_method, shots, video_types, \
                                subsample_factor, clip_methods, clip_length, frame_size, frame_norm_method, annotations_to_load, filter_by_annotations, \
-                                num_tasks, test_mode, with_cluster_labels, with_caps, shuffle, logfile) 
+                                num_tasks, test_mode, with_cluster_labels, with_caps, shuffle, logfile=logfile) 
--- a/data/queues.py
+++ b/data/queues.py
@ -40,8 +40,9 @@ class DatasetQueue:
        return self.dataset.cluster_classes

 class UserEpisodicDatasetQueue(DatasetQueue):
-    def __init__(self, root, way_method, object_cap, shot_method, shots, video_types, subsample_factor, clip_methods, clip_length, frame_size, frame_norm_method, annotations_to_load, filter_by_annotations, num_tasks, test_mode, with_cluster_labels, with_caps, shuffle, logfile):
-        DatasetQueue.__init__(self, num_tasks, shuffle, num_workers=4 if test_mode else 8)
+    def __init__(self, root, way_method, object_cap, shot_method, shots, video_types, subsample_factor, clip_methods, clip_length, frame_size, frame_norm_method, annotations_to_load, filter_by_annotations, num_tasks, test_mode, with_cluster_labels, with_caps, shuffle, num_workers=None, logfile=None):
+        num_workers = num_workers if num_workers else 4 if test_mode else 8
+        DatasetQueue.__init__(self, num_tasks, shuffle, num_workers)
        self.dataset = UserEpisodicORBITDataset(root, way_method, object_cap, shot_method, shots, video_types, subsample_factor, clip_methods, clip_length, frame_size, frame_norm_method, annotations_to_load, filter_by_annotations, test_mode, with_cluster_labels, with_caps, logfile)
        self.num_users = self.dataset.num_users
    
@ -58,8 +59,9 @@ class UserEpisodicDatasetQueue(DatasetQueue):
        return self.dataset.num_users

 class ObjectEpisodicDatasetQueue(DatasetQueue):
-    def __init__(self, root, way_method, object_cap, shot_method, shots, video_types, subsample_factor, clip_methods, clip_length, frame_size, frame_norm_method, annotations_to_load, filter_by_annotations, num_tasks, test_mode, with_cluster_labels, with_caps, shuffle, logfile):
-        DatasetQueue.__init__(self, num_tasks, shuffle, num_workers=4 if test_model else 8)
+    def __init__(self, root, way_method, object_cap, shot_method, shots, video_types, subsample_factor, clip_methods, clip_length, frame_size, frame_norm_method, annotations_to_load, filter_by_annotations, num_tasks, test_mode, with_cluster_labels, with_caps, shuffle, num_workers=None, logfile=None):
+        num_workers = num_workers if num_workers else 4 if test_mode else 8
+        DatasetQueue.__init__(self, num_tasks, shuffle, num_workers)
        self.dataset = ObjectEpisodicORBITDataset(root, way_method, object_cap, shot_method, shots, video_types, subsample_factor, clip_methods, clip_length, frame_size, frame_norm_method, annotations_to_load, filter_by_annotations, test_mode, with_cluster_labels, with_caps, logfile)
        self.num_users = self.dataset.num_users
        self.num_objects = self.dataset.num_objects
@ -72,7 +74,6 @@ class ObjectEpisodicDatasetQueue(DatasetQueue):
                sampler=TaskSampler(self.num_tasks, self.num_objects, self.shuffle),
                collate_fn=self.collate_fn
                )
-
-    
+ 
    def __len__(self):
        return self.dataset.num_objects
--- a/environment.yml
+++ b/environment.yml
@ -12,6 +12,7 @@ dependencies:
  - pytorch-cuda=11.6
  - torchvision=0.14.0
  - tqdm=4.62.3
+  - ipykernel=6.15.2
  - pip:
    - thop==0.1.1.post2209072238
    - timm==0.6.12
--- a/orbit_challenge_getting_started.ipynb
+++ b/orbit_challenge_getting_started.ipynb
@ -1,12 +1,13 @@
 {
 "cells": [
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ORBIT Challenge - Getting Started\n",
    "\n",
-    "This notebook will step you through a simple starter task which you can use to get you started on the (ORBIT Few-Shot Object Recognition Challenge 2022)[https://eval.ai/web/challenges/challenge-page/1438]. In this starter task, you will download a few-shot learning model (Prototypical Networks, Snell et al., 2017) trained on the ORBIT train set, and use it to generate frame predictions on the ORBIT validation set. The predictions will be saved in a JSON in the format required by the Challenge's evaluation server. You can upload this JSON under the 'Starter Task' phase on the evaluation server to check your implementation.\n",
+    "This notebook will step you through a simple starter task which you can use to get you started on the [ORBIT Few-Shot Object Recognition Challenge 2023](https://eval.ai/web/challenges/challenge-page/1896). In this starter task, you will download a few-shot learning model (Prototypical Networks, Snell et al., 2017) trained on the ORBIT train set, and use it to generate frame predictions on the ORBIT validation set. The predictions will be saved in a JSON in the format required by the Challenge's evaluation server. You can upload this JSON under the 'Starter Task' phase on the evaluation server to check your implementation.\n",
    "\n",
    "This notebook has been tested using the conda environment specified in [environment.yml](environment.yml)."
   ]
@ -30,22 +31,26 @@
   "source": [
    "from pathlib import Path\n",
    "\n",
-    "validation_path = Path(\"orbit_benchmark/validation\")\n",
+    "DATA_ROOT = \"orbit_benchmark\"\n",
+    "DATA_SPLIT = \"validation\"\n",
+    "validation_path = Path(DATA_ROOT, DATA_SPLIT)\n",
+    "from pathlib import Path\n",
+    "\n",
    "if not validation_path.is_dir():\n",
    "    validation_path.parent.mkdir(parents=True, exist_ok=True)\n",
-    "    print(\"downloading validation.zip\")\n",
-    "    !wget -O orbit_benchmark/validation.zip https://city.figshare.com/ndownloader/files/28368351\n",
+    "    print(\"Downloading validation.zip...\")\n",
+    "    !wget -O validation.zip https://city.figshare.com/ndownloader/files/28368351\n",
    "\n",
-    "    print(\"unzipping validation.zip...\")\n",
-    "    !unzip -q orbit_benchmark/validation.zip -d orbit_benchmark\n",
+    "    print(\"Unzipping validation.zip...\")\n",
+    "    !unzip -q validation.zip -d {DATA_ROOT}\n",
    "\n",
    "    if not validation_path.is_dir():\n",
    "        raise ValueError(f\"Path {validation_path} is not a directory.\")\n",
    "    else:\n",
-    "        print(f\"dataset ready at {validation_path}\")\n",
+    "        print(f\"Dataset ready at {validation_path}.\")\n",
    "    # You can now delete the zip file.\n",
    "else:\n",
-    "    print(f\"dataset already saved at {validation_path}\")"
+    "    print(f\"Dataset already saved at {validation_path}.\")"
   ]
  },
  {
@ -61,41 +66,40 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Add this repository root to the Python path.\n",
-    "from pathlib import Path\n",
    "from data.queues import UserEpisodicDatasetQueue\n",
    "\n",
-    "DATA_ROOT = \"orbit_benchmark\"\n",
-    "DATA_SPLIT = \"validation\"\n",
-    "\n",
-    "print(\"Creating data queue.\")\n",
+    "print(\"Creating data queue...\")\n",
    "data_queue = UserEpisodicDatasetQueue(\n",
    "    root=Path(DATA_ROOT, DATA_SPLIT), # path to data\n",
    "    way_method=\"max\", # sample all objects per user\n",
-    "    object_cap=\"max\", # do not cap number of objects per user\n",
+    "    object_cap=15, # cap number of objects per user to 15 (no ORBIT user has >15 objects)\n",
    "    shot_method=[\"max\", \"max\"], # sample [all context videos, all target videos] per object\n",
    "    shots=[5, 2], # only relevant if shot_method contains strings \"specific\" or \"fixed\"\n",
    "    video_types=[\"clean\", \"clutter\"], # sample clips from [clean context videos, clutter target videos]\n",
-    "    subsample_factor=1, # subsample rate for video frames\n",
-    "    num_clips=[\"random\", \"max\"], # sample [a random number of clips per context video, all target clips per target video]; note if test_mode=True, target clips will be flattened into a list of frames\n",
-    "    clip_length=8, # sample 8 frames per clip\n",
-    "    preload_clips=True, # load clips into memory when sampling a task; if False, load each clip only when it is passed through model \n",
+    "    subsample_factor=30, # subsample rate for video frames if clip_method = uniform\n",
+    "    clip_methods=[\"random\", \"random_200\"], # sample [a random number of clips per context video, 200 random target clips per target video]; note if test_mode=True, target clips will be flattened into a list of frames\n",
+    "    clip_length=1, # sample 1 frame per clip. Can be increased to sample multiple frames per clip.\n",
    "    frame_size=224, # width and height of frame \n",
-    "    frame_annotations=[], # do not load any frame annotations\n",
-    "    tasks_per_user=1, # sample 1 task per user; if >1 then only frame predictions from the final task per user will be saved\n",
+    "    frame_norm_method='imagenet_inception', # normalize frames using imagenet inception statistics since we're using ViT-B-32 pretrained on ImageNet-21K (see below).\n",
+    "    annotations_to_load=[], # do not load any frame annotations\n",
+    "    filter_by_annotations=[['no_object_not_present_issue'], ['no_object_not_present_issue']], # only includes context and target frames with the 'no_object_not_present_issue' tag\n",
+    "    num_tasks=50, # sample 50 tasks per user\n",
    "    test_mode=True, # sample test (rather than train) tasks\n",
    "    with_cluster_labels=False, # use user's personalised object names as labels, rather than broader object categories\n",
-    "    with_caps=False, # do not impose any caps\n",
-    "    shuffle=False) # do not shuffle task data\n",
+    "    with_caps=False, # do not impose any sampling caps\n",
+    "    shuffle=False, # do not shuffle task data\n",
+    "    num_workers=2 # use 2 workers to load data\n",
+    ")\n",
    "\n",
    "print(f\"Created data queue, queue uses {data_queue.num_workers} workers.\")"
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "We now need to set up the model. For the starter task, we will use a few-shot learning model called Prototypical Networks (Snell et al., 2017) which has been pretrained on the ORBIT train users for the CLUVE or Clutter Video Evaluation task (trained on 224x224 frame size, using LITE). First, we download the checkpoint file that corresponds to this model. We then create an instance of the model using the pretrained weights."
+    "We now need to set up the model. For the starter task, we will use a few-shot learning model called Prototypical Networks (Snell et al., 2017) using a cosine rather a Euclidean distance. The model uses a Vision Transformer feature extractor which has been pre-trained on ImageNet-21K (i.e. 'vit_b_32'). We then meta-train this model on the ORBIT train users for the CLUVE or Clutter Video Evaluation task (trained on 224x224 frame size, using LITE). First, we download the checkpoint file that corresponds to this model. We then create an instance of the model using the pretrained weights."
   ]
  },
  {
@ -104,14 +108,15 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "checkpoint_path = Path(\"orbit_pretrained_checkpoints/orbit_cluve_protonets_efficientnetb0_224_lite.pth\")\n",
+    "checkpoint_path = Path(\"orbit_pretrained_checkpoints\", \"orbit_cluve_protonets_cosine_vit_b_32_224_lite.pth\")\n",
+    "\n",
    "if not checkpoint_path.exists():\n",
    "    checkpoint_path.parent.mkdir(parents=True, exist_ok=True)\n",
-    "    print(\"downloading checkpoint file\")\n",
-    "    !wget -q -O orbit_pretrained_checkpoints/orbit_cluve_protonets_efficientnetb0_224_lite.pth https://github.com/microsoft/ORBIT-Dataset/raw/master/checkpoints/orbit_cluve_protonets_efficientnetb0_224_lite.pth\n",
-    "    print(f\"checkpoint saved to {checkpoint_path}!\")\n",
+    "    print(\"Downloading checkpoint file...\")\n",
+    "    !wget -O orbit_pretrained_checkpoints/orbit_cluve_protonets_cosine_vit_b_32_224_lite.pth https://taixmachinelearning.blob.core.windows.net/publicbaselines/orbit_cluve_protonets_cosine_vit_b_32_224_lite.pth\n",
+    "    print(f\"Checkpoint saved to {checkpoint_path}.\")\n",
    "else:\n",
-    "    print(f\"checkpoint already exists at {checkpoint_path}\")"
+    "    print(f\"Checkpoint already exists at {checkpoint_path}.\")"
   ]
  },
  {
@ -121,7 +126,7 @@
   "outputs": [],
   "source": [
    "import torch\n",
-    "from models.few_shot_recognisers import SingleStepFewShotRecogniser\n",
+    "from model.few_shot_recognisers import SingleStepFewShotRecogniser\n",
    "\n",
    "if torch.cuda.is_available():\n",
    "    device = torch.device(\"cuda:0\")\n",
@ -130,38 +135,34 @@
    "    device = torch.device(\"cpu\")\n",
    "    map_location = lambda storage, _: storage.cpu()\n",
    "\n",
-    "print(f\"Using device {device}\")\n",
-    "\n",
    "model = SingleStepFewShotRecogniser(\n",
-    "    pretrained_extractor_path=\"features/pretrained/efficientnetb0_imagenet_224.pth\", # path to pretrained feature extractor trained on ImageNet\n",
-    "    feature_extractor=\"efficientnetb0\", # feature extractor is an EfficientNet-B0\n",
-    "    batch_normalisation=\"basic\", # standard batch normalisation rather than task normalisation (Bronskill et al., 2020)\n",
-    "    adapt_features=False, # do not use FiLM Layers\n",
-    "    classifier=\"proto\", # use a Prototypical Networks classifier head\n",
-    "    clip_length=8, # number of frames per clip; frame features are mean-pooled to get the clip feature\n",
-    "    batch_size=4, # number of clips within a task to process at a time\n",
+    "    feature_extractor_name=\"vit_b_32\", # feature extractor is a Vision Transformer\n",
+    "    adapt_features=False, # do not generate FiLM Layers\n",
+    "    classifier=\"proto_cosine\", # use a Prototypical Networks classifier head, with a cosine rather than Euclidean distance metric\n",
+    "    clip_length=1, # number of frames per clip; frame features are mean-pooled to get the clip feature\n",
+    "    batch_size=256, # number of clips within a task to process at a time\n",
    "    learn_extractor=False, # only relevant when training\n",
-    "    feature_adaptation_method=\"generate\", # only relevant when adapt_features = True\n",
-    "    use_two_gpus=False, # use only 1 GPU; if >1 model is parallelised over 2 GPUs\n",
-    "    num_lite_samples=8 # only relevant when training with LITE\n",
+    "    num_lite_samples=16, # only relevant when training with LITE\n",
+    "    logit_scale=32.0 # scalar to scale logits (increased for proto_cosine, but typically 1.0)\n",
    ")\n",
    "model._set_device(device)\n",
    "model._send_to_device()\n",
-    "model._register_extra_parameters()\n",
    "\n",
-    "checkpoint_path = Path(\"orbit_pretrained_checkpoints\", \"orbit_cluve_protonets_efficientnetb0_224_lite.pth\")\n",
+    "# load in the pretrained checkpoint weights\n",
    "model.load_state_dict(torch.load(checkpoint_path, map_location=map_location), strict=False)\n",
+    "# set the model to evaluation mode (ensures batch norm modules are in the correct state)\n",
    "model.set_test_mode(True)\n",
-    "print(\"instance of SingleStepFewShotRecogniser created!\")"
+    "print(f\"Instance of SingleStepFewShotRecogniser created on device {device}.\")"
   ]
  },
  {
+   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "We are now going to run our data through our model. We go through each task (which corresponds to a user from the dataset, since we specified `tasks_per_user = 1` above) and use the task's context clips to create a personalized model. We then evaluate the personalized model on each frame in the user's target videos.\n",
+    "We are now going to run our data through our model. We go through each task (50 tasks per user, since we specified `num_tasks = 50` above) and use the task's context clips to create a personalized model. We then evaluate the personalized model on each frame in the task's target videos.\n",
    "\n",
-    "The results from each user will be saved to a JSON file (this is what should be submitted to the evaluation server) and the aggregate stats will be printed to the console."
+    "The results for each task will be saved to a JSON file (this is what should be submitted to the evaluation server) and the aggregate stats will be printed to the console."
   ]
  },
  {
@ -170,70 +171,86 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "import numpy as np\n",
    "from typing import Dict, Tuple\n",
-    "from utils.data import attach_frame_history\n",
+    "from data.utils import attach_frame_history\n",
    "from utils.eval_metrics import TestEvaluator\n",
    "\n",
    "output_dir = Path(\"output\", DATA_SPLIT)\n",
    "output_dir.mkdir(exist_ok=True, parents=True)\n",
    "\n",
-    "metrics = ['frame_acc', 'video_acc', 'frames_to_recognition']\n",
+    "metrics = ['frame_acc']\n",
    "evaluator = TestEvaluator(metrics, output_dir)\n",
-    "num_test_tasks = data_queue.num_users * data_queue.tasks_per_user\n",
+    "num_test_tasks = data_queue.num_users * data_queue.num_tasks\n",
    "\n",
-    "def get_stats_str(label: str, stats: Dict[str, Tuple[float, float]], dps: int=4) -> str:\n",
-    "    stats_str = f\"{label}\\t\"\n",
-    "    stats_str += \"\\t\".join([f\"{stats[metric][0]:.{dps}f} ({stats[metric][1]:.{dps}f})\" for metric in metrics])\n",
+    "def get_stats_str(stats: Dict[str, Tuple[float, float]], dps: int=2) -> str:\n",
+    "    stats_str = \"\\t\".join([f\"{metric}: {stats[metric][0]*100:.{dps}f} ({stats[metric][1]*100:.{dps}f})\" for metric in metrics])\n",
    "    return stats_str\n",
    "\n",
-    "print(\"running evaluation\")\n",
-    "print(\"         \\tFrame Accuracy\\tVideo Accuracy\\tFrames to Recognition\")\n",
-    "for step, task in enumerate(data_queue.get_tasks()):\n",
-    "    with torch.no_grad():\n",
-    "        context_set = task[\"context_clips\"].to(device)          # Torch tensor of shape: (N, clip_length, C, H, W), dtype float32\n",
+    "print(\"Running evaluation...\")\n",
+    "num_context_clips_per_task = []\n",
+    "num_target_clips_per_task = []\n",
+    "with torch.no_grad():\n",
+    "    for step, task in enumerate(data_queue.get_tasks()):\n",
+    "        context_clips = task[\"context_clips\"].to(device)          # Torch tensor of shape: (N, clip_length, C, H, W), dtype float32\n",
    "        context_labels = task[\"context_labels\"].to(device)      # Torch tensor of shape: (N), dtype int64\n",
    "        object_list = task[\"object_list\"]                       # List of str of length num_objects\n",
+    "        num_context_clips = len(context_clips)\n",
+    "\n",
+    "        # log task in evaluator\n",
+    "        evaluator.set_task_object_list(object_list)\n",
+    "        #evaluator.set_task_context_paths(task[\"context_paths\"])\n",
    "        \n",
-    "        # personalise the pre-trained model to the current user\n",
-    "        model.personalise(context_set, context_labels)\n",
+    "        # personalise the pre-trained model to the current task\n",
+    "        model.personalise(context_clips, context_labels)\n",
    "\n",
    "        # loop through each of the user's target videos, and get predictions from the personalised model for every frame\n",
+    "        num_target_clips = 0\n",
    "        for video_frames, video_paths, video_label in zip(task['target_clips'], task[\"target_paths\"], task['target_labels']):\n",
    "            # video_frames is a Torch tensor of shape (frame_count, C, H, W), dtype float32\n",
    "            # video_paths is a Torch tensor of shape (frame_count), dtype object (Path)\n",
    "            # video_label is single int64\n",
    "\n",
-    "            # first, for each frame, attach a short history of its previous frames\n",
+    "            # first, for each frame, attach a short history of its previous frames if clip_length > 1\n",
    "            video_frames_with_history = attach_frame_history(video_frames, model.clip_length)      # Torch tensor of shape: (frame_count, clip_length, C, H, W), dtype float32\n",
+    "            num_target_clips += len(video_frames_with_history)\n",
+    "\n",
    "            # get predicted logits for each frame\n",
    "            logits = model.predict(video_frames_with_history)                                      # Torch tensor of shape: (frame_count, num_objects), dtype float32\n",
-    "            evaluator.append_video(logits, video_label, video_paths, object_list)\n",
+    "            evaluator.append_video(logits, video_label, video_paths)\n",
    "\n",
    "        # reset model for next task \n",
    "        model._reset()\n",
    "\n",
-    "        # check if the user has any more tasks; if tasks_per_user == 1, we reset every time.\n",
-    "        if (step+1) % data_queue.tasks_per_user == 0:\n",
-    "            _, current_user_stats = evaluator.get_mean_stats(current_user=True)\n",
-    "            print(get_stats_str(f\"user {task['user_id']} ({evaluator.current_user+1}/{data_queue.num_users})\", current_user_stats))\n",
+    "        # check if the user has any more tasks; if tasks_per_user == 50, we reset every 50th task.\n",
+    "        if (step+1) % data_queue.num_tasks == 0:\n",
+    "            evaluator.set_current_user(task[\"task_id\"])\n",
+    "            _,_,_,current_video_stats = evaluator.get_mean_stats(current_user=True)\n",
+    "            print(f\"User {task['task_id']} ({evaluator.current_user+1}/{len(data_queue)}) {get_stats_str(current_video_stats)}, avg #context clips/task: {np.mean(num_context_clips_per_task):d}, avg #target clips/task: {np.mean(num_target_clips_per_task):d}\")\n",
    "            if (step+1) < num_test_tasks:\n",
+    "                num_context_clips_per_task = []\n",
+    "                num_target_clips_per_task = []\n",
    "                evaluator.next_user()\n",
+    "        else:\n",
+    "            num_context_clips_per_task.append(num_context_clips)\n",
+    "            num_target_clips_per_task.append(num_target_clips)\n",
+    "            evaluator.next_task()\n",
    "\n",
    "# Compute the aggregate statistics averaged over users and averged over videos. We use the video aggregate stats for the competition.\n",
-    "stats_per_user, stats_per_video = evaluator.get_mean_stats()\n",
-    "print(get_stats_str(\"User avg\", stats_per_user))\n",
-    "print(get_stats_str(\"Video avg\", stats_per_video))\n",
+    "stats_per_user, stats_per_obj, stats_per_task, stats_per_video = evaluator.get_mean_stats()\n",
+    "print('-'*20)\n",
+    "print(f\"Average over all users: {get_stats_str(stats_per_user)}\")\n",
+    "print(f\"Average over all objects: {get_stats_str(stats_per_obj)}\")\n",
+    "print(f\"Average over all tasks: {get_stats_str(stats_per_task)}\")\n",
+    "print(f\"Average over all videos: {get_stats_str(stats_per_video)}\")\n",
    "evaluator.save()\n",
-    "print(f\"results saved to {evaluator.json_results_path}\")"
+    "print(f\"Results saved to {evaluator.json_results_path}.\")"
   ]
  }
 ],
 "metadata": {
-  "interpreter": {
-   "hash": "a7c70594a95b6216cafc8f65304111471c70399665833c76d08f54ea09558935"
-  },
  "kernelspec": {
-   "display_name": "Python 3.7.7 64-bit ('orbit-dataset': conda)",
+   "display_name": "orbit-dataset-release",
   "language": "python",
   "name": "python3"
  },
@ -247,9 +264,14 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.7.7"
+   "version": "3.7.15"
  },
-  "orig_nbformat": 4
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "35a491206864cd1cd42eca7a8f938db8ccb579057bd59c1d04d1dc1f060cefe6"
+   }
+  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
--- a/requirements.txt
+++ b/requirements.txt
@ -5,3 +5,4 @@ plotly==4.8.1
 plotly-orca==1.3.1
 tqdm==4.62.3
 timm==0.6.12
+ipykernel==6.15.2