311 строки
11 KiB
Python
311 строки
11 KiB
Python
r"""
|
|
This program compiles an Excel spreadsheet for manually mapping dataset-specific
|
|
species names to a common taxonomy.
|
|
|
|
We are currently doing the counting of species here instead of as a part of the
|
|
Cosmos DB query
|
|
- see SDK issue in notes.
|
|
|
|
It first goes through the list of datasets in the `datasets` table to find out
|
|
which "species" are in each dataset and the count of its "occurrences" (each
|
|
sequence is counted as 1 if the class label is on the sequence; each image is
|
|
counted as 1 as well if the class label is on the image level; so a
|
|
sequence/image count mixture). This information is saved in a JSON file in the
|
|
`output_dir` for each dataset.
|
|
|
|
Once this information is collected, for each "species" in a dataset, it queries
|
|
the TOP 100 sequences where the "species" is in the list of class names at
|
|
either the sequence or the image level. It samples 7 of these TOP 500 sequences
|
|
(sequences returned by TOP may have little variety) and from each sequence
|
|
samples an image to surface as an example. The spreadsheet is then prepared,
|
|
adding a Bing search URL with the species class name as the query string and
|
|
fields to filter and fill in Excel.
|
|
|
|
Because querying for all species present in a dataset may take a long time, a
|
|
dataset is only queried if it does not yet have a JSON file in the `output_dir`.
|
|
|
|
Also, the spreadsheet creation step is only done for datasets in
|
|
DATASETS_TO_INCLUDE_IN_SPREADSHEET specified below. This is usually the datasets
|
|
just ingested that need their species names mapped next.
|
|
|
|
Leave out the flag `--query-species` if you only want to prepare the spreadsheet
|
|
using previously queried species presence result.
|
|
|
|
|
|
Example invocation:
|
|
python taxonomy_mapping/species_by_dataset.py \
|
|
--output-dir $HOME/megadb_query_results/species_by_dataset_trial \
|
|
--query-species
|
|
"""
|
|
|
|
import argparse
|
|
from collections import Counter
|
|
from datetime import datetime
|
|
import json
|
|
import os
|
|
from random import sample
|
|
from typing import List, Optional
|
|
import urllib.parse
|
|
|
|
from openpyxl import Workbook
|
|
from openpyxl.utils.dataframe import dataframe_to_rows
|
|
import pandas as pd
|
|
from tqdm import tqdm
|
|
|
|
from data_management.megadb.megadb_utils import MegadbUtils
|
|
|
|
|
|
NUMBER_EXAMPLES_PER_SPECIES = 7
|
|
NUMBER_SEQUENCES_TO_QUERY = 500
|
|
|
|
DATASETS_TO_INCLUDE_IN_SPREADSHEET = [
|
|
'ena24',
|
|
'sulross_2018',
|
|
'sulross_2019_spring',
|
|
'sulross_kitfox',
|
|
'idfg_swwlf_2019'
|
|
]
|
|
|
|
|
|
def query_species_by_dataset(megadb_utils: MegadbUtils,
|
|
output_dir: str) -> None:
|
|
"""For each dataset, creates a JSON file specifying species counts.
|
|
|
|
Skips dataset if a JSON file for it already exists.
|
|
"""
|
|
# which datasets are already processed?
|
|
queried_datasets = set(
|
|
i.split('.json')[0] for i in os.listdir(output_dir)
|
|
if i.endswith('.json'))
|
|
|
|
datasets_table = megadb_utils.get_datasets_table()
|
|
dataset_names = [i for i in datasets_table if i not in queried_datasets]
|
|
|
|
print(f'{len(queried_datasets)} datasets already queried. Querying species '
|
|
f'in {len(dataset_names)} datasets...')
|
|
|
|
for dataset_name in dataset_names:
|
|
print(f'Querying dataset {dataset_name}...')
|
|
|
|
# sequence-level query should be fairly fast, ~1 sec
|
|
query_seq_level = '''
|
|
SELECT VALUE seq.class
|
|
FROM seq
|
|
WHERE ARRAY_LENGTH(seq.class) > 0
|
|
AND NOT ARRAY_CONTAINS(seq.class, "empty")
|
|
AND NOT ARRAY_CONTAINS(seq.class, "__label_unavailable")
|
|
'''
|
|
results = megadb_utils.query_sequences_table(
|
|
query_seq_level, partition_key=dataset_name)
|
|
|
|
counter = Counter()
|
|
for i in results:
|
|
counter.update(i)
|
|
|
|
# cases when the class field is on the image level (images in a sequence
|
|
# that had different class labels, 'caltech' dataset is like this)
|
|
# this query may take a long time, >1hr
|
|
query_image_level = '''
|
|
SELECT VALUE seq.images
|
|
FROM sequences seq
|
|
WHERE (
|
|
SELECT VALUE COUNT(im)
|
|
FROM im IN seq.images
|
|
WHERE ARRAY_LENGTH(im.class) > 0
|
|
) > 0
|
|
'''
|
|
|
|
start = datetime.now()
|
|
results_im = megadb_utils.query_sequences_table(
|
|
query_image_level, partition_key=dataset_name)
|
|
elapsed = (datetime.now() - start).seconds
|
|
print(f'- image-level query took {elapsed}s')
|
|
|
|
for seq_images in results_im:
|
|
for im in seq_images:
|
|
assert 'class' in im
|
|
counter.update(im['class'])
|
|
|
|
with open(os.path.join(output_dir, f'{dataset_name}.json'), 'w') as f:
|
|
json.dump(counter, f, indent=2)
|
|
|
|
|
|
def get_example_images(megadb_utils: MegadbUtils, dataset_name: str,
|
|
class_name: str) -> List[Optional[str]]:
|
|
"""Gets SAS URLs for images of a particular class from a given dataset."""
|
|
datasets_table = megadb_utils.get_datasets_table()
|
|
|
|
# this query should be fairly fast, ~1 sec
|
|
query_both_levels = f'''
|
|
SELECT TOP {NUMBER_SEQUENCES_TO_QUERY} VALUE seq
|
|
FROM seq
|
|
WHERE ARRAY_CONTAINS(seq.class, "{class_name}")
|
|
OR (SELECT VALUE COUNT(im)
|
|
FROM im IN seq.images
|
|
WHERE ARRAY_CONTAINS(im.class, "{class_name}")) > 0
|
|
'''
|
|
sequences = megadb_utils.query_sequences_table(
|
|
query_both_levels, partition_key=dataset_name)
|
|
|
|
num_samples = min(len(sequences), NUMBER_EXAMPLES_PER_SPECIES)
|
|
sample_seqs = sample(sequences, num_samples)
|
|
|
|
image_urls: List[Optional[str]] = []
|
|
for seq in sample_seqs:
|
|
sample_image = sample(seq['images'], 1)[0] # sample 1 img per sequence
|
|
img_path = MegadbUtils.get_full_path(
|
|
datasets_table, dataset_name, sample_image['file'])
|
|
img_path = urllib.parse.quote_plus(img_path)
|
|
|
|
dataset_info = datasets_table[dataset_name]
|
|
img_url = 'https://{}.blob.core.windows.net/{}/{}{}'.format(
|
|
dataset_info["storage_account"],
|
|
dataset_info["container"],
|
|
img_path,
|
|
dataset_info["container_sas_key"])
|
|
image_urls.append(img_url)
|
|
|
|
num_missing = NUMBER_EXAMPLES_PER_SPECIES - len(image_urls)
|
|
if num_missing > 0:
|
|
image_urls.extend([None] * num_missing)
|
|
assert len(image_urls) == NUMBER_EXAMPLES_PER_SPECIES
|
|
return image_urls
|
|
|
|
|
|
def make_spreadsheet(megadb_utils: MegadbUtils, output_dir: str) -> None:
|
|
all_classes = set()
|
|
class_in_multiple_ds = {} # {class_name: bool}
|
|
species_by_dataset = {} # {dataset_name: {class_name: count}}
|
|
|
|
classes_excluded = ['empty', 'car', 'vehicle', 'unidentified', 'unknown',
|
|
'__label_unavailable', 'error']
|
|
|
|
# read species presence info from the JSON files for each dataset
|
|
for file_name in os.listdir(output_dir):
|
|
dataset_name, ext = os.path.splitext(file_name)
|
|
if (ext != '.json') or (dataset_name not in DATASETS_TO_INCLUDE_IN_SPREADSHEET):
|
|
continue
|
|
print(f'Processing dataset {dataset_name}')
|
|
|
|
with open(os.path.join(output_dir, file_name)) as f:
|
|
class_counts = json.load(f)
|
|
|
|
species_valid = {
|
|
class_name: count for class_name, count in class_counts.items()
|
|
if class_name not in classes_excluded
|
|
}
|
|
# has this class name appeared in a previous dataset?
|
|
for class_name in species_valid:
|
|
class_in_multiple_ds[class_name] = (class_name in all_classes)
|
|
all_classes.add(class_name)
|
|
|
|
species_by_dataset[dataset_name] = species_valid
|
|
|
|
# columns to populate the spreadsheet
|
|
col_order = [
|
|
'dataset',
|
|
'occurrences', # count of sequences/images mixture with this class name
|
|
'species_label',
|
|
'bing_url',
|
|
'is_common', # is this class name seen already / need to be labeled again?
|
|
'taxonomy_name',
|
|
'common_name',
|
|
'is_typo', # there is a typo in the class name, but correct taxonomy name can be inferred
|
|
'not_applicable', # labels like "human-cattle" where a taxonomy name would not be applicable
|
|
'other_notes', # other info in the class name, like male/female
|
|
'is_new' # not in pervious versions of this spreadsheet
|
|
]
|
|
for i in range(NUMBER_EXAMPLES_PER_SPECIES):
|
|
col_order.append(f'example{i + 1}')
|
|
col_order.append('example_mislabeled')
|
|
|
|
rows = []
|
|
bing_prefix = 'https://www.bing.com/search?q='
|
|
for dataset_name, species_count in species_by_dataset.items():
|
|
print(dataset_name)
|
|
|
|
# sort by descending species count
|
|
species_count_tups = sorted(species_count.items(),
|
|
key=lambda x: x[1], reverse=True)
|
|
for class_name, class_count in tqdm(species_count_tups):
|
|
row = dict(
|
|
dataset=dataset_name,
|
|
occurrences=class_count,
|
|
species_label=class_name,
|
|
bing_url=bing_prefix + urllib.parse.quote_plus(class_name),
|
|
is_common=class_in_multiple_ds[class_name],
|
|
taxonomy_name='',
|
|
common_name='',
|
|
is_typo='',
|
|
other_notes='',
|
|
not_applicable='',
|
|
is_new=True,
|
|
example_mislabeled='')
|
|
|
|
example_images_sas_urls = get_example_images(
|
|
megadb_utils, dataset_name, class_name)
|
|
for i, url in enumerate(example_images_sas_urls):
|
|
row[f'example{i + 1}'] = url
|
|
|
|
rows.append(row)
|
|
|
|
# make the spreadsheet
|
|
spreadsheet = pd.DataFrame(data=rows, columns=col_order)
|
|
print(spreadsheet.head(5))
|
|
wb = Workbook()
|
|
ws = wb.active
|
|
for r in dataframe_to_rows(spreadsheet, index=False, header=True):
|
|
ws.append(r)
|
|
|
|
# hyperlink Bing search URLs
|
|
for i_row, cell in enumerate(ws['D']): # TODO hardcoded column number
|
|
if i_row > 0:
|
|
cell.hyperlink = cell.value
|
|
cell.style = 'Hyperlink'
|
|
|
|
# hyperlink example image SAS URLs
|
|
# TODO hardcoded columns: change if # of examples or col_order changes
|
|
sas_cols = [ws['L'], ws['M'], ws['N'], ws['O'], ws['P'], ws['Q'], ws['R']]
|
|
assert len(sas_cols) == NUMBER_EXAMPLES_PER_SPECIES
|
|
|
|
for i_example, ws_col in enumerate(sas_cols):
|
|
for i_row, cell in enumerate(ws_col):
|
|
if i_row > 0 and cell.value is not None:
|
|
if not isinstance(cell.value, str):
|
|
print(f'WARNING cell.value is {cell.value}, '
|
|
f'type is {type(cell.value)}')
|
|
continue
|
|
cell.hyperlink = cell.value
|
|
cell.value = f'example{i_example + 1}'
|
|
cell.style = 'Hyperlink'
|
|
|
|
date = datetime.now().strftime('%Y_%m_%d')
|
|
wb.save(os.path.join(output_dir, f'species_by_dataset_{date}.xlsx'))
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
'-o', '--output-dir', required=True,
|
|
help='Path to directory where the JSONs containing species count for '
|
|
'each dataset live')
|
|
parser.add_argument(
|
|
'-q', '--query-species', action='store_true',
|
|
help='Query what species are present in a dataset. '
|
|
'Otherwise, create a spreadsheet for labeling the taxonomy.')
|
|
args = parser.parse_args()
|
|
|
|
assert 'COSMOS_ENDPOINT' in os.environ and 'COSMOS_KEY' in os.environ
|
|
|
|
os.makedirs(args.output_dir, exist_ok=True)
|
|
|
|
megadb_utils = MegadbUtils()
|
|
|
|
if args.query_species:
|
|
query_species_by_dataset(megadb_utils, args.output_dir)
|
|
make_spreadsheet(megadb_utils, args.output_dir)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|