From 6cb95c0dcc9821bf4a44475951267b21d69831fa Mon Sep 17 00:00:00 2001 From: Ilya Matiach Date: Fri, 15 Dec 2023 16:38:51 -0500 Subject: [PATCH] fix error when extracting exif metadata features from images in RAI Vision dashboard (#2461) --- ...i_text_insights_save_and_load_scenarios.py | 5 +- responsibleai_vision/requirements-dev.txt | 3 +- .../responsibleai_vision/common/constants.py | 12 ++++ .../rai_vision_insights.py | 6 +- .../utils/feature_extractors.py | 33 +++++++-- .../utils/image_reader.py | 3 +- .../tests/common_vision_utils.py | 11 ++- .../tests/test_feature_extractors.py | 69 +++++++++++++++++++ .../tests/test_image_utils.py | 5 +- .../tests/test_rai_vision_insights.py | 9 +-- 10 files changed, 136 insertions(+), 20 deletions(-) create mode 100644 responsibleai_vision/tests/test_feature_extractors.py diff --git a/responsibleai_text/tests/test_rai_text_insights_save_and_load_scenarios.py b/responsibleai_text/tests/test_rai_text_insights_save_and_load_scenarios.py index f0518a093..be45d7770 100644 --- a/responsibleai_text/tests/test_rai_text_insights_save_and_load_scenarios.py +++ b/responsibleai_text/tests/test_rai_text_insights_save_and_load_scenarios.py @@ -15,7 +15,6 @@ from common_text_utils import (COVID19_EVENTS_LABELS, EMOTION, create_text_classification_pipeline, load_covid19_emergency_event_dataset, load_emotion_dataset) -from huggingface_hub.utils._validators import HFValidationError from rai_text_insights_validator import validate_rai_text_insights from responsibleai._internal.constants import ManagerNames @@ -134,8 +133,8 @@ class TestRAITextInsightsSaveAndLoadScenarios(object): match_msg = 'Can\'t load the configuration' expected_error = OSError else: - match_msg = 'Repo id must' - expected_error = HFValidationError + match_msg = 'local folder' + expected_error = OSError with pytest.raises(expected_error, match=match_msg): without_model_rai_insights = RAITextInsights.load(save_path) assert without_model_rai_insights.model is None diff --git a/responsibleai_vision/requirements-dev.txt b/responsibleai_vision/requirements-dev.txt index dc16683fc..1ef221b7f 100644 --- a/responsibleai_vision/requirements-dev.txt +++ b/responsibleai_vision/requirements-dev.txt @@ -28,4 +28,5 @@ opencv-python fastai mlflow -pydantic<2.0.0 \ No newline at end of file +pydantic<2.0.0 +piexif \ No newline at end of file diff --git a/responsibleai_vision/responsibleai_vision/common/constants.py b/responsibleai_vision/responsibleai_vision/common/constants.py index be47546df..716459d65 100644 --- a/responsibleai_vision/responsibleai_vision/common/constants.py +++ b/responsibleai_vision/responsibleai_vision/common/constants.py @@ -4,6 +4,12 @@ from enum import Enum +class ExtractedFeatures(str, Enum): + """Provide constants related to the extracted image features.""" + + MEAN_PIXEL_VALUE = 'mean_pixel_value' + + class ModelTask(str, Enum): """Provide model task constants. @@ -28,6 +34,12 @@ class ImageColumns(str, Enum): IMAGE_DETAILS = 'image_details' +class ImageModes(str, Enum): + """Provide constants related to the image modes.""" + + RGB = 'RGB' + + class ExplainabilityLiterals: """Parameters for explainability method names.""" diff --git a/responsibleai_vision/responsibleai_vision/rai_vision_insights/rai_vision_insights.py b/responsibleai_vision/responsibleai_vision/rai_vision_insights/rai_vision_insights.py index 11e5ce28e..38ddc7abc 100644 --- a/responsibleai_vision/responsibleai_vision/rai_vision_insights/rai_vision_insights.py +++ b/responsibleai_vision/responsibleai_vision/rai_vision_insights/rai_vision_insights.py @@ -37,7 +37,7 @@ from responsibleai.rai_insights.rai_base_insights import RAIBaseInsights from responsibleai.serialization_utilities import serialize_json_safe from responsibleai_vision.common.constants import (CommonTags, ExplainabilityDefaults, - ImageColumns, + ImageColumns, ImageModes, MLFlowSchemaLiterals, ModelTask) from responsibleai_vision.managers.error_analysis_manager import \ @@ -135,7 +135,7 @@ class RAIVisionInsights(RAIBaseInsights): classes: Optional[np.ndarray] = None, serializer: Optional[Any] = None, maximum_rows_for_test: int = 5000, - image_mode: str = "RGB", + image_mode: str = ImageModes.RGB, test_data_path: Optional[str] = None, transformations: Optional[Any] = None, image_downloader: Optional[Any] = None, @@ -267,7 +267,7 @@ class RAIVisionInsights(RAIBaseInsights): serializer) ext_test, ext_features = extract_features( - self.test, self.target_column, self.task_type, + self.test, self.target_column, self.image_mode, self._feature_metadata) self._ext_test = ext_test diff --git a/responsibleai_vision/responsibleai_vision/utils/feature_extractors.py b/responsibleai_vision/responsibleai_vision/utils/feature_extractors.py index b139bd885..65c2c6f53 100644 --- a/responsibleai_vision/responsibleai_vision/utils/feature_extractors.py +++ b/responsibleai_vision/responsibleai_vision/utils/feature_extractors.py @@ -3,6 +3,7 @@ """Defines the feature extractors.""" +import warnings from typing import Optional import pandas as pd @@ -11,13 +12,17 @@ from PIL.ExifTags import TAGS from tqdm import tqdm from responsibleai.feature_metadata import FeatureMetadata +from responsibleai_vision.common.constants import ExtractedFeatures from responsibleai_vision.utils.image_reader import ( get_all_exif_feature_names, get_image_from_path, get_image_pointer_from_path) +MEAN_PIXEL_VALUE = ExtractedFeatures.MEAN_PIXEL_VALUE.value +MAX_CUSTOM_LEN = 100 + def extract_features(image_dataset: pd.DataFrame, - target_column: str, task_type: str, + target_column: str, image_mode: str = None, feature_metadata: Optional[FeatureMetadata] = None): '''Extract tabular data features from the image dataset. @@ -27,8 +32,6 @@ def extract_features(image_dataset: pd.DataFrame, :param target_column: The name of the label column or list of columns. This is a list of columns for multilabel models. :type target_column: str or list[str] - :param task_type: The type of task to be performed. - :type task_type: str :param image_mode: The mode to open the image in. See pillow documentation for all modes: https://pillow.readthedocs.io/en/stable/handbook/concepts.html @@ -45,7 +48,7 @@ def extract_features(image_dataset: pd.DataFrame, if feature_metadata and feature_metadata.categorical_features is None: feature_metadata.categorical_features = [] exif_feature_names = get_all_exif_feature_names(image_dataset) - feature_names = ["mean_pixel_value"] + exif_feature_names + feature_names = [MEAN_PIXEL_VALUE] + exif_feature_names # append all feature names other than target column and label column_names = image_dataset.columns @@ -58,6 +61,7 @@ def extract_features(image_dataset: pd.DataFrame, continue feature_names.append(column_names[j]) + blacklisted_tags = {} # append all features for i in tqdm(range(image_dataset.shape[0])): image = image_dataset.iloc[i, 0] @@ -81,9 +85,26 @@ def extract_features(image_dataset: pd.DataFrame, # decode bytes if isinstance(data, bytes): data = data.decode() + if len(data) > MAX_CUSTOM_LEN: + data = data[:MAX_CUSTOM_LEN] + '...' if isinstance(data, str): - feature_metadata.categorical_features.append(str(tag)) - row_feature_values[feature_names.index(tag)] = data + if not feature_metadata: + feature_metadata = FeatureMetadata() + feature_metadata.categorical_features = [] + if tag in feature_names: + feature_metadata.categorical_features.append( + str(tag)) + tag_index = feature_names.index(tag) + row_feature_values[tag_index] = data + else: + # in theory this should now never happen with + # latest code, but adding this check for safety + if tag not in blacklisted_tags: + blacklisted_tags.add(tag) + warnings.warn( + f'Exif tag {tag} could not be found ' + 'in the feature names. Ignoring tag ' + 'from extracted metadata.') elif isinstance(data, int) or isinstance(data, float): row_feature_values[feature_names.index(tag)] = data diff --git a/responsibleai_vision/responsibleai_vision/utils/image_reader.py b/responsibleai_vision/responsibleai_vision/utils/image_reader.py index 74d76aaf0..c0f949915 100644 --- a/responsibleai_vision/responsibleai_vision/utils/image_reader.py +++ b/responsibleai_vision/responsibleai_vision/utils/image_reader.py @@ -94,7 +94,8 @@ def get_all_exif_feature_names(image_dataset): data = exifdata.get(tag_id) if isinstance(data, str) or \ isinstance(data, int) or \ - isinstance(data, float): + isinstance(data, float) or \ + isinstance(data, bytes): exif_feature_names.add(tag) return list(exif_feature_names) diff --git a/responsibleai_vision/tests/common_vision_utils.py b/responsibleai_vision/tests/common_vision_utils.py index 8255ae1bd..66c397251 100644 --- a/responsibleai_vision/tests/common_vision_utils.py +++ b/responsibleai_vision/tests/common_vision_utils.py @@ -12,6 +12,7 @@ from zipfile import ZipFile import numpy as np import pandas as pd +import piexif import shap import torch import torch.nn as nn @@ -172,7 +173,7 @@ def retrieve_unzip_file(download_url, data_file): os.remove(data_file) -def load_fridge_dataset(): +def load_fridge_dataset(add_extra_mixed_metadata=False): # create data folder if it doesnt exist. os.makedirs("data", exist_ok=True) @@ -186,6 +187,14 @@ def load_fridge_dataset(): for folder in os.listdir("./data/fridgeObjects"): for file in os.listdir("./data/fridgeObjects/" + folder): image_path = "./data/fridgeObjects/" + folder + "/" + file + if add_extra_mixed_metadata and file.endswith("1.jpg"): + with Image.open(image_path) as im: + exif_dict = piexif.load(im.info['exif']) + comment = 'Extra metadata for {}'.format(file).encode() + exif_dict['0th'][piexif.ImageIFD.XPComment] = comment + exif_dict['1st'][piexif.ImageIFD.XPComment] = comment + exif_bytes = piexif.dump(exif_dict) + im.save(image_path, exif=exif_bytes) data = data.append({IMAGE: image_path, LABEL: folder}, ignore_index=True) return data diff --git a/responsibleai_vision/tests/test_feature_extractors.py b/responsibleai_vision/tests/test_feature_extractors.py new file mode 100644 index 000000000..0057cf85b --- /dev/null +++ b/responsibleai_vision/tests/test_feature_extractors.py @@ -0,0 +1,69 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +from common_vision_utils import (load_flowers_dataset, load_fridge_dataset, + load_fridge_object_detection_dataset, + load_imagenet_dataset) + +from responsibleai_vision.common.constants import (ExtractedFeatures, + ImageColumns, ImageModes) +from responsibleai_vision.utils.feature_extractors import extract_features + +MEAN_PIXEL_VALUE = ExtractedFeatures.MEAN_PIXEL_VALUE.value +FRIDGE_METADATA_FEATURES = [ + 'Make', 'ResolutionUnit', 'ImageLength', 'ExifOffset', 'Model', + 'GPSInfo', 'ImageWidth', 'DateTime', 'YCbCrPositioning', + 'Software', 'Orientation' +] + + +def validate_extracted_features(extracted_features, feature_names, + expected_feature_names, data): + assert len(extracted_features) == len(data) + assert feature_names[0] == expected_feature_names[0] + for i in range(1, len(feature_names)): + assert feature_names[i] in expected_feature_names + assert len(feature_names) == len(expected_feature_names) + assert len(extracted_features[0]) == len(feature_names) + + +def extract_dataset_features(data): + return extract_features(data, ImageColumns.LABEL, ImageModes.RGB, None) + + +class TestFeatureExtractors(object): + def test_extract_features_fridge_object_detection(self): + data = load_fridge_object_detection_dataset(automl_format=False) + extracted_features, feature_names = extract_dataset_features(data) + expected_feature_names = [MEAN_PIXEL_VALUE] + FRIDGE_METADATA_FEATURES + validate_extracted_features(extracted_features, feature_names, + expected_feature_names, data) + + def test_extract_features_fridge_metadata(self): + data = load_fridge_dataset() + extracted_features, feature_names = extract_dataset_features(data) + expected_feature_names = [MEAN_PIXEL_VALUE] + FRIDGE_METADATA_FEATURES + validate_extracted_features(extracted_features, feature_names, + expected_feature_names, data) + + def test_extract_features_imagenet_metadata(self): + data = load_imagenet_dataset() + extracted_features, feature_names = extract_dataset_features(data) + expected_feature_names = [MEAN_PIXEL_VALUE] + validate_extracted_features(extracted_features, feature_names, + expected_feature_names, data) + + def test_extract_features_flowers_metadata(self): + data = load_flowers_dataset(upscale=False) + extracted_features, feature_names = extract_dataset_features(data) + expected_feature_names = [MEAN_PIXEL_VALUE] + validate_extracted_features(extracted_features, feature_names, + expected_feature_names, data) + + def test_extract_features_mixed_exif_XPComment_metadata(self): + data = load_fridge_dataset(add_extra_mixed_metadata=True) + extracted_features, feature_names = extract_dataset_features(data) + expected_feature_names = [MEAN_PIXEL_VALUE, 'XPComment'] + expected_feature_names += FRIDGE_METADATA_FEATURES + validate_extracted_features(extracted_features, feature_names, + expected_feature_names, data) diff --git a/responsibleai_vision/tests/test_image_utils.py b/responsibleai_vision/tests/test_image_utils.py index 641172d53..7e749d693 100644 --- a/responsibleai_vision/tests/test_image_utils.py +++ b/responsibleai_vision/tests/test_image_utils.py @@ -63,6 +63,7 @@ class TestImageUtils(object): assert label_j[5] == o_label_j[IS_CROWD] def test_retry_sessions_match_domain_count(self): + sessions_before_test = len(image_reader_requests_sessions) urls = [f"https://{i}.com/image.png" for i in range(10)] duplicates = urls.copy() urls.extend(duplicates) @@ -72,7 +73,9 @@ class TestImageUtils(object): for url in urls: image_reader_get_retry_session(url) - assert len(image_reader_requests_sessions) == domain_unique_count + new_session_count = len(image_reader_requests_sessions) + new_session_count -= sessions_before_test + assert new_session_count == domain_unique_count @patch("urllib3.connectionpool.HTTPConnectionPool._make_request") def test_retry_sessions_retries_on_conn_failure(self, request_mock): diff --git a/responsibleai_vision/tests/test_rai_vision_insights.py b/responsibleai_vision/tests/test_rai_vision_insights.py index 664cbac2d..330391e01 100644 --- a/responsibleai_vision/tests/test_rai_vision_insights.py +++ b/responsibleai_vision/tests/test_rai_vision_insights.py @@ -25,7 +25,7 @@ from rai_vision_insights_validator import validate_rai_vision_insights from responsibleai.feature_metadata import FeatureMetadata from responsibleai_vision import ModelTask, RAIVisionInsights from responsibleai_vision.common.constants import (ExplainabilityDefaults, - ImageColumns) + ImageColumns, ImageModes) DEFAULT_MAX_EVALS = ExplainabilityDefaults.DEFAULT_MAX_EVALS DEFAULT_NUM_MASKS = ExplainabilityDefaults.DEFAULT_NUM_MASKS @@ -40,7 +40,7 @@ class TestRAIVisionInsights(object): task_type = ModelTask.IMAGE_CLASSIFICATION class_names = load_imagenet_labels() run_rai_insights(pred, data[:3], ImageColumns.LABEL, - task_type, class_names, image_mode='RGB') + task_type, class_names, image_mode=ImageModes.RGB) @pytest.mark.parametrize('max_evals', [None, 10, 200]) def test_rai_insights_image_classification_max_evals(self, max_evals): @@ -51,7 +51,7 @@ class TestRAIVisionInsights(object): # run on a single image to avoid running out of memory on # test machines run_rai_insights(pred, data[:1], ImageColumns.LABEL, - task_type, class_names, image_mode='RGB', + task_type, class_names, image_mode=ImageModes.RGB, test_explainer=True, max_evals=max_evals) @pytest.mark.parametrize('max_evals', [-100, -1, 0]) @@ -63,7 +63,8 @@ class TestRAIVisionInsights(object): with pytest.raises(ValueError, match="max_evals must be greater than 0"): run_rai_insights(pred, data[:1], ImageColumns.LABEL, - task_type, class_names, image_mode='RGB', + task_type, class_names, + image_mode=ImageModes.RGB, test_explainer=True, max_evals=max_evals) def test_rai_insights_image_classification_fridge(self):