Add support for YOLO v8 Pose post-processing. (#605)

* Add support for YOLO v8 Pose post-processing. The output has additional values in the 'mask' data for the keypoints. - Update the post processing steps to support extracting and scaling the keypoints. - Simplify the existing step to split out the boxes and scores by using a basic Split operator if there is no confidence score for a bounding box to apply to the class scores. - Confidence score for a bounding box is YOLO versions prior to 8. - Update existing tests TODO: Add unit tests for new Steps. They have been manually validated with the real model for now. * Changes to support pre-decoded input. Needs cleanup. * Support an overall max number of detections as well as per-class detections. * Expand Identity to support multiple inputs Fix issue with incorrect score being selected by NMS (was max and not score for selected clas) Fix TopK usage so result ordering is consistent when it is not used Add unit tests. * Update docs and some cleanups * Use Union
2023-12-20 08:51:29 +10:00 · 2023-12-20 08:51:29 +10:00 · da9c01d3e1
--- a/onnxruntime_extensions/tools/add_pre_post_processing_to_model.py
+++ b/onnxruntime_extensions/tools/add_pre_post_processing_to_model.py
@ -209,12 +209,12 @@ def yolo_detection(model_file: Path, output_file: Path, output_format: str = 'jp
    # https://github.com/ultralytics/ultralytics/blob/e5cb35edfc3bbc9d7d7db8a6042778a751f0e39e/examples/YOLOv8-CPP-Inference/inference.cpp#L31-L33
    # We always want the box info to be the last dim for each of iteration.
    # For new variants like YoloV8, we need to add an transpose op to permute output back.
-    need_transpose = False
+    yolo_v8_or_later = False

    output_shape = [model_output_shape.dim[i].dim_value if model_output_shape.dim[i].HasField("dim_value") else -1
                    for i in [-2, -1]]
    if output_shape[0] != -1 and output_shape[1] != -1:
-        need_transpose = output_shape[0] < output_shape[1] 
+        yolo_v8_or_later = output_shape[0] < output_shape[1]
    else:
        assert len(model.graph.input) == 1, "Doesn't support adding pre and post-processing for multi-inputs model."
        try:
@ -233,7 +233,7 @@ Because we need to execute the model to determine the output shape in order to a
        outputs = session.run(None,  inp)[0]
        assert len(outputs.shape) == 3 and outputs.shape[0] == 1, "shape of the first model output is not (1, n, m)"
        if outputs.shape[1] < outputs.shape[2]:
-            need_transpose = True
+            yolo_v8_or_later = True
        assert num_classes+4 == outputs.shape[2] or num_classes+5 == outputs.shape[2], \
            "The output shape is neither (1, num_boxes, num_classes+4(reg)) nor (1, num_boxes, num_classes+5(reg+obj))"

@ -251,12 +251,29 @@ Because we need to execute the model to determine the output shape in order to a
            Unsqueeze([0]),  # add batch, CHW --> 1CHW
        ]
    )
+
    # NMS and drawing boxes
    post_processing_steps = [
-        Squeeze([0]), # - Squeeze to remove batch dimension
-        SplitOutBoxAndScore(num_classes=num_classes), # Separate bounding box and confidence outputs
-        SelectBestBoundingBoxesByNMS(), # Apply NMS to suppress bounding boxes
-        (ScaleBoundingBoxes(),  # Scale bounding box coords back to original image
+        Squeeze([0]),  # - Squeeze to remove batch dimension
+    ]
+
+    if yolo_v8_or_later:
+        post_processing_steps += [
+            Transpose([1, 0]),  # transpose to (num_boxes, box+scores)
+            # split  elements into the box and scores for the classes. no confidence value to apply to scores
+            Split(num_outputs=2, axis=-1, splits=[4, num_classes]),
+        ]
+    else:
+        post_processing_steps += [
+            # Split bounding box from confidence and scores for each class
+            # Apply confidence to the scores.
+            SplitOutBoxAndScoreWithConf(num_classes=num_classes),
+        ]
+
+    post_processing_steps += [
+        SelectBestBoundingBoxesByNMS(),  # pick best bounding boxes with NonMaxSuppression
+         # Scale bounding box coords back to original image
+        (ScaleNMSBoundingBoxesAndKeyPoints(name='ScaleBoundingBoxes'),
         [
            # A connection from original image to ScaleBoundingBoxes
            # A connection from the resized image to ScaleBoundingBoxes
@ -279,9 +296,6 @@ Because we need to execute the model to determine the output shape in order to a
        # Encode to jpg/png
        ConvertBGRToImage(image_format=output_format),
    ]
-    # transpose to (num_boxes, coor+conf) if needed
-    if need_transpose:
-        post_processing_steps.insert(1, Transpose([1, 0]))

    pipeline.add_post_processing(post_processing_steps)

--- a/onnxruntime_extensions/tools/pre_post_processing/docs/pdoc/index.md
+++ b/onnxruntime_extensions/tools/pre_post_processing/docs/pdoc/index.md
@ -0,0 +1,371 @@
+Module pdoc
+===========
+Python package `pdoc` provides types, functions, and a command-line
+interface for accessing public documentation of Python modules, and
+for presenting it in a user-friendly, industry-standard open format.
+It is best suited for small- to medium-sized projects with tidy,
+hierarchical APIs.
+
+`pdoc` extracts documentation of:
+
+* modules (including submodules),
+* functions (including methods, properties, coroutines ...),
+* classes, and
+* variables (including globals, class variables, and instance variables).
+
+Documentation is extracted from live objects' [docstrings]
+using Python's `__doc__` attribute[^execution]. Documentation for
+variables is found by examining objects' abstract syntax trees.
+
+[docstrings]: https://docs.python.org/3/glossary.html#term-docstring
+
+[^execution]:
+    Documented modules are _executed_ in order to provide `__doc__`
+    attributes. Any [non-fenced] global code in imported modules will
+    _affect the current runtime environment_.
+
+[non-fenced]: https://stackoverflow.com/questions/19578308/what-is-the-benefit-of-using-main-method-in-python/19578335#19578335
+
+What objects are documented?
+----------------------------
+[public-private]: #what-objects-are-documented
+`pdoc` only extracts _public API_ documentation.[^public]
+Code objects (modules, variables, functions, classes, methods) are considered
+public in the modules where they are defined (vs. imported from somewhere else)
+as long as their _identifiers don't begin with an underscore_ ( \_ ).[^private]
+If a module defines [`__all__`][__all__], then only the identifiers contained
+in this list are considered public, regardless of where they were defined.
+
+This can be fine-tuned through [`__pdoc__` dict][__pdoc__].
+
+[^public]:
+    Here, public API refers to the API that is made available
+    to your project end-users, not the public API e.g. of a
+    private class that can be reasonably extended elsewhere
+    by your project developers.
+
+[^private]:
+    Prefixing private, implementation-specific objects with
+    an underscore is [a common convention].
+
+[a common convention]: https://docs.python.org/3/tutorial/classes.html#private-variables
+
+[__all__]: https://docs.python.org/3/tutorial/modules.html#importing-from-a-package
+
+Where does `pdoc` get documentation from?
+-----------------------------------------
+In Python, objects like modules, functions, classes, and methods
+have a special attribute `__doc__` which contains that object's
+documentation string ([docstring][docstrings]).
+For example, the following code defines a function with a docstring
+and shows how to access its contents:
+
+    >>> def test():
+    ...     """This is a docstring."""
+    ...     pass
+    ...
+    >>> test.__doc__
+    'This is a docstring.'
+
+It's pretty much the same with classes and modules.
+See [PEP-257] for Python docstring conventions.
+
+[PEP-257]: https://www.python.org/dev/peps/pep-0257/
+
+These docstrings are set as descriptions for each module, class,
+function, and method listed in the documentation produced by `pdoc`.
+
+`pdoc` extends the standard use of docstrings in Python in two
+important ways: by allowing methods to inherit docstrings, and
+by introducing syntax for docstrings for variables.
+
+### Docstrings inheritance
+[docstrings inheritance]: #docstrings-inheritance
+
+`pdoc` considers methods' docstrings inherited from superclass methods',
+following the normal class inheritance patterns.
+Consider the following code example:
+
+    >>> class A:
+    ...     def test(self):
+    ...         """Docstring for A."""
+    ...         pass
+    ...
+    >>> class B(A):
+    ...     def test(self):
+    ...         pass
+    ...
+    >>> A.test.__doc__
+    'Docstring for A.'
+    >>> B.test.__doc__
+    None
+
+In Python, the docstring for `B.test` doesn't exist, even though a
+docstring was defined for `A.test`.
+Contrary, when `pdoc` generates documentation for code such as above,
+it will automatically attach the docstring for `A.test` to
+`B.test` if the latter doesn't define its own.
+In the default HTML template, such inherited docstrings are greyed out.
+
+### Docstrings for variables
+[variable docstrings]: #docstrings-for-variables
+
+Python by itself [doesn't allow docstrings attached to variables][PEP-224].
+However, `pdoc` supports documenting module (or global)
+variables, class variables, and object instance variables via
+two different mechanisms: [PEP-224] and `#:` doc-comments.
+
+For example:
+
+[PEP-224]: http://www.python.org/dev/peps/pep-0224
+
+    module_variable = 1
+    """PEP 224 docstring for module_variable."""
+
+    class C:
+        #: Documentation comment for class_variable
+        #: spanning over three lines.
+        class_variable = 2  #: Assignment line is included.
+
+        def __init__(self):
+            #: Instance variable's doc-comment
+            self.variable = 3
+            """But note, PEP 224 docstrings take precedence."""
+
+While the resulting variables have no `__doc__` attribute,
+`pdoc` compensates by reading the source code (when available)
+and parsing the syntax tree.
+
+By convention, variables defined in a class' `__init__` method
+and attached to `self` are considered and documented as
+_instance_ variables.
+
+Class and instance variables can also [inherit docstrings][docstrings inheritance].
+
+Overriding docstrings with `__pdoc__`
+-------------------------------------
+[__pdoc__]: #overriding-docstrings-with-__pdoc__
+Docstrings for objects can be disabled, overridden, or whitelisted with a special
+module-level dictionary `__pdoc__`. The _keys_
+should be string identifiers within the scope of the module or,
+alternatively, fully-qualified reference names. E.g. for instance
+variable `self.variable` of class `C`, its module-level identifier is
+`'C.variable'`, and `some_package.module.C.variable` its refname.
+
+If `__pdoc__[key] = False`, then `key` (and its members) will be
+**excluded from the documentation** of the module.
+
+Conversely, if `__pdoc__[key] = True`, then `key` (and its public members) will be
+**included in the documentation** of the module. This can be used to
+include documentation of [private objects][public-private],
+including special functions such as `__call__`, which are ignored by default.
+
+Alternatively, the _values_ of `__pdoc__` can be the **overriding docstrings**.
+This feature is useful when there's no feasible way of
+attaching a docstring to something. A good example is a
+[namedtuple](https://docs.python.org/3/library/collections.html#collections.namedtuple):
+
+    __pdoc__ = {}
+
+    Table = namedtuple('Table', ['types', 'names', 'rows'])
+    __pdoc__['Table.types'] = 'Types for each column in the table.'
+    __pdoc__['Table.names'] = 'The names of each column in the table.'
+    __pdoc__['Table.rows'] = 'Lists corresponding to each row in the table.'
+
+`pdoc` will then show `Table` as a class with documentation for the
+`types`, `names` and `rows` members.
+
+.. note::
+    The assignments to `__pdoc__` need to be placed where they'll be
+    executed when the module is imported. For example, at the top level
+    of a module or in the definition of a class.
+
+Supported docstring formats
+---------------------------
+[docstring-formats]: #supported-docstring-formats
+Currently, pure Markdown (with [extensions]), [numpydoc],
+and [Google-style] docstrings formats are supported,
+along with some [reST directives].
+
+Additionally, if `latex_math` [template config][custom templates] option is enabled,
+LaTeX math syntax is supported when placed between
+[recognized delimiters]: `\(...\)` for inline equations and
+`\[...\]` or `$$...$$` for block equations. Note, you need to escape
+your backslashes in Python docstrings (`\\(`, `\\frac{}{}`, ...)
+or, alternatively, use [raw string literals].
+
+*[reST]: reStructuredText
+[extensions]: https://python-markdown.github.io/extensions/#officially-supported-extensions
+[numpydoc]: https://numpydoc.readthedocs.io/
+[Google-style]: http://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings
+[recognized delimiters]: https://docs.mathjax.org/en/latest/input/tex/delimiters.html
+[raw string literals]: https://www.journaldev.com/23598/python-raw-string
+
+### Supported reST directives
+[reST directives]: #supported-rest-directives
+
+The following reST directives should work:
+
+* specific and generic [admonitions] (attention, caution, danger,
+  error, hint, important, note, tip, warning, admonition),
+* [`.. image::`][image] or `.. figure::` (without options),
+* [`.. include::`][include], with support for the options:
+  `:start-line:`, `:end-line:`, `:start-after:` and `:end-before:`.
+* [`.. math::`][math]
+* `.. versionadded::`
+* `.. versionchanged::`
+* `.. deprecated::`
+* `.. todo::`
+
+[admonitions]: http://docutils.sourceforge.net/docs/ref/rst/directives.html#admonitions
+[image]: http://docutils.sourceforge.net/docs/ref/rst/directives.html#images
+[include]: http://docutils.sourceforge.net/docs/ref/rst/directives.html#including-an-external-document-fragment
+[math]: http://docutils.sourceforge.net/docs/ref/rst/directives.html#math
+
+Linking to other identifiers
+----------------------------
+[cross-linking]: #linking-to-other-identifiers
+In your documentation, you may refer to other identifiers in
+your modules. When exporting to HTML, linking is automatically
+done whenever you surround an identifier with [backticks] ( \` ).
+Unless within the current module,
+the identifier name must be fully qualified, for example
+<code>\`pdoc.Doc.docstring\`</code> is correct (and will link to
+`pdoc.Doc.docstring`) while <code>\`Doc.docstring\`</code>
+only works within `pdoc` module.
+
+[backticks]: https://en.wikipedia.org/wiki/Grave_accent#Use_in_programming
+
+Command-line interface
+----------------------
+[cmd]: #command-line-interface
+`pdoc` includes a feature-rich "binary" program for producing
+HTML and plain text documentation of your modules.
+For example, to produce HTML documentation of your whole package
+in subdirectory 'build' of the current directory, using the default
+HTML template, run:
+
+    $ pdoc --html --output-dir build my_package
+
+If you want to omit the source code preview, run:
+
+    $ pdoc --html --config show_source_code=False my_package
+
+Find additional template configuration tunables in [custom templates]
+section below.
+
+To run a local HTTP server while developing your package or writing
+docstrings for it, run:
+
+    $ pdoc --http : my_package
+
+To re-build documentation as part of your continuous integration (CI)
+best practice, i.e. ensuring all reference links are correct and
+up-to-date, make warnings error loudly by settings the environment
+variable [`PYTHONWARNINGS`][PYTHONWARNINGS] before running pdoc:
+
+    $ export PYTHONWARNINGS='error::UserWarning'
+
+[PYTHONWARNINGS]: https://docs.python.org/3/using/cmdline.html#envvar-PYTHONWARNINGS
+
+For brief usage instructions, type:
+
+    $ pdoc --help
+
+Even more usage examples can be found in the [FAQ].
+
+[FAQ]: https://github.com/pdoc3/pdoc/issues?q=is%3Aissue+label%3Aquestion
+
+Programmatic usage
+------------------
+The main entry point is `pdoc.Module` which wraps a module object
+and recursively imports and wraps any submodules and their members.
+
+After all related modules are wrapped (related modules are those that
+share the same `pdoc.Context`), you need to call
+`pdoc.link_inheritance` with the used `Context` instance to
+establish class inheritance links.
+
+Afterwards, you can use `pdoc.Module.html` and `pdoc.Module.text`
+methods to output documentation in the desired format.
+For example:
+
+    import pdoc
+
+    modules = ['a', 'b']  # Public submodules are auto-imported
+    context = pdoc.Context()
+
+    modules = [pdoc.Module(mod, context=context)
+               for mod in modules]
+    pdoc.link_inheritance(context)
+
+    def recursive_htmls(mod):
+        yield mod.name, mod.html()
+        for submod in mod.submodules():
+            yield from recursive_htmls(submod)
+
+    for mod in modules:
+        for module_name, html in recursive_htmls(mod):
+            ...  # Process
+
+When documenting a single module, you might find
+functions `pdoc.html` and `pdoc.text` handy.
+For importing arbitrary modules/files, use `pdoc.import_module`.
+
+Alternatively, use the [runnable script][cmd] included with this package.
+
+Custom templates
+----------------
+[custom templates]: #custom-templates
+To override the built-in HTML/CSS and plain text templates, copy
+the relevant templates from `pdoc/templates` directory into a directory
+of your choosing and edit them. When you run [pdoc command][cmd]
+afterwards, pass the directory path as a parameter to the
+`--template-dir` switch.
+
+.. tip::
+    If you find you only need to apply minor alterations to the HTML template,
+    see if you can do so by overriding just some of the following, placeholder
+    sub-templates:
+
+    * [_config.mako_]: Basic template configuration, affects the way templates
+      are rendered.
+    * _head.mako_: Included just before `</head>`. Best for adding resources and styles.
+    * _logo.mako_: Included at the very top of the navigation sidebar. Empty by default.
+    * _credits.mako_: Included in the footer, right before pdoc version string.
+
+    See [default template files] for reference.
+
+.. tip::
+   You can also alter individual [_config.mako_] preferences using the
+   `--config` command-line switch.
+
+If working with `pdoc` programmatically, _prepend_ the directory with
+modified templates into the `directories` list of the
+`pdoc.tpl_lookup` object.
+
+[_config.mako_]: https://github.com/pdoc3/pdoc/blob/master/pdoc/templates/config.mako
+[default template files]: https://github.com/pdoc3/pdoc/tree/master/pdoc/templates
+
+Compatibility
+-------------
+`pdoc` requires Python 3.6+.
+The last version to support Python 2.x is [pdoc3 0.3.x].
+
+[pdoc3 0.3.x]: https://pypi.org/project/pdoc3/0.3.13/
+
+Contributing
+------------
+`pdoc` is [on GitHub]. Bug reports and pull requests are welcome.
+
+[on GitHub]: https://github.com/pdoc3/pdoc
+
+License
+-------
+`pdoc` is licensed under the terms of GNU [AGPL-3.0]{: rel=license} or later,
+meaning you can use it for any reasonable purpose and remain in
+complete ownership of all the documentation you produce,
+but you are also encouraged to make sure any upgrades to `pdoc`
+itself find their way back to the community.
+
+[AGPL-3.0]: https://www.gnu.org/licenses/agpl-3.0.html
--- a/onnxruntime_extensions/tools/pre_post_processing/docs/pre_post_processing/step.md
+++ b/onnxruntime_extensions/tools/pre_post_processing/docs/pre_post_processing/step.md
@ -35,8 +35,10 @@ Classes

    * pre_post_processing.step.Debug
    * pre_post_processing.steps.general.ArgMax
+    * pre_post_processing.steps.general.Identity
    * pre_post_processing.steps.general.ReverseAxis
    * pre_post_processing.steps.general.Softmax
+    * pre_post_processing.steps.general.Split
    * pre_post_processing.steps.general.Squeeze
    * pre_post_processing.steps.general.Transpose
    * pre_post_processing.steps.general.Unsqueeze
@ -53,9 +55,9 @@ Classes
    * pre_post_processing.steps.vision.Normalize
    * pre_post_processing.steps.vision.PixelsToYCbCr
    * pre_post_processing.steps.vision.Resize
-    * pre_post_processing.steps.vision.ScaleBoundingBoxes
+    * pre_post_processing.steps.vision.ScaleNMSBoundingBoxesAndKeyPoints
    * pre_post_processing.steps.vision.SelectBestBoundingBoxesByNMS
-    * pre_post_processing.steps.vision.SplitOutBoxAndScore
+    * pre_post_processing.steps.vision.SplitOutBoxAndScoreWithConf
    * pre_post_processing.steps.vision.YCbCrToPixels

    ### Class variables
--- a/onnxruntime_extensions/tools/pre_post_processing/docs/pre_post_processing/steps/general.md
+++ b/onnxruntime_extensions/tools/pre_post_processing/docs/pre_post_processing/steps/general.md
@ -16,6 +16,16 @@ Classes

    * pre_post_processing.step.Step

+`Identity(num_inputs: int = 1, name: Optional[str] = None)`
+:   ONNX Identity for all inputs to the Step. Used to pass through values as-is to later Steps.
+    
+    Args:
+        name: Optional name of step. Defaults to 'Identity'
+
+    ### Ancestors (in MRO)
+
+    * pre_post_processing.step.Step
+
 `ReverseAxis(axis: int = -1, dim_value: int = -1, name: Optional[str] = None)`
 :   Reverses the data in an axis by splitting and concatenating in reverse order.
      e.g. convert RGB ordered data to BGR.
@ -43,6 +53,18 @@ Classes

    * pre_post_processing.step.Step

+`Split(num_outputs: int, axis: Optional[int] = None, splits: Optional[List[int]] = None, name: Optional[str] = None)`
+:   ONNX Split
+    
+    :param num_outputs: Number of outputs to split the input into. Unequal split is allowed for opset 18+.
+    :param axis: Axis to split on. Default is 0.
+    :param splits: Optional length of each output. Sum must equal dim value at 'axis'
+    :param name: Optional Step name. Defaults to 'Split'
+
+    ### Ancestors (in MRO)
+
+    * pre_post_processing.step.Step
+
 `Squeeze(axes: Optional[List[int]] = None, name: Optional[str] = None)`
 :   ONNX Squeeze
    
--- a/onnxruntime_extensions/tools/pre_post_processing/docs/pre_post_processing/steps/nlp.md
+++ b/onnxruntime_extensions/tools/pre_post_processing/docs/pre_post_processing/steps/nlp.md
@ -4,7 +4,7 @@ Module pre_post_processing.steps.nlp
 Classes
 -------

-`BertTokenizer(tokenizer_param: pre_post_processing.steps.nlp.TokenizerParam, name: Optional[str] = None)`
+`BertTokenizer(tokenizer_param: pre_post_processing.steps.nlp.TokenizerParam, need_token_type_ids_output: bool = False, name: Optional[str] = None)`
 :   Base class for a pre or post processing step.
    
    Brief: This step is used to convert the input text into the input_ids, attention_mask, token_type_ids.
@ -18,6 +18,8 @@ Classes
                                             )
    
        name: Optional name of step. Defaults to 'BertTokenizer'
+        need_token_type_ids_output: last outout `token_type_ids` is not required in some Bert based models. (e.g. DistilBert, etc.) can optionally
+                                    choose to add it in graph for step.

    ### Ancestors (in MRO)

--- a/onnxruntime_extensions/tools/pre_post_processing/docs/pre_post_processing/steps/vision.md
+++ b/onnxruntime_extensions/tools/pre_post_processing/docs/pre_post_processing/steps/vision.md
@ -109,7 +109,7 @@ Classes

    * pre_post_processing.step.Step

-`ImageBytesToFloat(name: Optional[str] = None)`
+`ImageBytesToFloat(rescale_factor: float = 0.00392156862745098, name: Optional[str] = None)`
 :   Convert uint8 or float values in range 0..255 to floating point values in range 0..1
    
    Args:
@ -119,7 +119,7 @@ Classes

    * pre_post_processing.step.Step

-`LetterBox(target_shape: Union[int, Tuple[int, int]], fill_value=0, name: Optional[str] = None)`
+`LetterBox(target_shape: Union[int, Tuple[int, int]], fill_value=0, layout: str = 'HWC', name: Optional[str] = None)`
 :   Image is channel last and ordered by BGR.
    mainly used in object detection, it mostly follows behind resize operation. 
    This step either add border or crop the image to satisfy network input.
@ -133,6 +133,7 @@ Classes
    
    Input shape: <uint8_t>{height, width, 3<BGR>}
    target_shape: <uint8_t>{out_height, out_width, 3<BGR>}
+    layout: HWC or CHW are supported
    Output shape: specified by target_shape
    
    Args:
@ -195,62 +196,75 @@ Classes

    * pre_post_processing.step.Step

-`ScaleBoundingBoxes(name: Optional[str] = None)`
-:   Mapping boxes coordinate to scale in original image.
-    The coordinate of boxes from detection model is relative to the input image of network, 
-    image is scaled and padded/cropped. So we need to do a linear mapping to get the real coordinate of original image.
+`ScaleNMSBoundingBoxesAndKeyPoints(num_key_points: Optional[int] = 0, layout: Optional[str] = 'HWC', name: Optional[str] = None)`
+:   Scale bounding box and key point coordinates in optional mask data to original image.
+    
+    Input image goes through Resize and LetterBox steps during pre-processing (in that order), and the output of this
+    is what the original model runs against.
+    To display the predictions on the original image we need to apply the reverse size changes to the co-ordinates 
+    of the bounding boxes.
+    
+    nms_step_output inner dimension has 4 values for the bounding box, 1 for the score, 1 for the selected class,
+    and the remainder (if any) is the mask data.
+    
+    The mask data has values for a fixed number of key points. Each key point has an x and y value, and optionally a
+    confidence value.
+    
    input:
-        box_of_nms_out: output of NMS, shape [num_boxes, 6]
-        original_image: original image decoded from jpg/png<uint8_t>[H, W, 3<BGR>]
-        scaled_image: scaled image, but without padding/crop[<uint8_t>[H1, W1, 3<BGR>]
-        letter_boxed_image: scaled image and with padding/crop[<uint8_t>[H2, W3, 3<BGR>]
+        nms_step_output: output of SelectBestBoundingBoxesByNMS Step, shape [num_boxes, 6+]
+        original_image: original image decoded from jpg/png, <uint8_t>[H, W, 3] or [3, H, W]
+        resized_image: output from Resize pre-processing Step, <uint8_t>[H1, W1, 3] or [3, H1, W1]
+        letter_boxed_image: output from LetterBox pre-processing Step, <uint8_t>[H2, W2, 3] or [3, H2, W2]
+        num_key_points: number of key points in each mask data entry, if present. optional.
    
    output:
-        scaled_box_out: shape [num_boxes, 6] with coordinate mapped to original image.
+        nms_output_with_scaled_boxes_and_keypoints: input data with boxes and key points scaled to original image.
    
    Args:
-        name: Optional name of step. Defaults to 'ScaleBoundingBoxes'
+        num_key_points: Number of key points in mask data. Only required if input has optional mask data.
+        layout: HWC or CHW. Used to determine where to read the H and W value from the input image shapes.
+                MUST be the same for all 3 input images.
+    
+        name: Optional name of step. Defaults to 'ScaleNMSBoundingBoxesAndKeyPoints'

    ### Ancestors (in MRO)

    * pre_post_processing.step.Step

-`SelectBestBoundingBoxesByNMS(iou_threshold: float = 0.5, score_threshold: float = 0.67, max_detections: int = 300, name: Optional[str] = None)`
-:   Non-maximum suppression (NMS) is to filter out redundant bounding boxes.
-    This step is used to warp the boxes and scores into onnx SelectBestBoundingBoxesByNMS op.
+`SelectBestBoundingBoxesByNMS(iou_threshold: Optional[float] = 0.5, score_threshold: Optional[float] = 0.67, max_boxes_per_class: Optional[int] = 100, max_detections: Optional[int] = None, has_mask_data: Optional[bool] = False, name: Optional[str] = None)`
+:   Non-maximum suppression (NMS) is to select the best bounding boxes.
    Input:
-        boxes:  float[num_boxes, 4]
-        scores:  shape float[num_boxes, num_classes]
+        boxes: float[num_boxes, 4]
+        scores: float[num_boxes, num_classes]
+        masks: float[num_boxes, mask_data]. optional
    
    Output:
-        nms_out: float[_few_num_boxes, 6<coordinate+score+class>]
+        nms_out: float[_few_num_boxes, <box+score+class+mask_data>]
    
-    Args:
-    Please refer to https://github.com/onnx/onnx/blob/main/docs/Operators.md#SelectBestBoundingBoxesByNMS
-    for more details about the parameters.
-        iou_threshold:  same as SelectBestBoundingBoxesByNMS op, intersection /union of boxes 
-        score_threshold:  If this box's score is lower than score_threshold, it will be removed.
-        max_detections:  max number of boxes to be selected
+    Args: Please refer to https://github.com/onnx/onnx/blob/main/docs/Operators.md#NonMaxSuppression
+          for more details about the parameters.
+        iou_threshold: same as NonMaxSuppression op, intersection/union of boxes
+        score_threshold: If this box's score is lower than score_threshold, it will be removed.
+        max_boxes_per_class: max number of boxes to be selected per class
+        max_detections: maximum number of boxes in total. Applied as the last step of processing if specified.
        name: Optional name of step. Defaults to 'SelectBestBoundingBoxesByNMS'

    ### Ancestors (in MRO)

    * pre_post_processing.step.Step

-`SplitOutBoxAndScore(num_classes: int = 80, name: Optional[str] = None)`
-:   Split the output of the model into boxes and scores. This step will also handle the optional object score.
-    Input shape: <float>{num_boxes, 4/5+num_classes}
+`SplitOutBoxAndScoreWithConf(num_classes: int, name: Optional[str] = None)`
+:   Split the output of the model into boxes and scores, applying the object confidence score.
+    Input shape: <float>{num_boxes, <4 box co-ords, conf score, num_classes>}
    Output shape: <float>{num_boxes, 4}, <float>{num_boxes, num_classes}
-    |x1,x2,x3,x4, (obj), cls_1, ... cls_num|
+    |x1,x2,x3,x4, obj_conf, cls_1, ... cls_num|
            /\
           /  \
-    |x1,x2,x3,x4|  |cls_1, ... clx_num|*(obj)
-    obj is optional, if it is not present, it will be set to 1.0
-    This is where 4/5 comes from, '4' represent coordinates and the fifth object probability.
+    |x1,x2,x3,x4|  |cls_1, ... clx_num|*obj_conf
    
    Args:
        num_classes: number of classes
-        name: Optional name of step. Defaults to 'SplitOutBoxAndScore'
+        name: Optional name of step. Defaults to 'SplitOutBoxAndScoreWithConf'

    ### Ancestors (in MRO)

--- a/onnxruntime_extensions/tools/pre_post_processing/pre_post_processor.py
+++ b/onnxruntime_extensions/tools/pre_post_processing/pre_post_processor.py
@ -186,6 +186,9 @@ class PrePostProcessor:
                pre_process_graph.input.append(i)
                pre_process_graph.output.append(i)

+            # connect up the graph input names to the first pre-processing step based on order
+            self.pre_processors[0]._connect_graph_inputs([vi.name for vi in self._inputs])
+
            for idx, step in enumerate(self.pre_processors):
                pre_process_graph = connect_and_run(pre_process_graph, step, self._pre_processor_connections[idx])

--- a/onnxruntime_extensions/tools/pre_post_processing/step.py
+++ b/onnxruntime_extensions/tools/pre_post_processing/step.py
@ -48,6 +48,11 @@ class Step(object):

        self.input_names[entry.consumer_idx] = entry.producer.output_names[entry.producer_idx]

+    def _connect_graph_inputs(self, graph_inputs: List[str]):
+        "Internal method to connect names of the first pre-processor step with the graph inputs"
+        for i, input_name in enumerate(graph_inputs):
+            self.input_names[i] = input_name
+
    def apply(self, graph: onnx.GraphProto, 
              checker_context: onnx.checker.C.CheckerContext, 
              graph_outputs_to_maintain: List[str]):
--- a/onnxruntime_extensions/tools/pre_post_processing/steps/general.py
+++ b/onnxruntime_extensions/tools/pre_post_processing/steps/general.py
@ -6,6 +6,46 @@ from typing import List, Optional
 from ..step import Step


+class Identity(Step):
+    """
+    ONNX Identity for all inputs to the Step. Used to pass through values as-is to later Steps.
+    """
+
+    def __init__(self, num_inputs: int = 1, name: Optional[str] = None):
+        """
+        Args:
+            name: Optional name of step. Defaults to 'Identity'
+        """
+        super().__init__([f"in_{x}" for x in range(0, num_inputs)],
+                         [f"out_{x}" for x in range(0, num_inputs)],
+                         name)
+        self._num_inputs = num_inputs
+
+    def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
+
+        inputs = []
+        outputs = []
+        identity_nodes = []
+
+        for i in range(0, self._num_inputs):
+            input_type_str, input_shape_str = self._get_input_type_and_shape_strs(graph, i)
+            inputs.append(f"{input_type_str}[{input_shape_str}] {self.input_names[i]}")
+            outputs.append(f"{input_type_str}[{input_shape_str}] {self.output_names[i]}")
+            identity_nodes.append(f"{self.output_names[i]} =  Identity({self.input_names[i]})")
+
+        identity_node_text = '\n'.join(identity_nodes)
+        converter_graph = onnx.parser.parse_graph(
+            f"""\
+            identities ({', '.join(inputs)}) => ({', '.join(outputs)})  
+            {{
+                {identity_node_text}
+            }}
+            """
+        )
+
+        return converter_graph
+
+
 class ReverseAxis(Step):
    """
    Reverses the data in an axis by splitting and concatenating in reverse order.
@ -63,6 +103,78 @@ class ReverseAxis(Step):
        return reverse_graph


+class Split(Step):
+    """
+    ONNX Split
+    """
+
+    def __init__(self,
+                 num_outputs: int,
+                 axis: Optional[int] = None,
+                 splits: Optional[List[int]] = None,
+                 name: Optional[str] = None):
+        """
+        :param num_outputs: Number of outputs to split the input into. Unequal split is allowed for opset 18+.
+        :param axis: Axis to split on. Default is 0.
+        :param splits: Optional length of each output. Sum must equal dim value at 'axis'
+        :param name: Optional Step name. Defaults to 'Split'
+        """
+        output_names = [f"{name if name else self.__class__.__name__}_{x}" for x in range(0, num_outputs)]
+        super().__init__(["data"], output_names, name)
+        self._num_outputs = num_outputs
+        self._axis = axis if axis else 0
+        self._splits = splits
+
+        if splits and len(splits) != num_outputs:
+            raise ValueError("Splits length must match num_outputs")
+
+    def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
+        input_type_str, input_shape_str = self._get_input_type_and_shape_strs(graph, 0)
+        dims = input_shape_str.split(",")
+
+        axis = (self._axis + len(dims)) if self._axis < 0 else self._axis
+
+        # calculate dim value of axis being split for each output
+        if self._splits:
+            split_dim_strs = [str(x) for x in self._splits]
+            splits_input_name = ", split_sizes"
+            splits_const = (f"split_sizes = Constant <value = int64[{self._num_outputs}] "
+                            f"{{{', '.join(split_dim_strs)}}}>()")
+        else:
+            if dims[axis].isdigit():
+                split_dim_str = str(int(dims[axis]) / self._num_outputs)
+            else:
+                split_dim_str = f"{dims[axis]}_/_{self._num_outputs}"
+
+            split_dim_strs = [split_dim_str] * self._num_outputs
+            splits_input_name = ""
+            splits_const = ""
+
+        split_outputs = []
+        for i in range(0, self._num_outputs):
+            dims[axis] = split_dim_strs[i]
+            split_outputs.append(f"{input_type_str}[{','.join(dims)}] {self.output_names[i]}")
+
+        # num_outputs attribute is required if opset 18+ and not providing splits input
+        num_outputs = ""
+        if onnx_opset >= 18 and not self._splits:
+            num_outputs = ", num_outputs = {self._num_outputs}"
+
+        split_graph = onnx.parser.parse_graph(
+            f"""\
+            split ({input_type_str}[{input_shape_str}] {self.input_names[0]}) 
+                => ({",".join(split_outputs)})  
+            {{
+                {splits_const}            
+                {",".join(self.output_names)} 
+                    = Split <axis={self._axis} {num_outputs}>({self.input_names[0]} {splits_input_name})
+            }}
+            """
+        )
+
+        return split_graph
+
+
 class Squeeze(Step):
    """
    ONNX Squeeze
--- a/onnxruntime_extensions/tools/pre_post_processing/steps/vision.py
+++ b/onnxruntime_extensions/tools/pre_post_processing/steps/vision.py
@ -384,8 +384,9 @@ class Resize(Step):
        
        # Resize-18 has the attribute "not_larger/not_smaller" to specify the resize policy, however
        # we want to support older opsets as well. 
-        assert (self.policy_ in ["not_smaller", "not_larger"], 
-                f"Unsupported resize policy of {self.policy_}, must be 'not_smaller' or 'not_larger'")
+        assert self.policy_ in ["not_smaller", "not_larger"], \
+            f"Unsupported resize policy of {self.policy_}, must be 'not_smaller' or 'not_larger'"
+
        ratio_resize_func = "ReduceMax"
        if self.policy_ == "not_larger":
            ratio_resize_func = "ReduceMin"
@ -712,10 +713,12 @@ class LetterBox(Step):

    Input shape: <uint8_t>{height, width, 3<BGR>}
    target_shape: <uint8_t>{out_height, out_width, 3<BGR>}
+    layout: HWC or CHW are supported
    Output shape: specified by target_shape
    """

-    def __init__(self, target_shape: Union[int, Tuple[int, int]], fill_value=0, name: Optional[str] = None):
+    def __init__(self, target_shape: Union[int, Tuple[int, int]], fill_value=0, layout: str = "HWC",
+                 name: Optional[str] = None):
        """
        Args:
            target_shape: the size of the output image
@ -727,19 +730,32 @@ class LetterBox(Step):
        self.target_shape_ = target_shape
        self.fill_value_ = fill_value

+        if layout != "HWC" and layout != "CHW":
+            raise ValueError("Invalid layout. Only HWC and CHW are supported")
+
+        self.layout_ = layout
+
    def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
        input0_type_str, input0_shape_str = self._get_input_type_and_shape_strs(graph, 0)
+        assert len(input0_shape_str.split(',')) == 3, "expected HWC or CHW input"

-        assert len(input0_shape_str.split(',')) == 3, " expected BGR image"
+        target_shape = f"{self.target_shape_[0]}, {self.target_shape_[1]}"

-        target_shape_str = f"{self.target_shape_[0]}, {self.target_shape_[1]}, 3"
+        if self.layout_ == "HWC":
+            target_shape_str = f"{target_shape}, 3"
+            split_input_shape_output = "h, w, c"
+            concat_input_order = "half_pad_hw, i64_0, remainder_pad_hw, i64_0"
+        else:
+            target_shape_str = f"3, {target_shape}"
+            split_input_shape_output = "c, h, w"
+            concat_input_order = "i64_0, half_pad_hw, i64_0, remainder_pad_hw"

        split_input_shape_attr = "axis = 0"
        if onnx_opset >= 18:
            # Split now requires the number of outputs to be specified even though that can be easily inferred...
            split_input_shape_attr += f", num_outputs = 3"

-        converter_graph = onnx.parser.parse_graph(
+        graph_text = (
            f"""\
            LetterBox (uint8[{input0_shape_str}] {self.input_names[0]}) 
                => (uint8[{target_shape_str}] {self.output_names[0]})  
@ -749,77 +765,63 @@ class LetterBox(Step):
                i64_0 = Constant <value = int64[1] {{0}}>()
                const_val = Constant <value = uint8[1] {{{self.fill_value_}}}> ()
                image_shape = Shape ({self.input_names[0]})
-                h,w,c = Split <{split_input_shape_attr}> (image_shape)
+                {split_input_shape_output} = Split <{split_input_shape_attr}> (image_shape)
                hw = Concat <axis = 0> (h, w)
                pad_hw = Sub (target_size, hw)
                half_pad_hw = Div (pad_hw, i64_2)
                remainder_pad_hw = Sub (pad_hw, half_pad_hw)
-                pad_value = Concat <axis = 0> (half_pad_hw, i64_0,remainder_pad_hw,i64_0)
+                pad_value = Concat <axis = 0> ({concat_input_order})
                {self.output_names[0]} = Pad({self.input_names[0]}, pad_value, const_val)
            }}
            """
        )

+        converter_graph = onnx.parser.parse_graph(graph_text)
+
        return converter_graph


-class SplitOutBoxAndScore(Step):
+class SplitOutBoxAndScoreWithConf(Step):
    r"""
-    Split the output of the model into boxes and scores. This step will also handle the optional object score.
-    Input shape: <float>{num_boxes, 4/5+num_classes}
+    Split the output of the model into boxes and scores, applying the object confidence score.
+    Input shape: <float>{num_boxes, <4 box co-ords, conf score, num_classes>}
    Output shape: <float>{num_boxes, 4}, <float>{num_boxes, num_classes}
-    |x1,x2,x3,x4, (obj), cls_1, ... cls_num|
+    |x1,x2,x3,x4, obj_conf, cls_1, ... cls_num|
            /\
           /  \
-    |x1,x2,x3,x4|  |cls_1, ... clx_num|*(obj)
-    obj is optional, if it is not present, it will be set to 1.0
-    This is where 4/5 comes from, '4' represent coordinates and the fifth object probability.
+    |x1,x2,x3,x4|  |cls_1, ... clx_num|*obj_conf
    """
-    def __init__(self, num_classes:int = 80, name: Optional[str] = None):
+
+    def __init__(self, num_classes: int, name: Optional[str] = None):
        """
        Args:
            num_classes: number of classes
-            name: Optional name of step. Defaults to 'SplitOutBoxAndScore'
+            name: Optional name of step. Defaults to 'SplitOutBoxAndScoreWithConf'
        """
-            
-        super().__init__(["box_and_score"], ["_pre_boxes", "_pre_scores"], name)
+
+        super().__init__(["box_conf_scores"], ["boxes", "scores"], name)
        self.num_classes_ = num_classes

    def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
        input0_type_str, input0_shape_str = self._get_input_type_and_shape_strs(graph, 0)

        input_shape_list = input0_shape_str.split(',')
-        assert len(input_shape_list) == 2, " expected [num_boxes, 4/5+num_classes]"
+        assert len(input_shape_list) == 2, " expected [num_boxes, 5+num_classes]"

        target_shape_str_0 = f"{input_shape_list[0]}, 4"
        target_shape_str_1 = f"{input_shape_list[0]}, _{self._step_num}_class"

        converter_graph = onnx.parser.parse_graph(
            f"""\
-            SplitOutBoxAndScore (float[{input0_shape_str}] {self.input_names[0]}) 
-                => (float[{target_shape_str_0}] {self.output_names[0]}, float[{target_shape_str_1}] {self.output_names[1]})  
+            SplitOutBoxConfidenceAndScore (float[{input0_shape_str}] {self.input_names[0]}) 
+                => (float[{target_shape_str_0}] {self.output_names[0]}, 
+                    float[{target_shape_str_1}] {self.output_names[1]})
            {{
+                split_sizes = Constant <value = int64[3] {{4, 1, {self.num_classes_}}}>()
+                {self.output_names[0]}, conf, orig_scores = Split <axis=-1>({self.input_names[0]}, split_sizes)

-                i64_neg1 = Constant <value = int64[1] {{-1}}>()
-                i64_4 = Constant <value = int64[1] {{4}}>()
-                i64_0 = Constant <value = int64[1] {{0}}>()
-                fp32_1 = Constant <value = float[1] {{1.0}}>()
-                i64_classes = Constant <value = int64[1] {{{self.num_classes_}}}>()
-                out_shape = Shape ({self.input_names[0]})
-                class_and_coor_dim = Gather (out_shape, i64_neg1)
-                coor_and_obj = Sub (class_and_coor_dim, i64_classes)
-                obj_0_or_1 = Sub (coor_and_obj, i64_4)
-                bool_num_obj_0_or_1 = Cast<to=9>(obj_0_or_1)
-
-                box_obj_class_concat = Concat <axis = 0> (i64_4, obj_0_or_1, i64_classes)
-                boxes_o, scores_obj_o, scores_cls_o = Split <axis = -1> ({self.input_names[0]}, box_obj_class_concat)
-                scores_obj_not_null = Concat <axis = -1> (scores_obj_o, boxes_o)
-                coef_obj_cat =  Where(bool_num_obj_0_or_1, scores_obj_not_null,fp32_1)
-                coef_obj = Gather <axis=-1> (coef_obj_cat, i64_0)
-                scores_o = Mul (scores_cls_o, coef_obj)
-                {self.output_names[0]} = Identity (boxes_o)
-                {self.output_names[1]} = Identity (scores_o)
-
+                scores_with_conf = Mul(orig_scores, conf)
+                {self.output_names[1]} = Identity (scores_with_conf)
            }}
            """
        )
@ -828,33 +830,42 @@ class SplitOutBoxAndScore(Step):

 class SelectBestBoundingBoxesByNMS(Step):
    """
-    Non-maximum suppression (NMS) is to filter out redundant bounding boxes.
-    This step is used to warp the boxes and scores into onnx SelectBestBoundingBoxesByNMS op.
+    Non-maximum suppression (NMS) is to select the best bounding boxes.
    Input:
-        boxes:  float[num_boxes, 4]
-        scores:  shape float[num_boxes, num_classes]
+        boxes: float[num_boxes, 4]
+        scores: float[num_boxes, num_classes]
+        masks: float[num_boxes, mask_data]. optional

    Output:
-        nms_out: float[_few_num_boxes, 6<coordinate+score+class>]
+        nms_out: float[_few_num_boxes, <box+score+class+mask_data>]
    """

-    def __init__(self, iou_threshold:float = 0.5, score_threshold:float = 0.67, 
-                 max_detections:int = 300, name: Optional[str] = None):
+    def __init__(self,
+                 iou_threshold: Optional[float] = 0.5,
+                 score_threshold: Optional[float] = 0.67,
+                 max_boxes_per_class: Optional[int] = 100,
+                 max_detections: Optional[int] = None,
+                 has_mask_data: Optional[bool] = False, name: Optional[str] = None):
        """
-        Args:
-        Please refer to https://github.com/onnx/onnx/blob/main/docs/Operators.md#SelectBestBoundingBoxesByNMS
-        for more details about the parameters.
-            iou_threshold:  same as SelectBestBoundingBoxesByNMS op, intersection /union of boxes 
-            score_threshold:  If this box's score is lower than score_threshold, it will be removed.
-            max_detections:  max number of boxes to be selected
+        Args: Please refer to https://github.com/onnx/onnx/blob/main/docs/Operators.md#NonMaxSuppression
+              for more details about the parameters.
+            iou_threshold: same as NonMaxSuppression op, intersection/union of boxes
+            score_threshold: If this box's score is lower than score_threshold, it will be removed.
+            max_boxes_per_class: max number of boxes to be selected per class
+            max_detections: maximum number of boxes in total. Applied as the last step of processing if specified.
            name: Optional name of step. Defaults to 'SelectBestBoundingBoxesByNMS'
        """
-        super().__init__(["boxes", "scores"], ["nms_out"], name)
+        inputs = ["boxes", "scores"]
+        if has_mask_data:
+            inputs.append("masks")
+
+        super().__init__(inputs, ["nms_out"], name)
+
        self.iou_threshold_ = iou_threshold
        self.score_threshold_ = score_threshold
+        self.max_boxes_per_class_ = max_boxes_per_class
        self.max_detections_ = max_detections

-
    def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
        input0_type_str, input0_shape_str = self._get_input_type_and_shape_strs(graph, 0)
        input1_type_str, input1_shape_str = self._get_input_type_and_shape_strs(graph, 1)
@ -862,104 +873,264 @@ class SelectBestBoundingBoxesByNMS(Step):
        input0_shape_list = input0_shape_str.split(',')
        assert len(input0_shape_list) == 2, " expected [num_boxes, 4]"

-        target_shape_str = f"_{self._step_num}_nms_boxes, 6"
+        has_mask_input = len(self.input_names) == 3

-        reduce_score = '(score_select_nm,i64_neg1)' if onnx_opset >= 18 else '<axes=[-1]>(score_select_nm)'
+        input_2 = ""
+        mask_i = ""
+        mask_select = ""
+        concat_for_output = "boxes_select, score_select, class_select"
+        output_size_str = "6"
+        # reduce_score picks the class with the best score for the selected box
+        reduce_score = '(score_select_nm, i64_neg1)' if onnx_opset >= 18 else '<axes=[-1]>(score_select_nm)'

-        converter_graph = onnx.parser.parse_graph(
-            f"""\
-            SelectBestBoundingBoxesByNMS (float[{input0_shape_str}] {self.input_names[0]},float[{input1_shape_str}] {self.input_names[1]}) 
-                => (float[{target_shape_str}] {self.output_names[0]})  
+        if has_mask_input:
+            input2_type_str, input2_shape_str = self._get_input_type_and_shape_strs(graph, 2)
+            input_2 = f", float[{input2_shape_str}] {self.input_names[2]}"
+            mask_i = f"masks_i = Identity({self.input_names[2]})"
+            mask_select = "mask_select = Gather <axis=0>(masks_i, box_idxs)"
+            concat_for_output += ", mask_select"
+
+            mask_size_str = input2_shape_str.split(",")[-1]
+            if mask_size_str.isnumeric():
+                output_size_str = str(6 + int(mask_size_str))
+            else:
+                output_size_str = f"_step{self._step_num}_6_+_mask_size"
+
+        if self.max_detections_:
+            # squeeze scores from [num_results, 1] to [num_results]
+            # use TopK to find the best scores for the selected boxes, but only if the number of results is
+            # greater than max_detections, and there are results (otherwise calling TopK is invalid).
+            # We sort the selected indices to maintain the original ordering for consistency when TopK isn't required
+            apply_max_detections = \
+                f"""
+                max_detections = Constant <value = int64[1] {{{self.max_detections_}}}>()
+                num_results = Shape(scores)
+                num_results_less_than_max = Less(num_results, max_detections)
+                k = Where(num_results_less_than_max, num_results, max_detections)
+                have_results = Greater(k, i64_0)
+                final_results = If<
+                                then_branch=then_graph() => 
+                                    (float[_{self._step_num}_selected_boxes, {output_size_str}] then_output) 
+                                    {{
+                                        topk_scores, topk_i = TopK<axis = 0>(scores, k)
+                                        # use Unique to sort. no onnx op seems to provide that directly.
+                                        sorted_topk_i = Unique<sorted=1>(topk_i)
+                                        then_output = Gather<axis = 0>(merged_results, sorted_topk_i)
+                                    }},
+                                else_branch=else_graph() => 
+                                    (float[_{self._step_num}_selected_boxes, {output_size_str}] else_output) 
+                                    {{
+                                        else_output = Identity(merged_results)
+                                    }}>
+                                    (have_results)
+                """
+
+        else:
+            apply_max_detections = "final_results = Identity(merged_results)"
+
+        graph_text = \
+            f"""
+            SelectBestBoundingBoxesByNMS (float[{input0_shape_str}] {self.input_names[0]},
+                                          float[{input1_shape_str}] {self.input_names[1]}
+                                          {input_2}) 
+                => (float[_{self._step_num}_selected_boxes, {output_size_str}] {self.output_names[0]})  
            {{
-                i64_2 = Constant <value = int64[1] {{2}}>()
+                i64_neg1 = Constant <value = int64[1] {{-1}}>()
                i64_0 = Constant <value = int64[1] {{0}}>()
                i64_1 = Constant <value = int64[1] {{1}}>()
-                i64_max_obj = Constant <value = int64[1] {{{self.max_detections_}}}>()
-                i64_neg1 = Constant <value = int64[1] {{-1}}>()
-                fp32_iou_th = Constant <value = float[1] {{{self.iou_threshold_}}}>()
-                fp32_score_th = Constant <value = float[1] {{{self.score_threshold_}}}>()
+                i64_2 = Constant <value = int64[1] {{2}}>()
+                i64_1_2 = Constant <value = int64[2] {{1, 2}}>()
+                max_per_class = Constant <value = int64[1] {{{self.max_boxes_per_class_}}}>()
+                iou_th = Constant <value = float[1] {{{self.iou_threshold_}}}>()
+                score_th = Constant <value = float[1] {{{self.score_threshold_}}}>()

-                boxes_i = Identity ({self.input_names[0]})
+                boxes_i = Identity({self.input_names[0]})
                scores_i = Identity({self.input_names[1]})
+                {mask_i}
+                
                scores_c_b = Transpose<perm=[1,0]>(scores_i)
                batch_boxes = Unsqueeze(boxes_i, i64_0)
                batch_scores = Unsqueeze(scores_c_b, i64_0)

-                nmsbox = NonMaxSuppression<center_point_box =1>(batch_boxes, batch_scores, i64_max_obj,fp32_iou_th,fp32_score_th)
-                classes_i64 = Gather <axis=-1>(nmsbox,i64_1)
-                class_select = Cast <to = 1>(classes_i64)
+                # NMS returns [num_selected_boxes, 3] where each entry is [batch, class idx, box idx] 
+                nmsbox = NonMaxSuppression<center_point_box=1>(batch_boxes, batch_scores, max_per_class,
+                                                               iou_th, score_th)
+                                                               
+                # extract class values
+                nms_classes = Gather<axis=-1>(nmsbox, i64_1)
+                class_select = Cast<to = 1>(nms_classes)

-                boxes_idx_us = Gather <axis=-1>(nmsbox,i64_2)
-                boxes_idx = Squeeze(boxes_idx_us, i64_neg1)
-                boxes_select = Gather <axis=0>(boxes_i, boxes_idx)
+                # extract box indexes and select box info using them.
+                nms_boxes = Gather<axis=-1>(nmsbox, i64_2)
+                box_idxs = Squeeze(nms_boxes, i64_neg1)
+                boxes_select = Gather<axis=0>(boxes_i, box_idxs)

-                score_select_nm = Gather <axis=0>(scores_i, boxes_idx)
-                score_select = ReduceMax{reduce_score}
-
-                {self.output_names[0]} = Concat <axis = -1> (boxes_select, score_select, class_select)
+                # scores_c_b is [classes, boxes]
+                # box_class_idxs is [selected_boxes, 2] where the 2 values are class idx, box idx
+                class_box_idxs = Gather<axis=-1>(nmsbox, i64_1_2)
+                scores = GatherND(scores_c_b, class_box_idxs)
+                score_select = Unsqueeze(scores, i64_neg1)
+                
+                {mask_select}
+                
+                merged_results = Concat <axis = -1> ({concat_for_output})
+                
+                {apply_max_detections}
+                
+                {self.output_names[0]} = Identity(final_results)
            }}
            """
-        )
+
+        converter_graph = onnx.parser.parse_graph(graph_text)
+
        return converter_graph


-class ScaleBoundingBoxes(Step):
+class ScaleNMSBoundingBoxesAndKeyPoints(Step):
    """
-    Mapping boxes coordinate to scale in original image.
-    The coordinate of boxes from detection model is relative to the input image of network, 
-    image is scaled and padded/cropped. So we need to do a linear mapping to get the real coordinate of original image.
+    Scale bounding box and key point coordinates in optional mask data to original image.
+
+    Input image goes through Resize and LetterBox steps during pre-processing (in that order), and the output of this
+    is what the original model runs against.
+    To display the predictions on the original image we need to apply the reverse size changes to the co-ordinates 
+    of the bounding boxes.
+
+    nms_step_output inner dimension has 4 values for the bounding box, 1 for the score, 1 for the selected class,
+    and the remainder (if any) is the mask data.
+
+    The mask data has values for a fixed number of key points. Each key point has an x and y value, and optionally a
+    confidence value.
+
    input:
-        box_of_nms_out: output of NMS, shape [num_boxes, 6]
-        original_image: original image decoded from jpg/png<uint8_t>[H, W, 3<BGR>]
-        scaled_image: scaled image, but without padding/crop[<uint8_t>[H1, W1, 3<BGR>]
-        letter_boxed_image: scaled image and with padding/crop[<uint8_t>[H2, W3, 3<BGR>]
+        nms_step_output: output of SelectBestBoundingBoxesByNMS Step, shape [num_boxes, 6+]
+        original_image: original image decoded from jpg/png, <uint8_t>[H, W, 3] or [3, H, W]
+        resized_image: output from Resize pre-processing Step, <uint8_t>[H1, W1, 3] or [3, H1, W1]
+        letter_boxed_image: output from LetterBox pre-processing Step, <uint8_t>[H2, W2, 3] or [3, H2, W2]
+        num_key_points: number of key points in each mask data entry, if present. optional.
    
    output:
-        scaled_box_out: shape [num_boxes, 6] with coordinate mapped to original image.
+        nms_output_with_scaled_boxes_and_keypoints: input data with boxes and key points scaled to original image.
    """

-    def __init__(self, name: Optional[str] = None):
+    def __init__(self, num_key_points: Optional[int] = 0, layout: Optional[str] = "HWC", name: Optional[str] = None):
        """
        Args:
-            name: Optional name of step. Defaults to 'ScaleBoundingBoxes'
+            num_key_points: Number of key points in mask data. Only required if input has optional mask data.
+            layout: HWC or CHW. Used to determine where to read the H and W value from the input image shapes.
+                    MUST be the same for all 3 input images.
+
+            name: Optional name of step. Defaults to 'ScaleNMSBoundingBoxesAndKeyPoints'
        """
-        super().__init__(["box_of_nms_out", "original_image", "scaled_image",
-                          "letter_boxed_image"], ["scaled_box_out"], name)
+        super().__init__(["nms_step_output", "original_image", "resized_image", "letter_boxed_image"],
+                         ["nms_output_with_scaled_boxes_and_keypoints"], name)
+        self._num_key_points = num_key_points
+
+        if layout != "HWC" and layout != "CHW":
+            raise ValueError("Invalid layout. Only HWC and CHW are supported")
+
+        self.layout_ = layout

    def _create_graph_for_step(self, graph: onnx.GraphProto, onnx_opset: int):
-        graph_input_param = []
-        target_shape = []
-        for idx,input_name in enumerate(self.input_names):
+        graph_input_params = []
+
+        for idx, input_name in enumerate(self.input_names):
            input_type_str, input_shape_str = self._get_input_type_and_shape_strs(graph, idx)
-            graph_input_param.append(f"{input_type_str}[{input_shape_str}] {input_name}")
-            target_shape.append(input_shape_str)
-        graph_input_param = ','.join(graph_input_param)
+            graph_input_params.append(f"{input_type_str}[{input_shape_str}] {input_name}")

-        target_shape = target_shape[:1]
-        graph_output_param = []
-        for idx,output_name in enumerate(self.output_names):
-            graph_output_param.append(f"float[{target_shape[idx]}] {output_name}")
-        graph_output_param = ','.join(graph_output_param)
+        graph_input_params = ', '.join(graph_input_params)

-        def split_num_ouputs(num_outputs: int):
-            split_input_shape_attr= ''
+        if self.layout_ == "HWC":
+            orig_image_h_w_c = "oh, ow, oc"
+            scaled_image_h_w_c = "sh, sw, sc"
+            letterboxed_image_h_w_c = "lh, lw, lc"
+        else:
+            orig_image_h_w_c = "oc, oh, ow"
+            scaled_image_h_w_c = "sc, sh, sw"
+            letterboxed_image_h_w_c = "lc, lh, lw"
+
+        def split_num_outputs(num_outputs: int):
+            split_input_shape_attr = ''
            if onnx_opset >= 18:
                split_input_shape_attr = f", num_outputs = {num_outputs}"
            return split_input_shape_attr

-        converter_graph = onnx.parser.parse_graph(
+        nms_output_type_str, nms_output_shape_str = self._get_input_type_and_shape_strs(graph, 0)
+        nms_output_shape = nms_output_shape_str.split(',')
+        data_size_per_result = nms_output_shape[-1]
+        if not data_size_per_result.isnumeric():
+            # this should be known when adding pre-processing
+            raise ValueError("Shape of input must have numeric value for the mask data size")
+
+        data_num_splits = 3  # splits of nms data into box[:2], box[2:4] , score+class, [mask]
+        data_split_sizes = "2, 2, 2"  # sizes of the splits
+        score_class_masks = "score_class"  # output name/s for trailing output/s from Split
+        keypoint_processing = ""  # operators to process the keypoints
+        scaled_keypoints = ""  # optional output from keypoint scaling
+
+        data_size = int(data_size_per_result)
+        if data_size > 6:
+            # we have mask data to split out
+            data_num_splits = 4
+            keypoint_data_size = data_size - 6
+            data_split_sizes += f", {keypoint_data_size}"
+            score_class_masks = "score_class, masks"
+            scaled_keypoints = ", scaled_keypoints"
+
+            values_per_keypoint = int(keypoint_data_size / self._num_key_points)
+            reshape_keypoints_to = ",".join([str(self._num_key_points), str(values_per_keypoint)])
+
+            if keypoint_data_size > 2:
+                # split into xy and conf
+                keypoints_xy_and_conf_from_keypoints = \
+                    f"""
+                    keypoints_split_sizes = Constant <value = int64[2] {{2, {values_per_keypoint - 2}}}>()
+                    keypoints_xy, conf = Split <axis = -1>(keypoints, keypoints_split_sizes)
+                    """
+                # need to re-combine after scaling
+                scaled_keypoints_and_conf = "scaled_keypoints_and_conf = Concat <axis=-1>(scaled_keypoints_xy, conf)"
+
+            else:
+                # use the keypoint data as-is as we don't have 'conf' data to split out
+                keypoints_xy_and_conf_from_keypoints = "keypoints_xy = Identity(keypoints)"
+                scaled_keypoints_and_conf = "scaled_keypoints_and_conf = Identity(scaled_keypoints_xy)"
+
+            keypoint_processing = \
+                f"""
+                reshape_keypoints_to = Constant <value = int64[2] {{{reshape_keypoints_to}}}>()
+                input_shape = Shape ({self.input_names[0]})
+
+                i64_0 = Constant <value = int64[1] {{0}}>()
+                num_boxes = Gather <axis=0>(input_shape, i64_0)
+                reshape_masks_to = Concat<axis=-1> (num_boxes, reshape_keypoints_to)
+                keypoints = Reshape(masks, reshape_masks_to)
+                
+                {keypoints_xy_and_conf_from_keypoints}
+                
+                offset_keypoints_xy = Sub (keypoints_xy, f_half_pad_wh)
+                scaled_keypoints_xy = Mul (offset_keypoints_xy, ratios)
+                
+                {scaled_keypoints_and_conf}
+                
+                orig_shape = Shape(masks)
+                scaled_keypoints = Reshape(scaled_keypoints_and_conf, orig_shape)
+                """
+
+        graph_text = \
            f"""\
-            ScaleBoundingBoxes ({graph_input_param}) 
-                => ({graph_output_param})  
+            ScaleNMSBoundingBoxesAndKeyPoints 
+            ({graph_input_params}) => ({nms_output_type_str}[{nms_output_shape_str}] {self.output_names[0]})
            {{
                i64_2 = Constant <value = int64[1] {{2}}>()
-
+                data_split_sizes = Constant <value = int64[{data_num_splits}] {{{data_split_sizes}}}>()
+                
+                boxes_xy, boxes_wh_or_xy, {score_class_masks} = Split <axis=-1>({self.input_names[0]}, data_split_sizes)
+                    
                ori_shape = Shape ({self.input_names[1]})
                scaled_shape = Shape ({self.input_names[2]})
                lettered_shape = Shape ({self.input_names[3]})
-                oh,ow,oc = Split <axis = 0 {split_num_ouputs(3)}> (ori_shape)
-                sh,sw,sc = Split <axis = 0 {split_num_ouputs(3)}> (scaled_shape)
-                lh,lw,lc = Split <axis = 0 {split_num_ouputs(3)}> (lettered_shape)
+                {orig_image_h_w_c} = Split <axis = 0 {split_num_outputs(3)}> (ori_shape)
+                {scaled_image_h_w_c} = Split <axis = 0 {split_num_outputs(3)}> (scaled_shape)
+                {letterboxed_image_h_w_c} = Split <axis = 0 {split_num_outputs(3)}> (lettered_shape)
                swh = Concat <axis = -1> (sw,sh)
                lwh = Concat <axis = -1> (lw,lh)
                
@ -971,14 +1142,16 @@ class ScaleBoundingBoxes(Step):
                half_pad_wh = Div (pad_wh, i64_2)
                f_half_pad_wh = Cast <to = 1> (half_pad_wh)

-                boxes_xy,boxes_wh_orxy,boxes_score_class = Split <axis=-1 {split_num_ouputs(3)}>({self.input_names[0]})
                offset_boxes_xy = Sub (boxes_xy, f_half_pad_wh)
-                restored_boxes = Concat <axis=-1> (offset_boxes_xy, boxes_wh_orxy)
-                scaled_boxes_coor = Mul (restored_boxes, ratios)
-                restored_boxes_res = Concat <axis=-1> (scaled_boxes_coor, boxes_score_class)
-
-                {self.output_names[0]} = Identity (restored_boxes_res)
+                restored_boxes = Concat <axis=-1> (offset_boxes_xy, boxes_wh_or_xy)
+                scaled_boxes = Mul (restored_boxes, ratios)
+                
+                {keypoint_processing}
+                
+                {self.output_names[0]} = Concat <axis=-1> (scaled_boxes, score_class {scaled_keypoints})
            }}
            """
-        )
-        return converter_graph
+
+        converter_graph = onnx.parser.parse_graph(graph_text)
+
+        return converter_graph
--- a/test/test_tools_add_pre_post_processing_to_model.py
+++ b/test/test_tools_add_pre_post_processing_to_model.py
@ -17,7 +17,7 @@ from onnxruntime_extensions import get_library_path
 from onnxruntime_extensions.tools import add_pre_post_processing_to_model as add_ppp
 from onnxruntime_extensions.tools import add_HuggingFace_CLIPImageProcessor_to_model as add_clip_feature
 from onnxruntime_extensions.tools import pre_post_processing as pre_post_processing
-from onnxruntime_extensions.tools.pre_post_processing.steps import *
+from onnxruntime_extensions.tools.pre_post_processing import *


 script_dir = os.path.dirname(os.path.realpath(__file__))
@ -579,23 +579,35 @@ class TestToolsAddPrePostProcessingToModel(unittest.TestCase):
        image_ref = np.frombuffer(open(output_img, 'rb').read(), dtype=np.uint8)
        self.assertEqual((image_ref == output).all(), True)

-    def create_pipeline_and_run_for_nms(self, output_model: Path, length: int,
-                                        iou_threshold: float = 0.5,
-                                        score_threshold: float = 0.7,
-                                        max_detections: int = 10):
+    def _create_pipeline_and_run_for_nms(self, output_model: Path,
+                                         has_conf_value: bool,
+                                         iou_threshold: float = 0.5,
+                                         score_threshold: float = 0.7,
+                                         max_detections: int = 100,
+                                         max_boxes_per_class: int = 100,
+                                         num_classes: int = 1):
        import onnx
        create_named_value = pre_post_processing.utils.create_named_value
-
-        inputs = [create_named_value("box_and_score", onnx.TensorProto.FLOAT, ["num_boxes", length])]
+        length = (5 if has_conf_value else 4) + num_classes
+        # [ num_boxes, <4 points for box, optional conf, one score per class> ]
+        inputs = [create_named_value("_input", onnx.TensorProto.FLOAT, ["num_boxes", length])]

        onnx_opset = 16
        pipeline = pre_post_processing.PrePostProcessor(inputs, onnx_opset)

-        pipeline.add_post_processing([
-            SplitOutBoxAndScore(num_classes=1),
-            SelectBestBoundingBoxesByNMS(iou_threshold=iou_threshold, score_threshold=score_threshold,
-                                         max_detections=max_detections),
-        ])
+        if has_conf_value:
+            pipeline.add_post_processing([
+                SplitOutBoxAndScoreWithConf(num_classes=num_classes),
+                SelectBestBoundingBoxesByNMS(iou_threshold=iou_threshold, score_threshold=score_threshold,
+                                             max_boxes_per_class=max_boxes_per_class, max_detections=max_detections),
+            ])
+        else:
+            pipeline.add_post_processing([
+                # split the 4 bounding box co-ords from the class scores
+                Split(num_outputs=2, axis=-1, splits=[4, num_classes]),
+                SelectBestBoundingBoxesByNMS(iou_threshold=iou_threshold, score_threshold=score_threshold,
+                                             max_boxes_per_class=max_boxes_per_class, max_detections=max_detections),
+            ])

        graph_def = onnx.parser.parse_graph(
            f"""\
@ -615,7 +627,7 @@ class TestToolsAddPrePostProcessingToModel(unittest.TestCase):

    def test_NMS_and_drawing_box_without_confOfObj(self):
        output_model = (self.temp4onnx / "nms.onnx").resolve()
-        self.create_pipeline_and_run_for_nms(output_model, iou_threshold=0.9, length=5)
+        self._create_pipeline_and_run_for_nms(output_model, iou_threshold=0.9, has_conf_value=False)
        input_data = [
            [0, 0, 240, 240, 0.75],
            [10, 10, 240, 240, 0.75],
@ -635,7 +647,7 @@ class TestToolsAddPrePostProcessingToModel(unittest.TestCase):

    def test_NMS_and_drawing_box_with_confOfObj(self):
        output_model = (self.temp4onnx / "nms.onnx").resolve()
-        self.create_pipeline_and_run_for_nms(output_model, iou_threshold=0.9, score_threshold=0.5, length=6)
+        self._create_pipeline_and_run_for_nms(output_model, iou_threshold=0.9, score_threshold=0.5, has_conf_value=True)
        input_data = [
            [0, 0, 240, 240, 0.75, 0.9],
            [10, 10, 240, 240, 0.75, 0.9],
@ -676,12 +688,302 @@ class TestToolsAddPrePostProcessingToModel(unittest.TestCase):
        idx = 0
        for iou_threshold in [0.9, 0.75, 0.5]:
            for score_threshold in [0.5, 0.8, 0.9]:
-                self.create_pipeline_and_run_for_nms(
-                    output_model, iou_threshold=iou_threshold, score_threshold=score_threshold, length=6)
+                self._create_pipeline_and_run_for_nms(
+                    output_model, iou_threshold=iou_threshold, score_threshold=score_threshold, has_conf_value=True)
                out = get_model_output()
                self.assertEqual(out.size, expected_size[idx])
                idx += 1
-        
+
+    def test_NMS_max_detections(self):
+        def run_test(max_per_class, max_overall):
+
+            output_model = (self.temp4onnx / "nms_max_det.onnx").resolve()
+            self._create_pipeline_and_run_for_nms(output_model, has_conf_value=False, iou_threshold=0.95, num_classes=2,
+                                                  max_boxes_per_class=max_per_class, max_detections=max_overall)
+            input_data = [
+                [25, 25, 10, 10, 0.75, 0.85],
+                [100, 100, 10, 10, 0.91, 0.72],
+                [25, 150, 10, 10, 0.83, 0.93],
+                [150, 150, 10, 10, 0.87, 0.77],
+            ]
+
+            input_data = np.array(input_data, dtype=np.float32)
+
+            num_classes = 2
+            # max results is returning both classes for every bounding box
+            num_to_select = min(max_overall, num_classes * len(input_data))
+            num_selected = 0
+            num_selected_per_class = [0 for i in range(0, num_classes)]
+
+            results_expected = [[] for i in range(0, num_classes)]
+            scores = input_data[:, -2:].copy()  # copy as we set values to 0 as we go along
+            # pick the initial set of results based on score
+            cur_result = 0
+            while num_selected < num_to_select and cur_result < scores.size:
+                cur_result += 1  # we may run out of results before we select enough
+                expected = []
+
+                best_score = scores.max()
+                idx = int(scores.argmax() / num_classes)  # find row best score came from. num_classes entries per row.
+                selected_class = np.where(scores[idx] == best_score)[0][0]  # find index of best score
+                scores[idx][selected_class] = 0.  # set the score to 0 so it doesn't get selected again
+
+                if num_selected_per_class[selected_class] == max_per_class:
+                    continue
+
+                box = np.array(input_data[idx][:4])
+                expected += box.tolist()
+                expected.append(best_score)
+                expected.append(selected_class)
+
+                results_expected[selected_class].append(expected)
+                num_selected_per_class[selected_class] += 1
+                num_selected += 1
+
+            so = ort.SessionOptions()
+            so.register_custom_ops_library(get_library_path())
+            ort_sess = ort.InferenceSession(str(output_model), providers=['CPUExecutionProvider'], sess_options=so)
+
+            # flatten the per-class entries from
+            #   {num_classes, num selected results, result size} to {num_classes * num_results, result size}
+            results_expected = [np.asarray(entry) for entry in results_expected if len(entry) > 0]
+            results_expected = np.concatenate(results_expected).reshape((-1, 6))
+
+            outputs = ort_sess.run(None, {'_input': input_data})
+            results_actual = outputs[0]
+
+            self.assertEqual(results_expected.shape, results_actual.shape)
+            compared = np.isclose(results_expected, results_actual)
+            self.assertTrue(compared.all(),
+                            msg=f"\nExpected={results_expected}\nActual={results_actual}\nCompared={compared}")
+
+        run_test(100, 3)  # max overall trims
+        run_test(1, 100)  # max per class trims
+        run_test(1, 1)  # max per class and max overall trim
+
+    # Create pipeline to run NMS and scaling.
+    # Scaling should handle converting back to co-ordinates in the original image that was resized and letterboxed
+    def _create_pipeline_and_run_for_nms_and_scaling(self, output_model: Path,
+                                                     orig_image_shape: List[int],  # 3 dims, HWC or CHW
+                                                     resized_image_shape: List[int],
+                                                     letterboxed_image_shape: List[int],
+                                                     num_classes: int = 1,
+                                                     has_key_points: bool = False,
+                                                     key_points_have_conf: bool = False,
+                                                     ):
+
+        # channels are 3 so infer layout from shape
+        layout = "HWC" if orig_image_shape[-1] == 3 else "CHW"
+
+        mask_data_size = 0
+        if has_key_points:
+            # 3 results of x and y. optional conf in each result
+            mask_data_size = 3 * (3 if key_points_have_conf else 2)
+
+        result_data_size = 4 + num_classes + mask_data_size
+
+        # create graph to provide outputs for post-processing
+        inputs = [utils.create_named_value("results", onnx.TensorProto.FLOAT, ["num_boxes", result_data_size]),
+                  utils.create_named_value("orig_img", onnx.TensorProto.UINT8, orig_image_shape),
+                  utils.create_named_value("resized_img", onnx.TensorProto.UINT8, resized_image_shape),
+                  utils.create_named_value("letterboxed_img", onnx.TensorProto.UINT8, letterboxed_image_shape),
+                  ]
+
+        graph_input_strings = [f"float[num_boxes, {result_data_size}] results",
+                               f"uint8[{','.join([str(i) for i in orig_image_shape])}] orig_img",
+                               f"uint8[{','.join([str(i) for i in resized_image_shape])}] resized_img",
+                               f"uint8[{','.join([str(i) for i in letterboxed_image_shape])}] letterboxed_img",
+                               ]
+
+        graph_output_strings = [s + "_out" for s in graph_input_strings]
+        graph_nodes = "\n".join([f"{input.name}_out = Identity({input.name})" for input in inputs])
+
+        onnx_opset = 16
+
+        graph_text = \
+            f"""pass_through ({', '.join(graph_input_strings)}) => ({', '.join(graph_output_strings)})             
+            {{
+                {graph_nodes}
+            }}"""
+
+        graph_def = onnx.parser.parse_graph(graph_text)
+
+        onnx_import = onnx.helper.make_operatorsetid('', onnx_opset)
+        ir_version = onnx.helper.find_min_ir_version_for([onnx_import])
+        input_model = onnx.helper.make_model_gen_version(graph_def, opset_imports=[onnx_import], ir_version=ir_version)
+
+        # if there is mask data containing keypoints we need to split that out
+        splits = [4, num_classes]
+        if has_key_points:
+            splits.append(mask_data_size)
+
+        pipeline = pre_post_processing.PrePostProcessor(inputs, onnx_opset)
+
+        post_processing = [
+            # pass through model inputs via a Step so the original, resized and letterboxed shapes are available
+            # to use in the IoMapEntry for scaling
+            Identity(num_inputs=4, name="InputsPassThrough"),
+            Split(num_outputs=len(splits), axis=1, splits=splits),
+            SelectBestBoundingBoxesByNMS(iou_threshold=0.7, score_threshold=0.25, has_mask_data=has_key_points),
+            # Scale boxes and key point coords back to original image. Mask data has 3 key points per box.
+            (ScaleNMSBoundingBoxesAndKeyPoints(num_key_points=3, layout=layout),
+             [
+                 # A default connection from SelectBestBoundingBoxesByNMS for input 0
+                 # A connection from original image
+                 # A connection from the resized image
+                 # A connection from the LetterBoxed image
+                 # We use the images to calculate the scale factor and offset.
+                 # With scale and offset, we can scale the bounding box and key points back to the original image.
+                 utils.IoMapEntry("InputsPassThrough", producer_idx=1, consumer_idx=1),
+                 utils.IoMapEntry("InputsPassThrough", producer_idx=2, consumer_idx=2),
+                 utils.IoMapEntry("InputsPassThrough", producer_idx=3, consumer_idx=3),
+             ]),
+        ]
+
+        pipeline.add_post_processing(post_processing)
+
+        new_model = pipeline.run(input_model)
+        onnx.save_model(new_model, str(output_model))
+
+    def _run_nms_scaling_test(self, channels_last: bool = True, num_classes: int = 1,
+                              has_key_points: bool = False, key_points_have_conf: bool = False):
+        model_name = (f"nms_{'HWC' if channels_last else 'CHW'}_c{num_classes}_"
+                      f"kp{has_key_points}_kpc{key_points_have_conf}")
+        output_model = (self.temp4onnx / f"{model_name}.onnx").resolve()
+
+        if channels_last:
+            h_dim, w_dim = 0, 1
+            orig_image_shape = [400, 500, 3]  # HWC
+            resized_image_shape = [320, 400, 3]  # Resize to not_smaller 400 x 400
+            letterboxed_image_shape = [400, 400, 3]  # letterbox to 400 x 400
+        else:
+            h_dim, w_dim = 1, 2
+            orig_image_shape = [3, 400, 500]
+            resized_image_shape = [3, 320, 400]
+            letterboxed_image_shape = [3, 400, 400]
+
+        scale_ratio = 500 / 400  # we kept the aspect ratio
+        # width and height padding to apply to first 2 points of box as format is XYWH
+        # / 2 as we
+        half_pad_h = (letterboxed_image_shape[h_dim] - resized_image_shape[h_dim]) / 2
+        half_pad_w = (letterboxed_image_shape[w_dim] - resized_image_shape[w_dim]) / 2
+        letterbox_padding = np.array([half_pad_w, half_pad_h, 0, 0], dtype=np.float32)
+
+        # default score threshold is 0.25 so this will ensure no results are thrown away due to the score
+        np.random.seed(123)
+        # scores0 = np.random.uniform(low=0.5, high=1.0, size=num_classes)
+        # scores1 = scores0 - 0.1  # first result should win if picking a single result and be first in NMS output
+        # scores = [scores0, scores1]
+        scores = np.random.uniform(low=0.5, high=1.0, size=(2, num_classes))
+
+        if has_key_points:
+            if key_points_have_conf:
+                keypoints = [[5., 5., .8, 10., 10., .8, 60., 60., .9],
+                             [60., 60., .9, 80., 80., .6, 150., 120., .5]]
+            else:
+                keypoints = [[5., 5., 10., 10., 60., 60.],
+                             [60., 60., 80., 80., 150., 120.]]
+        else:
+            keypoints = [[], []]
+
+        # 4 for box, num_classes scores, key point data
+        input_data = [
+            [50., 50., 100., 100., *scores[0], *keypoints[0]],
+            [80., 80., 100., 100., *scores[1], *keypoints[1]],
+        ]
+        input_data = np.array(input_data, dtype=np.float32)
+
+        model_inputs = {
+            "results": input_data,
+            "orig_img": np.ones(orig_image_shape, dtype=np.uint8),
+            "resized_img": np.ones(resized_image_shape, dtype=np.uint8),
+            "letterboxed_img": np.ones(letterboxed_image_shape, dtype=np.uint8),
+        }
+
+        # for each result, manually scale box and keypoints to validate. check for correct class and score info.
+        # we aren't limiting results based on max classes per box or max overall matches so we expect all classes
+        # to be returned as results for both bounding boxes.
+        # the NMS output is sorted by class first and score second, so we assemble the results on a per-class basis
+        # and flatten to compare with the actual results
+        results_expected = [[] for i in range(0, num_classes)]
+        num_selected = 0
+        while num_selected < num_classes * len(input_data):
+            expected = []
+
+            best_score = scores.max()
+            idx = int(scores.argmax() / num_classes)  # find row best score came from. num_classes entry per row.
+
+            box = np.array(input_data[idx][:4])
+            box -= letterbox_padding
+            box *= scale_ratio
+            expected += box.tolist()
+
+            selected_class = np.where(scores[idx] == best_score)[0][0]  # find index of best score
+            expected.append(best_score)
+            expected.append(selected_class)
+
+            # set the score to 0 so it doesn't get selected again
+            scores[idx][selected_class] = 0.
+
+            # keypoints
+            values_per_entry = 3 if key_points_have_conf else 2
+            for kp_idx, kp in enumerate(input_data[idx][4 + num_classes:]):
+
+                if kp_idx % values_per_entry == 0:
+                    # x coord
+                    expected.append((kp - letterbox_padding[0]) * scale_ratio)
+                elif kp_idx % values_per_entry == 1:
+                    # y coord
+                    expected.append((kp - letterbox_padding[1]) * scale_ratio)
+                else:
+                    assert key_points_have_conf
+                    # confidence score should match input
+                    expected.append(keypoints[idx][kp_idx])
+
+            results_expected[selected_class].append(expected)
+            num_selected += 1
+
+        self._create_pipeline_and_run_for_nms_and_scaling(
+            output_model, orig_image_shape, resized_image_shape, letterboxed_image_shape,
+            num_classes, has_key_points, key_points_have_conf)
+
+        so = ort.SessionOptions()
+        so.register_custom_ops_library(get_library_path())
+        ort_sess = ort.InferenceSession(str(output_model), providers=['CPUExecutionProvider'], sess_options=so)
+
+        outputs = ort_sess.run(None, model_inputs)
+        results_actual = outputs[0]
+        # flatten the per-class entries. we are returning results for all classes of both bounding boxes so should be
+        # equal to scores.size
+        #   {num_classes, num results, result size} to {num_classes * num_results, result size}
+        results_expected = np.asarray(results_expected).reshape((scores.size, -1))
+        self.assertEqual(results_expected.shape, results_actual.shape)
+
+        compared = np.isclose(results_expected, results_actual)
+        self.assertTrue(compared.all(),
+                        msg=f"\nExpected={results_expected}\nActual={results_actual}\nCompared={compared}")
+
+    def test_NMS_with_scaling_and_keypoints(self):
+        """
+        Test selecting bounding boxes with NMS and scaling the results.
+        Include testing of when there are key points in mask data in the results (used by pose models)
+        """
+        for channels_last in [True, False]:
+            for num_classes in [1, 4]:
+                for has_key_points in [True, False]:
+                    # it only makes sense to have keypoints when there's a single class as the keypoints are
+                    # per bounding box. e.g. if you have a bounding box and classes of person and dog, each class would
+                    # require totally different keypoints
+                    if not has_key_points or num_classes == 1:
+                        msg = (f"Running test with layout={'HWC' if channels_last else 'CHW'} "
+                               f"num_classes={num_classes} has_key_points={has_key_points}")
+                        print(msg)
+                        self._run_nms_scaling_test(channels_last, num_classes, has_key_points)
+                        if has_key_points:
+                            key_points_have_conf = True
+                            print(msg + " key_points_have_conf=True")
+                            self._run_nms_scaling_test(channels_last, num_classes, has_key_points, key_points_have_conf)
+
    def test_FastestDet(self):
        # https://github.com/dog-qiuqiu/FastestDet
        # a minor fix is to accommodate output with yolo output format, including bounding box regression inside.
@ -689,7 +991,7 @@ class TestToolsAddPrePostProcessingToModel(unittest.TestCase):
        output_model = os.path.join(test_data_dir, "FastestDet.updated.onnx")
        input_image_path = os.path.join(test_data_dir, "wolves.jpg")

-        add_ppp.yolo_detection(Path(input_model), Path(output_model),input_shape=(352,352))
+        add_ppp.yolo_detection(Path(input_model), Path(output_model), input_shape=(352, 352))

        so = ort.SessionOptions()
        so.register_custom_ops_library(get_library_path())
--- a/tutorials/data/bus.jpg
+++ b/tutorials/data/bus.jpg
--- a/tutorials/yolo_e2e.py
+++ b/tutorials/yolo_e2e.py
@ -5,7 +5,8 @@ import numpy
 from pathlib import Path
 import onnxruntime_extensions

-def get_yolov8_model(onnx_model_name: str):
+
+def get_yolo_model(version: int, onnx_model_name: str):
    # install yolov8
    from pip._internal import main as pipmain
    try:
@ -13,12 +14,12 @@ def get_yolov8_model(onnx_model_name: str):
    except ImportError:
        pipmain(['install', 'ultralytics'])
        import ultralytics
-    pt_model = Path("yolov8n.pt")
+    pt_model = Path(f"yolov{version}n.pt")
    model = ultralytics.YOLO(str(pt_model))  # load a pretrained model
-    success = model.export(format="onnx")  # export the model to ONNX format
-    assert success, "Failed to export yolov8n.pt to onnx"
+    exported_filename = model.export(format="onnx")  # export the model to ONNX format
+    assert exported_filename, f"Failed to export yolov{version}n.pt to onnx"
    import shutil
-    shutil.move(pt_model.with_suffix('.onnx'), onnx_model_name)
+    shutil.move(exported_filename, onnx_model_name)


 def add_pre_post_processing_to_yolo(input_model_file: Path, output_model_file: Path):
@ -29,13 +30,11 @@ def add_pre_post_processing_to_yolo(input_model_file: Path, output_model_file: P
        input_model_file (Path): The onnx yolo model.
        output_model_file (Path): where to save the final onnx model.
    """
-    if not Path(input_model_file).is_file():
-        get_yolov8_model(input_model_file)
-
    from onnxruntime_extensions.tools import add_pre_post_processing_to_model as add_ppp
    add_ppp.yolo_detection(input_model_file, output_model_file, "jpg", onnx_opset=18)

-def test_inference(onnx_model_file:Path):
+
+def run_inference(onnx_model_file: Path):
    import onnxruntime as ort
    import numpy as np

@ -48,14 +47,23 @@ def test_inference(onnx_model_file:Path):

    inname = [i.name for i in session.get_inputs()]
    inp = {inname[0]: image}
-    outputs = session.run(['image_out'], inp)[0]
-    open('../test/data/result.jpg', 'wb').write(outputs)
-
+    output = session.run(['image_out'], inp)[0]
+    output_filename = '../test/data/result.jpg'
+    open(output_filename, 'wb').write(output)
+    from PIL import Image
+    Image.open(output_filename).show()


 if __name__ == '__main__':
-    print("checking the model...")
-    onnx_model_name = Path("../test/data/yolov8n.onnx")
+    # YOLO version. Tested with 5 and 8.
+    version = 8
+    onnx_model_name = Path(f"../test/data/yolov{version}n.onnx")
+    if not onnx_model_name.exists():
+        print("Fetching original model...")
+        get_yolo_model(version, str(onnx_model_name))
+
    onnx_e2e_model_name = onnx_model_name.with_suffix(suffix=".with_pre_post_processing.onnx")
+    print("Adding pre/post processing...")
    add_pre_post_processing_to_yolo(onnx_model_name, onnx_e2e_model_name)
-    test_inference(onnx_e2e_model_name)
+    print("Testing updated model...")
+    run_inference(onnx_e2e_model_name)
--- a/tutorials/yolov8_pose_e2e.py
+++ b/tutorials/yolov8_pose_e2e.py
@ -0,0 +1,261 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import onnx.shape_inference
+import onnxruntime_extensions
+from onnxruntime_extensions.tools.pre_post_processing import *
+from pathlib import Path
+from PIL import Image, ImageDraw
+
+def get_yolov8_pose_model(onnx_model_name: str):
+    # install yolov8
+    from pip._internal import main as pipmain
+    try:
+        import ultralytics
+    except ImportError:
+        pipmain(['install', 'ultralytics'])
+        import ultralytics
+    pt_model = Path("yolov8n-pose.pt")
+    model = ultralytics.YOLO(str(pt_model))  # load a pretrained model
+    success = model.export(format="onnx")  # export the model to ONNX format
+    assert success, "Failed to export yolov8n-pose.pt to onnx"
+    import shutil
+    shutil.move(pt_model.with_suffix('.onnx'), onnx_model_name)
+
+
+def add_pre_post_processing_to_yolo(input_model_file: Path, output_model_file: Path,
+                                    output_image: bool = False,
+                                    decode_input: bool = True,
+                                    input_shape: Optional[List[Union[int, str]]] = None):
+    """Construct the pipeline for an end2end model with pre and post processing. 
+    The final model can take raw image binary as inputs and output the result in raw image file.
+
+    Args:
+        input_model_file (Path): The onnx yolo model.
+        output_model_file (Path): where to save the final onnx model.
+        output_image (bool): Model will draw bounding boxes on the original image and output that. It will NOT draw
+            the keypoints as there's no custom operator to handle that currently.
+            If false, the output will have the same shape as the original model, with all the co-ordinates updated
+            to match the original input image.
+        decode_input: Input is jpg/png to decode. Alternative is to provide RGB data
+        input_shape: Input shape if RGB data is being provided. Can use symbolic dimensions. Either the first or last
+                     dimension must be 3 to determine if layout is HWC or CHW.
+    """
+    if not Path(input_model_file).is_file():
+        print("Fetching the model...")
+        get_yolov8_pose_model(str(input_model_file))
+
+    print("Adding pre/post processing to the model...")
+    model = onnx.load(str(input_model_file.resolve(strict=True)))
+    model_with_shape_info = onnx.shape_inference.infer_shapes(model)
+
+    model_input_shape = model_with_shape_info.graph.input[0].type.tensor_type.shape
+    model_output_shape = model_with_shape_info.graph.output[0].type.tensor_type.shape
+
+    # infer the input sizes from the model.
+    w_in = model_input_shape.dim[-1].dim_value
+    h_in = model_input_shape.dim[-2].dim_value
+    assert w_in == 640 and h_in == 640  # expected values
+
+    # output is [1, 56, 8400]
+    # there are
+    classes_masks_out = model_output_shape.dim[1].dim_value
+    boxes_out = model_output_shape.dim[2].dim_value
+    assert classes_masks_out == 56
+    assert boxes_out == 8400
+
+    # layout of image prior to Resize and LetterBox being run. post-processing needs to know this to determine where
+    # to get the original H and W from
+    if decode_input:
+        inputs = [create_named_value("image", onnx.TensorProto.UINT8, ["num_bytes"])]
+        # ConvertImageToBGR produces HWC output
+        decoded_image_layout = "HWC"
+    else:
+        assert input_shape and len(input_shape) == 3, "3D input shape is required if decode_input is false."
+        if input_shape[0] == 3:
+            decoded_image_layout = "CHW"
+        elif input_shape[2] == 3:
+            decoded_image_layout = "HWC"
+        else:
+            raise ValueError("Invalid input shape. Either first or last dimension must be 3.")
+
+        inputs = [create_named_value("decoded_image", onnx.TensorProto.UINT8, input_shape)]
+
+    onnx_opset = 18
+    pipeline = PrePostProcessor(inputs, onnx_opset)
+
+    pre_processing_steps = []
+    if decode_input:
+        pre_processing_steps.append(ConvertImageToBGR(name="ImageHWC"))  # jpg/png image to BGR in HWC layout
+    else:
+        # use Identity if we don't need to call ChannelsLastToChannelsFirst as the next step
+        if decoded_image_layout == "CHW":
+            pre_processing_steps.append(Identity(name="DecodedImageCHW"))
+
+    if decoded_image_layout == "HWC":
+        pre_processing_steps.append(ChannelsLastToChannelsFirst(name="DecodedImageCHW"))  # HWC to CHW
+
+    pre_processing_steps += [
+        # Resize an arbitrary sized image to a fixed size in not_larger policy
+        Resize((h_in, w_in), policy='not_larger', layout='CHW'),
+        # padding or cropping the image to (h_in, w_in)
+        LetterBox(target_shape=(h_in, w_in), layout='CHW'),
+        ImageBytesToFloat(),  # Convert to float in range 0..1
+        Unsqueeze([0]),  # add batch, CHW --> 1CHW
+    ]
+
+    pipeline.add_pre_processing(pre_processing_steps)
+
+    # NMS and drawing boxes
+    post_processing_steps = [
+        Squeeze([0]),  # - Squeeze to remove batch dimension from [batch, 56, 8200] output
+        Transpose([1, 0]),  # reverse so result info is inner dim
+        # split the 56 elements into the box, score for the 1 class, and mask info (17 locations x 3 values)
+        Split(num_outputs=3, axis=1, splits=[4, 1, 51]),
+        # Apply NMS to select best boxes. iou and score values match
+        # https://github.com/ultralytics/ultralytics/blob/e7bd159a44cf7426c0f33ed9b413ef4439505a03/ultralytics/models/yolo/pose/predict.py#L34-L35
+        SelectBestBoundingBoxesByNMS(iou_threshold=0.7, score_threshold=0.25, has_mask_data=True),
+        # Scale boxes and key point coords back to original image. Mask data has 17 key points per box.
+        (ScaleNMSBoundingBoxesAndKeyPoints(num_key_points=17, layout='CHW'),
+         [
+             # A default connection from SelectBestBoundingBoxesByNMS for input 0
+             # A connection from original image to input 1
+             # A connection from the resized image to input 2
+             # A connection from the LetterBoxed image to input 3
+             # We use the three images to calculate the scale factor and offset.
+             # With scale and offset, we can scale the bounding box and key points back to the original image.
+             utils.IoMapEntry("DecodedImageCHW", producer_idx=0, consumer_idx=1),
+             utils.IoMapEntry("Resize", producer_idx=0, consumer_idx=2),
+             utils.IoMapEntry("LetterBox", producer_idx=0, consumer_idx=3),
+        ]),
+    ]
+
+    if output_image:
+        # separate out the bounding boxes from the keypoint data to use the existing steps/custom op to draw the
+        # bounding boxes.
+        post_processing_steps += [
+            Split(num_outputs=2, axis=-1, splits=[6, 51], name="SplitScaledBoxesAndKeypoints"),
+            (DrawBoundingBoxes(mode='CENTER_XYWH', num_classes=1, colour_by_classes=True),
+             [
+                 utils.IoMapEntry("OriginalRGBImage", producer_idx=0, consumer_idx=0),
+                 utils.IoMapEntry("SplitScaledBoxesAndKeypoints", producer_idx=0, consumer_idx=1),
+             ]),
+            # Encode to jpg/png
+            ConvertBGRToImage(image_format="png"),
+        ]
+
+    pipeline.add_post_processing(post_processing_steps)
+
+    new_model = pipeline.run(model)
+
+    print("Pre/post proceessing added.")
+    # run shape inferencing to validate the new model. shape inferencing will fail if any of the new node
+    # types or shapes are incorrect. infer_shapes returns a copy of the model with ValueInfo populated,
+    # but we ignore that and save new_model as it is smaller due to not containing the inferred shape information.
+    _ = onnx.shape_inference.infer_shapes(new_model, strict_mode=True)
+    onnx.save_model(new_model, str(output_model_file.resolve()))
+    print("Updated model saved.")
+
+
+def run_inference(onnx_model_file: Path, output_image: bool = False, model_decodes_image: bool = True):
+    import onnxruntime as ort
+    import numpy as np
+
+    print("Running the model to validate output.")
+
+    providers = ['CPUExecutionProvider']
+    session_options = ort.SessionOptions()
+    session_options.register_custom_ops_library(onnxruntime_extensions.get_library_path())
+    session = ort.InferenceSession(str(onnx_model_file), providers=providers, sess_options=session_options)
+
+    input_image_path = './data/bus.jpg'
+    input_name = [i.name for i in session.get_inputs()]
+    if model_decodes_image:
+        image_bytes = np.frombuffer(open(input_image_path, 'rb').read(), dtype=np.uint8)
+        model_input = {input_name[0]: image_bytes}
+    else:
+        rgb_image = np.array(Image.open(input_image_path).convert('RGB'))
+        rgb_image = rgb_image.transpose((2, 0, 1))  # Channels first
+        model_input = {input_name[0]: rgb_image}
+
+    model_output = ['image_out'] if output_image else ['nms_output_with_scaled_boxes_and_keypoints']
+    outputs = session.run(model_output, model_input)
+
+    if output_image:
+        image_out = outputs[0]
+        from io import BytesIO
+        s = BytesIO(image_out)
+        Image.open(s).show()
+    else:
+        # manually draw the bounding boxes and skeleton just to prove it works
+        skeleton = [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 12], [7, 13], [6, 7], [6, 8], [7, 9],
+                    [8, 10], [9, 11], [2, 3], [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]]
+
+        # open original image so we can draw on it
+        input_image = Image.open(input_image_path).convert('RGB')
+        input_image_draw = ImageDraw.Draw(input_image)
+
+        scaled_nms_output = outputs[0]
+        for result in scaled_nms_output:
+            # split the 4 box coords, 1 score, 1 class (ignored), keypoints
+            (box, score, _, keypoints) = np.split(result, (4, 5, 6))
+            keypoints = keypoints.reshape((17, 3))
+
+            # convert box from centered XYWH to co-ords and draw rectangle
+            # NOTE: The pytorch model seems to output XYXY co-ords. Not sure why that's different.
+            half_w = (box[2] / 2)
+            half_h = (box[3] / 2)
+            x0 = box[0] - half_w
+            y0 = box[1] - half_h
+            x1 = box[0] + half_w
+            y1 = box[1] + half_h
+            input_image_draw.rectangle(((x0, y0), (x1, y1)), outline='red', width=4)
+
+            # draw skeleton
+            # See https://github.com/ultralytics/ultralytics/blob/e7bd159a44cf7426c0f33ed9b413ef4439505a03/ultralytics/utils/plotting.py#L171
+            for i, sk in enumerate(skeleton):
+                # convert keypoint index in `skeleton` to 0-based index and get keypoint data for it
+                keypoint1 = keypoints[sk[0] - 1]
+                keypoint2 = keypoints[sk[1] - 1]
+                pos1 = (int(keypoint1[0]), int(keypoint1[1]))
+                pos2 = (int(keypoint2[0]), int(keypoint2[1]))
+                conf1 = keypoint1[2]
+                conf2 = keypoint2[2]
+                if conf1 < 0.5 or conf2 < 0.5:
+                    continue
+
+                def coord_valid(coord):
+                    x, y = coord
+                    return 0 <= x < input_image.width and 0 <= y < input_image.height
+
+                if coord_valid(pos1) and coord_valid(pos2):
+                    input_image_draw.line((pos1, pos2), fill='yellow', width=2)
+
+        print("Displaying original image with bounding boxes and skeletons.")
+        input_image.show()
+
+
+if __name__ == '__main__':
+    onnx_model_name = Path("./data/yolov8n-pose.onnx")
+    onnx_e2e_model_name = onnx_model_name.with_suffix(suffix=".with_pre_post_processing.onnx")
+
+    # default output is the scaled non-max suppresion data which matches the original model.
+    # each result has bounding box (4), score (1), class (1), keypoints(17 x 3) = 57 elements
+    # bounding box is centered XYWH format.
+    # alternative is to output the original image with the bounding boxes but no key points drawn.
+    output_image_with_bounding_boxes = False
+
+    for model_decodes_image in [True, False]:
+        if model_decodes_image:
+            print("Running with model taking jpg/png as input.")
+        else:
+            print("Running with model taking RGB data as input.")
+
+        input_shape = None
+        if not model_decodes_image:
+            # NOTE: This uses CHW just for the sake of testing both layouts
+            input_shape = [3, "h_in", "w_in"]
+
+        add_pre_post_processing_to_yolo(onnx_model_name, onnx_e2e_model_name, output_image_with_bounding_boxes,
+                                        model_decodes_image, input_shape)
+        run_inference(onnx_e2e_model_name, output_image_with_bounding_boxes, model_decodes_image)