Check partial conversion on FP16 to FP32 AVX Cast kernel (#22091)

### Description Added checks to convert partial vectors in the early stages of the FP16 to FP32 cast using AVX NE CONVERT ISA. ### Motivation and Context Avoid storing data in sections outside of the output buffer, these checks are missing on the [original PR](https://github.com/microsoft/onnxruntime/pull/21183). This fix prevents memory corruption when the output buffer has a size [n*16 + 1, n*16 + 7] with 0< n
2024-09-16 10:20:06 -06:00 · 2024-09-16 10:20:06 -06:00 · e93f14e00d
--- a/onnxruntime/core/mlas/lib/amd64/cvtfp16Avx.asm
+++ b/onnxruntime/core/mlas/lib/amd64/cvtfp16Avx.asm
@ -80,6 +80,8 @@ Convert256Vectors:
        jz      ExitRoutine ; If we are done, exit
        cmp     r8, 16      ; If the vector is big enough, we go again
        jae     Convert256Vectors
+        cmp     r8, 8       ; Check if we have enough elements to convert
+        jb      ConvertMaskedVectors



--- a/onnxruntime/core/mlas/lib/x86_64/cvtfp16Avx.S
+++ b/onnxruntime/core/mlas/lib/x86_64/cvtfp16Avx.S
@ -51,8 +51,6 @@ FUNCTION_ENTRY MlasCastF16ToF32KernelAvx

        test    rdx, rdx      // Check if we have any elements to convert
        jz      ExitRoutine
-
-AVX_NE_CONVERT:
        cmp     rdx, 8
        jb      ConvertMaskedVectors
        cmp     rdx, 16
@ -75,6 +73,8 @@ Convert256Vectors:
        jz      ExitRoutine     // If we are done, exit
        cmp     rdx, 16         // If the vector is big enough, we go again
        jae     Convert256Vectors
+        cmp	rdx, 8           // Check if we have enough elements to convert
+        jb      ConvertMaskedVectors