hat/samples/sample_gemm_library.hat

#ifdef TOML

[description]
comment = "John Doe's GEMM Library"
author = "John Doe"
version = "1.2.3.5"
license_url = "https://www.apache.org/licenses/LICENSE-2.0.html"

[functions]
    [functions.GEMM_B94D27B9934D3E08]
    name = "GEMM_B94D27B9934D3E08"
    description = "CPU Implementation of the GEMM algorithm: C = alpha * (A * B) + beta * C with specific sizes."
    calling_convention = "stdcall"
    arguments = [
        { name = "A", description = "left-hand input matrix", logical_type = "affine_array", declared_type = "const float*", element_type = "float", usage = "input", shape = [ 512, 256 ], affine_map = [ 256, 1 ], affine_offset = 0 },
        { name = "B", description = "right-hand input matrix", logical_type = "affine_array", declared_type = "const float*", element_type = "float", usage = "input", shape = [ 256, 512 ], affine_map = [ 512, 1 ], affine_offset = 0 },
        { name = "C", description = "result accumulation matrix", logical_type = "affine_array", declared_type = "float*", element_type = "float", usage = "input_output", shape = [ 512, 512 ], affine_map = [ 512, 1 ], affine_offset = 0 },
        { name = "alpha", description = "Scalar to apply to A*B matrix product", logical_type = "element", declared_type = "float", element_type = "float", usage = "input" },
        { name = "beta", description = "Scalar to apply to the input C matrix before accumulating", logical_type = "element", declared_type = "float", element_type = "float", usage = "input" }
    ]
    return = { name = "return value", description = "GEMM_B94D27B9934D3E08 return value", logical_type = "void", declared_type = "void", element_type = "void", usage = "output" }

        [functions.GEMM_B94D27B9934D3E08.auxiliary]
            [functions.GEMM_B94D27B9934D3E08.auxiliary.FooToolchain]
            node = "gemm"

            [functions.GEMM_B94D27B9934D3E08.auxiliary.Contoso]
            implementation = "Contoso_GEMM_123"
            contoso_tool_version = "1.0"
            node = "GEMM"
                [functions.GEMM_B94D27B9934D3E08.auxiliary.Contoso.implementation_config]
                PackBMatrix = true


    [functions.blas_sgemm_row_major]
    name = "blas_sgemm_row_major"
    description = "CPU Implementation of the GEMM algorithm: C = alpha * (A * B) + beta * C with runtime-known sizes and non-transposed row-major matrices."
    calling_convention = "stdcall"
    arguments = [
        { name = "M", description = "Number of rows in the A and C matrices", logical_type = "element", declared_type = "int", element_type = "int", usage = "input" },
        { name = "N", description = "Number of columns in the B and C matrices", logical_type = "element", declared_type = "int", element_type = "int", usage = "input" },
        { name = "K", description = "Number of columns in the A matrix and rows in the B matrix", logical_type = "element", declared_type = "int", element_type = "int", usage = "input" },
        { name = "alpha", description = "Scalar to apply to A*B matrix product", logical_type = "element", declared_type = "float", element_type = "float", usage = "input" },
        { name = "A", description = "left-hand input matrix", logical_type = "runtime_array", declared_type = "const float*", element_type = "float", usage = "input", size = "lda * K" },
        { name = "lda", description = "Leading dimension step size of A", logical_type = "element", declared_type = "int", element_type = "int", usage = "input" },
        { name = "B", description = "right-hand input matrix", logical_type = "runtime_array", declared_type = "const float*", element_type = "float", usage = "input", size = "ldb * N" },
        { name = "ldb", description = "Leading dimension step size of B", logical_type = "element", declared_type = "int", element_type = "int", usage = "input" },
        { name = "beta", description = "Scalar to apply to the input C matrix before accumulating", logical_type = "element", declared_type = "float", element_type = "float", usage = "input" },
        { name = "C", description = "result accumulation matrix", logical_type = "runtime_array", declared_type = "const float*", element_type = "float", usage = "input", size = "ldc * N" },
        { name = "ldc", description = "Leading dimension step size of C", logical_type = "element", declared_type = "int", element_type = "int", usage = "input" }
    ]
    return = { name = "return value", description = "blas_sgemm_row_major return value", logical_type = "void", declared_type = "void", element_type = "void", usage = "output" }

[target]
    [target.required]
    os = "windows"

    [target.required.CPU]
    architecture = "x86_64"
    extensions = ["AVX2"]

    [target.optimized_for.CPU]
    name = "Intel Xeon E5-4669 v4"
    family = "Broadwell"
    clock_frequency = 2.2
    cores = 22
    threads = 44

        [target.optimized_for.CPU.cache]
        instruction_KB = 32
        sizes_KB = [ 32, 256, 56320 ]
        line_sizes = [ 64, 64, 64 ]

# The dependencies table provides information that a consumer of this .hat file
# will need to act on in order to properly consume the package, such as library
# files to link and dynamic libraries to make available at runtime
[dependencies]
link_target = "sample_gemm.lib"
deploy_files = [ "sample_gemm.dll" ]
dynamic = [ { name = "Windows Universal C Runtime", version = "141", target_file = "ucrtbase.dll" } ]

# The compiled_with table provides information that a consumer of this .hat file
# may find useful but may not necessarily need to act on in order to successfully
# consume this package
[compiled_with]
compiler = "MSVC141"
flags = "-std=c++14 -ffast-math -fno-exceptions -fno-rtti"
crt = "ucrt"
libraries = [ { name = "LLVM OpenMP", version = "5.1", target_file = "libomp.lib" } ]

[declaration]
code = '''
#endif // TOML

#pragma once

#include <stdint.h>

#if defined(__cplusplus)
    extern "C"
{
#endif // defined(__cplusplus)

//
// Functions
//

void GEMM_B94D27B9934D3E08(const float* A, const float* B, float* C, float alpha, float beta);

void blas_sgemm_row_major(int M, int N, int K, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc);

#if defined(__cplusplus)
} // extern "C"
#endif // defined(__cplusplus)

#ifdef TOML
'''
#endif // TOML