AMD support (#1430)

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com> Co-authored-by: Jithun Nair <jithun.nair@amd.com> Co-authored-by: rraminen <rraminen@amd.com> Co-authored-by: Jeff Daily <jeff.daily@amd.com> Co-authored-by: okakarpa <okakarpa@amd.com> Co-authored-by: rraminen <rraminen@amd.com> Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> Co-authored-by: Jeff Daily <jeff.daily@amd.com> Co-authored-by: okakarpa <okakarpa@amd.com> Co-authored-by: Ramya Ramineni <62723901+rraminen@users.noreply.github.com>
2022-03-02 17:53:35 -08:00 · 2022-03-02 17:53:35 -08:00 · c3c8d5dd93
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@ -1,4 +1,4 @@
-name: Build
+name: unit-tests

 on:
  push:
@ -14,7 +14,7 @@ on:
 jobs:
  # unit tests running on nvidia gpus
  nv-torch12-p40:
-    runs-on: [self-hosted, nvidia, torch12]
+    runs-on: [self-hosted, nvidia, torch12, p40]

    steps:
      - uses: actions/checkout@v2
@ -102,6 +102,43 @@ jobs:
          find examples/pytorch  -regextype posix-egrep -regex '.*(language-modeling|question-answering|summarization|image-classification|text-classification|translation).*/requirements.txt' -exec pip install -r {} \;
          TORCH_EXTENSIONS_DIR=./torch-extensions RUN_SLOW=1 pytest --color=yes --durations=0 --verbose tests/deepspeed

+  # unit tests running on amd gpus
+  amd:
+    # The type of runner that the job will run on
+    runs-on: [self-hosted, amd]
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+
+      # Runs a single command using the runners shell
+      - name: environment
+        run: |
+          rocm-smi --showhw
+          which python
+          python --version
+          which hipcc
+          hipcc --version
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+          sudo apt-get update
+          sudo apt-get install -y libaio-dev
+      # Runs a set of commands using the runners shell
+      - name: Install deepspeed
+        run: |
+          pip install .[dev,1bit,autotuning]
+          python -c "from deepspeed.env_report import cli_main; cli_main()"
+          #ds_report
+      # Runs a set of commands using the runners shell
+      - name: Unit tests
+        run: |
+          if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
+          cd tests
+          #TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose unit/
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 -m 'not sequential' unit/
+          TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
+
  nv-lightning-v100:
    runs-on: [self-hosted, nvidia, torch18, v100]

--- a/csrc/includes/cublas_wrappers.h
+++ b/csrc/includes/cublas_wrappers.h
@ -5,7 +5,9 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
+#ifndef __HIP_PLATFORM_HCC__
 #include <mma.h>
+#endif
 #include <stdio.h>

 int cublas_gemm_ex(cublasHandle_t handle,
@ -19,7 +21,11 @@ int cublas_gemm_ex(cublasHandle_t handle,
                   const float* A,
                   const float* B,
                   float* C,
+#ifdef __HIP_PLATFORM_HCC__
+                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
+#endif

 int cublas_gemm_ex(cublasHandle_t handle,
                   cublasOperation_t transa,
@ -32,7 +38,11 @@ int cublas_gemm_ex(cublasHandle_t handle,
                   const __half* A,
                   const __half* B,
                   __half* C,
+#ifdef __HIP_PLATFORM_HCC__
+                   rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif

 int cublas_strided_batched_gemm(cublasHandle_t handle,
                                int m,
@ -49,7 +59,11 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                int stride_B,
                                int stride_C,
                                int batch,
+#ifdef __HIP_PLATFORM_HCC__
+                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                                cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
+#endif

 int cublas_strided_batched_gemm(cublasHandle_t handle,
                                int m,
@ -66,4 +80,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                int stride_B,
                                int stride_C,
                                int batch,
+#ifdef __HIP_PLATFORM_HCC__
+                                rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
+#else
                                cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+#endif
--- a/csrc/includes/custom_cuda_layers.h
+++ b/csrc/includes/custom_cuda_layers.h
@ -5,7 +5,15 @@
 #include <stdio.h>
 #include <stdlib.h>

+#ifdef __HIP_PLATFORM_HCC__
+#define HALF_PRECISION_AVAILABLE = 1
+#include <hip/hip_cooperative_groups.h>
+#else
+#if __CUDA_ARCH__ >= 700
+#define HALF_PRECISION_AVAILABLE = 1
+#endif
 #include <cooperative_groups.h>
+#endif
 #include <curand_kernel.h>

 #include "context.h"
--- a/csrc/includes/feed_forward.h
+++ b/csrc/includes/feed_forward.h
@ -43,7 +43,11 @@ public:
                       weights,
                       input_ptr,
                       out,
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo(config_.gemm_algos[0]));
+#else
                       cublasGemmAlgo_t(config_.gemm_algos[0]));
+#endif
    }
    void Backward(int bsz,
                  const T* out_grad,
@ -68,7 +72,11 @@ public:
                       input_ptr,
                       out_grad,
                       weights_grad,
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo(config_.gemm_algos[1]));
+#else
                       cublasGemmAlgo_t(config_.gemm_algos[1]));
+#endif

        cublas_gemm_ex(_cublasHandle,
                       CUBLAS_OP_N,
@ -81,7 +89,11 @@ public:
                       weights,
                       out_grad,
                       inp_grad_out,
+#ifdef __HIP_PLATFORM_HCC__
+                       rocblas_gemm_algo(config_.gemm_algos[2]));
+#else
                       cublasGemmAlgo_t(config_.gemm_algos[2]));
+#endif

        launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream);
    }
--- a/csrc/includes/gemm_test.h
+++ b/csrc/includes/gemm_test.h
@ -2,7 +2,9 @@
 #pragma once

 #include <cuda_fp16.h>
+#ifndef __HIP_PLATFORM_HCC__
 #include <cuda_profiler_api.h>
+#endif
 #include <array>
 #include <cstdio>
 #include <cstdlib>
@ -58,7 +60,11 @@ public:
                           B,
                           A,
                           C,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
                           static_cast<cublasGemmAlgo_t>(algo));
+#endif
        });

        int algo_bw1 = Run(loops, [=](int algo) {
@ -73,7 +79,11 @@ public:
                           A,
                           C,
                           B,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
                           static_cast<cublasGemmAlgo_t>(algo));
+#endif
        });

        int algo_bw2 = Run(loops, [=](int algo) {
@ -88,7 +98,11 @@ public:
                           B,
                           C,
                           A,
+#ifdef __HIP_PLATFORM_HCC__
+                           static_cast<rocblas_gemm_algo>(algo));
+#else
                           static_cast<cublasGemmAlgo_t>(algo));
+#endif
        });

        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
@ -100,8 +114,12 @@ public:
        float fast_latency = (std::numeric_limits<float>::max)();
        int fast_algo = 0;

+#ifdef __HIP_PLATFORM_HCC__
+        for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
+#else
        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+#endif
             algo++) {
            int warm_up = 5;
            for (int i = 0; i < warm_up; ++i) f(algo);
@ -186,7 +204,11 @@ public:
                                        stride_b,
                                        stride_c,
                                        bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
                                        static_cast<cublasGemmAlgo_t>(algo));
+#endif
        });

        int algo_bw1 = Run(loops, [=](int algo) {
@ -216,7 +238,11 @@ public:
                                        stride_b,
                                        stride_c,
                                        bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
                                        static_cast<cublasGemmAlgo_t>(algo));
+#endif
        });

        int algo_bw2 = Run(loops, [=](int algo) {
@ -243,7 +269,11 @@ public:
                                        stride_b,
                                        stride_c,
                                        bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                        static_cast<rocblas_gemm_algo>(algo));
+#else
                                        static_cast<cublasGemmAlgo_t>(algo));
+#endif
        });

        return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
@ -255,8 +285,12 @@ public:
        float fast_latency = (std::numeric_limits<float>::max)();
        int fast_algo = 0;

+#ifdef __HIP_PLATFORM_HCC__
+        for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
+#else
        for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
             algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
+#endif
             algo++) {
            int warm_up = 5;
            for (int i = 0; i < warm_up; ++i) f(algo);
--- a/csrc/includes/general_kernels.h
+++ b/csrc/includes/general_kernels.h
@ -3,7 +3,11 @@
 #include <stdio.h>
 #include <stdlib.h>

+#ifdef __HIP_PLATFORM_HCC__
+#include <hip/hip_cooperative_groups.h>
+#else
 #include <cooperative_groups.h>
+#endif
 #include <curand_kernel.h>

 #include "context.h"
--- a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h
+++ b/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h
@ -0,0 +1,348 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  hcc_detail/hip_cooperative_groups.h
+ *
+ *  @brief Device side implementation of `Cooperative Group` feature.
+ *
+ *  Defines new types and device API wrappers related to `Cooperative Group`
+ *  feature, which the programmer can directly use in his kernel(s) in order to
+ *  make use of this feature.
+ */
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
+
+//#if __cplusplus
+#if __cplusplus && defined(__clang__) && defined(__HIP__)
+#include <hip/hcc_detail/hip_cooperative_groups_helper.h>
+#if ROCM_VERSION_MAJOR < 5 and ROCM_VERSION_MINOR < 4
+#include <hip/hcc_detail/device_functions.h>
+#endif
+namespace cooperative_groups {
+
+/** \brief The base type of all cooperative group types
+ *
+ *  \details Holds the key properties of a constructed cooperative group type
+ *           object, like the group type, its size, etc
+ */
+/*
+class thread_group {
+ protected:
+  uint32_t _type; // thread_group type
+  uint32_t _size; // total number of threads in the tread_group
+  uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types,
+                  // LSB represents lane 0, and MSB represents lane 63
+
+  // Construct a thread group, and set thread group type and other essential
+  // thread group properties. This generic thread group is directly constructed
+  // only when the group is supposed to contain only the calling the thread
+  // (throurh the API - `this_thread()`), and in all other cases, this thread
+  // group object is a sub-object of some other derived thread group object
+  __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size,
+                                uint64_t mask = (uint64_t)0) {
+    _type = type;
+    _size = size;
+    _mask = mask;
+  }
+
+ public:
+  // Total number of threads in the thread group, and this serves the purpose
+  // for all derived cooperative group types since their `size` is directly
+  // saved during the construction
+  __CG_QUALIFIER__ uint32_t size() const {
+    return _size;
+  }
+  // Rank of the calling thread within [0, size())
+  __CG_QUALIFIER__ uint32_t thread_rank() const;
+  // Is this cooperative group type valid?
+  __CG_QUALIFIER__ bool is_valid() const;
+  // synchronize the threads in the thread group
+  __CG_QUALIFIER__ void sync() const;
+};
+*/
+
+class thread_group {
+protected:
+    bool _tiled_partition;  // this_thread_block() constructor sets to false
+    uint32_t _size;         // this_thread_block() constructor sets to size()
+    uint32_t local_rank;    // this_thread_block() constructor sets to thread_rank()
+    uint32_t _mask;
+    uint32_t _type;
+
+public:
+    __CG_QUALIFIER__ thread_group(internal::group_type type,
+                                  uint32_t group_size,
+                                  uint64_t mask = (uint64_t)0)
+    {
+        _type = type;
+        _size = group_size;
+        _mask = mask;
+        local_rank = internal::workgroup::thread_rank();
+    }
+
+    __CG_QUALIFIER__ void tiled_partition(const thread_group& parent, unsigned int tile_size)
+    {
+        if ((ceil(log2(tile_size)) == floor(log2(tile_size))) || tile_size == 0 || tile_size > 64 ||
+            parent.size() < tile_size)
+            _tiled_partition = false;
+        // xxx : abort
+        _tiled_partition = true;
+        _size = tile_size;
+        local_rank = parent.thread_rank() % tile_size;
+    }
+    __CG_QUALIFIER__ void sync() const;
+    __CG_QUALIFIER__ uint32_t size() const { return _size; }
+    __CG_QUALIFIER__ uint32_t thread_rank() const;
+    __CG_QUALIFIER__ float shfl_down(float var, unsigned int delta) const
+    {
+        return (__shfl_down(var, delta, _size));
+    }
+    __CG_QUALIFIER__ float shfl_xor(float var, int mask) const
+    {
+        return (__shfl_xor(var, mask, _size));
+    }
+    __CG_QUALIFIER__ float shfl(float var, unsigned int src_lane) const
+    {
+        return (__shfl(var, src_lane, _size));
+    }
+    __CG_QUALIFIER__ bool is_valid() const;
+};
+
+/** \brief The multi-grid cooperative group type
+ *
+ *  \details Represents an inter-device cooperative group type where the
+ *           participating threads within the group spans across multiple
+ *           devices, running the (same) kernel on these devices
+ */
+class multi_grid_group : public thread_group {
+    // Only these friend functions are allowed to construct an object of this class
+    // and access its resources
+    friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
+
+protected:
+    // Construct mutli-grid thread group (through the API this_multi_grid())
+    explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
+        : thread_group(internal::cg_multi_grid, size)
+    {
+    }
+
+public:
+    // Number of invocations participating in this multi-grid group. In other
+    // words, the number of GPUs
+    __CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); }
+    // Rank of this invocation. In other words, an ID number within the range
+    // [0, num_grids()) of the GPU, this kernel is running on
+    __CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
+    __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
+    __CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
+    __CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); }
+};
+
+/** \brief User exposed API interface to construct multi-grid cooperative
+ *         group type object - `multi_grid_group`
+ *
+ *  \details User is not allowed to directly construct an object of type
+ *           `multi_grid_group`. Instead, he should construct it through this
+ *           API function
+ */
+__CG_QUALIFIER__ multi_grid_group this_multi_grid()
+{
+    return multi_grid_group(internal::multi_grid::size());
+}
+
+/** \brief The grid cooperative group type
+ *
+ *  \details Represents an inter-workgroup cooperative group type where the
+ *           participating threads within the group spans across multiple
+ *           workgroups running the (same) kernel on the same device
+ */
+class grid_group : public thread_group {
+    // Only these friend functions are allowed to construct an object of this class
+    // and access its resources
+    friend __CG_QUALIFIER__ grid_group this_grid();
+
+protected:
+    // Construct grid thread group (through the API this_grid())
+    explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {}
+
+public:
+    __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); }
+    __CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
+    __CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
+};
+
+/** \brief User exposed API interface to construct grid cooperative group type
+ *         object - `grid_group`
+ *
+ *  \details User is not allowed to directly construct an object of type
+ *           `multi_grid_group`. Instead, he should construct it through this
+ *           API function
+ */
+__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); }
+
+/** \brief The workgroup (thread-block in CUDA terminology) cooperative group
+ *         type
+ *
+ *  \details Represents an intra-workgroup cooperative group type where the
+ *           participating threads within the group are exctly the same threads
+ *           which are participated in the currently executing `workgroup`
+ */
+class thread_block : public thread_group {
+    // Only these friend functions are allowed to construct an object of this
+    // class and access its resources
+    friend __CG_QUALIFIER__ thread_block this_thread_block();
+
+protected:
+    // Construct a workgroup thread group (through the API this_thread_block())
+    explicit __CG_QUALIFIER__ thread_block(uint32_t size)
+        : thread_group(internal::cg_workgroup, size)
+    {
+    }
+
+public:
+    // 3-dimensional block index within the grid
+    __CG_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); }
+    // 3-dimensional thread index within the block
+    __CG_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
+    __CG_QUALIFIER__ uint32_t thread_rank() const { return internal::workgroup::thread_rank(); }
+    __CG_QUALIFIER__ bool is_valid() const { return internal::workgroup::is_valid(); }
+    __CG_QUALIFIER__ void sync() const { internal::workgroup::sync(); }
+};
+
+/** \brief User exposed API interface to construct workgroup cooperative
+ *         group type object - `thread_block`
+ *
+ *  \details User is not allowed to directly construct an object of type
+ *           `thread_block`. Instead, he should construct it through this API
+ *           function
+ */
+__CG_QUALIFIER__ thread_block this_thread_block()
+{
+    return thread_block(internal::workgroup::size());
+}
+
+/**
+ *  Implementation of all publicly exposed base class APIs
+ */
+__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const
+{
+    switch (this->_type) {
+        case internal::cg_multi_grid: {
+            return (static_cast<const multi_grid_group*>(this)->thread_rank());
+        }
+        case internal::cg_grid: {
+            return (static_cast<const grid_group*>(this)->thread_rank());
+        }
+        case internal::cg_workgroup: {
+            return (static_cast<const thread_block*>(this)->thread_rank());
+        }
+        case internal::cg_coalesced_tile: {
+            return local_rank;
+        }
+        default: {
+            assert(false && "invalid cooperative group type");
+            return -1;
+        }
+    }
+}
+
+__CG_QUALIFIER__ bool thread_group::is_valid() const
+{
+    switch (this->_type) {
+        case internal::cg_multi_grid: {
+            return (static_cast<const multi_grid_group*>(this)->is_valid());
+        }
+        case internal::cg_grid: {
+            return (static_cast<const grid_group*>(this)->is_valid());
+        }
+        case internal::cg_workgroup: {
+            return (static_cast<const thread_block*>(this)->is_valid());
+        }
+        case internal::cg_coalesced_tile: {
+            return _tiled_partition;
+        }
+        default: {
+            assert(false && "invalid cooperative group type");
+            return false;
+        }
+    }
+}
+
+__CG_QUALIFIER__ void thread_group::sync() const
+{
+    switch (this->_type) {
+        case internal::cg_multi_grid: {
+            static_cast<const multi_grid_group*>(this)->sync();
+            break;
+        }
+        case internal::cg_grid: {
+            static_cast<const grid_group*>(this)->sync();
+            break;
+        }
+        case internal::cg_workgroup: {
+            static_cast<const thread_block*>(this)->sync();
+            break;
+        }
+        case internal::cg_coalesced_tile: {
+            if (!_tiled_partition)  // If in a tiled partition, this is a no-op
+                __syncthreads();
+            break;
+        }
+        default: {
+            assert(false && "invalid cooperative group type");
+        }
+    }
+}
+
+/**
+ *  Implementation of publicly exposed `wrapper` APIs on top of basic cooperative
+ *  group type APIs
+ */
+template <class CGTy>
+__CG_QUALIFIER__ uint32_t group_size(CGTy const& g)
+{
+    return g.size();
+}
+
+template <class CGTy>
+__CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g)
+{
+    return g.thread_rank();
+}
+
+template <class CGTy>
+__CG_QUALIFIER__ bool is_valid(CGTy const& g)
+{
+    return g.is_valid();
+}
+
+template <class CGTy>
+__CG_QUALIFIER__ void sync(CGTy const& g)
+{
+    g.sync();
+}
+
+}  // namespace cooperative_groups
+
+#endif  // __cplusplus
+#endif  // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
--- a/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h
+++ b/csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h
@ -0,0 +1,163 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/**
+ *  @file  hcc_detail/hip_cooperative_groups_helper.h
+ *
+ *  @brief Device side implementation of cooperative group feature.
+ *
+ *  Defines helper constructs and APIs which aid the types and device API
+ *  wrappers defined within `hcc_detail/hip_cooperative_groups.h`.
+ */
+#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
+#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
+
+#if __cplusplus
+
+#if ROCM_VERSION_MAJOR < 5 and ROCM_VERSION_MINOR < 4
+#include <hip/hcc_detail/device_functions.h>
+#include <hip/hcc_detail/hip_runtime_api.h>
+#else
+#include <hip/hcc_detail/amd_device_functions.h>
+#endif
+
+#if !defined(__align__)
+#define __align__(x) __attribute__((aligned(x)))
+#endif
+
+#if !defined(__CG_QUALIFIER__)
+#define __CG_QUALIFIER__ __device__ __forceinline__
+#endif
+
+#if !defined(__CG_STATIC_QUALIFIER__)
+#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
+#endif
+
+#if !defined(WAVEFRONT_SIZE)
+#define WAVEFRONT_SIZE 64
+#endif
+
+namespace cooperative_groups {
+
+namespace internal {
+
+/** \brief Enums representing different cooperative group types
+ */
+typedef enum { cg_invalid, cg_multi_grid, cg_grid, cg_workgroup, cg_coalesced_tile } group_type;
+
+/**
+ *  Functionalities related to multi-grid cooperative group type
+ */
+namespace multi_grid {
+
+__CG_STATIC_QUALIFIER__ uint32_t num_grids() { return (uint32_t)__ockl_multi_grid_num_grids(); }
+
+__CG_STATIC_QUALIFIER__ uint32_t grid_rank() { return (uint32_t)__ockl_multi_grid_grid_rank(); }
+
+__CG_STATIC_QUALIFIER__ uint32_t size() { return (uint32_t)__ockl_multi_grid_size(); }
+
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return (uint32_t)__ockl_multi_grid_thread_rank(); }
+
+__CG_STATIC_QUALIFIER__ bool is_valid() { return (bool)__ockl_multi_grid_is_valid(); }
+
+__CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); }
+
+}  // namespace multi_grid
+
+/**
+ *  Functionalities related to grid cooperative group type
+ */
+namespace grid {
+
+__CG_STATIC_QUALIFIER__ uint32_t size()
+{
+    return (uint32_t)((hipBlockDim_z * hipGridDim_z) * (hipBlockDim_y * hipGridDim_y) *
+                      (hipBlockDim_x * hipGridDim_x));
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank()
+{
+    // Compute global id of the workgroup to which the current thread belongs to
+    uint32_t blkIdx = (uint32_t)((hipBlockIdx_z * hipGridDim_y * hipGridDim_x) +
+                                 (hipBlockIdx_y * hipGridDim_x) + (hipBlockIdx_x));
+
+    // Compute total number of threads being passed to reach current workgroup
+    // within grid
+    uint32_t num_threads_till_current_workgroup =
+        (uint32_t)(blkIdx * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
+
+    // Compute thread local rank within current workgroup
+    uint32_t local_thread_rank = (uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
+                                            (hipThreadIdx_y * hipBlockDim_x) + (hipThreadIdx_x));
+
+    return (num_threads_till_current_workgroup + local_thread_rank);
+}
+
+__CG_STATIC_QUALIFIER__ bool is_valid() { return (bool)__ockl_grid_is_valid(); }
+
+__CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); }
+
+}  // namespace grid
+
+/**
+ *  Functionalities related to `workgroup` (thread_block in CUDA terminology)
+ *  cooperative group type
+ */
+namespace workgroup {
+
+__CG_STATIC_QUALIFIER__ dim3 group_index()
+{
+    return (dim3((uint32_t)hipBlockIdx_x, (uint32_t)hipBlockIdx_y, (uint32_t)hipBlockIdx_z));
+}
+
+__CG_STATIC_QUALIFIER__ dim3 thread_index()
+{
+    return (dim3((uint32_t)hipThreadIdx_x, (uint32_t)hipThreadIdx_y, (uint32_t)hipThreadIdx_z));
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t size()
+{
+    return ((uint32_t)(hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
+}
+
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank()
+{
+    return ((uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
+                       (hipThreadIdx_y * hipBlockDim_x) + (hipThreadIdx_x)));
+}
+
+__CG_STATIC_QUALIFIER__ bool is_valid()
+{
+    // TODO(mahesha) any functionality need to be added here? I believe not
+    return true;
+}
+
+__CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); }
+
+}  // namespace workgroup
+
+}  // namespace internal
+
+}  // namespace cooperative_groups
+
+#endif  // __cplusplus
+#endif  // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
--- a/csrc/includes/strided_batch_gemm.h
+++ b/csrc/includes/strided_batch_gemm.h
@ -72,7 +72,11 @@ public:
                                    stride_b,
                                    stride_c,
                                    bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                    rocblas_gemm_algo(_config.gemm_algos[0]));
+#else
                                    cublasGemmAlgo_t(_config.gemm_algos[0]));
+#endif
    }

    void ForwardPlusSave(T* output, const T* _buffer_a, const T* _buffer_b, cublasHandle_t handle)
@ -96,7 +100,11 @@ public:
                                    stride_b,
                                    stride_c,
                                    _config.batch_size,
+#ifdef __HIP_PLATFORM_HCC__
+                                    rocblas_gemm_algo(_config.gemm_algos[0]));
+#else
                                    cublasGemmAlgo_t(_config.gemm_algos[0]));
+#endif

        k_buf = _buffer_a;
        q_buf = _buffer_b;
@ -136,7 +144,11 @@ public:
                                    stride_b,
                                    stride_c,
                                    bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                    rocblas_gemm_algo(_config.gemm_algos[1]));
+#else
                                    cublasGemmAlgo_t(_config.gemm_algos[1]));
+#endif

        // A need to transpose.
        cublasOperation_t op_a = (_config.op_A == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
@ -161,7 +173,11 @@ public:
                                    stride_b,
                                    stride_c,
                                    bsz,
+#ifdef __HIP_PLATFORM_HCC__
+                                    rocblas_gemm_algo(_config.gemm_algos[2]));
+#else
                                    cublasGemmAlgo_t(_config.gemm_algos[2]));
+#endif
    }

    inline int GetN() const { return _config.k; }
--- a/csrc/lamb/fused_lamb_cuda_kernel.cu
+++ b/csrc/lamb/fused_lamb_cuda_kernel.cu
@ -14,7 +14,11 @@
 #include <iostream>

 //#include <helper_functions.h>
+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+#include <hip/hip_cooperative_groups.h>
+#else
 #include <cooperative_groups.h>
+#endif
 #include <cuda_runtime_api.h>
 #include <stdio.h>

@ -78,7 +82,11 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
    T a_sum = s_a[tid];
    T b_sum = s_b[tid];

+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
    cg::sync(cta);
+#endif

    // do reduction in shared mem
    if ((blockSize >= 512) && (tid < 256)) {
@ -86,21 +94,33 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
        s_b[tid] = b_sum = b_sum + s_b[tid + 256];
    }

+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
    cg::sync(cta);
+#endif

    if ((blockSize >= 256) && (tid < 128)) {
        s_a[tid] = a_sum = a_sum + s_a[tid + 128];
        s_b[tid] = b_sum = b_sum + s_b[tid + 128];
    }

+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
    cg::sync(cta);
+#endif

    if ((blockSize >= 128) && (tid < 64)) {
        s_a[tid] = a_sum = a_sum + s_a[tid + 64];
        s_b[tid] = b_sum = b_sum + s_b[tid + 64];
    }

+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
    cg::sync(cta);
+#endif

 #if (__CUDA_ARCH__ >= 300)
    if (tid < 32) {
@ -124,42 +144,66 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
        s_b[tid] = b_sum = b_sum + s_b[tid + 32];
    }

+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
    cg::sync(cta);
+#endif

    if ((blockSize >= 32) && (tid < 16)) {
        s_a[tid] = a_sum = a_sum + s_a[tid + 16];
        s_b[tid] = b_sum = b_sum + s_b[tid + 16];
    }

+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
    cg::sync(cta);
+#endif

    if ((blockSize >= 16) && (tid < 8)) {
        s_a[tid] = a_sum = a_sum + s_a[tid + 8];
        s_b[tid] = b_sum = b_sum + s_b[tid + 8];
    }

+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
    cg::sync(cta);
+#endif

    if ((blockSize >= 8) && (tid < 4)) {
        s_a[tid] = a_sum = a_sum + s_a[tid + 4];
        s_b[tid] = b_sum = b_sum + s_b[tid + 4];
    }

+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
    cg::sync(cta);
+#endif

    if ((blockSize >= 4) && (tid < 2)) {
        s_a[tid] = a_sum = a_sum + s_a[tid + 2];
        s_b[tid] = b_sum = b_sum + s_b[tid + 2];
    }

+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
    cg::sync(cta);
+#endif

    if ((blockSize >= 2) && (tid < 1)) {
        s_a[tid] = a_sum = a_sum + s_a[tid + 1];
        s_b[tid] = b_sum = b_sum + s_b[tid + 1];
    }

+#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
+    cta.sync();
+#else
    cg::sync(cta);
+#endif

 #endif

@ -464,7 +508,7 @@ void fused_lamb_cuda(at::Tensor& p,
                        lamb_coeff.data<scalar_t>());
            }));
    }
-    THCudaCheck(cudaGetLastError());
+    C10_CUDA_CHECK(cudaGetLastError());
 }

 // template __device__ void reduce_two_vectors_in_register<float,512>(float a, float b, float* g_a,
--- a/csrc/transformer/cublas_wrappers.cu
+++ b/csrc/transformer/cublas_wrappers.cu
@ -1,5 +1,19 @@
 #include "cublas_wrappers.h"

+#ifdef __HIP_PLATFORM_HCC__
+int cublas_gemm_ex(rocblas_handle handle,
+                   rocblas_operation transa,
+                   rocblas_operation transb,
+                   int m,
+                   int n,
+                   int k,
+                   const float* alpha,
+                   const float* beta,
+                   const float* A,
+                   const float* B,
+                   float* C,
+                   rocblas_gemm_algo algo)
+#else
 int cublas_gemm_ex(cublasHandle_t handle,
                   cublasOperation_t transa,
                   cublasOperation_t transb,
@ -12,7 +26,34 @@ int cublas_gemm_ex(cublasHandle_t handle,
                   const float* B,
                   float* C,
                   cublasGemmAlgo_t algo)
+#endif
 {
+#ifdef __HIP_PLATFORM_HCC__
+    rocblas_status status = rocblas_gemm_ex(handle,
+                                            transa,
+                                            transb,
+                                            m,
+                                            n,
+                                            k,
+                                            (const void*)alpha,
+                                            (const void*)A,
+                                            rocblas_datatype_f32_r,
+                                            (transa == rocblas_operation_none) ? m : k,
+                                            (const void*)B,
+                                            rocblas_datatype_f32_r,
+                                            (transb == rocblas_operation_none) ? k : n,
+                                            (const void*)beta,
+                                            C,
+                                            rocblas_datatype_f32_r,
+                                            m,
+                                            C,
+                                            rocblas_datatype_f32_r,
+                                            m,
+                                            rocblas_datatype_f32_r,
+                                            algo,
+                                            0,
+                                            0);
+#else
    cublasStatus_t status = cublasGemmEx(handle,
                                         transa,
                                         transb,
@ -32,8 +73,13 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                         m,
                                         CUDA_R_32F,
                                         algo);
+#endif

+#ifdef __HIP_PLATFORM_HCC__
+    if (status != rocblas_status_success) {
+#else
    if (status != CUBLAS_STATUS_SUCCESS) {
+#endif
        fprintf(stderr,
                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
                m,
@ -45,6 +91,20 @@ int cublas_gemm_ex(cublasHandle_t handle,
    return 0;
 }

+#ifdef __HIP_PLATFORM_HCC__
+int cublas_gemm_ex(rocblas_handle handle,
+                   rocblas_operation transa,
+                   rocblas_operation transb,
+                   int m,
+                   int n,
+                   int k,
+                   const float* alpha,
+                   const float* beta,
+                   const __half* A,
+                   const __half* B,
+                   __half* C,
+                   rocblas_gemm_algo algo)
+#else
 int cublas_gemm_ex(cublasHandle_t handle,
                   cublasOperation_t transa,
                   cublasOperation_t transb,
@ -57,7 +117,34 @@ int cublas_gemm_ex(cublasHandle_t handle,
                   const __half* B,
                   __half* C,
                   cublasGemmAlgo_t algo)
+#endif
 {
+#ifdef __HIP_PLATFORM_HCC__
+    rocblas_status status = rocblas_gemm_ex(handle,
+                                            transa,
+                                            transb,
+                                            m,
+                                            n,
+                                            k,
+                                            (const void*)alpha,
+                                            (const void*)A,
+                                            rocblas_datatype_f16_r,
+                                            (transa == rocblas_operation_none) ? m : k,
+                                            (const void*)B,
+                                            rocblas_datatype_f16_r,
+                                            (transb == rocblas_operation_none) ? k : n,
+                                            (const void*)beta,
+                                            (void*)C,
+                                            rocblas_datatype_f16_r,
+                                            m,
+                                            (void*)C,
+                                            rocblas_datatype_f16_r,
+                                            m,
+                                            rocblas_datatype_f32_r,
+                                            algo,
+                                            0,
+                                            0);
+#else
    cublasStatus_t status = cublasGemmEx(handle,
                                         transa,
                                         transb,
@ -77,8 +164,13 @@ int cublas_gemm_ex(cublasHandle_t handle,
                                         m,
                                         CUDA_R_32F,
                                         algo);
+#endif

+#ifdef __HIP_PLATFORM_HCC__
+    if (status != rocblas_status_success) {
+#else
    if (status != CUBLAS_STATUS_SUCCESS) {
+#endif
        fprintf(stderr,
                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
                m,
@ -90,6 +182,24 @@ int cublas_gemm_ex(cublasHandle_t handle,
    return 0;
 }

+#ifdef __HIP_PLATFORM_HCC__
+int cublas_strided_batched_gemm(rocblas_handle handle,
+                                int m,
+                                int n,
+                                int k,
+                                const float* alpha,
+                                const float* beta,
+                                const float* A,
+                                const float* B,
+                                float* C,
+                                rocblas_operation op_A,
+                                rocblas_operation op_B,
+                                int stride_A,
+                                int stride_B,
+                                int stride_C,
+                                int batch,
+                                rocblas_gemm_algo algo)
+#else
 int cublas_strided_batched_gemm(cublasHandle_t handle,
                                int m,
                                int n,
@ -106,7 +216,40 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                int stride_C,
                                int batch,
                                cublasGemmAlgo_t algo)
+#endif
 {
+#ifdef __HIP_PLATFORM_HCC__
+    rocblas_status status =
+        rocblas_gemm_strided_batched_ex(handle,
+                                        op_A,
+                                        op_B,
+                                        m,
+                                        n,
+                                        k,
+                                        alpha,
+                                        A,
+                                        rocblas_datatype_f32_r,
+                                        (op_A == rocblas_operation_none) ? m : k,
+                                        stride_A,
+                                        B,
+                                        rocblas_datatype_f32_r,
+                                        (op_B == rocblas_operation_none) ? k : n,
+                                        stride_B,
+                                        beta,
+                                        C,
+                                        rocblas_datatype_f32_r,
+                                        m,
+                                        stride_C,
+                                        C,
+                                        rocblas_datatype_f32_r,
+                                        m,
+                                        stride_C,
+                                        batch,
+                                        rocblas_datatype_f32_r,
+                                        algo,
+                                        0,
+                                        0);
+#else
    cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
                                                       op_A,
                                                       op_B,
@ -130,8 +273,13 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                                       batch,
                                                       CUDA_R_32F,
                                                       algo);
+#endif

+#ifdef __HIP_PLATFORM_HCC__
+    if (status != rocblas_status_success) {
+#else
    if (status != CUBLAS_STATUS_SUCCESS) {
+#endif
        fprintf(stderr,
                "!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d) \n",
                batch,
@ -144,6 +292,24 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
    return 0;
 }

+#ifdef __HIP_PLATFORM_HCC__
+int cublas_strided_batched_gemm(rocblas_handle handle,
+                                int m,
+                                int n,
+                                int k,
+                                const float* alpha,
+                                const float* beta,
+                                const __half* A,
+                                const __half* B,
+                                __half* C,
+                                rocblas_operation op_A,
+                                rocblas_operation op_B,
+                                int stride_A,
+                                int stride_B,
+                                int stride_C,
+                                int batch,
+                                rocblas_gemm_algo algo)
+#else
 int cublas_strided_batched_gemm(cublasHandle_t handle,
                                int m,
                                int n,
@ -160,7 +326,40 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                int stride_C,
                                int batch,
                                cublasGemmAlgo_t algo)
+#endif
 {
+#ifdef __HIP_PLATFORM_HCC__
+    rocblas_status status =
+        rocblas_gemm_strided_batched_ex(handle,
+                                        op_A,
+                                        op_B,
+                                        m,
+                                        n,
+                                        k,
+                                        alpha,
+                                        A,
+                                        rocblas_datatype_f16_r,
+                                        (op_A == rocblas_operation_none) ? m : k,
+                                        stride_A,
+                                        B,
+                                        rocblas_datatype_f16_r,
+                                        (op_B == rocblas_operation_none) ? k : n,
+                                        stride_B,
+                                        beta,
+                                        C,
+                                        rocblas_datatype_f16_r,
+                                        m,
+                                        stride_C,
+                                        C,
+                                        rocblas_datatype_f16_r,
+                                        m,
+                                        stride_C,
+                                        batch,
+                                        rocblas_datatype_f32_r,
+                                        algo,
+                                        0,
+                                        0);
+#else
    cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
                                                       op_A,
                                                       op_B,
@ -184,8 +383,13 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
                                                       batch,
                                                       CUDA_R_32F,
                                                       algo);
+#endif

+#ifdef __HIP_PLATFORM_HCC__
+    if (status != rocblas_status_success) {
+#else
    if (status != CUBLAS_STATUS_SUCCESS) {
+#endif
        fprintf(stderr,
                "!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
                m,
--- a/csrc/transformer/ds_transformer_cuda.cpp
+++ b/csrc/transformer/ds_transformer_cuda.cpp
@ -140,7 +140,9 @@ BertTransformerLayer<T>::~BertTransformerLayer()
 template <typename T>
 void BertTransformerLayer<T>::Initialize()
 {
+#ifndef __HIP_PLATFORM_HCC__
    if (std::is_same<T, __half>::value) cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
+#endif
 }

 template <typename T>
--- a/csrc/transformer/gelu_kernels.cu
+++ b/csrc/transformer/gelu_kernels.cu
@ -60,7 +60,7 @@ __global__ void gelu_kernel(const float* input, float* vals, int row_stride, int

 __global__ void gelu_kernel(const __half* input, __half* vals, int row_stride, int iterations)
 {
-#if __CUDA_ARCH__ >= 700
+#ifdef HALF_PRECISION_AVAILABLE
    int row = blockIdx.x;
    int id = threadIdx.x;
    int loop_stride = blockDim.x;
@ -131,7 +131,7 @@ __global__ void fused_bias_gelu(const __half* input,
                                int row_stride,
                                int iterations)
 {
-#if __CUDA_ARCH__ >= 700
+#ifdef HALF_PRECISION_AVAILABLE
    int row = blockIdx.x;
    int id = threadIdx.x;
    int loop_stride = blockDim.x;
@ -214,7 +214,7 @@ __global__ void d_gelu_func(__half* d_output,
                            int row_stride,
                            int iterations)
 {
-#if __CUDA_ARCH__ >= 700
+#ifdef HALF_PRECISION_AVAILABLE
    int row = blockIdx.x;
    int id = threadIdx.x;
    int loop_stride = blockDim.x;
--- a/csrc/transformer/general_kernels.cu
+++ b/csrc/transformer/general_kernels.cu
@ -11,8 +11,12 @@ __global__ void column_sum_reduce(const T* __restrict__ inp,
    __shared__ float tile[TILE_DIM][TILE_DIM + 1];

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
+    g.tiled_partition(b, TILE_DIM);
+#else
    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
-
+#endif
    int idx = blockDim.x * blockIdx.x + threadIdx.x;

    int y_stride = width * TILE_DIM;
--- a/csrc/transformer/inference/csrc/dequantize.cu
+++ b/csrc/transformer/inference/csrc/dequantize.cu
@ -46,7 +46,7 @@ __global__ void dequantize_kernel(__half* output,
                                  unsigned groups,
                                  unsigned merge_count)
 {
-#if __CUDA_ARCH__ >= 700
+#ifdef HALF_PRECISION_AVAILABLE

    unsigned merge_hidden = hidden_dim >> merge_count;
    unsigned quantization_stride = (merge_hidden * output_size) / groups;
--- a/csrc/transformer/inference/csrc/gelu.cu
+++ b/csrc/transformer/inference/csrc/gelu.cu
@ -42,7 +42,7 @@ __global__ void fused_bias_gelu(__half* input,
                                int total_count,
                                int intermediate_size)
 {
-#if __CUDA_ARCH__ >= 700
+#ifdef HALF_PRECISION_AVAILABLE

    float2* input_cast = reinterpret_cast<float2*>(input);
    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
@ -120,7 +120,7 @@ __global__ void fused_bias_add(float* input, const float* bias, int total_count,

 __global__ void fused_bias_add(__half* input, const __half* bias, int total_count, int hidden_size)
 {
-#if __CUDA_ARCH__ >= 700
+#ifdef HALF_PRECISION_AVAILABLE

    float2* input_cast = reinterpret_cast<float2*>(input);
    const float2* bias_cast = reinterpret_cast<const float2*>(bias);
@ -206,7 +206,7 @@ __global__ void fused_bias_residual(__half* input,
                                    int intermediate_size,
                                    bool add_bias)
 {
-#if __CUDA_ARCH__ >= 700
+#ifdef HALF_PRECISION_AVAILABLE

    float2* input_cast = reinterpret_cast<float2*>(input);
    const float2* residual_cast = reinterpret_cast<const float2*>(residual);
--- a/csrc/transformer/inference/csrc/normalize.cu
+++ b/csrc/transformer/inference/csrc/normalize.cu
@ -85,7 +85,7 @@ __global__ void fused_bias_residual_layer_norm(__half* output,
                                               float epsilon,
                                               int row_stride)
 {
-#if __CUDA_ARCH__ >= 700
+#ifdef HALF_PRECISION_AVAILABLE
    int iteration_stride = blockDim.x;
    int iterations = row_stride / iteration_stride;

@ -287,7 +287,7 @@ __global__ void fused_residual_layer_norm(__half* norm,
                                          int row_stride,
                                          bool preLN)
 {
-#if __CUDA_ARCH__ >= 700
+#ifdef HALF_PRECISION_AVAILABLE
    int iteration_stride = blockDim.x;

    cg::thread_block b = cg::this_thread_block();
--- a/csrc/transformer/inference/csrc/softmax.cu
+++ b/csrc/transformer/inference/csrc/softmax.cu
@ -38,7 +38,7 @@ __global__ void attn_softmax_v2(__half* vals,
                                int iterations,
                                int reduceWidth)
 {
-#if __CUDA_ARCH__ >= 700
+#ifdef HALF_PRECISION_AVAILABLE

    cg::thread_block b = cg::this_thread_block();
    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
--- a/csrc/transformer/inference/includes/custom_cuda_layers.h
+++ b/csrc/transformer/inference/includes/custom_cuda_layers.h
@ -1,6 +1,15 @@
 #pragma once

+#ifdef __HIP_PLATFORM_HCC__
+#define HALF_PRECISION_AVAILABLE = 1
+#include <hip/hip_cooperative_groups.h>
+#else
+#if __CUDA_ARCH__ >= 700
+#define HALF_PRECISION_AVAILABLE = 1
+#endif
 #include <cooperative_groups.h>
+#endif
+
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <stdio.h>
--- a/csrc/transformer/normalize_kernels.cu
+++ b/csrc/transformer/normalize_kernels.cu
@ -28,8 +28,12 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
    int iterations = row_stride / iteration_stride;

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
+#else
    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
+#endif
    int row = blockIdx.x;
    int id = threadIdx.x;
    int gid = id / WARP_SIZE;
@ -125,12 +129,17 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
                                               __half* means,
                                               int row_stride)
 {
-#if __CUDA_ARCH__ >= 700
+#ifdef HALF_PRECISION_AVAILABLE
    int iteration_stride = blockDim.x;
    int iterations = row_stride / iteration_stride;

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
+    g.tiled_partition(b, 32);
+#else
    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+#endif

    int row = blockIdx.x;
    int id = threadIdx.x;
@ -322,7 +331,12 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
    int iterations = row_stride / iteration_stride;

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
+    g.tiled_partition(b, 32);
+#else
    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+#endif

    int row = blockIdx.x;
    int id = threadIdx.x;
@ -416,13 +430,18 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
                                               __half* vars,
                                               int row_stride)
 {
-#if __CUDA_ARCH__ >= 700
+#ifdef HALF_PRECISION_AVAILABLE

    int iteration_stride = blockDim.x;
    int iterations = row_stride / iteration_stride;

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
+    g.tiled_partition(b, 32);
+#else
    cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
+#endif

    int row = blockIdx.x;
    int id = threadIdx.x;
@ -634,7 +653,12 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad,
    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
+    g.tiled_partition(b, TILE_DIM);
+#else
    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+#endif

    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int offset = threadIdx.y * width + idx;
@ -701,7 +725,12 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad,
    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
+    g.tiled_partition(b, TILE_DIM);
+#else
    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+#endif

    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int offset = threadIdx.y * width + idx;
@ -766,7 +795,12 @@ __global__ void LayerNormBackward2(const float* out_grad,
    int iterations = row_stride / iteration_stride;

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
+#else
    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+#endif

    int row = blockIdx.x;
    int id = threadIdx.x;
@ -866,7 +900,12 @@ __global__ void LayerNormBackward2(const __half* out_grad,
    int iterations = row_stride / iteration_stride;

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
+#else
    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+#endif

    int row = blockIdx.x;
    int id = threadIdx.x;
@ -1081,7 +1120,12 @@ __global__ void LayerNormBackward2(const float* out_grad,
    int iterations = row_stride / iteration_stride;

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
+#else
    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+#endif

    int row = blockIdx.x;
    int id = threadIdx.x;
@ -1176,7 +1220,12 @@ __global__ void LayerNormBackward2(const __half* out_grad,
    int iterations = row_stride / iteration_stride;

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
+#else
    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+#endif

    int row = blockIdx.x;
    int id = threadIdx.x;
@ -1380,7 +1429,12 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
+    g.tiled_partition(b, TILE_DIM);
+#else
    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+#endif

    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int offset = threadIdx.y * width + idx;
@ -1442,7 +1496,12 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
    __shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
+    g.tiled_partition(b, TILE_DIM);
+#else
    cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
+#endif

    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int offset = threadIdx.y * width + idx;
@ -1501,7 +1560,12 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
    int iterations = row_stride / iteration_stride;

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
+#else
    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+#endif

    int row = blockIdx.x;
    int id = threadIdx.x;
@ -1605,7 +1669,12 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
    int iterations = row_stride / iteration_stride;

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
+#else
    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+#endif

    int row = blockIdx.x;
    int id = threadIdx.x;
@ -1823,7 +1892,12 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
    int iterations = row_stride / iteration_stride;

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
+#else
    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+#endif

    int row = blockIdx.x;
    int id = threadIdx.x;
@ -1926,7 +2000,12 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
    int iterations = row_stride / iteration_stride;

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
+#else
    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+#endif

    int row = blockIdx.x;
    int id = threadIdx.x;
--- a/csrc/transformer/softmax_kernels.cu
+++ b/csrc/transformer/softmax_kernels.cu
@ -34,7 +34,12 @@ __global__ void attn_softmax(float* vals,
    int block_width = blockStride * seq_length;

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
+    g.tiled_partition(b, tbSize);
+#else
    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+#endif

    int batch = blockIdx.y;
    int row = blockIdx.x;
@ -156,7 +161,7 @@ __global__ void attn_softmax(__half* vals,
                             int seq_length,
                             int iterations)
 {
-#if __CUDA_ARCH__ >= 700
+#ifdef HALF_PRECISION_AVAILABLE
    __shared__ float partialSum[MAX_WARP_NUM];

    int warp_num = blockDim.x >> WARP_SIZE_BITS;
@ -165,7 +170,12 @@ __global__ void attn_softmax(__half* vals,
    int block_width = blockStride * seq_length;

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
+    g.tiled_partition(b, tbSize);
+#else
    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+#endif

    int batch = blockIdx.y;
    int row = blockIdx.x;
@ -449,7 +459,12 @@ __global__ void softmax_backward_kernel(T* out_grad, const T* soft_inp, int seq_
                          : MAX_THREAD_ITERATIONS);

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
+    g.tiled_partition(b, tbSize);
+#else
    cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
+#endif

    int row = blockIdx.x;
    int id = threadIdx.x;
@ -524,7 +539,12 @@ __global__ void softmax_backward_kernel_v2(T* grad /* input & output*/,
    }

    cg::thread_block b = cg::this_thread_block();
+#ifdef __HIP_PLATFORM_HCC__
+    cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
+    g.tiled_partition(b, WARP_SIZE);
+#else
    cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
+#endif

    for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);

--- a/csrc/transformer/transform_kernels.cu
+++ b/csrc/transformer/transform_kernels.cu
@ -96,7 +96,7 @@ __global__ void transform_0213<__half>(__half* output,
                                       int heads,
                                       int head_ext)
 {
-#if __CUDA_ARCH__ >= 700
+#ifdef HALF_PRECISION_AVAILABLE

    int d0_stride = hidden_dim * seq_length;
    int d1_stride = hidden_dim;
@ -219,7 +219,7 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
                                                int heads,
                                                int head_ext)
 {
-#if __CUDA_ARCH__ >= 700
+#ifdef HALF_PRECISION_AVAILABLE

    int d0_stride = hidden_dim * seq_length;
    int d1_stride = hidden_dim;
@ -289,7 +289,7 @@ __global__ void bias_add_transform_0213_v2(__half* output,
                                           int seq_length,
                                           int heads)
 {
-#if __CUDA_ARCH__ >= 700
+#ifdef HALF_PRECISION_AVAILABLE
    __shared__ float4 in_data[3072];

    int d0_stride = hidden_dim * seq_length;
@ -451,7 +451,7 @@ __global__ void transform4d_0213<__half>(__half* out,
                                         int hidden_dim,
                                         int head_ext)
 {
-#if __CUDA_ARCH__ >= 700
+#ifdef HALF_PRECISION_AVAILABLE

    int d0_stride = hidden_dim * (seq_length / head_ext);
    int d1_stride = hidden_dim;
@ -487,7 +487,7 @@ __global__ void transform4d_0213_v2(__half* out,
                                    int seq_length,
                                    int hidden_dim)
 {
-#if __CUDA_ARCH__ >= 700
+#ifdef HALF_PRECISION_AVAILABLE
    __shared__ float4 in_data[3072];

    int d0_stride = hidden_dim * seq_length;
--- a/deepspeed/env_report.py
+++ b/deepspeed/env_report.py
@ -79,6 +79,11 @@ def nvcc_version():

 def debug_report():
    max_dots = 33
+
+    hip_version = 'unknown'
+    if hasattr(torch.version, 'hip'):
+        hip_version = torch.version.hip
+
    report = [
        ("torch install path",
         torch.__path__),
@ -86,6 +91,8 @@ def debug_report():
         torch.__version__),
        ("torch cuda version",
         torch.version.cuda),
+        ("torch hip version",
+         hip_version),
        ("nvcc version",
         nvcc_version()),
        ("deepspeed install path",
@ -94,7 +101,8 @@ def debug_report():
         f"{deepspeed.__version__}, {deepspeed.__git_hash__}, {deepspeed.__git_branch__}"
         ),
        ("deepspeed wheel compiled w.",
-         f"torch {torch_info['version']}, cuda {torch_info['cuda_version']}"),
+         f"torch {torch_info['version']}, cuda {torch_info['cuda_version']}, hip {torch_info['hip_version']}"
+         ),
    ]
    print("DeepSpeed general environment info:")
    for name, value in report:
--- a/deepspeed/git_version_info.py
+++ b/deepspeed/git_version_info.py
@ -14,4 +14,4 @@ except ModuleNotFoundError:
    from .ops.op_builder import ALL_OPS
    installed_ops = dict.fromkeys(ALL_OPS.keys(), False)
    compatible_ops = dict.fromkeys(ALL_OPS.keys(), False)
-    torch_info = {'version': "0.0", "cuda_version": "0.0"}
+    torch_info = {'version': "0.0", "cuda_version": "0.0", "hip_version": "0.0"}
--- a/deepspeed/ops/init.py
+++ b/deepspeed/ops/init.py
@ -1,6 +1,8 @@
 from . import adam
 from . import adagrad
 from . import lamb
+#from ..git_version_info_installed import installed_ops as __installed_ops__
+#if __installed_ops__['sparse_attn']:
 from . import sparse_attention
 from . import transformer

--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@ -415,27 +415,6 @@ class DeepSpeedEngine(Module):
        """
        return self._global_grad_norm

-    def set_train_batch_size(self, train_batch_size):
-        """Adjust the global batch size by increasing or decreasing the number of
-        micro-batches (i.e., gradient accumulation steps). The size of each micro-batch
-        (i.e., ``train_micro_batch_size_per_gpu``) is not changed.
-        Args:
-            train_batch_size (int): The new global batch size for training.
-        Raises:
-            ValueError: if ``train_batch_size`` is not divisible by the
-                configured micro-batch size and data parallelism.
-        """
-        if train_batch_size % (self.train_micro_batch_size_per_gpu() *
-                               self.dp_world_size) != 0:
-            #print(f'{train_batch_size=} {self.train_micro_batch_size_per_gpu()=} {self.dp_world_size=}')
-            raise ValueError(
-                f'Train batch size must be divisible by micro-batch data parallelism')
-        new_gas = train_batch_size // (self.train_micro_batch_size_per_gpu() *
-                                       self.dp_world_size)
-        # overwrite config
-        self._config.train_batch_size = train_batch_size
-        self._config.gradient_accumulation_steps = new_gas
-
    def get_global_grad_norm(self) -> float:
        """Return the 2-norm of all gradients. If there is model parallelism,
        the norm will be global.
@ -2839,10 +2818,9 @@ class DeepSpeedEngine(Module):
            self.optimizer.state_dict()
            if self.optimizer and not self.zero_optimization() else None
        }
-        torch.save(optimizer_state,
-                   self._get_optimizer_ckpt_name(save_dir,
-                                                 tag,
-                                                 expp_rank))
+        with open(self._get_optimizer_ckpt_name(save_dir, tag, expp_rank), 'wb') as fd:
+            torch.save(optimizer_state, fd)
+            fd.flush()

        # get non-moe parameters
        model_state_dict = self._get_non_moe_state_dict(self.module_state_dict())
@ -2872,7 +2850,9 @@ class DeepSpeedEngine(Module):
            }
            state.update(client_state)
            logger.info(f'Saving model checkpoint: {save_path}')
-            torch.save(state, save_path)
+            with open(save_path, 'wb') as fd:
+                torch.save(state, fd)
+                fd.flush()
        self._curr_save_path = None

    def _create_checkpoint_file(self, save_dir, tag, zero_checkpoint):
@ -3006,7 +2986,9 @@ class DeepSpeedEngine(Module):
        zero_sd = dict(optimizer_state_dict=self.optimizer.state_dict(),
                       ds_config=self.config,
                       ds_version=version)
-        torch.save(zero_sd, zero_checkpoint_name)
+        with open(zero_checkpoint_name, 'wb') as fd:
+            torch.save(zero_sd, fd)
+            fd.flush()
        if self.global_rank == 0:
            self._copy_recovery_script(save_path)
        logger.info('zero checkpoint saved {}'.format(zero_checkpoint_name))
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@ -0,0 +1,186 @@
+FROM rocm/pytorch:latest
+
+
+##############################################################################
+# Temporary Installation Directory
+##############################################################################
+ENV STAGE_DIR=/tmp
+RUN mkdir -p ${STAGE_DIR}
+
+##############################################################################
+# Installation/Basic Utilities
+##############################################################################
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        software-properties-common build-essential autotools-dev \
+        nfs-common pdsh \
+        cmake g++ gcc \
+        curl wget vim tmux emacs less unzip \
+        htop iftop iotop ca-certificates openssh-client openssh-server \
+        rsync iputils-ping net-tools sudo \
+        llvm-9-dev
+
+##############################################################################
+# Installation Latest Git
+##############################################################################
+RUN add-apt-repository ppa:git-core/ppa -y && \
+    apt-get update && \
+    apt-get install -y git && \
+    git --version
+
+##############################################################################
+# Client Liveness & Uncomment Port 22 for SSH Daemon
+##############################################################################
+# Keep SSH client alive from server side
+RUN echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config
+RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \
+    sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+
+##############################################################################
+# Mellanox OFED
+##############################################################################
+#ENV MLNX_OFED_VERSION=4.6-1.0.1.1
+#RUN apt-get install -y libnuma-dev
+#RUN cd ${STAGE_DIR} && \
+#    wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64.tgz | tar xzf - && \
+#    cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64 && \
+#    ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
+#    cd ${STAGE_DIR} && \
+#    rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64*
+
+##############################################################################
+# OPENMPI
+##############################################################################
+#ENV OPENMPI_BASEVERSION=4.0
+#ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.1
+#RUN cd ${STAGE_DIR} && \
+#    wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
+#    cd openmpi-${OPENMPI_VERSION} && \
+#    ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
+#    make -j"$(nproc)" install && \
+#    ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
+#    # Sanity check:
+#    test -f /usr/local/mpi/bin/mpic++ && \
+#    cd ${STAGE_DIR} && \
+#    rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
+#ENV PATH=/usr/local/mpi/bin:${PATH} \
+#    LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
+## Create a wrapper for OpenMPI to allow running as root by default
+#RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
+#    echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
+#    echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
+#    chmod a+x /usr/local/mpi/bin/mpirun
+
+##############################################################################
+# Python
+##############################################################################
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHON_VERSION=3.6
+RUN apt-get install -y python3.6 python3.6-dev && \
+    rm -f /usr/bin/python && \
+    ln -s /usr/bin/python3.6 /usr/bin/python && \
+    curl -O https://bootstrap.pypa.io/get-pip.py && \
+        python get-pip.py && \
+        rm get-pip.py && \
+    pip install --upgrade pip && \
+    # Print python an pip version
+    python -V && pip -V
+RUN pip install pyyaml
+RUN pip install ipython
+
+##############################################################################
+# TensorFlow
+##############################################################################
+RUN pip install tensorflow-rocm
+
+##############################################################################
+# Some Packages
+##############################################################################
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        libsndfile-dev \
+        libjpeg-dev \
+        libpng-dev \
+        screen
+RUN pip install psutil \
+                yappi \
+                cffi \
+                ipdb \
+                pandas \
+                matplotlib \
+                py3nvml \
+                pyarrow \
+                graphviz \
+                astor \
+                boto3 \
+                tqdm \
+                sentencepiece \
+                msgpack \
+                requests \
+                pandas \
+                sphinx \
+                sphinx_rtd_theme \
+                scipy \
+                numpy \
+                sklearn \
+                scikit-learn \
+                mpi4py \
+                h5py
+
+##############################################################################
+## SSH daemon port inside container cannot conflict with host OS port
+###############################################################################
+ENV SSH_PORT=2222
+RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
+    sed "0,/^#Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
+
+##############################################################################
+# PyTorch
+##############################################################################
+#ENV PYTORCH_VERSION=1.2.0
+#ENV TORCHVISION_VERSION=0.4.0
+#ENV TENSORBOARDX_VERSION=1.8
+#RUN pip install torch==${PYTORCH_VERSION}
+#RUN pip install torchvision==${TORCHVISION_VERSION}
+#RUN pip install tensorboardX==${TENSORBOARDX_VERSION}
+
+##############################################################################
+# PyYAML build issue
+# https://stackoverflow.com/a/53926898
+##############################################################################
+RUN rm -rf /usr/lib/python3/dist-packages/yaml && \
+        rm -rf /usr/lib/python3/dist-packages/PyYAML-*
+
+##############################################################################
+## CuPy installation
+###############################################################################
+RUN git clone https://github.com/ROCmSoftwarePlatform/cupy ${STAGE_DIR}/cupy
+RUN cd ${STAGE_DIR}/cupy && \
+        git submodule update --init && \
+        CUPY_INSTALL_USE_HIP=1 ROCM_HOME=/opt/rocm pip install -e . --no-cache-dir -vvvv
+RUN rm -rf ${STAGE_DIR}/cupy
+
+##############################################################################
+## Add deepspeed user
+###############################################################################
+# Add a deepspeed user with user id 8877
+#RUN useradd --create-home --uid 8877 deepspeed
+#RUN useradd --create-home --uid 1000 --shell /bin/bash deepspeed
+#RUN usermod -aG sudo deepspeed
+#RUN echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
+# # Change to non-root privilege
+#USER deepspeed
+
+##############################################################################
+# DeepSpeed
+##############################################################################
+RUN git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
+RUN cd ${STAGE_DIR}/DeepSpeed && \
+        git checkout . && \
+        git checkout master && \
+        cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups.h && \
+        cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h /opt/rocm/include/hip/hcc_detail/amd_hip_cooperative_groups.h && \
+        cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups_helper.h && \
+        DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo
+RUN rm -rf ${STAGE_DIR}/DeepSpeed
+RUN cd ~ && python -c "import deepspeed; print(deepspeed.__version__)"
--- a/install.sh
+++ b/install.sh
@ -156,7 +156,7 @@ python setup.py $VERBOSE bdist_wheel

 if [ "$local_only" == "1" ]; then
    echo "Installing deepspeed"
-    $PIP_SUDO pip uninstall -y deepspeed
+#    $PIP_SUDO pip uninstall -y deepspeed
    $PIP_SUDO $PIP_INSTALL dist/deepspeed*.whl
    ds_report
 else
--- a/op_builder/init.py
+++ b/op_builder/init.py
@ -10,9 +10,9 @@ from .transformer import TransformerBuilder
 from .stochastic_transformer import StochasticTransformerBuilder
 from .utils import UtilsBuilder
 from .async_io import AsyncIOBuilder
-from .builder import get_default_compute_capabilities
 from .transformer_inference import InferenceBuilder
 from .quantizer import QuantizerBuilder
+from .builder import get_default_compute_capabilities, OpBuilder

 # TODO: infer this list instead of hard coded
 # List of all available ops
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@ -31,6 +31,9 @@ except ImportError:
        f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops."
    )

+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+

 def installed_cuda_version():
    import torch.utils.cpp_extension
@ -100,23 +103,10 @@ def assert_no_cuda_mismatch():
            "cuda/cpp extensions without a matching cuda version.")


-def assert_torch_info(torch_info):
-    install_torch_version = torch_info['version']
-    install_cuda_version = torch_info['cuda_version']
-
-    current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
-    current_torch_version = ".".join(torch.__version__.split('.')[:2])
-
-    if install_cuda_version != current_cuda_version or install_torch_version != current_torch_version:
-        raise RuntimeError(
-            "PyTorch and CUDA version mismatch! DeepSpeed ops were compiled and installed "
-            "with a different version than what is being used at runtime. Please re-install "
-            f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
-            f"torch={install_torch_version}, cuda={install_cuda_version}, runtime versions:"
-            f"torch={current_torch_version}, cuda={current_cuda_version}")
-
-
 class OpBuilder(ABC):
+    _rocm_version = None
+    _is_rocm_pytorch = None
+
    def __init__(self, name):
        self.name = name
        self.jit_mode = False
@ -136,6 +126,67 @@ class OpBuilder(ABC):
        '''
        pass

+    @staticmethod
+    def assert_torch_info(torch_info):
+        install_torch_version = torch_info['version']
+        install_cuda_version = torch_info['cuda_version']
+        install_hip_version = torch_info['hip_version']
+
+        if not OpBuilder.is_rocm_pytorch():
+            current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+        else:
+            current_hip_version = ".".join(torch.version.hip.split('.')[:2])
+
+        current_torch_version = ".".join(torch.__version__.split('.')[:2])
+
+        if not OpBuilder.is_rocm_pytorch():
+            if install_cuda_version != current_cuda_version or install_torch_version != current_torch_version:
+                raise RuntimeError(
+                    "PyTorch and CUDA version mismatch! DeepSpeed ops were compiled and installed "
+                    "with a different version than what is being used at runtime. Please re-install "
+                    f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
+                    f"torch={install_torch_version}, cuda={install_cuda_version}, runtime versions:"
+                    f"torch={current_torch_version}, cuda={current_cuda_version}")
+        else:
+            if install_hip_version != current_hip_version or install_torch_version != current_torch_version:
+                raise RuntimeError(
+                    "PyTorch and HIP version mismatch! DeepSpeed ops were compiled and installed "
+                    "with a different version than what is being used at runtime. Please re-install "
+                    f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
+                    f"torch={install_torch_version}, hip={install_hip_version}, runtime versions:"
+                    f"torch={current_torch_version}, hip={current_hip_version}")
+
+    @staticmethod
+    def is_rocm_pytorch():
+        if OpBuilder._is_rocm_pytorch is not None:
+            return OpBuilder._is_rocm_pytorch
+
+        _is_rocm_pytorch = False
+        if TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 5):
+            _is_rocm_pytorch = hasattr(torch.version,
+                                       'hip') and torch.version.hip is not None
+            if _is_rocm_pytorch:
+                from torch.utils.cpp_extension import ROCM_HOME
+                _is_rocm_pytorch = ROCM_HOME is not None
+        OpBuilder._is_rocm_pytorch = _is_rocm_pytorch
+        return OpBuilder._is_rocm_pytorch
+
+    @staticmethod
+    def installed_rocm_version():
+        if OpBuilder._rocm_version:
+            return OpBuilder._rocm_version
+
+        ROCM_MAJOR = '0'
+        ROCM_MINOR = '0'
+        if OpBuilder.is_rocm_pytorch():
+            from torch.utils.cpp_extension import ROCM_HOME
+            with open('/opt/rocm/.info/version-dev', 'r') as file:
+                ROCM_VERSION_DEV_RAW = file.read()
+            ROCM_MAJOR = ROCM_VERSION_DEV_RAW.split('.')[0]
+            ROCM_MINOR = ROCM_VERSION_DEV_RAW.split('.')[1]
+        OpBuilder._rocm_version = (int(ROCM_MAJOR), int(ROCM_MINOR))
+        return OpBuilder._rocm_version
+
    def include_paths(self):
        '''
        Returns list of include paths, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
@ -396,7 +447,7 @@ class OpBuilder(ABC):
            # Ensure the op we're about to load was compiled with the same
            # torch/cuda versions we are currently using at runtime.
            if isinstance(self, CUDAOpBuilder):
-                assert_torch_info(torch_info)
+                self.assert_torch_info(torch_info)

            return importlib.import_module(self.absolute_name())
        else:
@ -414,7 +465,7 @@ class OpBuilder(ABC):
                f"Unable to JIT load the {self.name} op due to ninja not being installed."
            )

-        if isinstance(self, CUDAOpBuilder):
+        if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
            assert_no_cuda_mismatch()

        self.jit_mode = True
@ -534,15 +585,26 @@ class CUDAOpBuilder(OpBuilder):

    def builder(self):
        from torch.utils.cpp_extension import CUDAExtension
-        assert_no_cuda_mismatch()
-        return CUDAExtension(name=self.absolute_name(),
-                             sources=self.strip_empty_entries(self.sources()),
-                             include_dirs=self.strip_empty_entries(self.include_paths()),
-                             libraries=self.strip_empty_entries(self.libraries_args()),
-                             extra_compile_args={
-                                 'cxx': self.strip_empty_entries(self.cxx_args()),
-                                 'nvcc': self.strip_empty_entries(self.nvcc_args())
-                             })
+        if not self.is_rocm_pytorch():
+            assert_no_cuda_mismatch()
+        cuda_ext = CUDAExtension(
+            name=self.absolute_name(),
+            sources=self.strip_empty_entries(self.sources()),
+            include_dirs=self.strip_empty_entries(self.include_paths()),
+            libraries=self.strip_empty_entries(self.libraries_args()),
+            extra_compile_args={
+                'cxx': self.strip_empty_entries(self.cxx_args()),
+                'nvcc': self.strip_empty_entries(self.nvcc_args())
+            })
+        if self.is_rocm_pytorch():
+            # hip converts paths to absolute, this converts back to relative
+            sources = cuda_ext.sources
+            curr_file = Path(__file__).parent.parent  # ds root
+            for i in range(len(sources)):
+                src = Path(sources[i])
+                sources[i] = str(src.relative_to(curr_file))
+            cuda_ext.sources = sources
+        return cuda_ext

    def cxx_args(self):
        if sys.platform == "win32":
@ -551,18 +613,29 @@ class CUDAOpBuilder(OpBuilder):
            return ['-O3', '-std=c++14', '-g', '-Wno-reorder']

    def nvcc_args(self):
-        cuda_major, _ = installed_cuda_version()
-        args = [
-            '-O3',
-            '--use_fast_math',
-            '-std=c++17'
-            if sys.platform == "win32" and cuda_major > 10 else '-std=c++14',
-            '-U__CUDA_NO_HALF_OPERATORS__',
-            '-U__CUDA_NO_HALF_CONVERSIONS__',
-            '-U__CUDA_NO_HALF2_OPERATORS__'
-        ]
-
-        return args + self.compute_capability_args()
+        args = ['-O3']
+        if self.is_rocm_pytorch():
+            ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
+            args += [
+                '-std=c++14',
+                '-U__HIP_NO_HALF_OPERATORS__',
+                '-U__HIP_NO_HALF_CONVERSIONS__',
+                '-U__HIP_NO_HALF2_OPERATORS__',
+                '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
+                '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
+            ]
+        else:
+            cuda_major, _ = installed_cuda_version()
+            args += [
+                '--use_fast_math',
+                '-std=c++17'
+                if sys.platform == "win32" and cuda_major > 10 else '-std=c++14',
+                '-U__CUDA_NO_HALF_OPERATORS__',
+                '-U__CUDA_NO_HALF_CONVERSIONS__',
+                '-U__CUDA_NO_HALF2_OPERATORS__'
+            ]
+            args += self.compute_capability_args()
+        return args

    def libraries_args(self):
        if sys.platform == "win32":
@ -573,11 +646,17 @@ class CUDAOpBuilder(OpBuilder):

 class TorchCPUOpBuilder(CUDAOpBuilder):
    def extra_ldflags(self):
-        return ['-lcurand']
+        if not self.is_rocm_pytorch():
+            return ['-lcurand']
+        else:
+            return []

    def cxx_args(self):
        import torch
-        CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
+        if not self.is_rocm_pytorch():
+            CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
+        else:
+            CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.ROCM_HOME, "lib")
        CPU_ARCH = self.cpu_arch()
        SIMD_WIDTH = self.simd_width()

--- a/op_builder/cpu_adagrad.py
+++ b/op_builder/cpu_adagrad.py
@ -2,8 +2,6 @@
 Copyright 2020 The Microsoft DeepSpeed Team
 """
 import os
-import sys
-import subprocess
 from .builder import TorchCPUOpBuilder


@ -22,5 +20,17 @@ class CPUAdagradBuilder(TorchCPUOpBuilder):

    def include_paths(self):
        import torch
-        CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")
-        return ['csrc/includes', CUDA_INCLUDE]
+        if not self.is_rocm_pytorch():
+            CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
+        else:
+            CUDA_INCLUDE = [
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
+                             "include"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
+                             "include",
+                             "rocrand"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
+                             "include",
+                             "hiprand"),
+            ]
+        return ['csrc/includes'] + CUDA_INCLUDE
--- a/op_builder/cpu_adam.py
+++ b/op_builder/cpu_adam.py
@ -2,8 +2,6 @@
 Copyright 2020 The Microsoft DeepSpeed Team
 """
 import os
-import sys
-import subprocess
 from .builder import TorchCPUOpBuilder


@ -22,5 +20,17 @@ class CPUAdamBuilder(TorchCPUOpBuilder):

    def include_paths(self):
        import torch
-        CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")
-        return ['csrc/includes', CUDA_INCLUDE]
+        if not self.is_rocm_pytorch():
+            CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
+        else:
+            CUDA_INCLUDE = [
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
+                             "include"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
+                             "include",
+                             "rocrand"),
+                os.path.join(torch.utils.cpp_extension.ROCM_HOME,
+                             "include",
+                             "hiprand"),
+            ]
+        return ['csrc/includes'] + CUDA_INCLUDE
--- a/op_builder/fused_adam.py
+++ b/op_builder/fused_adam.py
@ -1,6 +1,7 @@
 """
 Copyright 2020 The Microsoft DeepSpeed Team
 """
+import torch
 from .builder import CUDAOpBuilder


@ -18,14 +19,15 @@ class FusedAdamBuilder(CUDAOpBuilder):
        return ['csrc/adam/fused_adam_frontend.cpp', 'csrc/adam/multi_tensor_adam.cu']

    def include_paths(self):
-        return ['csrc/includes']
+        return ['csrc/includes', 'csrc/adam']

    def cxx_args(self):
        args = super().cxx_args()
        return args + self.version_dependent_macros()

    def nvcc_args(self):
-        return ['-lineinfo',
-                '-O3',
-                '--use_fast_math'
-                ] + self.version_dependent_macros() + self.compute_capability_args()
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if not self.is_rocm_pytorch():
+            nvcc_flags.extend(['-lineinfo',
+                               '--use_fast_math'] + self.compute_capability_args())
+        return nvcc_flags
--- a/op_builder/fused_lamb.py
+++ b/op_builder/fused_lamb.py
@ -1,6 +1,7 @@
 """
 Copyright 2020 The Microsoft DeepSpeed Team
 """
+import torch
 from .builder import CUDAOpBuilder


@ -25,7 +26,14 @@ class FusedLambBuilder(CUDAOpBuilder):
        return args + self.version_dependent_macros()

    def nvcc_args(self):
-        return ['-lineinfo',
-                '-O3',
-                '--use_fast_math'
-                ] + self.version_dependent_macros() + self.compute_capability_args()
+        nvcc_flags = ['-O3'] + self.version_dependent_macros()
+        if self.is_rocm_pytorch():
+            ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
+            nvcc_flags += [
+                '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
+                '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
+            ]
+        else:
+            nvcc_flags.extend(['-lineinfo',
+                               '--use_fast_math'] + self.compute_capability_args())
+        return nvcc_flags
--- a/op_builder/sparse_attn.py
+++ b/op_builder/sparse_attn.py
@ -32,6 +32,10 @@ class SparseAttnBuilder(OpBuilder):
        #command_status = list(map(self.command_exists, required_commands))
        #deps_compatible = all(command_status)

+        if self.is_rocm_pytorch():
+            self.warning(f'{self.NAME} is not compatible with ROCM')
+            return False
+
        try:
            import torch
        except ImportError:
--- a/op_builder/transformer.py
+++ b/op_builder/transformer.py
@ -1,6 +1,7 @@
 """
 Copyright 2020 The Microsoft DeepSpeed Team
 """
+import torch
 from .builder import CUDAOpBuilder


@ -28,4 +29,11 @@ class TransformerBuilder(CUDAOpBuilder):
        ]

    def include_paths(self):
-        return ['csrc/includes']
+        includes = ['csrc/includes']
+        if self.is_rocm_pytorch():
+            from torch.utils.cpp_extension import ROCM_HOME
+            includes += [
+                '{}/hiprand/include'.format(ROCM_HOME),
+                '{}/rocrand/include'.format(ROCM_HOME)
+            ]
+        return includes
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,11 @@ except ImportError:
    print('[WARNING] Unable to import torch, pre-compiling ops will be disabled. ' \
        'Please visit https://pytorch.org/ to see how to properly install torch on your system.')

-from op_builder import ALL_OPS, get_default_compute_capabilities
+from op_builder import ALL_OPS, get_default_compute_capabilities, OpBuilder
+
+# fetch rocm state
+is_rocm_pytorch = OpBuilder.is_rocm_pytorch()
+rocm_version = OpBuilder.installed_rocm_version()

 RED_START = '\033[31m'
 RED_END = '\033[0m'
@ -51,8 +55,8 @@ def fetch_requirements(path):

 install_requires = fetch_requirements('requirements/requirements.txt')
 extras_require = {
-    '1bit_mpi' : fetch_requirements('requirements/requirements-1bit-mpi.txt'),
-    '1bit': [], # Will add proper cupy version below
+    '1bit': [], # add cupy based on cuda/rocm version
+    '1bit_mpi': fetch_requirements('requirements/requirements-1bit-mpi.txt'),
    'readthedocs': fetch_requirements('requirements/requirements-readthedocs.txt'),
    'dev': fetch_requirements('requirements/requirements-dev.txt'),
    'autotuning': fetch_requirements('requirements/requirements-autotuning.txt'),
@ -62,9 +66,17 @@ extras_require = {

 # Add specific cupy version to both onebit extension variants
 if torch_available and torch.cuda.is_available():
-    cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}"
-    extras_require['1bit_mpi'].append(cupy)
-    extras_require['1bit'].append(cupy)
+    cupy = None
+    if is_rocm_pytorch:
+        rocm_major, rocm_minor = rocm_version
+        # XXX cupy support for rocm 5 is not available yet
+        if rocm_major <= 4:
+            cupy = f"cupy-rocm-{rocm_major}-{rocm_minor}"
+    else:
+        cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}"
+    if cupy:
+        extras_require['1bit'].append(cupy)
+        extras_require['1bit_mpi'].append(cupy)

 # Make an [all] extra that installs all needed dependencies
 all_extras = set()
@ -207,9 +219,17 @@ else:
 torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR])
 # Set cuda_version to 0.0 if cpu-only
 cuda_version = "0.0"
+# Set hip_version to 0.0 if cpu-only
+hip_version = "0.0"
 if torch_available and torch.version.cuda is not None:
    cuda_version = ".".join(torch.version.cuda.split('.')[:2])
-torch_info = {"version": torch_version, "cuda_version": cuda_version}
+if torch_available and hasattr(torch.version, 'hip') and torch.version.hip is not None:
+    hip_version = ".".join(torch.version.hip.split('.')[:2])
+torch_info = {
+    "version": torch_version,
+    "cuda_version": cuda_version,
+    "hip_version": hip_version
+}

 print(f"version={version_str}, git_hash={git_hash}, git_branch={git_branch}")
 with open('deepspeed/git_version_info_installed.py', 'w') as fd:
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@ -8,6 +8,9 @@ from torch.multiprocessing import Process
 import deepspeed

 import pytest
+from functools import wraps
+import unittest
+from pathlib import Path

 from pathlib import Path

@ -39,8 +42,15 @@ def set_cuda_visibile():
    if cuda_visible is None:
        # CUDA_VISIBLE_DEVICES is not set, discover it from nvidia-smi instead
        import subprocess
-        nvidia_smi = subprocess.check_output(['nvidia-smi', '--list-gpus'])
-        num_gpus = len(nvidia_smi.decode('utf-8').strip().split('\n'))
+        is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None
+        if is_rocm_pytorch:
+            rocm_smi = subprocess.check_output(['rocm-smi', '--showid'])
+            gpu_ids = filter(lambda s: 'GPU' in s,
+                             rocm_smi.decode('utf-8').strip().split('\n'))
+            num_gpus = len(list(gpu_ids))
+        else:
+            nvidia_smi = subprocess.check_output(['nvidia-smi', '--list-gpus'])
+            num_gpus = len(nvidia_smi.decode('utf-8').strip().split('\n'))
        cuda_visible = ",".join(map(str, range(num_gpus)))

    # rotate list based on xdist worker id, example below
@ -94,6 +104,7 @@ def distributed_test(world_size=2, backend='nccl'):

            # make sure all ranks finish at the same time
            torch.distributed.barrier()
+
            # tear down after test completes
            torch.distributed.destroy_process_group()

@ -154,6 +165,6 @@ def distributed_test(world_size=2, backend='nccl'):
    return dist_wrap


-def get_test_path(src):
+def get_test_path(filename):
    curr_path = Path(__file__).parent
-    return str(curr_path.joinpath(src))
+    return str(curr_path.joinpath(filename))
--- a/tests/unit/test_checkpointing.py
+++ b/tests/unit/test_checkpointing.py
@ -186,6 +186,8 @@ def checkpoint_correctness_verification(args,

    trained_model.save_checkpoint(save_folder, tag=save_tag)

+    dist.barrier()
+
    loaded_model = create_deepspeed_model(args=args,
                                          model=models[1],
                                          base_optimizer=base_optimizers[1])
--- a/tests/unit/test_configurable_parallel.py
+++ b/tests/unit/test_configurable_parallel.py
@ -123,7 +123,7 @@ class TestConfigurableMP:
                                  load_lr_scheduler_states=False)

            test = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda())
-            assert torch.allclose(baseline, test, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
+            assert torch.allclose(baseline, test, rtol=1.0, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"

        inputs = self.get_inputs()
        _run(inputs)
--- a/tests/unit/test_cuda_forward.py
+++ b/tests/unit/test_cuda_forward.py
@ -15,9 +15,6 @@ import deepspeed

 import sys

-#if not deepspeed.ops.__installed_ops__['transformer']:
-#    pytest.skip("transformer kernels are not installed", allow_module_level=True)
-

 def check_equal(first, second, atol=1e-2, verbose=False):
    if verbose:
@ -266,10 +263,10 @@ def test_forward(batch_size,

@pytest.mark.parametrize('batch_size, small_bsz, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
                         [
-                             #(8,3,1024,512,16,3,True,False),
-                             #(8,7,1024,512,16,3,True,True),
-                             #(8,3,1024,512,16,3,False,False),
-                             #(8,7,1024,512,16,3,False,True),
+                             (8,3,1024,512,16,3,True,False),
+                             (8,7,1024,512,16,3,True,True),
+                             (8,3,1024,512,16,3,False,False),
+                             (8,7,1024,512,16,3,False,True),
                         ]) # yapf: disable
 def test_forward_with_small_bsz(batch_size,
                                small_bsz,
--- a/tests/unit/test_onebit.py
+++ b/tests/unit/test_onebit.py
@ -12,6 +12,7 @@ import numpy as np
 import time

 from deepspeed.runtime.pipe.topology import PipeDataParallelTopology, PipeModelDataParallelTopology
+from deepspeed.ops.op_builder import OpBuilder

 PipeTopo = PipeDataParallelTopology
 from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec
@ -25,6 +26,12 @@ if TORCH_MAJOR < 1 or TORCH_MINOR < 8:
    pytest.skip("NCCL-based 1-bit compression requires torch 1.8 or higher",
                allow_module_level=True)

+rocm_version = OpBuilder.installed_rocm_version()
+if rocm_version[0] > 4:
+    pytest.skip(
+        "NCCL-based 1-bit compression is not yet supported w. ROCm 5 until cupy supports ROCm 5",
+        allow_module_level=True)
+

 def test_onebitadam_fp16_basic(tmpdir):
    config_dict = {
--- a/tests/unit/test_sparse_attention.py
+++ b/tests/unit/test_sparse_attention.py
@ -252,7 +252,6 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo


 def _skip_on_cuda_compatability():
-    return
    if torch.cuda.get_device_capability()[0] < 7:
        pytest.skip("needs higher compute capability than 7")
    cuda_major = int(torch.version.cuda.split('.')[0]) * 10
--- a/tests/unit/test_zero.py
+++ b/tests/unit/test_zero.py
@ -846,7 +846,7 @@ def test_zero3_param_partitioning_many_params(world_sz: int,
                for _ in range(n_layers))

            for layer_num, module in enumerate(self.modulelist):
-                if dist.get_rank() == 0:
+                with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0):
                    param: Parameter = module.weight
                    partition_sz = math.ceil(param.numel() / dist.get_world_size())
                    offset = 0