зеркало из https://github.com/microsoft/DeepSpeed.git
AMD support (#1430)
Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com> Co-authored-by: Jithun Nair <jithun.nair@amd.com> Co-authored-by: rraminen <rraminen@amd.com> Co-authored-by: Jeff Daily <jeff.daily@amd.com> Co-authored-by: okakarpa <okakarpa@amd.com> Co-authored-by: rraminen <rraminen@amd.com> Co-authored-by: Jithun Nair <37884920+jithunnair-amd@users.noreply.github.com> Co-authored-by: Jeff Daily <jeff.daily@amd.com> Co-authored-by: okakarpa <okakarpa@amd.com> Co-authored-by: Ramya Ramineni <62723901+rraminen@users.noreply.github.com>
This commit is contained in:
Родитель
f0304bd103
Коммит
c3c8d5dd93
|
@ -1,4 +1,4 @@
|
|||
name: Build
|
||||
name: unit-tests
|
||||
|
||||
on:
|
||||
push:
|
||||
|
@ -14,7 +14,7 @@ on:
|
|||
jobs:
|
||||
# unit tests running on nvidia gpus
|
||||
nv-torch12-p40:
|
||||
runs-on: [self-hosted, nvidia, torch12]
|
||||
runs-on: [self-hosted, nvidia, torch12, p40]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
@ -102,6 +102,43 @@ jobs:
|
|||
find examples/pytorch -regextype posix-egrep -regex '.*(language-modeling|question-answering|summarization|image-classification|text-classification|translation).*/requirements.txt' -exec pip install -r {} \;
|
||||
TORCH_EXTENSIONS_DIR=./torch-extensions RUN_SLOW=1 pytest --color=yes --durations=0 --verbose tests/deepspeed
|
||||
|
||||
# unit tests running on amd gpus
|
||||
amd:
|
||||
# The type of runner that the job will run on
|
||||
runs-on: [self-hosted, amd]
|
||||
|
||||
# Steps represent a sequence of tasks that will be executed as part of the job
|
||||
steps:
|
||||
# Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
# Runs a single command using the runners shell
|
||||
- name: environment
|
||||
run: |
|
||||
rocm-smi --showhw
|
||||
which python
|
||||
python --version
|
||||
which hipcc
|
||||
hipcc --version
|
||||
python -c "import torch; print('torch:', torch.__version__, torch)"
|
||||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libaio-dev
|
||||
# Runs a set of commands using the runners shell
|
||||
- name: Install deepspeed
|
||||
run: |
|
||||
pip install .[dev,1bit,autotuning]
|
||||
python -c "from deepspeed.env_report import cli_main; cli_main()"
|
||||
#ds_report
|
||||
# Runs a set of commands using the runners shell
|
||||
- name: Unit tests
|
||||
run: |
|
||||
if [[ -d ./torch-extensions ]]; then rm -rf ./torch-extensions; fi
|
||||
cd tests
|
||||
#TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose unit/
|
||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -n 4 -m 'not sequential' unit/
|
||||
TORCH_EXTENSIONS_DIR=./torch-extensions pytest --color=yes --durations=0 --forked --verbose -m 'sequential' unit/
|
||||
|
||||
nv-lightning-v100:
|
||||
runs-on: [self-hosted, nvidia, torch18, v100]
|
||||
|
||||
|
|
|
@ -5,7 +5,9 @@
|
|||
#include <cuda.h>
|
||||
#include <cuda_fp16.h>
|
||||
#include <cuda_runtime.h>
|
||||
#ifndef __HIP_PLATFORM_HCC__
|
||||
#include <mma.h>
|
||||
#endif
|
||||
#include <stdio.h>
|
||||
|
||||
int cublas_gemm_ex(cublasHandle_t handle,
|
||||
|
@ -19,7 +21,11 @@ int cublas_gemm_ex(cublasHandle_t handle,
|
|||
const float* A,
|
||||
const float* B,
|
||||
float* C,
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
|
||||
#else
|
||||
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
|
||||
#endif
|
||||
|
||||
int cublas_gemm_ex(cublasHandle_t handle,
|
||||
cublasOperation_t transa,
|
||||
|
@ -32,7 +38,11 @@ int cublas_gemm_ex(cublasHandle_t handle,
|
|||
const __half* A,
|
||||
const __half* B,
|
||||
__half* C,
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
|
||||
#else
|
||||
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
|
||||
#endif
|
||||
|
||||
int cublas_strided_batched_gemm(cublasHandle_t handle,
|
||||
int m,
|
||||
|
@ -49,7 +59,11 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
|
|||
int stride_B,
|
||||
int stride_C,
|
||||
int batch,
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
|
||||
#else
|
||||
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT);
|
||||
#endif
|
||||
|
||||
int cublas_strided_batched_gemm(cublasHandle_t handle,
|
||||
int m,
|
||||
|
@ -66,4 +80,8 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
|
|||
int stride_B,
|
||||
int stride_C,
|
||||
int batch,
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
rocblas_gemm_algo algo = rocblas_gemm_algo_standard);
|
||||
#else
|
||||
cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT_TENSOR_OP);
|
||||
#endif
|
||||
|
|
|
@ -5,7 +5,15 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
#define HALF_PRECISION_AVAILABLE = 1
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#else
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#define HALF_PRECISION_AVAILABLE = 1
|
||||
#endif
|
||||
#include <cooperative_groups.h>
|
||||
#endif
|
||||
#include <curand_kernel.h>
|
||||
|
||||
#include "context.h"
|
||||
|
|
|
@ -43,7 +43,11 @@ public:
|
|||
weights,
|
||||
input_ptr,
|
||||
out,
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
rocblas_gemm_algo(config_.gemm_algos[0]));
|
||||
#else
|
||||
cublasGemmAlgo_t(config_.gemm_algos[0]));
|
||||
#endif
|
||||
}
|
||||
void Backward(int bsz,
|
||||
const T* out_grad,
|
||||
|
@ -68,7 +72,11 @@ public:
|
|||
input_ptr,
|
||||
out_grad,
|
||||
weights_grad,
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
rocblas_gemm_algo(config_.gemm_algos[1]));
|
||||
#else
|
||||
cublasGemmAlgo_t(config_.gemm_algos[1]));
|
||||
#endif
|
||||
|
||||
cublas_gemm_ex(_cublasHandle,
|
||||
CUBLAS_OP_N,
|
||||
|
@ -81,7 +89,11 @@ public:
|
|||
weights,
|
||||
out_grad,
|
||||
inp_grad_out,
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
rocblas_gemm_algo(config_.gemm_algos[2]));
|
||||
#else
|
||||
cublasGemmAlgo_t(config_.gemm_algos[2]));
|
||||
#endif
|
||||
|
||||
launch_fuse_transpose_bias_kernel<T>(out_grad, bias_grad, bsz, config_.outputSize, stream);
|
||||
}
|
||||
|
|
|
@ -2,7 +2,9 @@
|
|||
#pragma once
|
||||
|
||||
#include <cuda_fp16.h>
|
||||
#ifndef __HIP_PLATFORM_HCC__
|
||||
#include <cuda_profiler_api.h>
|
||||
#endif
|
||||
#include <array>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
@ -58,7 +60,11 @@ public:
|
|||
B,
|
||||
A,
|
||||
C,
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
static_cast<rocblas_gemm_algo>(algo));
|
||||
#else
|
||||
static_cast<cublasGemmAlgo_t>(algo));
|
||||
#endif
|
||||
});
|
||||
|
||||
int algo_bw1 = Run(loops, [=](int algo) {
|
||||
|
@ -73,7 +79,11 @@ public:
|
|||
A,
|
||||
C,
|
||||
B,
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
static_cast<rocblas_gemm_algo>(algo));
|
||||
#else
|
||||
static_cast<cublasGemmAlgo_t>(algo));
|
||||
#endif
|
||||
});
|
||||
|
||||
int algo_bw2 = Run(loops, [=](int algo) {
|
||||
|
@ -88,7 +98,11 @@ public:
|
|||
B,
|
||||
C,
|
||||
A,
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
static_cast<rocblas_gemm_algo>(algo));
|
||||
#else
|
||||
static_cast<cublasGemmAlgo_t>(algo));
|
||||
#endif
|
||||
});
|
||||
|
||||
return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
|
||||
|
@ -100,8 +114,12 @@ public:
|
|||
float fast_latency = (std::numeric_limits<float>::max)();
|
||||
int fast_algo = 0;
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
|
||||
#else
|
||||
for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
|
||||
algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
|
||||
#endif
|
||||
algo++) {
|
||||
int warm_up = 5;
|
||||
for (int i = 0; i < warm_up; ++i) f(algo);
|
||||
|
@ -186,7 +204,11 @@ public:
|
|||
stride_b,
|
||||
stride_c,
|
||||
bsz,
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
static_cast<rocblas_gemm_algo>(algo));
|
||||
#else
|
||||
static_cast<cublasGemmAlgo_t>(algo));
|
||||
#endif
|
||||
});
|
||||
|
||||
int algo_bw1 = Run(loops, [=](int algo) {
|
||||
|
@ -216,7 +238,11 @@ public:
|
|||
stride_b,
|
||||
stride_c,
|
||||
bsz,
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
static_cast<rocblas_gemm_algo>(algo));
|
||||
#else
|
||||
static_cast<cublasGemmAlgo_t>(algo));
|
||||
#endif
|
||||
});
|
||||
|
||||
int algo_bw2 = Run(loops, [=](int algo) {
|
||||
|
@ -243,7 +269,11 @@ public:
|
|||
stride_b,
|
||||
stride_c,
|
||||
bsz,
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
static_cast<rocblas_gemm_algo>(algo));
|
||||
#else
|
||||
static_cast<cublasGemmAlgo_t>(algo));
|
||||
#endif
|
||||
});
|
||||
|
||||
return std::array<int, 3>({algo_fw, algo_bw1, algo_bw2});
|
||||
|
@ -255,8 +285,12 @@ public:
|
|||
float fast_latency = (std::numeric_limits<float>::max)();
|
||||
int fast_algo = 0;
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
for (int algo = (int)rocblas_gemm_algo_standard; algo <= (int)rocblas_gemm_algo_standard;
|
||||
#else
|
||||
for (int algo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
|
||||
algo <= (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
|
||||
#endif
|
||||
algo++) {
|
||||
int warm_up = 5;
|
||||
for (int i = 0; i < warm_up; ++i) f(algo);
|
||||
|
|
|
@ -3,7 +3,11 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#else
|
||||
#include <cooperative_groups.h>
|
||||
#endif
|
||||
#include <curand_kernel.h>
|
||||
|
||||
#include "context.h"
|
||||
|
|
|
@ -0,0 +1,348 @@
|
|||
/*
|
||||
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file hcc_detail/hip_cooperative_groups.h
|
||||
*
|
||||
* @brief Device side implementation of `Cooperative Group` feature.
|
||||
*
|
||||
* Defines new types and device API wrappers related to `Cooperative Group`
|
||||
* feature, which the programmer can directly use in his kernel(s) in order to
|
||||
* make use of this feature.
|
||||
*/
|
||||
#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
|
||||
#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
|
||||
|
||||
//#if __cplusplus
|
||||
#if __cplusplus && defined(__clang__) && defined(__HIP__)
|
||||
#include <hip/hcc_detail/hip_cooperative_groups_helper.h>
|
||||
#if ROCM_VERSION_MAJOR < 5 and ROCM_VERSION_MINOR < 4
|
||||
#include <hip/hcc_detail/device_functions.h>
|
||||
#endif
|
||||
namespace cooperative_groups {
|
||||
|
||||
/** \brief The base type of all cooperative group types
|
||||
*
|
||||
* \details Holds the key properties of a constructed cooperative group type
|
||||
* object, like the group type, its size, etc
|
||||
*/
|
||||
/*
|
||||
class thread_group {
|
||||
protected:
|
||||
uint32_t _type; // thread_group type
|
||||
uint32_t _size; // total number of threads in the tread_group
|
||||
uint64_t _mask; // Lanemask for coalesced and tiled partitioned group types,
|
||||
// LSB represents lane 0, and MSB represents lane 63
|
||||
|
||||
// Construct a thread group, and set thread group type and other essential
|
||||
// thread group properties. This generic thread group is directly constructed
|
||||
// only when the group is supposed to contain only the calling the thread
|
||||
// (throurh the API - `this_thread()`), and in all other cases, this thread
|
||||
// group object is a sub-object of some other derived thread group object
|
||||
__CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size,
|
||||
uint64_t mask = (uint64_t)0) {
|
||||
_type = type;
|
||||
_size = size;
|
||||
_mask = mask;
|
||||
}
|
||||
|
||||
public:
|
||||
// Total number of threads in the thread group, and this serves the purpose
|
||||
// for all derived cooperative group types since their `size` is directly
|
||||
// saved during the construction
|
||||
__CG_QUALIFIER__ uint32_t size() const {
|
||||
return _size;
|
||||
}
|
||||
// Rank of the calling thread within [0, size())
|
||||
__CG_QUALIFIER__ uint32_t thread_rank() const;
|
||||
// Is this cooperative group type valid?
|
||||
__CG_QUALIFIER__ bool is_valid() const;
|
||||
// synchronize the threads in the thread group
|
||||
__CG_QUALIFIER__ void sync() const;
|
||||
};
|
||||
*/
|
||||
|
||||
class thread_group {
|
||||
protected:
|
||||
bool _tiled_partition; // this_thread_block() constructor sets to false
|
||||
uint32_t _size; // this_thread_block() constructor sets to size()
|
||||
uint32_t local_rank; // this_thread_block() constructor sets to thread_rank()
|
||||
uint32_t _mask;
|
||||
uint32_t _type;
|
||||
|
||||
public:
|
||||
__CG_QUALIFIER__ thread_group(internal::group_type type,
|
||||
uint32_t group_size,
|
||||
uint64_t mask = (uint64_t)0)
|
||||
{
|
||||
_type = type;
|
||||
_size = group_size;
|
||||
_mask = mask;
|
||||
local_rank = internal::workgroup::thread_rank();
|
||||
}
|
||||
|
||||
__CG_QUALIFIER__ void tiled_partition(const thread_group& parent, unsigned int tile_size)
|
||||
{
|
||||
if ((ceil(log2(tile_size)) == floor(log2(tile_size))) || tile_size == 0 || tile_size > 64 ||
|
||||
parent.size() < tile_size)
|
||||
_tiled_partition = false;
|
||||
// xxx : abort
|
||||
_tiled_partition = true;
|
||||
_size = tile_size;
|
||||
local_rank = parent.thread_rank() % tile_size;
|
||||
}
|
||||
__CG_QUALIFIER__ void sync() const;
|
||||
__CG_QUALIFIER__ uint32_t size() const { return _size; }
|
||||
__CG_QUALIFIER__ uint32_t thread_rank() const;
|
||||
__CG_QUALIFIER__ float shfl_down(float var, unsigned int delta) const
|
||||
{
|
||||
return (__shfl_down(var, delta, _size));
|
||||
}
|
||||
__CG_QUALIFIER__ float shfl_xor(float var, int mask) const
|
||||
{
|
||||
return (__shfl_xor(var, mask, _size));
|
||||
}
|
||||
__CG_QUALIFIER__ float shfl(float var, unsigned int src_lane) const
|
||||
{
|
||||
return (__shfl(var, src_lane, _size));
|
||||
}
|
||||
__CG_QUALIFIER__ bool is_valid() const;
|
||||
};
|
||||
|
||||
/** \brief The multi-grid cooperative group type
|
||||
*
|
||||
* \details Represents an inter-device cooperative group type where the
|
||||
* participating threads within the group spans across multiple
|
||||
* devices, running the (same) kernel on these devices
|
||||
*/
|
||||
class multi_grid_group : public thread_group {
|
||||
// Only these friend functions are allowed to construct an object of this class
|
||||
// and access its resources
|
||||
friend __CG_QUALIFIER__ multi_grid_group this_multi_grid();
|
||||
|
||||
protected:
|
||||
// Construct mutli-grid thread group (through the API this_multi_grid())
|
||||
explicit __CG_QUALIFIER__ multi_grid_group(uint32_t size)
|
||||
: thread_group(internal::cg_multi_grid, size)
|
||||
{
|
||||
}
|
||||
|
||||
public:
|
||||
// Number of invocations participating in this multi-grid group. In other
|
||||
// words, the number of GPUs
|
||||
__CG_QUALIFIER__ uint32_t num_grids() { return internal::multi_grid::num_grids(); }
|
||||
// Rank of this invocation. In other words, an ID number within the range
|
||||
// [0, num_grids()) of the GPU, this kernel is running on
|
||||
__CG_QUALIFIER__ uint32_t grid_rank() { return internal::multi_grid::grid_rank(); }
|
||||
__CG_QUALIFIER__ uint32_t thread_rank() const { return internal::multi_grid::thread_rank(); }
|
||||
__CG_QUALIFIER__ bool is_valid() const { return internal::multi_grid::is_valid(); }
|
||||
__CG_QUALIFIER__ void sync() const { internal::multi_grid::sync(); }
|
||||
};
|
||||
|
||||
/** \brief User exposed API interface to construct multi-grid cooperative
|
||||
* group type object - `multi_grid_group`
|
||||
*
|
||||
* \details User is not allowed to directly construct an object of type
|
||||
* `multi_grid_group`. Instead, he should construct it through this
|
||||
* API function
|
||||
*/
|
||||
__CG_QUALIFIER__ multi_grid_group this_multi_grid()
|
||||
{
|
||||
return multi_grid_group(internal::multi_grid::size());
|
||||
}
|
||||
|
||||
/** \brief The grid cooperative group type
|
||||
*
|
||||
* \details Represents an inter-workgroup cooperative group type where the
|
||||
* participating threads within the group spans across multiple
|
||||
* workgroups running the (same) kernel on the same device
|
||||
*/
|
||||
class grid_group : public thread_group {
|
||||
// Only these friend functions are allowed to construct an object of this class
|
||||
// and access its resources
|
||||
friend __CG_QUALIFIER__ grid_group this_grid();
|
||||
|
||||
protected:
|
||||
// Construct grid thread group (through the API this_grid())
|
||||
explicit __CG_QUALIFIER__ grid_group(uint32_t size) : thread_group(internal::cg_grid, size) {}
|
||||
|
||||
public:
|
||||
__CG_QUALIFIER__ uint32_t thread_rank() const { return internal::grid::thread_rank(); }
|
||||
__CG_QUALIFIER__ bool is_valid() const { return internal::grid::is_valid(); }
|
||||
__CG_QUALIFIER__ void sync() const { internal::grid::sync(); }
|
||||
};
|
||||
|
||||
/** \brief User exposed API interface to construct grid cooperative group type
|
||||
* object - `grid_group`
|
||||
*
|
||||
* \details User is not allowed to directly construct an object of type
|
||||
* `multi_grid_group`. Instead, he should construct it through this
|
||||
* API function
|
||||
*/
|
||||
__CG_QUALIFIER__ grid_group this_grid() { return grid_group(internal::grid::size()); }
|
||||
|
||||
/** \brief The workgroup (thread-block in CUDA terminology) cooperative group
|
||||
* type
|
||||
*
|
||||
* \details Represents an intra-workgroup cooperative group type where the
|
||||
* participating threads within the group are exctly the same threads
|
||||
* which are participated in the currently executing `workgroup`
|
||||
*/
|
||||
class thread_block : public thread_group {
|
||||
// Only these friend functions are allowed to construct an object of this
|
||||
// class and access its resources
|
||||
friend __CG_QUALIFIER__ thread_block this_thread_block();
|
||||
|
||||
protected:
|
||||
// Construct a workgroup thread group (through the API this_thread_block())
|
||||
explicit __CG_QUALIFIER__ thread_block(uint32_t size)
|
||||
: thread_group(internal::cg_workgroup, size)
|
||||
{
|
||||
}
|
||||
|
||||
public:
|
||||
// 3-dimensional block index within the grid
|
||||
__CG_QUALIFIER__ dim3 group_index() { return internal::workgroup::group_index(); }
|
||||
// 3-dimensional thread index within the block
|
||||
__CG_QUALIFIER__ dim3 thread_index() { return internal::workgroup::thread_index(); }
|
||||
__CG_QUALIFIER__ uint32_t thread_rank() const { return internal::workgroup::thread_rank(); }
|
||||
__CG_QUALIFIER__ bool is_valid() const { return internal::workgroup::is_valid(); }
|
||||
__CG_QUALIFIER__ void sync() const { internal::workgroup::sync(); }
|
||||
};
|
||||
|
||||
/** \brief User exposed API interface to construct workgroup cooperative
|
||||
* group type object - `thread_block`
|
||||
*
|
||||
* \details User is not allowed to directly construct an object of type
|
||||
* `thread_block`. Instead, he should construct it through this API
|
||||
* function
|
||||
*/
|
||||
__CG_QUALIFIER__ thread_block this_thread_block()
|
||||
{
|
||||
return thread_block(internal::workgroup::size());
|
||||
}
|
||||
|
||||
/**
|
||||
* Implementation of all publicly exposed base class APIs
|
||||
*/
|
||||
__CG_QUALIFIER__ uint32_t thread_group::thread_rank() const
|
||||
{
|
||||
switch (this->_type) {
|
||||
case internal::cg_multi_grid: {
|
||||
return (static_cast<const multi_grid_group*>(this)->thread_rank());
|
||||
}
|
||||
case internal::cg_grid: {
|
||||
return (static_cast<const grid_group*>(this)->thread_rank());
|
||||
}
|
||||
case internal::cg_workgroup: {
|
||||
return (static_cast<const thread_block*>(this)->thread_rank());
|
||||
}
|
||||
case internal::cg_coalesced_tile: {
|
||||
return local_rank;
|
||||
}
|
||||
default: {
|
||||
assert(false && "invalid cooperative group type");
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__CG_QUALIFIER__ bool thread_group::is_valid() const
|
||||
{
|
||||
switch (this->_type) {
|
||||
case internal::cg_multi_grid: {
|
||||
return (static_cast<const multi_grid_group*>(this)->is_valid());
|
||||
}
|
||||
case internal::cg_grid: {
|
||||
return (static_cast<const grid_group*>(this)->is_valid());
|
||||
}
|
||||
case internal::cg_workgroup: {
|
||||
return (static_cast<const thread_block*>(this)->is_valid());
|
||||
}
|
||||
case internal::cg_coalesced_tile: {
|
||||
return _tiled_partition;
|
||||
}
|
||||
default: {
|
||||
assert(false && "invalid cooperative group type");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__CG_QUALIFIER__ void thread_group::sync() const
|
||||
{
|
||||
switch (this->_type) {
|
||||
case internal::cg_multi_grid: {
|
||||
static_cast<const multi_grid_group*>(this)->sync();
|
||||
break;
|
||||
}
|
||||
case internal::cg_grid: {
|
||||
static_cast<const grid_group*>(this)->sync();
|
||||
break;
|
||||
}
|
||||
case internal::cg_workgroup: {
|
||||
static_cast<const thread_block*>(this)->sync();
|
||||
break;
|
||||
}
|
||||
case internal::cg_coalesced_tile: {
|
||||
if (!_tiled_partition) // If in a tiled partition, this is a no-op
|
||||
__syncthreads();
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
assert(false && "invalid cooperative group type");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Implementation of publicly exposed `wrapper` APIs on top of basic cooperative
|
||||
* group type APIs
|
||||
*/
|
||||
template <class CGTy>
|
||||
__CG_QUALIFIER__ uint32_t group_size(CGTy const& g)
|
||||
{
|
||||
return g.size();
|
||||
}
|
||||
|
||||
template <class CGTy>
|
||||
__CG_QUALIFIER__ uint32_t thread_rank(CGTy const& g)
|
||||
{
|
||||
return g.thread_rank();
|
||||
}
|
||||
|
||||
template <class CGTy>
|
||||
__CG_QUALIFIER__ bool is_valid(CGTy const& g)
|
||||
{
|
||||
return g.is_valid();
|
||||
}
|
||||
|
||||
template <class CGTy>
|
||||
__CG_QUALIFIER__ void sync(CGTy const& g)
|
||||
{
|
||||
g.sync();
|
||||
}
|
||||
|
||||
} // namespace cooperative_groups
|
||||
|
||||
#endif // __cplusplus
|
||||
#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_H
|
|
@ -0,0 +1,163 @@
|
|||
/*
|
||||
Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file hcc_detail/hip_cooperative_groups_helper.h
|
||||
*
|
||||
* @brief Device side implementation of cooperative group feature.
|
||||
*
|
||||
* Defines helper constructs and APIs which aid the types and device API
|
||||
* wrappers defined within `hcc_detail/hip_cooperative_groups.h`.
|
||||
*/
|
||||
#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
|
||||
#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
|
||||
|
||||
#if __cplusplus
|
||||
|
||||
#if ROCM_VERSION_MAJOR < 5 and ROCM_VERSION_MINOR < 4
|
||||
#include <hip/hcc_detail/device_functions.h>
|
||||
#include <hip/hcc_detail/hip_runtime_api.h>
|
||||
#else
|
||||
#include <hip/hcc_detail/amd_device_functions.h>
|
||||
#endif
|
||||
|
||||
#if !defined(__align__)
|
||||
#define __align__(x) __attribute__((aligned(x)))
|
||||
#endif
|
||||
|
||||
#if !defined(__CG_QUALIFIER__)
|
||||
#define __CG_QUALIFIER__ __device__ __forceinline__
|
||||
#endif
|
||||
|
||||
#if !defined(__CG_STATIC_QUALIFIER__)
|
||||
#define __CG_STATIC_QUALIFIER__ __device__ static __forceinline__
|
||||
#endif
|
||||
|
||||
#if !defined(WAVEFRONT_SIZE)
|
||||
#define WAVEFRONT_SIZE 64
|
||||
#endif
|
||||
|
||||
namespace cooperative_groups {
|
||||
|
||||
namespace internal {
|
||||
|
||||
/** \brief Enums representing different cooperative group types
|
||||
*/
|
||||
typedef enum { cg_invalid, cg_multi_grid, cg_grid, cg_workgroup, cg_coalesced_tile } group_type;
|
||||
|
||||
/**
|
||||
* Functionalities related to multi-grid cooperative group type
|
||||
*/
|
||||
namespace multi_grid {
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t num_grids() { return (uint32_t)__ockl_multi_grid_num_grids(); }
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t grid_rank() { return (uint32_t)__ockl_multi_grid_grid_rank(); }
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t size() { return (uint32_t)__ockl_multi_grid_size(); }
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return (uint32_t)__ockl_multi_grid_thread_rank(); }
|
||||
|
||||
__CG_STATIC_QUALIFIER__ bool is_valid() { return (bool)__ockl_multi_grid_is_valid(); }
|
||||
|
||||
__CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); }
|
||||
|
||||
} // namespace multi_grid
|
||||
|
||||
/**
|
||||
* Functionalities related to grid cooperative group type
|
||||
*/
|
||||
namespace grid {
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t size()
|
||||
{
|
||||
return (uint32_t)((hipBlockDim_z * hipGridDim_z) * (hipBlockDim_y * hipGridDim_y) *
|
||||
(hipBlockDim_x * hipGridDim_x));
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t thread_rank()
|
||||
{
|
||||
// Compute global id of the workgroup to which the current thread belongs to
|
||||
uint32_t blkIdx = (uint32_t)((hipBlockIdx_z * hipGridDim_y * hipGridDim_x) +
|
||||
(hipBlockIdx_y * hipGridDim_x) + (hipBlockIdx_x));
|
||||
|
||||
// Compute total number of threads being passed to reach current workgroup
|
||||
// within grid
|
||||
uint32_t num_threads_till_current_workgroup =
|
||||
(uint32_t)(blkIdx * (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
|
||||
|
||||
// Compute thread local rank within current workgroup
|
||||
uint32_t local_thread_rank = (uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
|
||||
(hipThreadIdx_y * hipBlockDim_x) + (hipThreadIdx_x));
|
||||
|
||||
return (num_threads_till_current_workgroup + local_thread_rank);
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ bool is_valid() { return (bool)__ockl_grid_is_valid(); }
|
||||
|
||||
__CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); }
|
||||
|
||||
} // namespace grid
|
||||
|
||||
/**
|
||||
* Functionalities related to `workgroup` (thread_block in CUDA terminology)
|
||||
* cooperative group type
|
||||
*/
|
||||
namespace workgroup {
|
||||
|
||||
__CG_STATIC_QUALIFIER__ dim3 group_index()
|
||||
{
|
||||
return (dim3((uint32_t)hipBlockIdx_x, (uint32_t)hipBlockIdx_y, (uint32_t)hipBlockIdx_z));
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ dim3 thread_index()
|
||||
{
|
||||
return (dim3((uint32_t)hipThreadIdx_x, (uint32_t)hipThreadIdx_y, (uint32_t)hipThreadIdx_z));
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t size()
|
||||
{
|
||||
return ((uint32_t)(hipBlockDim_x * hipBlockDim_y * hipBlockDim_z));
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ uint32_t thread_rank()
|
||||
{
|
||||
return ((uint32_t)((hipThreadIdx_z * hipBlockDim_y * hipBlockDim_x) +
|
||||
(hipThreadIdx_y * hipBlockDim_x) + (hipThreadIdx_x)));
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ bool is_valid()
|
||||
{
|
||||
// TODO(mahesha) any functionality need to be added here? I believe not
|
||||
return true;
|
||||
}
|
||||
|
||||
__CG_STATIC_QUALIFIER__ void sync() { __syncthreads(); }
|
||||
|
||||
} // namespace workgroup
|
||||
|
||||
} // namespace internal
|
||||
|
||||
} // namespace cooperative_groups
|
||||
|
||||
#endif // __cplusplus
|
||||
#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
|
|
@ -72,7 +72,11 @@ public:
|
|||
stride_b,
|
||||
stride_c,
|
||||
bsz,
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
rocblas_gemm_algo(_config.gemm_algos[0]));
|
||||
#else
|
||||
cublasGemmAlgo_t(_config.gemm_algos[0]));
|
||||
#endif
|
||||
}
|
||||
|
||||
void ForwardPlusSave(T* output, const T* _buffer_a, const T* _buffer_b, cublasHandle_t handle)
|
||||
|
@ -96,7 +100,11 @@ public:
|
|||
stride_b,
|
||||
stride_c,
|
||||
_config.batch_size,
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
rocblas_gemm_algo(_config.gemm_algos[0]));
|
||||
#else
|
||||
cublasGemmAlgo_t(_config.gemm_algos[0]));
|
||||
#endif
|
||||
|
||||
k_buf = _buffer_a;
|
||||
q_buf = _buffer_b;
|
||||
|
@ -136,7 +144,11 @@ public:
|
|||
stride_b,
|
||||
stride_c,
|
||||
bsz,
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
rocblas_gemm_algo(_config.gemm_algos[1]));
|
||||
#else
|
||||
cublasGemmAlgo_t(_config.gemm_algos[1]));
|
||||
#endif
|
||||
|
||||
// A need to transpose.
|
||||
cublasOperation_t op_a = (_config.op_A == CUBLAS_OP_T ? CUBLAS_OP_N : CUBLAS_OP_T);
|
||||
|
@ -161,7 +173,11 @@ public:
|
|||
stride_b,
|
||||
stride_c,
|
||||
bsz,
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
rocblas_gemm_algo(_config.gemm_algos[2]));
|
||||
#else
|
||||
cublasGemmAlgo_t(_config.gemm_algos[2]));
|
||||
#endif
|
||||
}
|
||||
|
||||
inline int GetN() const { return _config.k; }
|
||||
|
|
|
@ -14,7 +14,11 @@
|
|||
#include <iostream>
|
||||
|
||||
//#include <helper_functions.h>
|
||||
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#else
|
||||
#include <cooperative_groups.h>
|
||||
#endif
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <stdio.h>
|
||||
|
||||
|
@ -78,7 +82,11 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
|
|||
T a_sum = s_a[tid];
|
||||
T b_sum = s_b[tid];
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
|
||||
cta.sync();
|
||||
#else
|
||||
cg::sync(cta);
|
||||
#endif
|
||||
|
||||
// do reduction in shared mem
|
||||
if ((blockSize >= 512) && (tid < 256)) {
|
||||
|
@ -86,21 +94,33 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
|
|||
s_b[tid] = b_sum = b_sum + s_b[tid + 256];
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
|
||||
cta.sync();
|
||||
#else
|
||||
cg::sync(cta);
|
||||
#endif
|
||||
|
||||
if ((blockSize >= 256) && (tid < 128)) {
|
||||
s_a[tid] = a_sum = a_sum + s_a[tid + 128];
|
||||
s_b[tid] = b_sum = b_sum + s_b[tid + 128];
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
|
||||
cta.sync();
|
||||
#else
|
||||
cg::sync(cta);
|
||||
#endif
|
||||
|
||||
if ((blockSize >= 128) && (tid < 64)) {
|
||||
s_a[tid] = a_sum = a_sum + s_a[tid + 64];
|
||||
s_b[tid] = b_sum = b_sum + s_b[tid + 64];
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
|
||||
cta.sync();
|
||||
#else
|
||||
cg::sync(cta);
|
||||
#endif
|
||||
|
||||
#if (__CUDA_ARCH__ >= 300)
|
||||
if (tid < 32) {
|
||||
|
@ -124,42 +144,66 @@ __device__ void reduce_block_in_shared_memory(T* s_a, T* s_b, T* g_a, T* g_b)
|
|||
s_b[tid] = b_sum = b_sum + s_b[tid + 32];
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
|
||||
cta.sync();
|
||||
#else
|
||||
cg::sync(cta);
|
||||
#endif
|
||||
|
||||
if ((blockSize >= 32) && (tid < 16)) {
|
||||
s_a[tid] = a_sum = a_sum + s_a[tid + 16];
|
||||
s_b[tid] = b_sum = b_sum + s_b[tid + 16];
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
|
||||
cta.sync();
|
||||
#else
|
||||
cg::sync(cta);
|
||||
#endif
|
||||
|
||||
if ((blockSize >= 16) && (tid < 8)) {
|
||||
s_a[tid] = a_sum = a_sum + s_a[tid + 8];
|
||||
s_b[tid] = b_sum = b_sum + s_b[tid + 8];
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
|
||||
cta.sync();
|
||||
#else
|
||||
cg::sync(cta);
|
||||
#endif
|
||||
|
||||
if ((blockSize >= 8) && (tid < 4)) {
|
||||
s_a[tid] = a_sum = a_sum + s_a[tid + 4];
|
||||
s_b[tid] = b_sum = b_sum + s_b[tid + 4];
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
|
||||
cta.sync();
|
||||
#else
|
||||
cg::sync(cta);
|
||||
#endif
|
||||
|
||||
if ((blockSize >= 4) && (tid < 2)) {
|
||||
s_a[tid] = a_sum = a_sum + s_a[tid + 2];
|
||||
s_b[tid] = b_sum = b_sum + s_b[tid + 2];
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
|
||||
cta.sync();
|
||||
#else
|
||||
cg::sync(cta);
|
||||
#endif
|
||||
|
||||
if ((blockSize >= 2) && (tid < 1)) {
|
||||
s_a[tid] = a_sum = a_sum + s_a[tid + 1];
|
||||
s_b[tid] = b_sum = b_sum + s_b[tid + 1];
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) && HIP_VERSION > 305
|
||||
cta.sync();
|
||||
#else
|
||||
cg::sync(cta);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -464,7 +508,7 @@ void fused_lamb_cuda(at::Tensor& p,
|
|||
lamb_coeff.data<scalar_t>());
|
||||
}));
|
||||
}
|
||||
THCudaCheck(cudaGetLastError());
|
||||
C10_CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
// template __device__ void reduce_two_vectors_in_register<float,512>(float a, float b, float* g_a,
|
||||
|
|
|
@ -1,5 +1,19 @@
|
|||
#include "cublas_wrappers.h"
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
int cublas_gemm_ex(rocblas_handle handle,
|
||||
rocblas_operation transa,
|
||||
rocblas_operation transb,
|
||||
int m,
|
||||
int n,
|
||||
int k,
|
||||
const float* alpha,
|
||||
const float* beta,
|
||||
const float* A,
|
||||
const float* B,
|
||||
float* C,
|
||||
rocblas_gemm_algo algo)
|
||||
#else
|
||||
int cublas_gemm_ex(cublasHandle_t handle,
|
||||
cublasOperation_t transa,
|
||||
cublasOperation_t transb,
|
||||
|
@ -12,7 +26,34 @@ int cublas_gemm_ex(cublasHandle_t handle,
|
|||
const float* B,
|
||||
float* C,
|
||||
cublasGemmAlgo_t algo)
|
||||
#endif
|
||||
{
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
rocblas_status status = rocblas_gemm_ex(handle,
|
||||
transa,
|
||||
transb,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
(const void*)alpha,
|
||||
(const void*)A,
|
||||
rocblas_datatype_f32_r,
|
||||
(transa == rocblas_operation_none) ? m : k,
|
||||
(const void*)B,
|
||||
rocblas_datatype_f32_r,
|
||||
(transb == rocblas_operation_none) ? k : n,
|
||||
(const void*)beta,
|
||||
C,
|
||||
rocblas_datatype_f32_r,
|
||||
m,
|
||||
C,
|
||||
rocblas_datatype_f32_r,
|
||||
m,
|
||||
rocblas_datatype_f32_r,
|
||||
algo,
|
||||
0,
|
||||
0);
|
||||
#else
|
||||
cublasStatus_t status = cublasGemmEx(handle,
|
||||
transa,
|
||||
transb,
|
||||
|
@ -32,8 +73,13 @@ int cublas_gemm_ex(cublasHandle_t handle,
|
|||
m,
|
||||
CUDA_R_32F,
|
||||
algo);
|
||||
#endif
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
if (status != rocblas_status_success) {
|
||||
#else
|
||||
if (status != CUBLAS_STATUS_SUCCESS) {
|
||||
#endif
|
||||
fprintf(stderr,
|
||||
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
|
||||
m,
|
||||
|
@ -45,6 +91,20 @@ int cublas_gemm_ex(cublasHandle_t handle,
|
|||
return 0;
|
||||
}
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
int cublas_gemm_ex(rocblas_handle handle,
|
||||
rocblas_operation transa,
|
||||
rocblas_operation transb,
|
||||
int m,
|
||||
int n,
|
||||
int k,
|
||||
const float* alpha,
|
||||
const float* beta,
|
||||
const __half* A,
|
||||
const __half* B,
|
||||
__half* C,
|
||||
rocblas_gemm_algo algo)
|
||||
#else
|
||||
int cublas_gemm_ex(cublasHandle_t handle,
|
||||
cublasOperation_t transa,
|
||||
cublasOperation_t transb,
|
||||
|
@ -57,7 +117,34 @@ int cublas_gemm_ex(cublasHandle_t handle,
|
|||
const __half* B,
|
||||
__half* C,
|
||||
cublasGemmAlgo_t algo)
|
||||
#endif
|
||||
{
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
rocblas_status status = rocblas_gemm_ex(handle,
|
||||
transa,
|
||||
transb,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
(const void*)alpha,
|
||||
(const void*)A,
|
||||
rocblas_datatype_f16_r,
|
||||
(transa == rocblas_operation_none) ? m : k,
|
||||
(const void*)B,
|
||||
rocblas_datatype_f16_r,
|
||||
(transb == rocblas_operation_none) ? k : n,
|
||||
(const void*)beta,
|
||||
(void*)C,
|
||||
rocblas_datatype_f16_r,
|
||||
m,
|
||||
(void*)C,
|
||||
rocblas_datatype_f16_r,
|
||||
m,
|
||||
rocblas_datatype_f32_r,
|
||||
algo,
|
||||
0,
|
||||
0);
|
||||
#else
|
||||
cublasStatus_t status = cublasGemmEx(handle,
|
||||
transa,
|
||||
transb,
|
||||
|
@ -77,8 +164,13 @@ int cublas_gemm_ex(cublasHandle_t handle,
|
|||
m,
|
||||
CUDA_R_32F,
|
||||
algo);
|
||||
#endif
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
if (status != rocblas_status_success) {
|
||||
#else
|
||||
if (status != CUBLAS_STATUS_SUCCESS) {
|
||||
#endif
|
||||
fprintf(stderr,
|
||||
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
|
||||
m,
|
||||
|
@ -90,6 +182,24 @@ int cublas_gemm_ex(cublasHandle_t handle,
|
|||
return 0;
|
||||
}
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
int cublas_strided_batched_gemm(rocblas_handle handle,
|
||||
int m,
|
||||
int n,
|
||||
int k,
|
||||
const float* alpha,
|
||||
const float* beta,
|
||||
const float* A,
|
||||
const float* B,
|
||||
float* C,
|
||||
rocblas_operation op_A,
|
||||
rocblas_operation op_B,
|
||||
int stride_A,
|
||||
int stride_B,
|
||||
int stride_C,
|
||||
int batch,
|
||||
rocblas_gemm_algo algo)
|
||||
#else
|
||||
int cublas_strided_batched_gemm(cublasHandle_t handle,
|
||||
int m,
|
||||
int n,
|
||||
|
@ -106,7 +216,40 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
|
|||
int stride_C,
|
||||
int batch,
|
||||
cublasGemmAlgo_t algo)
|
||||
#endif
|
||||
{
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
rocblas_status status =
|
||||
rocblas_gemm_strided_batched_ex(handle,
|
||||
op_A,
|
||||
op_B,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
alpha,
|
||||
A,
|
||||
rocblas_datatype_f32_r,
|
||||
(op_A == rocblas_operation_none) ? m : k,
|
||||
stride_A,
|
||||
B,
|
||||
rocblas_datatype_f32_r,
|
||||
(op_B == rocblas_operation_none) ? k : n,
|
||||
stride_B,
|
||||
beta,
|
||||
C,
|
||||
rocblas_datatype_f32_r,
|
||||
m,
|
||||
stride_C,
|
||||
C,
|
||||
rocblas_datatype_f32_r,
|
||||
m,
|
||||
stride_C,
|
||||
batch,
|
||||
rocblas_datatype_f32_r,
|
||||
algo,
|
||||
0,
|
||||
0);
|
||||
#else
|
||||
cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
|
||||
op_A,
|
||||
op_B,
|
||||
|
@ -130,8 +273,13 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
|
|||
batch,
|
||||
CUDA_R_32F,
|
||||
algo);
|
||||
#endif
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
if (status != rocblas_status_success) {
|
||||
#else
|
||||
if (status != CUBLAS_STATUS_SUCCESS) {
|
||||
#endif
|
||||
fprintf(stderr,
|
||||
"!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d) \n",
|
||||
batch,
|
||||
|
@ -144,6 +292,24 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
|
|||
return 0;
|
||||
}
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
int cublas_strided_batched_gemm(rocblas_handle handle,
|
||||
int m,
|
||||
int n,
|
||||
int k,
|
||||
const float* alpha,
|
||||
const float* beta,
|
||||
const __half* A,
|
||||
const __half* B,
|
||||
__half* C,
|
||||
rocblas_operation op_A,
|
||||
rocblas_operation op_B,
|
||||
int stride_A,
|
||||
int stride_B,
|
||||
int stride_C,
|
||||
int batch,
|
||||
rocblas_gemm_algo algo)
|
||||
#else
|
||||
int cublas_strided_batched_gemm(cublasHandle_t handle,
|
||||
int m,
|
||||
int n,
|
||||
|
@ -160,7 +326,40 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
|
|||
int stride_C,
|
||||
int batch,
|
||||
cublasGemmAlgo_t algo)
|
||||
#endif
|
||||
{
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
rocblas_status status =
|
||||
rocblas_gemm_strided_batched_ex(handle,
|
||||
op_A,
|
||||
op_B,
|
||||
m,
|
||||
n,
|
||||
k,
|
||||
alpha,
|
||||
A,
|
||||
rocblas_datatype_f16_r,
|
||||
(op_A == rocblas_operation_none) ? m : k,
|
||||
stride_A,
|
||||
B,
|
||||
rocblas_datatype_f16_r,
|
||||
(op_B == rocblas_operation_none) ? k : n,
|
||||
stride_B,
|
||||
beta,
|
||||
C,
|
||||
rocblas_datatype_f16_r,
|
||||
m,
|
||||
stride_C,
|
||||
C,
|
||||
rocblas_datatype_f16_r,
|
||||
m,
|
||||
stride_C,
|
||||
batch,
|
||||
rocblas_datatype_f32_r,
|
||||
algo,
|
||||
0,
|
||||
0);
|
||||
#else
|
||||
cublasStatus_t status = cublasGemmStridedBatchedEx(handle,
|
||||
op_A,
|
||||
op_B,
|
||||
|
@ -184,8 +383,13 @@ int cublas_strided_batched_gemm(cublasHandle_t handle,
|
|||
batch,
|
||||
CUDA_R_32F,
|
||||
algo);
|
||||
#endif
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
if (status != rocblas_status_success) {
|
||||
#else
|
||||
if (status != CUBLAS_STATUS_SUCCESS) {
|
||||
#endif
|
||||
fprintf(stderr,
|
||||
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d) \n",
|
||||
m,
|
||||
|
|
|
@ -140,7 +140,9 @@ BertTransformerLayer<T>::~BertTransformerLayer()
|
|||
template <typename T>
|
||||
void BertTransformerLayer<T>::Initialize()
|
||||
{
|
||||
#ifndef __HIP_PLATFORM_HCC__
|
||||
if (std::is_same<T, __half>::value) cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
|
|
|
@ -60,7 +60,7 @@ __global__ void gelu_kernel(const float* input, float* vals, int row_stride, int
|
|||
|
||||
__global__ void gelu_kernel(const __half* input, __half* vals, int row_stride, int iterations)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#ifdef HALF_PRECISION_AVAILABLE
|
||||
int row = blockIdx.x;
|
||||
int id = threadIdx.x;
|
||||
int loop_stride = blockDim.x;
|
||||
|
@ -131,7 +131,7 @@ __global__ void fused_bias_gelu(const __half* input,
|
|||
int row_stride,
|
||||
int iterations)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#ifdef HALF_PRECISION_AVAILABLE
|
||||
int row = blockIdx.x;
|
||||
int id = threadIdx.x;
|
||||
int loop_stride = blockDim.x;
|
||||
|
@ -214,7 +214,7 @@ __global__ void d_gelu_func(__half* d_output,
|
|||
int row_stride,
|
||||
int iterations)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#ifdef HALF_PRECISION_AVAILABLE
|
||||
int row = blockIdx.x;
|
||||
int id = threadIdx.x;
|
||||
int loop_stride = blockDim.x;
|
||||
|
|
|
@ -11,8 +11,12 @@ __global__ void column_sum_reduce(const T* __restrict__ inp,
|
|||
__shared__ float tile[TILE_DIM][TILE_DIM + 1];
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
|
||||
g.tiled_partition(b, TILE_DIM);
|
||||
#else
|
||||
cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
|
||||
|
||||
#endif
|
||||
int idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
int y_stride = width * TILE_DIM;
|
||||
|
|
|
@ -46,7 +46,7 @@ __global__ void dequantize_kernel(__half* output,
|
|||
unsigned groups,
|
||||
unsigned merge_count)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#ifdef HALF_PRECISION_AVAILABLE
|
||||
|
||||
unsigned merge_hidden = hidden_dim >> merge_count;
|
||||
unsigned quantization_stride = (merge_hidden * output_size) / groups;
|
||||
|
|
|
@ -42,7 +42,7 @@ __global__ void fused_bias_gelu(__half* input,
|
|||
int total_count,
|
||||
int intermediate_size)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#ifdef HALF_PRECISION_AVAILABLE
|
||||
|
||||
float2* input_cast = reinterpret_cast<float2*>(input);
|
||||
const float2* bias_cast = reinterpret_cast<const float2*>(bias);
|
||||
|
@ -120,7 +120,7 @@ __global__ void fused_bias_add(float* input, const float* bias, int total_count,
|
|||
|
||||
__global__ void fused_bias_add(__half* input, const __half* bias, int total_count, int hidden_size)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#ifdef HALF_PRECISION_AVAILABLE
|
||||
|
||||
float2* input_cast = reinterpret_cast<float2*>(input);
|
||||
const float2* bias_cast = reinterpret_cast<const float2*>(bias);
|
||||
|
@ -206,7 +206,7 @@ __global__ void fused_bias_residual(__half* input,
|
|||
int intermediate_size,
|
||||
bool add_bias)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#ifdef HALF_PRECISION_AVAILABLE
|
||||
|
||||
float2* input_cast = reinterpret_cast<float2*>(input);
|
||||
const float2* residual_cast = reinterpret_cast<const float2*>(residual);
|
||||
|
|
|
@ -85,7 +85,7 @@ __global__ void fused_bias_residual_layer_norm(__half* output,
|
|||
float epsilon,
|
||||
int row_stride)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#ifdef HALF_PRECISION_AVAILABLE
|
||||
int iteration_stride = blockDim.x;
|
||||
int iterations = row_stride / iteration_stride;
|
||||
|
||||
|
@ -287,7 +287,7 @@ __global__ void fused_residual_layer_norm(__half* norm,
|
|||
int row_stride,
|
||||
bool preLN)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#ifdef HALF_PRECISION_AVAILABLE
|
||||
int iteration_stride = blockDim.x;
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
|
|
|
@ -38,7 +38,7 @@ __global__ void attn_softmax_v2(__half* vals,
|
|||
int iterations,
|
||||
int reduceWidth)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#ifdef HALF_PRECISION_AVAILABLE
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
|
||||
|
|
|
@ -1,6 +1,15 @@
|
|||
#pragma once
|
||||
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
#define HALF_PRECISION_AVAILABLE = 1
|
||||
#include <hip/hip_cooperative_groups.h>
|
||||
#else
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#define HALF_PRECISION_AVAILABLE = 1
|
||||
#endif
|
||||
#include <cooperative_groups.h>
|
||||
#endif
|
||||
|
||||
#include <cuda.h>
|
||||
#include <cuda_fp16.h>
|
||||
#include <stdio.h>
|
||||
|
|
|
@ -28,8 +28,12 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
|
|||
int iterations = row_stride / iteration_stride;
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
|
||||
g.tiled_partition(b, WARP_SIZE);
|
||||
#else
|
||||
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
|
||||
|
||||
#endif
|
||||
int row = blockIdx.x;
|
||||
int id = threadIdx.x;
|
||||
int gid = id / WARP_SIZE;
|
||||
|
@ -125,12 +129,17 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
|
|||
__half* means,
|
||||
int row_stride)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#ifdef HALF_PRECISION_AVAILABLE
|
||||
int iteration_stride = blockDim.x;
|
||||
int iterations = row_stride / iteration_stride;
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
|
||||
g.tiled_partition(b, 32);
|
||||
#else
|
||||
cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
|
||||
#endif
|
||||
|
||||
int row = blockIdx.x;
|
||||
int id = threadIdx.x;
|
||||
|
@ -322,7 +331,12 @@ __global__ void fused_bias_residual_layer_norm(float* vals,
|
|||
int iterations = row_stride / iteration_stride;
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
|
||||
g.tiled_partition(b, 32);
|
||||
#else
|
||||
cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
|
||||
#endif
|
||||
|
||||
int row = blockIdx.x;
|
||||
int id = threadIdx.x;
|
||||
|
@ -416,13 +430,18 @@ __global__ void fused_bias_residual_layer_norm(__half* vals,
|
|||
__half* vars,
|
||||
int row_stride)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#ifdef HALF_PRECISION_AVAILABLE
|
||||
|
||||
int iteration_stride = blockDim.x;
|
||||
int iterations = row_stride / iteration_stride;
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, 32);
|
||||
g.tiled_partition(b, 32);
|
||||
#else
|
||||
cg::thread_block_tile<32> g = cg::tiled_partition<32>(b);
|
||||
#endif
|
||||
|
||||
int row = blockIdx.x;
|
||||
int id = threadIdx.x;
|
||||
|
@ -634,7 +653,12 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad,
|
|||
__shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
|
||||
g.tiled_partition(b, TILE_DIM);
|
||||
#else
|
||||
cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
|
||||
#endif
|
||||
|
||||
int idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int offset = threadIdx.y * width + idx;
|
||||
|
@ -701,7 +725,12 @@ __global__ void LayerNormBackward1(const T* __restrict__ out_grad,
|
|||
__shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
|
||||
g.tiled_partition(b, TILE_DIM);
|
||||
#else
|
||||
cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
|
||||
#endif
|
||||
|
||||
int idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int offset = threadIdx.y * width + idx;
|
||||
|
@ -766,7 +795,12 @@ __global__ void LayerNormBackward2(const float* out_grad,
|
|||
int iterations = row_stride / iteration_stride;
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
|
||||
g.tiled_partition(b, WARP_SIZE);
|
||||
#else
|
||||
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
|
||||
#endif
|
||||
|
||||
int row = blockIdx.x;
|
||||
int id = threadIdx.x;
|
||||
|
@ -866,7 +900,12 @@ __global__ void LayerNormBackward2(const __half* out_grad,
|
|||
int iterations = row_stride / iteration_stride;
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
|
||||
g.tiled_partition(b, WARP_SIZE);
|
||||
#else
|
||||
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
|
||||
#endif
|
||||
|
||||
int row = blockIdx.x;
|
||||
int id = threadIdx.x;
|
||||
|
@ -1081,7 +1120,12 @@ __global__ void LayerNormBackward2(const float* out_grad,
|
|||
int iterations = row_stride / iteration_stride;
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
|
||||
g.tiled_partition(b, WARP_SIZE);
|
||||
#else
|
||||
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
|
||||
#endif
|
||||
|
||||
int row = blockIdx.x;
|
||||
int id = threadIdx.x;
|
||||
|
@ -1176,7 +1220,12 @@ __global__ void LayerNormBackward2(const __half* out_grad,
|
|||
int iterations = row_stride / iteration_stride;
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
|
||||
g.tiled_partition(b, WARP_SIZE);
|
||||
#else
|
||||
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
|
||||
#endif
|
||||
|
||||
int row = blockIdx.x;
|
||||
int id = threadIdx.x;
|
||||
|
@ -1380,7 +1429,12 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
|
|||
__shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
|
||||
g.tiled_partition(b, TILE_DIM);
|
||||
#else
|
||||
cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
|
||||
#endif
|
||||
|
||||
int idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int offset = threadIdx.y * width + idx;
|
||||
|
@ -1442,7 +1496,12 @@ __global__ void LayerNormBackward1_fused_add(const T* __restrict__ out_grad1,
|
|||
__shared__ float gamma_buffer[TILE_DIM][TILE_DIM + 1];
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, TILE_DIM);
|
||||
g.tiled_partition(b, TILE_DIM);
|
||||
#else
|
||||
cg::thread_block_tile<TILE_DIM> g = cg::tiled_partition<TILE_DIM>(b);
|
||||
#endif
|
||||
|
||||
int idx = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
int offset = threadIdx.y * width + idx;
|
||||
|
@ -1501,7 +1560,12 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
|
|||
int iterations = row_stride / iteration_stride;
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
|
||||
g.tiled_partition(b, WARP_SIZE);
|
||||
#else
|
||||
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
|
||||
#endif
|
||||
|
||||
int row = blockIdx.x;
|
||||
int id = threadIdx.x;
|
||||
|
@ -1605,7 +1669,12 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
|
|||
int iterations = row_stride / iteration_stride;
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
|
||||
g.tiled_partition(b, WARP_SIZE);
|
||||
#else
|
||||
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
|
||||
#endif
|
||||
|
||||
int row = blockIdx.x;
|
||||
int id = threadIdx.x;
|
||||
|
@ -1823,7 +1892,12 @@ __global__ void LayerNormBackward2_fused_add(const float* out_grad1,
|
|||
int iterations = row_stride / iteration_stride;
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
|
||||
g.tiled_partition(b, WARP_SIZE);
|
||||
#else
|
||||
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
|
||||
#endif
|
||||
|
||||
int row = blockIdx.x;
|
||||
int id = threadIdx.x;
|
||||
|
@ -1926,7 +2000,12 @@ __global__ void LayerNormBackward2_fused_add(const __half* out_grad1,
|
|||
int iterations = row_stride / iteration_stride;
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
|
||||
g.tiled_partition(b, WARP_SIZE);
|
||||
#else
|
||||
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
|
||||
#endif
|
||||
|
||||
int row = blockIdx.x;
|
||||
int id = threadIdx.x;
|
||||
|
|
|
@ -34,7 +34,12 @@ __global__ void attn_softmax(float* vals,
|
|||
int block_width = blockStride * seq_length;
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
|
||||
g.tiled_partition(b, tbSize);
|
||||
#else
|
||||
cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
|
||||
#endif
|
||||
|
||||
int batch = blockIdx.y;
|
||||
int row = blockIdx.x;
|
||||
|
@ -156,7 +161,7 @@ __global__ void attn_softmax(__half* vals,
|
|||
int seq_length,
|
||||
int iterations)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#ifdef HALF_PRECISION_AVAILABLE
|
||||
__shared__ float partialSum[MAX_WARP_NUM];
|
||||
|
||||
int warp_num = blockDim.x >> WARP_SIZE_BITS;
|
||||
|
@ -165,7 +170,12 @@ __global__ void attn_softmax(__half* vals,
|
|||
int block_width = blockStride * seq_length;
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
|
||||
g.tiled_partition(b, tbSize);
|
||||
#else
|
||||
cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
|
||||
#endif
|
||||
|
||||
int batch = blockIdx.y;
|
||||
int row = blockIdx.x;
|
||||
|
@ -449,7 +459,12 @@ __global__ void softmax_backward_kernel(T* out_grad, const T* soft_inp, int seq_
|
|||
: MAX_THREAD_ITERATIONS);
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, tbSize);
|
||||
g.tiled_partition(b, tbSize);
|
||||
#else
|
||||
cg::thread_block_tile<tbSize> g = cg::tiled_partition<tbSize>(b);
|
||||
#endif
|
||||
|
||||
int row = blockIdx.x;
|
||||
int id = threadIdx.x;
|
||||
|
@ -524,7 +539,12 @@ __global__ void softmax_backward_kernel_v2(T* grad /* input & output*/,
|
|||
}
|
||||
|
||||
cg::thread_block b = cg::this_thread_block();
|
||||
#ifdef __HIP_PLATFORM_HCC__
|
||||
cg::thread_group g(cg::internal::cg_coalesced_tile, WARP_SIZE);
|
||||
g.tiled_partition(b, WARP_SIZE);
|
||||
#else
|
||||
cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
|
||||
#endif
|
||||
|
||||
for (int i = 1; i < WARP_SIZE; i <<= 1) sum += g.shfl_xor(sum, i);
|
||||
|
||||
|
|
|
@ -96,7 +96,7 @@ __global__ void transform_0213<__half>(__half* output,
|
|||
int heads,
|
||||
int head_ext)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#ifdef HALF_PRECISION_AVAILABLE
|
||||
|
||||
int d0_stride = hidden_dim * seq_length;
|
||||
int d1_stride = hidden_dim;
|
||||
|
@ -219,7 +219,7 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
|
|||
int heads,
|
||||
int head_ext)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#ifdef HALF_PRECISION_AVAILABLE
|
||||
|
||||
int d0_stride = hidden_dim * seq_length;
|
||||
int d1_stride = hidden_dim;
|
||||
|
@ -289,7 +289,7 @@ __global__ void bias_add_transform_0213_v2(__half* output,
|
|||
int seq_length,
|
||||
int heads)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#ifdef HALF_PRECISION_AVAILABLE
|
||||
__shared__ float4 in_data[3072];
|
||||
|
||||
int d0_stride = hidden_dim * seq_length;
|
||||
|
@ -451,7 +451,7 @@ __global__ void transform4d_0213<__half>(__half* out,
|
|||
int hidden_dim,
|
||||
int head_ext)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#ifdef HALF_PRECISION_AVAILABLE
|
||||
|
||||
int d0_stride = hidden_dim * (seq_length / head_ext);
|
||||
int d1_stride = hidden_dim;
|
||||
|
@ -487,7 +487,7 @@ __global__ void transform4d_0213_v2(__half* out,
|
|||
int seq_length,
|
||||
int hidden_dim)
|
||||
{
|
||||
#if __CUDA_ARCH__ >= 700
|
||||
#ifdef HALF_PRECISION_AVAILABLE
|
||||
__shared__ float4 in_data[3072];
|
||||
|
||||
int d0_stride = hidden_dim * seq_length;
|
||||
|
|
|
@ -79,6 +79,11 @@ def nvcc_version():
|
|||
|
||||
def debug_report():
|
||||
max_dots = 33
|
||||
|
||||
hip_version = 'unknown'
|
||||
if hasattr(torch.version, 'hip'):
|
||||
hip_version = torch.version.hip
|
||||
|
||||
report = [
|
||||
("torch install path",
|
||||
torch.__path__),
|
||||
|
@ -86,6 +91,8 @@ def debug_report():
|
|||
torch.__version__),
|
||||
("torch cuda version",
|
||||
torch.version.cuda),
|
||||
("torch hip version",
|
||||
hip_version),
|
||||
("nvcc version",
|
||||
nvcc_version()),
|
||||
("deepspeed install path",
|
||||
|
@ -94,7 +101,8 @@ def debug_report():
|
|||
f"{deepspeed.__version__}, {deepspeed.__git_hash__}, {deepspeed.__git_branch__}"
|
||||
),
|
||||
("deepspeed wheel compiled w.",
|
||||
f"torch {torch_info['version']}, cuda {torch_info['cuda_version']}"),
|
||||
f"torch {torch_info['version']}, cuda {torch_info['cuda_version']}, hip {torch_info['hip_version']}"
|
||||
),
|
||||
]
|
||||
print("DeepSpeed general environment info:")
|
||||
for name, value in report:
|
||||
|
|
|
@ -14,4 +14,4 @@ except ModuleNotFoundError:
|
|||
from .ops.op_builder import ALL_OPS
|
||||
installed_ops = dict.fromkeys(ALL_OPS.keys(), False)
|
||||
compatible_ops = dict.fromkeys(ALL_OPS.keys(), False)
|
||||
torch_info = {'version': "0.0", "cuda_version": "0.0"}
|
||||
torch_info = {'version': "0.0", "cuda_version": "0.0", "hip_version": "0.0"}
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
from . import adam
|
||||
from . import adagrad
|
||||
from . import lamb
|
||||
#from ..git_version_info_installed import installed_ops as __installed_ops__
|
||||
#if __installed_ops__['sparse_attn']:
|
||||
from . import sparse_attention
|
||||
from . import transformer
|
||||
|
||||
|
|
|
@ -415,27 +415,6 @@ class DeepSpeedEngine(Module):
|
|||
"""
|
||||
return self._global_grad_norm
|
||||
|
||||
def set_train_batch_size(self, train_batch_size):
|
||||
"""Adjust the global batch size by increasing or decreasing the number of
|
||||
micro-batches (i.e., gradient accumulation steps). The size of each micro-batch
|
||||
(i.e., ``train_micro_batch_size_per_gpu``) is not changed.
|
||||
Args:
|
||||
train_batch_size (int): The new global batch size for training.
|
||||
Raises:
|
||||
ValueError: if ``train_batch_size`` is not divisible by the
|
||||
configured micro-batch size and data parallelism.
|
||||
"""
|
||||
if train_batch_size % (self.train_micro_batch_size_per_gpu() *
|
||||
self.dp_world_size) != 0:
|
||||
#print(f'{train_batch_size=} {self.train_micro_batch_size_per_gpu()=} {self.dp_world_size=}')
|
||||
raise ValueError(
|
||||
f'Train batch size must be divisible by micro-batch data parallelism')
|
||||
new_gas = train_batch_size // (self.train_micro_batch_size_per_gpu() *
|
||||
self.dp_world_size)
|
||||
# overwrite config
|
||||
self._config.train_batch_size = train_batch_size
|
||||
self._config.gradient_accumulation_steps = new_gas
|
||||
|
||||
def get_global_grad_norm(self) -> float:
|
||||
"""Return the 2-norm of all gradients. If there is model parallelism,
|
||||
the norm will be global.
|
||||
|
@ -2839,10 +2818,9 @@ class DeepSpeedEngine(Module):
|
|||
self.optimizer.state_dict()
|
||||
if self.optimizer and not self.zero_optimization() else None
|
||||
}
|
||||
torch.save(optimizer_state,
|
||||
self._get_optimizer_ckpt_name(save_dir,
|
||||
tag,
|
||||
expp_rank))
|
||||
with open(self._get_optimizer_ckpt_name(save_dir, tag, expp_rank), 'wb') as fd:
|
||||
torch.save(optimizer_state, fd)
|
||||
fd.flush()
|
||||
|
||||
# get non-moe parameters
|
||||
model_state_dict = self._get_non_moe_state_dict(self.module_state_dict())
|
||||
|
@ -2872,7 +2850,9 @@ class DeepSpeedEngine(Module):
|
|||
}
|
||||
state.update(client_state)
|
||||
logger.info(f'Saving model checkpoint: {save_path}')
|
||||
torch.save(state, save_path)
|
||||
with open(save_path, 'wb') as fd:
|
||||
torch.save(state, fd)
|
||||
fd.flush()
|
||||
self._curr_save_path = None
|
||||
|
||||
def _create_checkpoint_file(self, save_dir, tag, zero_checkpoint):
|
||||
|
@ -3006,7 +2986,9 @@ class DeepSpeedEngine(Module):
|
|||
zero_sd = dict(optimizer_state_dict=self.optimizer.state_dict(),
|
||||
ds_config=self.config,
|
||||
ds_version=version)
|
||||
torch.save(zero_sd, zero_checkpoint_name)
|
||||
with open(zero_checkpoint_name, 'wb') as fd:
|
||||
torch.save(zero_sd, fd)
|
||||
fd.flush()
|
||||
if self.global_rank == 0:
|
||||
self._copy_recovery_script(save_path)
|
||||
logger.info('zero checkpoint saved {}'.format(zero_checkpoint_name))
|
||||
|
|
|
@ -0,0 +1,186 @@
|
|||
FROM rocm/pytorch:latest
|
||||
|
||||
|
||||
##############################################################################
|
||||
# Temporary Installation Directory
|
||||
##############################################################################
|
||||
ENV STAGE_DIR=/tmp
|
||||
RUN mkdir -p ${STAGE_DIR}
|
||||
|
||||
##############################################################################
|
||||
# Installation/Basic Utilities
|
||||
##############################################################################
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
software-properties-common build-essential autotools-dev \
|
||||
nfs-common pdsh \
|
||||
cmake g++ gcc \
|
||||
curl wget vim tmux emacs less unzip \
|
||||
htop iftop iotop ca-certificates openssh-client openssh-server \
|
||||
rsync iputils-ping net-tools sudo \
|
||||
llvm-9-dev
|
||||
|
||||
##############################################################################
|
||||
# Installation Latest Git
|
||||
##############################################################################
|
||||
RUN add-apt-repository ppa:git-core/ppa -y && \
|
||||
apt-get update && \
|
||||
apt-get install -y git && \
|
||||
git --version
|
||||
|
||||
##############################################################################
|
||||
# Client Liveness & Uncomment Port 22 for SSH Daemon
|
||||
##############################################################################
|
||||
# Keep SSH client alive from server side
|
||||
RUN echo "ClientAliveInterval 30" >> /etc/ssh/sshd_config
|
||||
RUN cp /etc/ssh/sshd_config ${STAGE_DIR}/sshd_config && \
|
||||
sed "0,/^#Port 22/s//Port 22/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
|
||||
|
||||
##############################################################################
|
||||
# Mellanox OFED
|
||||
##############################################################################
|
||||
#ENV MLNX_OFED_VERSION=4.6-1.0.1.1
|
||||
#RUN apt-get install -y libnuma-dev
|
||||
#RUN cd ${STAGE_DIR} && \
|
||||
# wget -q -O - http://www.mellanox.com/downloads/ofed/MLNX_OFED-${MLNX_OFED_VERSION}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64.tgz | tar xzf - && \
|
||||
# cd MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64 && \
|
||||
# ./mlnxofedinstall --user-space-only --without-fw-update --all -q && \
|
||||
# cd ${STAGE_DIR} && \
|
||||
# rm -rf ${STAGE_DIR}/MLNX_OFED_LINUX-${MLNX_OFED_VERSION}-ubuntu18.04-x86_64*
|
||||
|
||||
##############################################################################
|
||||
# OPENMPI
|
||||
##############################################################################
|
||||
#ENV OPENMPI_BASEVERSION=4.0
|
||||
#ENV OPENMPI_VERSION=${OPENMPI_BASEVERSION}.1
|
||||
#RUN cd ${STAGE_DIR} && \
|
||||
# wget -q -O - https://download.open-mpi.org/release/open-mpi/v${OPENMPI_BASEVERSION}/openmpi-${OPENMPI_VERSION}.tar.gz | tar xzf - && \
|
||||
# cd openmpi-${OPENMPI_VERSION} && \
|
||||
# ./configure --prefix=/usr/local/openmpi-${OPENMPI_VERSION} && \
|
||||
# make -j"$(nproc)" install && \
|
||||
# ln -s /usr/local/openmpi-${OPENMPI_VERSION} /usr/local/mpi && \
|
||||
# # Sanity check:
|
||||
# test -f /usr/local/mpi/bin/mpic++ && \
|
||||
# cd ${STAGE_DIR} && \
|
||||
# rm -r ${STAGE_DIR}/openmpi-${OPENMPI_VERSION}
|
||||
#ENV PATH=/usr/local/mpi/bin:${PATH} \
|
||||
# LD_LIBRARY_PATH=/usr/local/lib:/usr/local/mpi/lib:/usr/local/mpi/lib64:${LD_LIBRARY_PATH}
|
||||
## Create a wrapper for OpenMPI to allow running as root by default
|
||||
#RUN mv /usr/local/mpi/bin/mpirun /usr/local/mpi/bin/mpirun.real && \
|
||||
# echo '#!/bin/bash' > /usr/local/mpi/bin/mpirun && \
|
||||
# echo 'mpirun.real --allow-run-as-root --prefix /usr/local/mpi "$@"' >> /usr/local/mpi/bin/mpirun && \
|
||||
# chmod a+x /usr/local/mpi/bin/mpirun
|
||||
|
||||
##############################################################################
|
||||
# Python
|
||||
##############################################################################
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV PYTHON_VERSION=3.6
|
||||
RUN apt-get install -y python3.6 python3.6-dev && \
|
||||
rm -f /usr/bin/python && \
|
||||
ln -s /usr/bin/python3.6 /usr/bin/python && \
|
||||
curl -O https://bootstrap.pypa.io/get-pip.py && \
|
||||
python get-pip.py && \
|
||||
rm get-pip.py && \
|
||||
pip install --upgrade pip && \
|
||||
# Print python an pip version
|
||||
python -V && pip -V
|
||||
RUN pip install pyyaml
|
||||
RUN pip install ipython
|
||||
|
||||
##############################################################################
|
||||
# TensorFlow
|
||||
##############################################################################
|
||||
RUN pip install tensorflow-rocm
|
||||
|
||||
##############################################################################
|
||||
# Some Packages
|
||||
##############################################################################
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
libsndfile-dev \
|
||||
libjpeg-dev \
|
||||
libpng-dev \
|
||||
screen
|
||||
RUN pip install psutil \
|
||||
yappi \
|
||||
cffi \
|
||||
ipdb \
|
||||
pandas \
|
||||
matplotlib \
|
||||
py3nvml \
|
||||
pyarrow \
|
||||
graphviz \
|
||||
astor \
|
||||
boto3 \
|
||||
tqdm \
|
||||
sentencepiece \
|
||||
msgpack \
|
||||
requests \
|
||||
pandas \
|
||||
sphinx \
|
||||
sphinx_rtd_theme \
|
||||
scipy \
|
||||
numpy \
|
||||
sklearn \
|
||||
scikit-learn \
|
||||
mpi4py \
|
||||
h5py
|
||||
|
||||
##############################################################################
|
||||
## SSH daemon port inside container cannot conflict with host OS port
|
||||
###############################################################################
|
||||
ENV SSH_PORT=2222
|
||||
RUN cat /etc/ssh/sshd_config > ${STAGE_DIR}/sshd_config && \
|
||||
sed "0,/^#Port 22/s//Port ${SSH_PORT}/" ${STAGE_DIR}/sshd_config > /etc/ssh/sshd_config
|
||||
|
||||
##############################################################################
|
||||
# PyTorch
|
||||
##############################################################################
|
||||
#ENV PYTORCH_VERSION=1.2.0
|
||||
#ENV TORCHVISION_VERSION=0.4.0
|
||||
#ENV TENSORBOARDX_VERSION=1.8
|
||||
#RUN pip install torch==${PYTORCH_VERSION}
|
||||
#RUN pip install torchvision==${TORCHVISION_VERSION}
|
||||
#RUN pip install tensorboardX==${TENSORBOARDX_VERSION}
|
||||
|
||||
##############################################################################
|
||||
# PyYAML build issue
|
||||
# https://stackoverflow.com/a/53926898
|
||||
##############################################################################
|
||||
RUN rm -rf /usr/lib/python3/dist-packages/yaml && \
|
||||
rm -rf /usr/lib/python3/dist-packages/PyYAML-*
|
||||
|
||||
##############################################################################
|
||||
## CuPy installation
|
||||
###############################################################################
|
||||
RUN git clone https://github.com/ROCmSoftwarePlatform/cupy ${STAGE_DIR}/cupy
|
||||
RUN cd ${STAGE_DIR}/cupy && \
|
||||
git submodule update --init && \
|
||||
CUPY_INSTALL_USE_HIP=1 ROCM_HOME=/opt/rocm pip install -e . --no-cache-dir -vvvv
|
||||
RUN rm -rf ${STAGE_DIR}/cupy
|
||||
|
||||
##############################################################################
|
||||
## Add deepspeed user
|
||||
###############################################################################
|
||||
# Add a deepspeed user with user id 8877
|
||||
#RUN useradd --create-home --uid 8877 deepspeed
|
||||
#RUN useradd --create-home --uid 1000 --shell /bin/bash deepspeed
|
||||
#RUN usermod -aG sudo deepspeed
|
||||
#RUN echo "deepspeed ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
|
||||
# # Change to non-root privilege
|
||||
#USER deepspeed
|
||||
|
||||
##############################################################################
|
||||
# DeepSpeed
|
||||
##############################################################################
|
||||
RUN git clone https://github.com/ROCmSoftwarePlatform/DeepSpeed.git ${STAGE_DIR}/DeepSpeed
|
||||
RUN cd ${STAGE_DIR}/DeepSpeed && \
|
||||
git checkout . && \
|
||||
git checkout master && \
|
||||
cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups.h && \
|
||||
cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups.h /opt/rocm/include/hip/hcc_detail/amd_hip_cooperative_groups.h && \
|
||||
cp -a csrc/includes/patch/hip/hcc_detail/hip_cooperative_groups_helper.h /opt/rocm/include/hip/hcc_detail/hip_cooperative_groups_helper.h && \
|
||||
DS_BUILD_FUSED_ADAM=1 DS_BUILD_FUSED_LAMB=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_TRANSFORMER=1 DS_BUILD_STOCHASTIC_TRANSFORMER=1 DS_BUILD_UTILS=1 ./install.sh --allow_sudo
|
||||
RUN rm -rf ${STAGE_DIR}/DeepSpeed
|
||||
RUN cd ~ && python -c "import deepspeed; print(deepspeed.__version__)"
|
|
@ -156,7 +156,7 @@ python setup.py $VERBOSE bdist_wheel
|
|||
|
||||
if [ "$local_only" == "1" ]; then
|
||||
echo "Installing deepspeed"
|
||||
$PIP_SUDO pip uninstall -y deepspeed
|
||||
# $PIP_SUDO pip uninstall -y deepspeed
|
||||
$PIP_SUDO $PIP_INSTALL dist/deepspeed*.whl
|
||||
ds_report
|
||||
else
|
||||
|
|
|
@ -10,9 +10,9 @@ from .transformer import TransformerBuilder
|
|||
from .stochastic_transformer import StochasticTransformerBuilder
|
||||
from .utils import UtilsBuilder
|
||||
from .async_io import AsyncIOBuilder
|
||||
from .builder import get_default_compute_capabilities
|
||||
from .transformer_inference import InferenceBuilder
|
||||
from .quantizer import QuantizerBuilder
|
||||
from .builder import get_default_compute_capabilities, OpBuilder
|
||||
|
||||
# TODO: infer this list instead of hard coded
|
||||
# List of all available ops
|
||||
|
|
|
@ -31,6 +31,9 @@ except ImportError:
|
|||
f"{WARNING} unable to import torch, please install it if you want to pre-compile any deepspeed ops."
|
||||
)
|
||||
|
||||
TORCH_MAJOR = int(torch.__version__.split('.')[0])
|
||||
TORCH_MINOR = int(torch.__version__.split('.')[1])
|
||||
|
||||
|
||||
def installed_cuda_version():
|
||||
import torch.utils.cpp_extension
|
||||
|
@ -100,23 +103,10 @@ def assert_no_cuda_mismatch():
|
|||
"cuda/cpp extensions without a matching cuda version.")
|
||||
|
||||
|
||||
def assert_torch_info(torch_info):
|
||||
install_torch_version = torch_info['version']
|
||||
install_cuda_version = torch_info['cuda_version']
|
||||
|
||||
current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
|
||||
current_torch_version = ".".join(torch.__version__.split('.')[:2])
|
||||
|
||||
if install_cuda_version != current_cuda_version or install_torch_version != current_torch_version:
|
||||
raise RuntimeError(
|
||||
"PyTorch and CUDA version mismatch! DeepSpeed ops were compiled and installed "
|
||||
"with a different version than what is being used at runtime. Please re-install "
|
||||
f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
|
||||
f"torch={install_torch_version}, cuda={install_cuda_version}, runtime versions:"
|
||||
f"torch={current_torch_version}, cuda={current_cuda_version}")
|
||||
|
||||
|
||||
class OpBuilder(ABC):
|
||||
_rocm_version = None
|
||||
_is_rocm_pytorch = None
|
||||
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.jit_mode = False
|
||||
|
@ -136,6 +126,67 @@ class OpBuilder(ABC):
|
|||
'''
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def assert_torch_info(torch_info):
|
||||
install_torch_version = torch_info['version']
|
||||
install_cuda_version = torch_info['cuda_version']
|
||||
install_hip_version = torch_info['hip_version']
|
||||
|
||||
if not OpBuilder.is_rocm_pytorch():
|
||||
current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
|
||||
else:
|
||||
current_hip_version = ".".join(torch.version.hip.split('.')[:2])
|
||||
|
||||
current_torch_version = ".".join(torch.__version__.split('.')[:2])
|
||||
|
||||
if not OpBuilder.is_rocm_pytorch():
|
||||
if install_cuda_version != current_cuda_version or install_torch_version != current_torch_version:
|
||||
raise RuntimeError(
|
||||
"PyTorch and CUDA version mismatch! DeepSpeed ops were compiled and installed "
|
||||
"with a different version than what is being used at runtime. Please re-install "
|
||||
f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
|
||||
f"torch={install_torch_version}, cuda={install_cuda_version}, runtime versions:"
|
||||
f"torch={current_torch_version}, cuda={current_cuda_version}")
|
||||
else:
|
||||
if install_hip_version != current_hip_version or install_torch_version != current_torch_version:
|
||||
raise RuntimeError(
|
||||
"PyTorch and HIP version mismatch! DeepSpeed ops were compiled and installed "
|
||||
"with a different version than what is being used at runtime. Please re-install "
|
||||
f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
|
||||
f"torch={install_torch_version}, hip={install_hip_version}, runtime versions:"
|
||||
f"torch={current_torch_version}, hip={current_hip_version}")
|
||||
|
||||
@staticmethod
|
||||
def is_rocm_pytorch():
|
||||
if OpBuilder._is_rocm_pytorch is not None:
|
||||
return OpBuilder._is_rocm_pytorch
|
||||
|
||||
_is_rocm_pytorch = False
|
||||
if TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 5):
|
||||
_is_rocm_pytorch = hasattr(torch.version,
|
||||
'hip') and torch.version.hip is not None
|
||||
if _is_rocm_pytorch:
|
||||
from torch.utils.cpp_extension import ROCM_HOME
|
||||
_is_rocm_pytorch = ROCM_HOME is not None
|
||||
OpBuilder._is_rocm_pytorch = _is_rocm_pytorch
|
||||
return OpBuilder._is_rocm_pytorch
|
||||
|
||||
@staticmethod
|
||||
def installed_rocm_version():
|
||||
if OpBuilder._rocm_version:
|
||||
return OpBuilder._rocm_version
|
||||
|
||||
ROCM_MAJOR = '0'
|
||||
ROCM_MINOR = '0'
|
||||
if OpBuilder.is_rocm_pytorch():
|
||||
from torch.utils.cpp_extension import ROCM_HOME
|
||||
with open('/opt/rocm/.info/version-dev', 'r') as file:
|
||||
ROCM_VERSION_DEV_RAW = file.read()
|
||||
ROCM_MAJOR = ROCM_VERSION_DEV_RAW.split('.')[0]
|
||||
ROCM_MINOR = ROCM_VERSION_DEV_RAW.split('.')[1]
|
||||
OpBuilder._rocm_version = (int(ROCM_MAJOR), int(ROCM_MINOR))
|
||||
return OpBuilder._rocm_version
|
||||
|
||||
def include_paths(self):
|
||||
'''
|
||||
Returns list of include paths, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
|
||||
|
@ -396,7 +447,7 @@ class OpBuilder(ABC):
|
|||
# Ensure the op we're about to load was compiled with the same
|
||||
# torch/cuda versions we are currently using at runtime.
|
||||
if isinstance(self, CUDAOpBuilder):
|
||||
assert_torch_info(torch_info)
|
||||
self.assert_torch_info(torch_info)
|
||||
|
||||
return importlib.import_module(self.absolute_name())
|
||||
else:
|
||||
|
@ -414,7 +465,7 @@ class OpBuilder(ABC):
|
|||
f"Unable to JIT load the {self.name} op due to ninja not being installed."
|
||||
)
|
||||
|
||||
if isinstance(self, CUDAOpBuilder):
|
||||
if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
|
||||
assert_no_cuda_mismatch()
|
||||
|
||||
self.jit_mode = True
|
||||
|
@ -534,15 +585,26 @@ class CUDAOpBuilder(OpBuilder):
|
|||
|
||||
def builder(self):
|
||||
from torch.utils.cpp_extension import CUDAExtension
|
||||
assert_no_cuda_mismatch()
|
||||
return CUDAExtension(name=self.absolute_name(),
|
||||
sources=self.strip_empty_entries(self.sources()),
|
||||
include_dirs=self.strip_empty_entries(self.include_paths()),
|
||||
libraries=self.strip_empty_entries(self.libraries_args()),
|
||||
extra_compile_args={
|
||||
'cxx': self.strip_empty_entries(self.cxx_args()),
|
||||
'nvcc': self.strip_empty_entries(self.nvcc_args())
|
||||
})
|
||||
if not self.is_rocm_pytorch():
|
||||
assert_no_cuda_mismatch()
|
||||
cuda_ext = CUDAExtension(
|
||||
name=self.absolute_name(),
|
||||
sources=self.strip_empty_entries(self.sources()),
|
||||
include_dirs=self.strip_empty_entries(self.include_paths()),
|
||||
libraries=self.strip_empty_entries(self.libraries_args()),
|
||||
extra_compile_args={
|
||||
'cxx': self.strip_empty_entries(self.cxx_args()),
|
||||
'nvcc': self.strip_empty_entries(self.nvcc_args())
|
||||
})
|
||||
if self.is_rocm_pytorch():
|
||||
# hip converts paths to absolute, this converts back to relative
|
||||
sources = cuda_ext.sources
|
||||
curr_file = Path(__file__).parent.parent # ds root
|
||||
for i in range(len(sources)):
|
||||
src = Path(sources[i])
|
||||
sources[i] = str(src.relative_to(curr_file))
|
||||
cuda_ext.sources = sources
|
||||
return cuda_ext
|
||||
|
||||
def cxx_args(self):
|
||||
if sys.platform == "win32":
|
||||
|
@ -551,18 +613,29 @@ class CUDAOpBuilder(OpBuilder):
|
|||
return ['-O3', '-std=c++14', '-g', '-Wno-reorder']
|
||||
|
||||
def nvcc_args(self):
|
||||
cuda_major, _ = installed_cuda_version()
|
||||
args = [
|
||||
'-O3',
|
||||
'--use_fast_math',
|
||||
'-std=c++17'
|
||||
if sys.platform == "win32" and cuda_major > 10 else '-std=c++14',
|
||||
'-U__CUDA_NO_HALF_OPERATORS__',
|
||||
'-U__CUDA_NO_HALF_CONVERSIONS__',
|
||||
'-U__CUDA_NO_HALF2_OPERATORS__'
|
||||
]
|
||||
|
||||
return args + self.compute_capability_args()
|
||||
args = ['-O3']
|
||||
if self.is_rocm_pytorch():
|
||||
ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
|
||||
args += [
|
||||
'-std=c++14',
|
||||
'-U__HIP_NO_HALF_OPERATORS__',
|
||||
'-U__HIP_NO_HALF_CONVERSIONS__',
|
||||
'-U__HIP_NO_HALF2_OPERATORS__',
|
||||
'-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
|
||||
'-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
|
||||
]
|
||||
else:
|
||||
cuda_major, _ = installed_cuda_version()
|
||||
args += [
|
||||
'--use_fast_math',
|
||||
'-std=c++17'
|
||||
if sys.platform == "win32" and cuda_major > 10 else '-std=c++14',
|
||||
'-U__CUDA_NO_HALF_OPERATORS__',
|
||||
'-U__CUDA_NO_HALF_CONVERSIONS__',
|
||||
'-U__CUDA_NO_HALF2_OPERATORS__'
|
||||
]
|
||||
args += self.compute_capability_args()
|
||||
return args
|
||||
|
||||
def libraries_args(self):
|
||||
if sys.platform == "win32":
|
||||
|
@ -573,11 +646,17 @@ class CUDAOpBuilder(OpBuilder):
|
|||
|
||||
class TorchCPUOpBuilder(CUDAOpBuilder):
|
||||
def extra_ldflags(self):
|
||||
return ['-lcurand']
|
||||
if not self.is_rocm_pytorch():
|
||||
return ['-lcurand']
|
||||
else:
|
||||
return []
|
||||
|
||||
def cxx_args(self):
|
||||
import torch
|
||||
CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
|
||||
if not self.is_rocm_pytorch():
|
||||
CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
|
||||
else:
|
||||
CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.ROCM_HOME, "lib")
|
||||
CPU_ARCH = self.cpu_arch()
|
||||
SIMD_WIDTH = self.simd_width()
|
||||
|
||||
|
|
|
@ -2,8 +2,6 @@
|
|||
Copyright 2020 The Microsoft DeepSpeed Team
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
from .builder import TorchCPUOpBuilder
|
||||
|
||||
|
||||
|
@ -22,5 +20,17 @@ class CPUAdagradBuilder(TorchCPUOpBuilder):
|
|||
|
||||
def include_paths(self):
|
||||
import torch
|
||||
CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")
|
||||
return ['csrc/includes', CUDA_INCLUDE]
|
||||
if not self.is_rocm_pytorch():
|
||||
CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
|
||||
else:
|
||||
CUDA_INCLUDE = [
|
||||
os.path.join(torch.utils.cpp_extension.ROCM_HOME,
|
||||
"include"),
|
||||
os.path.join(torch.utils.cpp_extension.ROCM_HOME,
|
||||
"include",
|
||||
"rocrand"),
|
||||
os.path.join(torch.utils.cpp_extension.ROCM_HOME,
|
||||
"include",
|
||||
"hiprand"),
|
||||
]
|
||||
return ['csrc/includes'] + CUDA_INCLUDE
|
||||
|
|
|
@ -2,8 +2,6 @@
|
|||
Copyright 2020 The Microsoft DeepSpeed Team
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
from .builder import TorchCPUOpBuilder
|
||||
|
||||
|
||||
|
@ -22,5 +20,17 @@ class CPUAdamBuilder(TorchCPUOpBuilder):
|
|||
|
||||
def include_paths(self):
|
||||
import torch
|
||||
CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")
|
||||
return ['csrc/includes', CUDA_INCLUDE]
|
||||
if not self.is_rocm_pytorch():
|
||||
CUDA_INCLUDE = [os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")]
|
||||
else:
|
||||
CUDA_INCLUDE = [
|
||||
os.path.join(torch.utils.cpp_extension.ROCM_HOME,
|
||||
"include"),
|
||||
os.path.join(torch.utils.cpp_extension.ROCM_HOME,
|
||||
"include",
|
||||
"rocrand"),
|
||||
os.path.join(torch.utils.cpp_extension.ROCM_HOME,
|
||||
"include",
|
||||
"hiprand"),
|
||||
]
|
||||
return ['csrc/includes'] + CUDA_INCLUDE
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
"""
|
||||
Copyright 2020 The Microsoft DeepSpeed Team
|
||||
"""
|
||||
import torch
|
||||
from .builder import CUDAOpBuilder
|
||||
|
||||
|
||||
|
@ -18,14 +19,15 @@ class FusedAdamBuilder(CUDAOpBuilder):
|
|||
return ['csrc/adam/fused_adam_frontend.cpp', 'csrc/adam/multi_tensor_adam.cu']
|
||||
|
||||
def include_paths(self):
|
||||
return ['csrc/includes']
|
||||
return ['csrc/includes', 'csrc/adam']
|
||||
|
||||
def cxx_args(self):
|
||||
args = super().cxx_args()
|
||||
return args + self.version_dependent_macros()
|
||||
|
||||
def nvcc_args(self):
|
||||
return ['-lineinfo',
|
||||
'-O3',
|
||||
'--use_fast_math'
|
||||
] + self.version_dependent_macros() + self.compute_capability_args()
|
||||
nvcc_flags = ['-O3'] + self.version_dependent_macros()
|
||||
if not self.is_rocm_pytorch():
|
||||
nvcc_flags.extend(['-lineinfo',
|
||||
'--use_fast_math'] + self.compute_capability_args())
|
||||
return nvcc_flags
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
"""
|
||||
Copyright 2020 The Microsoft DeepSpeed Team
|
||||
"""
|
||||
import torch
|
||||
from .builder import CUDAOpBuilder
|
||||
|
||||
|
||||
|
@ -25,7 +26,14 @@ class FusedLambBuilder(CUDAOpBuilder):
|
|||
return args + self.version_dependent_macros()
|
||||
|
||||
def nvcc_args(self):
|
||||
return ['-lineinfo',
|
||||
'-O3',
|
||||
'--use_fast_math'
|
||||
] + self.version_dependent_macros() + self.compute_capability_args()
|
||||
nvcc_flags = ['-O3'] + self.version_dependent_macros()
|
||||
if self.is_rocm_pytorch():
|
||||
ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
|
||||
nvcc_flags += [
|
||||
'-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
|
||||
'-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
|
||||
]
|
||||
else:
|
||||
nvcc_flags.extend(['-lineinfo',
|
||||
'--use_fast_math'] + self.compute_capability_args())
|
||||
return nvcc_flags
|
||||
|
|
|
@ -32,6 +32,10 @@ class SparseAttnBuilder(OpBuilder):
|
|||
#command_status = list(map(self.command_exists, required_commands))
|
||||
#deps_compatible = all(command_status)
|
||||
|
||||
if self.is_rocm_pytorch():
|
||||
self.warning(f'{self.NAME} is not compatible with ROCM')
|
||||
return False
|
||||
|
||||
try:
|
||||
import torch
|
||||
except ImportError:
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
"""
|
||||
Copyright 2020 The Microsoft DeepSpeed Team
|
||||
"""
|
||||
import torch
|
||||
from .builder import CUDAOpBuilder
|
||||
|
||||
|
||||
|
@ -28,4 +29,11 @@ class TransformerBuilder(CUDAOpBuilder):
|
|||
]
|
||||
|
||||
def include_paths(self):
|
||||
return ['csrc/includes']
|
||||
includes = ['csrc/includes']
|
||||
if self.is_rocm_pytorch():
|
||||
from torch.utils.cpp_extension import ROCM_HOME
|
||||
includes += [
|
||||
'{}/hiprand/include'.format(ROCM_HOME),
|
||||
'{}/rocrand/include'.format(ROCM_HOME)
|
||||
]
|
||||
return includes
|
||||
|
|
34
setup.py
34
setup.py
|
@ -32,7 +32,11 @@ except ImportError:
|
|||
print('[WARNING] Unable to import torch, pre-compiling ops will be disabled. ' \
|
||||
'Please visit https://pytorch.org/ to see how to properly install torch on your system.')
|
||||
|
||||
from op_builder import ALL_OPS, get_default_compute_capabilities
|
||||
from op_builder import ALL_OPS, get_default_compute_capabilities, OpBuilder
|
||||
|
||||
# fetch rocm state
|
||||
is_rocm_pytorch = OpBuilder.is_rocm_pytorch()
|
||||
rocm_version = OpBuilder.installed_rocm_version()
|
||||
|
||||
RED_START = '\033[31m'
|
||||
RED_END = '\033[0m'
|
||||
|
@ -51,8 +55,8 @@ def fetch_requirements(path):
|
|||
|
||||
install_requires = fetch_requirements('requirements/requirements.txt')
|
||||
extras_require = {
|
||||
'1bit_mpi' : fetch_requirements('requirements/requirements-1bit-mpi.txt'),
|
||||
'1bit': [], # Will add proper cupy version below
|
||||
'1bit': [], # add cupy based on cuda/rocm version
|
||||
'1bit_mpi': fetch_requirements('requirements/requirements-1bit-mpi.txt'),
|
||||
'readthedocs': fetch_requirements('requirements/requirements-readthedocs.txt'),
|
||||
'dev': fetch_requirements('requirements/requirements-dev.txt'),
|
||||
'autotuning': fetch_requirements('requirements/requirements-autotuning.txt'),
|
||||
|
@ -62,9 +66,17 @@ extras_require = {
|
|||
|
||||
# Add specific cupy version to both onebit extension variants
|
||||
if torch_available and torch.cuda.is_available():
|
||||
cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}"
|
||||
extras_require['1bit_mpi'].append(cupy)
|
||||
extras_require['1bit'].append(cupy)
|
||||
cupy = None
|
||||
if is_rocm_pytorch:
|
||||
rocm_major, rocm_minor = rocm_version
|
||||
# XXX cupy support for rocm 5 is not available yet
|
||||
if rocm_major <= 4:
|
||||
cupy = f"cupy-rocm-{rocm_major}-{rocm_minor}"
|
||||
else:
|
||||
cupy = f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}"
|
||||
if cupy:
|
||||
extras_require['1bit'].append(cupy)
|
||||
extras_require['1bit_mpi'].append(cupy)
|
||||
|
||||
# Make an [all] extra that installs all needed dependencies
|
||||
all_extras = set()
|
||||
|
@ -207,9 +219,17 @@ else:
|
|||
torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR])
|
||||
# Set cuda_version to 0.0 if cpu-only
|
||||
cuda_version = "0.0"
|
||||
# Set hip_version to 0.0 if cpu-only
|
||||
hip_version = "0.0"
|
||||
if torch_available and torch.version.cuda is not None:
|
||||
cuda_version = ".".join(torch.version.cuda.split('.')[:2])
|
||||
torch_info = {"version": torch_version, "cuda_version": cuda_version}
|
||||
if torch_available and hasattr(torch.version, 'hip') and torch.version.hip is not None:
|
||||
hip_version = ".".join(torch.version.hip.split('.')[:2])
|
||||
torch_info = {
|
||||
"version": torch_version,
|
||||
"cuda_version": cuda_version,
|
||||
"hip_version": hip_version
|
||||
}
|
||||
|
||||
print(f"version={version_str}, git_hash={git_hash}, git_branch={git_branch}")
|
||||
with open('deepspeed/git_version_info_installed.py', 'w') as fd:
|
||||
|
|
|
@ -8,6 +8,9 @@ from torch.multiprocessing import Process
|
|||
import deepspeed
|
||||
|
||||
import pytest
|
||||
from functools import wraps
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
|
@ -39,8 +42,15 @@ def set_cuda_visibile():
|
|||
if cuda_visible is None:
|
||||
# CUDA_VISIBLE_DEVICES is not set, discover it from nvidia-smi instead
|
||||
import subprocess
|
||||
nvidia_smi = subprocess.check_output(['nvidia-smi', '--list-gpus'])
|
||||
num_gpus = len(nvidia_smi.decode('utf-8').strip().split('\n'))
|
||||
is_rocm_pytorch = hasattr(torch.version, 'hip') and torch.version.hip is not None
|
||||
if is_rocm_pytorch:
|
||||
rocm_smi = subprocess.check_output(['rocm-smi', '--showid'])
|
||||
gpu_ids = filter(lambda s: 'GPU' in s,
|
||||
rocm_smi.decode('utf-8').strip().split('\n'))
|
||||
num_gpus = len(list(gpu_ids))
|
||||
else:
|
||||
nvidia_smi = subprocess.check_output(['nvidia-smi', '--list-gpus'])
|
||||
num_gpus = len(nvidia_smi.decode('utf-8').strip().split('\n'))
|
||||
cuda_visible = ",".join(map(str, range(num_gpus)))
|
||||
|
||||
# rotate list based on xdist worker id, example below
|
||||
|
@ -94,6 +104,7 @@ def distributed_test(world_size=2, backend='nccl'):
|
|||
|
||||
# make sure all ranks finish at the same time
|
||||
torch.distributed.barrier()
|
||||
|
||||
# tear down after test completes
|
||||
torch.distributed.destroy_process_group()
|
||||
|
||||
|
@ -154,6 +165,6 @@ def distributed_test(world_size=2, backend='nccl'):
|
|||
return dist_wrap
|
||||
|
||||
|
||||
def get_test_path(src):
|
||||
def get_test_path(filename):
|
||||
curr_path = Path(__file__).parent
|
||||
return str(curr_path.joinpath(src))
|
||||
return str(curr_path.joinpath(filename))
|
||||
|
|
|
@ -186,6 +186,8 @@ def checkpoint_correctness_verification(args,
|
|||
|
||||
trained_model.save_checkpoint(save_folder, tag=save_tag)
|
||||
|
||||
dist.barrier()
|
||||
|
||||
loaded_model = create_deepspeed_model(args=args,
|
||||
model=models[1],
|
||||
base_optimizer=base_optimizers[1])
|
||||
|
|
|
@ -123,7 +123,7 @@ class TestConfigurableMP:
|
|||
load_lr_scheduler_states=False)
|
||||
|
||||
test = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda())
|
||||
assert torch.allclose(baseline, test, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
|
||||
assert torch.allclose(baseline, test, rtol=1.0, atol=1e-07), f"Baseline output {baseline} is not equal to save-then-load output {test}"
|
||||
|
||||
inputs = self.get_inputs()
|
||||
_run(inputs)
|
||||
|
|
|
@ -15,9 +15,6 @@ import deepspeed
|
|||
|
||||
import sys
|
||||
|
||||
#if not deepspeed.ops.__installed_ops__['transformer']:
|
||||
# pytest.skip("transformer kernels are not installed", allow_module_level=True)
|
||||
|
||||
|
||||
def check_equal(first, second, atol=1e-2, verbose=False):
|
||||
if verbose:
|
||||
|
@ -266,10 +263,10 @@ def test_forward(batch_size,
|
|||
|
||||
@pytest.mark.parametrize('batch_size, small_bsz, hidden_size, seq_len, heads, num_layers, is_preln, use_fp16',
|
||||
[
|
||||
#(8,3,1024,512,16,3,True,False),
|
||||
#(8,7,1024,512,16,3,True,True),
|
||||
#(8,3,1024,512,16,3,False,False),
|
||||
#(8,7,1024,512,16,3,False,True),
|
||||
(8,3,1024,512,16,3,True,False),
|
||||
(8,7,1024,512,16,3,True,True),
|
||||
(8,3,1024,512,16,3,False,False),
|
||||
(8,7,1024,512,16,3,False,True),
|
||||
]) # yapf: disable
|
||||
def test_forward_with_small_bsz(batch_size,
|
||||
small_bsz,
|
||||
|
|
|
@ -12,6 +12,7 @@ import numpy as np
|
|||
import time
|
||||
|
||||
from deepspeed.runtime.pipe.topology import PipeDataParallelTopology, PipeModelDataParallelTopology
|
||||
from deepspeed.ops.op_builder import OpBuilder
|
||||
|
||||
PipeTopo = PipeDataParallelTopology
|
||||
from deepspeed.runtime.pipe.module import PipelineModule, LayerSpec
|
||||
|
@ -25,6 +26,12 @@ if TORCH_MAJOR < 1 or TORCH_MINOR < 8:
|
|||
pytest.skip("NCCL-based 1-bit compression requires torch 1.8 or higher",
|
||||
allow_module_level=True)
|
||||
|
||||
rocm_version = OpBuilder.installed_rocm_version()
|
||||
if rocm_version[0] > 4:
|
||||
pytest.skip(
|
||||
"NCCL-based 1-bit compression is not yet supported w. ROCm 5 until cupy supports ROCm 5",
|
||||
allow_module_level=True)
|
||||
|
||||
|
||||
def test_onebitadam_fp16_basic(tmpdir):
|
||||
config_dict = {
|
||||
|
|
|
@ -252,7 +252,6 @@ def init_softmax_inputs(Z, H, M, N, scale, rho, block, dtype, dense_x=True, layo
|
|||
|
||||
|
||||
def _skip_on_cuda_compatability():
|
||||
return
|
||||
if torch.cuda.get_device_capability()[0] < 7:
|
||||
pytest.skip("needs higher compute capability than 7")
|
||||
cuda_major = int(torch.version.cuda.split('.')[0]) * 10
|
||||
|
|
|
@ -846,7 +846,7 @@ def test_zero3_param_partitioning_many_params(world_sz: int,
|
|||
for _ in range(n_layers))
|
||||
|
||||
for layer_num, module in enumerate(self.modulelist):
|
||||
if dist.get_rank() == 0:
|
||||
with deepspeed.zero.GatheredParameters(module.weight, modifier_rank=0):
|
||||
param: Parameter = module.weight
|
||||
partition_sz = math.ceil(param.numel() / dist.get_world_size())
|
||||
offset = 0
|
||||
|
|
Загрузка…
Ссылка в новой задаче