зеркало из https://github.com/microsoft/apex.git
* Pushing for build tests * Contrib files * Removing deprecated checks
This commit is contained in:
Родитель
1bf0d8d4ba
Коммит
325f5a0bec
|
@ -8,6 +8,8 @@
|
||||||
|
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
|
|
||||||
|
#include "compat.h"
|
||||||
|
|
||||||
#define cudaCheckErrors(msg) \
|
#define cudaCheckErrors(msg) \
|
||||||
do { \
|
do { \
|
||||||
cudaError_t __err = cudaGetLastError(); \
|
cudaError_t __err = cudaGetLastError(); \
|
||||||
|
@ -72,7 +74,7 @@ at::Tensor nhwc_bn_fwd_train(
|
||||||
const int C = x.size(3);
|
const int C = x.size(3);
|
||||||
|
|
||||||
// generating new magic number and use that for sync
|
// generating new magic number and use that for sync
|
||||||
int* magic = magic_tensor.data<int>();
|
int* magic = magic_tensor.DATA_PTR<int>();
|
||||||
*magic = (*magic + 1) & 0xff;
|
*magic = (*magic + 1) & 0xff;
|
||||||
|
|
||||||
// Allocate output tensor
|
// Allocate output tensor
|
||||||
|
@ -87,13 +89,13 @@ at::Tensor nhwc_bn_fwd_train(
|
||||||
bn->setConstants(momentum, epsilon);
|
bn->setConstants(momentum, epsilon);
|
||||||
|
|
||||||
// set pointers within the wrapper
|
// set pointers within the wrapper
|
||||||
bn->setInputOutputPointers(x.data<at::Half>(),
|
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
|
||||||
nullptr,
|
nullptr,
|
||||||
y.data<at::Half>(),
|
y.DATA_PTR<at::Half>(),
|
||||||
nullptr);
|
nullptr);
|
||||||
|
|
||||||
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {nullptr, nullptr});
|
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {nullptr, nullptr});
|
||||||
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
|
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});
|
||||||
|
|
||||||
// deal with workspace(s)
|
// deal with workspace(s)
|
||||||
auto workspace_bytes = bn->numWorkspaceBytes();
|
auto workspace_bytes = bn->numWorkspaceBytes();
|
||||||
|
@ -114,12 +116,12 @@ at::Tensor nhwc_bn_fwd_train(
|
||||||
Workspace ws(total_workspace_bytes);
|
Workspace ws(total_workspace_bytes);
|
||||||
|
|
||||||
std::vector<void *> workspace;
|
std::vector<void *> workspace;
|
||||||
workspace.push_back(minibatch_mean.data<float>());
|
workspace.push_back(minibatch_mean.DATA_PTR<float>());
|
||||||
workspace.push_back(minibatch_inv_var.data<float>());
|
workspace.push_back(minibatch_inv_var.DATA_PTR<float>());
|
||||||
|
|
||||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||||
const int retired_cta_bytes = workspace_bytes[2];
|
const int retired_cta_bytes = workspace_bytes[2];
|
||||||
void* retired_ctas = ret_cta.data<uint8_t>();
|
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
|
||||||
assert(ret_cta.size(0)>=retired_cta_bytes);
|
assert(ret_cta.size(0)>=retired_cta_bytes);
|
||||||
workspace.push_back(retired_ctas);
|
workspace.push_back(retired_ctas);
|
||||||
|
|
||||||
|
@ -165,13 +167,13 @@ at::Tensor nhwc_bn_fwd_eval(
|
||||||
bn->setConstants(momentum, epsilon);
|
bn->setConstants(momentum, epsilon);
|
||||||
|
|
||||||
// set pointers within the wrapper
|
// set pointers within the wrapper
|
||||||
bn->setInputOutputPointers(x.data<at::Half>(),
|
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
|
||||||
nullptr,
|
nullptr,
|
||||||
y.data<at::Half>(),
|
y.DATA_PTR<at::Half>(),
|
||||||
nullptr);
|
nullptr);
|
||||||
|
|
||||||
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {nullptr, nullptr});
|
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {nullptr, nullptr});
|
||||||
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
|
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});
|
||||||
|
|
||||||
// deal with workspace(s)
|
// deal with workspace(s)
|
||||||
auto workspace_bytes = bn->numWorkspaceBytes();
|
auto workspace_bytes = bn->numWorkspaceBytes();
|
||||||
|
@ -197,7 +199,7 @@ at::Tensor nhwc_bn_fwd_eval(
|
||||||
|
|
||||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||||
const int retired_cta_bytes = workspace_bytes[2];
|
const int retired_cta_bytes = workspace_bytes[2];
|
||||||
void* retired_ctas = ret_cta.data<uint8_t>();
|
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
|
||||||
assert(ret_cta.size(0)>=retired_cta_bytes);
|
assert(ret_cta.size(0)>=retired_cta_bytes);
|
||||||
workspace.push_back(retired_ctas);
|
workspace.push_back(retired_ctas);
|
||||||
|
|
||||||
|
@ -244,7 +246,7 @@ std::vector<at::Tensor> nhwc_bn_bwd(
|
||||||
const int C = x.size(3);
|
const int C = x.size(3);
|
||||||
|
|
||||||
// generating new magic number and use that for sync
|
// generating new magic number and use that for sync
|
||||||
int* magic = magic_tensor.data<int>();
|
int* magic = magic_tensor.DATA_PTR<int>();
|
||||||
*magic = (*magic + 1) & 0xff;
|
*magic = (*magic + 1) & 0xff;
|
||||||
|
|
||||||
// outputs
|
// outputs
|
||||||
|
@ -264,13 +266,13 @@ std::vector<at::Tensor> nhwc_bn_bwd(
|
||||||
bn->setConstants(momentum, epsilon);
|
bn->setConstants(momentum, epsilon);
|
||||||
|
|
||||||
// set pointers within the wrapper
|
// set pointers within the wrapper
|
||||||
bn->setInputOutputPointers(x.data<at::Half>(),
|
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
|
||||||
x_grad.data<at::Half>(),
|
x_grad.DATA_PTR<at::Half>(),
|
||||||
nullptr,
|
nullptr,
|
||||||
dy.data<at::Half>());
|
dy.DATA_PTR<at::Half>());
|
||||||
|
|
||||||
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {scale_grad.data<float>(), bias_grad.data<float>()});
|
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {scale_grad.DATA_PTR<float>(), bias_grad.DATA_PTR<float>()});
|
||||||
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
|
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});
|
||||||
|
|
||||||
// deal with workspace(s)
|
// deal with workspace(s)
|
||||||
auto workspace_bytes = bn->numWorkspaceBytes();
|
auto workspace_bytes = bn->numWorkspaceBytes();
|
||||||
|
@ -291,12 +293,12 @@ std::vector<at::Tensor> nhwc_bn_bwd(
|
||||||
Workspace ws(total_workspace_bytes);
|
Workspace ws(total_workspace_bytes);
|
||||||
|
|
||||||
std::vector<void *> workspace;
|
std::vector<void *> workspace;
|
||||||
workspace.push_back(minibatch_mean.data<float>());
|
workspace.push_back(minibatch_mean.DATA_PTR<float>());
|
||||||
workspace.push_back(minibatch_inv_var.data<float>());
|
workspace.push_back(minibatch_inv_var.DATA_PTR<float>());
|
||||||
|
|
||||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||||
const int retired_cta_bytes = workspace_bytes[2];
|
const int retired_cta_bytes = workspace_bytes[2];
|
||||||
void* retired_ctas = ret_cta.data<uint8_t>();
|
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
|
||||||
assert(ret_cta.size(0)>=retired_cta_bytes);
|
assert(ret_cta.size(0)>=retired_cta_bytes);
|
||||||
workspace.push_back(retired_ctas);
|
workspace.push_back(retired_ctas);
|
||||||
|
|
||||||
|
|
|
@ -8,6 +8,8 @@
|
||||||
|
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
|
|
||||||
|
#include "compat.h"
|
||||||
|
|
||||||
//FIXME move the common stuff to common h file
|
//FIXME move the common stuff to common h file
|
||||||
#define cudaCheckErrors(msg) \
|
#define cudaCheckErrors(msg) \
|
||||||
do { \
|
do { \
|
||||||
|
@ -74,7 +76,7 @@ at::Tensor nhwc_bn_addrelu_fwd_train(
|
||||||
const int C = x.size(3);
|
const int C = x.size(3);
|
||||||
|
|
||||||
// generating new magic number and use that for sync
|
// generating new magic number and use that for sync
|
||||||
int* magic = magic_tensor.data<int>();
|
int* magic = magic_tensor.DATA_PTR<int>();
|
||||||
*magic = (*magic + 1) & 0xff;
|
*magic = (*magic + 1) & 0xff;
|
||||||
|
|
||||||
// Allocate output tensor
|
// Allocate output tensor
|
||||||
|
@ -89,15 +91,15 @@ at::Tensor nhwc_bn_addrelu_fwd_train(
|
||||||
bn->setConstants(momentum, epsilon);
|
bn->setConstants(momentum, epsilon);
|
||||||
|
|
||||||
// set pointers within the wrapper
|
// set pointers within the wrapper
|
||||||
bn->setInputOutputPointers(x.data<at::Half>(),
|
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
|
||||||
nullptr,
|
nullptr,
|
||||||
y.data<at::Half>(),
|
y.DATA_PTR<at::Half>(),
|
||||||
nullptr,
|
nullptr,
|
||||||
z.data<at::Half>(),
|
z.DATA_PTR<at::Half>(),
|
||||||
nullptr);
|
nullptr);
|
||||||
|
|
||||||
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {nullptr, nullptr});
|
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {nullptr, nullptr});
|
||||||
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
|
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});
|
||||||
|
|
||||||
// deal with workspace(s)
|
// deal with workspace(s)
|
||||||
auto workspace_bytes = bn->numWorkspaceBytes();
|
auto workspace_bytes = bn->numWorkspaceBytes();
|
||||||
|
@ -118,13 +120,13 @@ at::Tensor nhwc_bn_addrelu_fwd_train(
|
||||||
Workspace ws(total_workspace_bytes);
|
Workspace ws(total_workspace_bytes);
|
||||||
|
|
||||||
std::vector<void *> workspace;
|
std::vector<void *> workspace;
|
||||||
workspace.push_back(minibatch_mean.data<float>());
|
workspace.push_back(minibatch_mean.DATA_PTR<float>());
|
||||||
workspace.push_back(minibatch_inv_var.data<float>());
|
workspace.push_back(minibatch_inv_var.DATA_PTR<float>());
|
||||||
workspace.push_back(bitmask.data<int32_t>());
|
workspace.push_back(bitmask.DATA_PTR<int32_t>());
|
||||||
|
|
||||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||||
const int retired_cta_bytes = workspace_bytes[3];
|
const int retired_cta_bytes = workspace_bytes[3];
|
||||||
void* retired_ctas = ret_cta.data<uint8_t>();
|
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
|
||||||
assert(ret_cta.size(0)>=retired_cta_bytes);
|
assert(ret_cta.size(0)>=retired_cta_bytes);
|
||||||
|
|
||||||
workspace.push_back(retired_ctas);
|
workspace.push_back(retired_ctas);
|
||||||
|
@ -171,15 +173,15 @@ at::Tensor nhwc_bn_addrelu_fwd_eval(
|
||||||
bn->setConstants(momentum, epsilon);
|
bn->setConstants(momentum, epsilon);
|
||||||
|
|
||||||
// set pointers within the wrapper
|
// set pointers within the wrapper
|
||||||
bn->setInputOutputPointers(x.data<at::Half>(),
|
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
|
||||||
nullptr,
|
nullptr,
|
||||||
y.data<at::Half>(),
|
y.DATA_PTR<at::Half>(),
|
||||||
nullptr,
|
nullptr,
|
||||||
z.data<at::Half>(),
|
z.DATA_PTR<at::Half>(),
|
||||||
nullptr);
|
nullptr);
|
||||||
|
|
||||||
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {nullptr, nullptr});
|
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {nullptr, nullptr});
|
||||||
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
|
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});
|
||||||
|
|
||||||
// deal with workspace(s)
|
// deal with workspace(s)
|
||||||
auto workspace_bytes = bn->numWorkspaceBytes();
|
auto workspace_bytes = bn->numWorkspaceBytes();
|
||||||
|
@ -206,7 +208,7 @@ at::Tensor nhwc_bn_addrelu_fwd_eval(
|
||||||
|
|
||||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||||
const int retired_cta_bytes = workspace_bytes[3];
|
const int retired_cta_bytes = workspace_bytes[3];
|
||||||
void* retired_ctas = ret_cta.data<uint8_t>();
|
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
|
||||||
assert(ret_cta.size(0)>=retired_cta_bytes);
|
assert(ret_cta.size(0)>=retired_cta_bytes);
|
||||||
workspace.push_back(retired_ctas);
|
workspace.push_back(retired_ctas);
|
||||||
|
|
||||||
|
@ -253,7 +255,7 @@ std::vector<at::Tensor> nhwc_bn_addrelu_bwd(
|
||||||
const int C = x.size(3);
|
const int C = x.size(3);
|
||||||
|
|
||||||
// generating new magic number and use that for sync
|
// generating new magic number and use that for sync
|
||||||
int* magic = magic_tensor.data<int>();
|
int* magic = magic_tensor.DATA_PTR<int>();
|
||||||
*magic = (*magic + 1) & 0xff;
|
*magic = (*magic + 1) & 0xff;
|
||||||
|
|
||||||
// outputs
|
// outputs
|
||||||
|
@ -274,15 +276,15 @@ std::vector<at::Tensor> nhwc_bn_addrelu_bwd(
|
||||||
bn->setConstants(momentum, epsilon);
|
bn->setConstants(momentum, epsilon);
|
||||||
|
|
||||||
// set pointers within the wrapper
|
// set pointers within the wrapper
|
||||||
bn->setInputOutputPointers(x.data<at::Half>(),
|
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
|
||||||
x_grad.data<at::Half>(),
|
x_grad.DATA_PTR<at::Half>(),
|
||||||
nullptr,
|
nullptr,
|
||||||
dy.data<at::Half>(),
|
dy.DATA_PTR<at::Half>(),
|
||||||
nullptr,
|
nullptr,
|
||||||
z_grad.data<at::Half>());
|
z_grad.DATA_PTR<at::Half>());
|
||||||
|
|
||||||
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {scale_grad.data<float>(), bias_grad.data<float>()});
|
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {scale_grad.DATA_PTR<float>(), bias_grad.DATA_PTR<float>()});
|
||||||
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
|
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});
|
||||||
|
|
||||||
// deal with workspace(s)
|
// deal with workspace(s)
|
||||||
auto workspace_bytes = bn->numWorkspaceBytes();
|
auto workspace_bytes = bn->numWorkspaceBytes();
|
||||||
|
@ -303,13 +305,13 @@ std::vector<at::Tensor> nhwc_bn_addrelu_bwd(
|
||||||
Workspace ws(total_workspace_bytes);
|
Workspace ws(total_workspace_bytes);
|
||||||
|
|
||||||
std::vector<void *> workspace;
|
std::vector<void *> workspace;
|
||||||
workspace.push_back(minibatch_mean.data<float>());
|
workspace.push_back(minibatch_mean.DATA_PTR<float>());
|
||||||
workspace.push_back(minibatch_inv_var.data<float>());
|
workspace.push_back(minibatch_inv_var.DATA_PTR<float>());
|
||||||
workspace.push_back(bitmask.data<int32_t>());
|
workspace.push_back(bitmask.DATA_PTR<int32_t>());
|
||||||
|
|
||||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||||
const int retired_cta_bytes = workspace_bytes[3];
|
const int retired_cta_bytes = workspace_bytes[3];
|
||||||
void* retired_ctas = ret_cta.data<uint8_t>();
|
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
|
||||||
assert(ret_cta.size(0)>=retired_cta_bytes);
|
assert(ret_cta.size(0)>=retired_cta_bytes);
|
||||||
workspace.push_back(retired_ctas);
|
workspace.push_back(retired_ctas);
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,8 @@
|
||||||
|
|
||||||
#include <cuda.h>
|
#include <cuda.h>
|
||||||
|
|
||||||
|
#include "compat.h"
|
||||||
|
|
||||||
|
|
||||||
#define cudaCheckErrors(msg) \
|
#define cudaCheckErrors(msg) \
|
||||||
do { \
|
do { \
|
||||||
|
@ -114,17 +116,17 @@ int64_t get_buffer_size(const int bn_sync_steps) {
|
||||||
|
|
||||||
void* get_remote_data_ptr(const at::Tensor& handle, const int64_t offset) {
|
void* get_remote_data_ptr(const at::Tensor& handle, const int64_t offset) {
|
||||||
cudaIpcMemHandle_t my_handle;
|
cudaIpcMemHandle_t my_handle;
|
||||||
memcpy((unsigned char *)(&my_handle), handle.data<uint8_t>(), sizeof(my_handle));
|
memcpy((unsigned char *)(&my_handle), handle.DATA_PTR<uint8_t>(), sizeof(my_handle));
|
||||||
return ipc_mem_registry.getPtr(my_handle, offset);
|
return ipc_mem_registry.getPtr(my_handle, offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
void close_remote_data(const at::Tensor& handle) {
|
void close_remote_data(const at::Tensor& handle) {
|
||||||
cudaIpcMemHandle_t my_handle;
|
cudaIpcMemHandle_t my_handle;
|
||||||
memcpy((unsigned char *)(&my_handle), handle.data<uint8_t>(), sizeof(my_handle));
|
memcpy((unsigned char *)(&my_handle), handle.DATA_PTR<uint8_t>(), sizeof(my_handle));
|
||||||
ipc_mem_registry.releasePtr(my_handle);
|
ipc_mem_registry.releasePtr(my_handle);
|
||||||
}
|
}
|
||||||
|
|
||||||
void* get_data_ptr(
|
void* get_data_ptr(
|
||||||
const at::Tensor& data) {
|
const at::Tensor& data) {
|
||||||
return data.data<uint8_t>();
|
return data.DATA_PTR<uint8_t>();
|
||||||
}
|
}
|
||||||
|
|
|
@ -82,6 +82,7 @@
|
||||||
#include <THC/THCThrustAllocator.cuh>
|
#include <THC/THCThrustAllocator.cuh>
|
||||||
|
|
||||||
#include "type_shim.h"
|
#include "type_shim.h"
|
||||||
|
#include "compat.h"
|
||||||
|
|
||||||
using Tensor = at::Tensor;
|
using Tensor = at::Tensor;
|
||||||
using TensorList = at::TensorList;
|
using TensorList = at::TensorList;
|
||||||
|
@ -492,7 +493,7 @@ std::vector<Tensor> host_softmax_xentropy(
|
||||||
inner_size *= input.size(i);
|
inner_size *= input.size(i);
|
||||||
// This kernel spawns a block per each element in the batch.
|
// This kernel spawns a block per each element in the batch.
|
||||||
// XXX: it assumes that inner_size == 1
|
// XXX: it assumes that inner_size == 1
|
||||||
AT_CHECK(inner_size == 1, "Currently only inner size 1 supported");
|
TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported");
|
||||||
|
|
||||||
const int ILP = 2;
|
const int ILP = 2;
|
||||||
dim3 grid(outer_size);
|
dim3 grid(outer_size);
|
||||||
|
@ -504,15 +505,15 @@ std::vector<Tensor> host_softmax_xentropy(
|
||||||
if (!half_to_float) {
|
if (!half_to_float) {
|
||||||
cunn_SoftMaxXEntropyForward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue>
|
cunn_SoftMaxXEntropyForward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue>
|
||||||
<<<grid, block, 2 * block.x * sizeof(accscalar_t), stream>>>(
|
<<<grid, block, 2 * block.x * sizeof(accscalar_t), stream>>>(
|
||||||
losses.data<accscalar_t>(), max_log_sum_exp.data<scalar_t_0>(),
|
losses.DATA_PTR<accscalar_t>(), max_log_sum_exp.DATA_PTR<scalar_t_0>(),
|
||||||
input.data<scalar_t_0>(), labels_.data<int64_t>(),
|
input.DATA_PTR<scalar_t_0>(), labels_.DATA_PTR<int64_t>(),
|
||||||
dim_size, smoothing
|
dim_size, smoothing
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
cunn_SoftMaxXEntropyForward<ILP, scalar_t_0, accscalar_t, accscalar_t, Epilogue>
|
cunn_SoftMaxXEntropyForward<ILP, scalar_t_0, accscalar_t, accscalar_t, Epilogue>
|
||||||
<<<grid, block, 2 * block.x * sizeof(accscalar_t), stream>>>(
|
<<<grid, block, 2 * block.x * sizeof(accscalar_t), stream>>>(
|
||||||
losses.data<accscalar_t>(), max_log_sum_exp.data<accscalar_t>(),
|
losses.DATA_PTR<accscalar_t>(), max_log_sum_exp.DATA_PTR<accscalar_t>(),
|
||||||
input.data<scalar_t_0>(), labels_.data<int64_t>(),
|
input.DATA_PTR<scalar_t_0>(), labels_.DATA_PTR<int64_t>(),
|
||||||
dim_size, smoothing
|
dim_size, smoothing
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -561,7 +562,7 @@ Tensor host_softmax_xentropy_backward(
|
||||||
inner_size *= logits.size(i);
|
inner_size *= logits.size(i);
|
||||||
// See descriptions of kernels above.
|
// See descriptions of kernels above.
|
||||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
AT_CHECK(inner_size == 1, "Currently only inner size 1 supported");
|
TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported");
|
||||||
|
|
||||||
const int ILP = 2;
|
const int ILP = 2;
|
||||||
dim3 grid(outer_size);
|
dim3 grid(outer_size);
|
||||||
|
@ -572,17 +573,17 @@ Tensor host_softmax_xentropy_backward(
|
||||||
if (!half_to_float) {
|
if (!half_to_float) {
|
||||||
cunn_SoftMaxXEntropyBackward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue>
|
cunn_SoftMaxXEntropyBackward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue>
|
||||||
<<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
|
<<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
|
||||||
gI.data<scalar_t_0>(), logits.data<scalar_t_0>(),
|
gI.DATA_PTR<scalar_t_0>(), logits.DATA_PTR<scalar_t_0>(),
|
||||||
max_log_sum_exp.data<scalar_t_0>(),
|
max_log_sum_exp.DATA_PTR<scalar_t_0>(),
|
||||||
grad.data<scalar_t_0>(), labels.data<int64_t>(),
|
grad.DATA_PTR<scalar_t_0>(), labels.DATA_PTR<int64_t>(),
|
||||||
smoothing, dim_size
|
smoothing, dim_size
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
cunn_SoftMaxXEntropyBackward<ILP, scalar_t_0, accscalar_t, accscalar_t, Epilogue>
|
cunn_SoftMaxXEntropyBackward<ILP, scalar_t_0, accscalar_t, accscalar_t, Epilogue>
|
||||||
<<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
|
<<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
|
||||||
gI.data<scalar_t_0>(), logits.data<scalar_t_0>(),
|
gI.DATA_PTR<scalar_t_0>(), logits.DATA_PTR<scalar_t_0>(),
|
||||||
max_log_sum_exp.data<accscalar_t>(),
|
max_log_sum_exp.DATA_PTR<accscalar_t>(),
|
||||||
grad.data<accscalar_t>(), labels.data<int64_t>(),
|
grad.DATA_PTR<accscalar_t>(), labels.DATA_PTR<int64_t>(),
|
||||||
smoothing, dim_size
|
smoothing, dim_size
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,3 +1,9 @@
|
||||||
#ifndef TORCH_CHECK
|
#ifndef TORCH_CHECK
|
||||||
#define TORCH_CHECK AT_CHECK
|
#define TORCH_CHECK AT_CHECK
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef VERSION_GE_1_3
|
||||||
|
#define DATA_PTR data_ptr
|
||||||
|
#else
|
||||||
|
#define DATA_PTR data
|
||||||
|
#endif
|
||||||
|
|
|
@ -190,11 +190,11 @@ void fused_adam_cuda(
|
||||||
DISPATCH_FLOAT_AND_HALF(g.scalar_type(), 0, "adam_cuda_kernel",
|
DISPATCH_FLOAT_AND_HALF(g.scalar_type(), 0, "adam_cuda_kernel",
|
||||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||||
adam_cuda_kernel<accscalar_t, scalar_t_0><<<blocks,threadsPerBlock, 0, stream>>>(
|
adam_cuda_kernel<accscalar_t, scalar_t_0><<<blocks,threadsPerBlock, 0, stream>>>(
|
||||||
p.data<accscalar_t>(),
|
p.DATA_PTR<accscalar_t>(),
|
||||||
p_copy.numel() ? p_copy.data<scalar_t_0>() : NULL,
|
p_copy.numel() ? p_copy.DATA_PTR<scalar_t_0>() : NULL,
|
||||||
m.data<accscalar_t>(),
|
m.DATA_PTR<accscalar_t>(),
|
||||||
v.data<accscalar_t>(),
|
v.DATA_PTR<accscalar_t>(),
|
||||||
g.data<scalar_t_0>(),
|
g.DATA_PTR<scalar_t_0>(),
|
||||||
beta1,
|
beta1,
|
||||||
beta2,
|
beta2,
|
||||||
eps,
|
eps,
|
||||||
|
@ -208,11 +208,11 @@ void fused_adam_cuda(
|
||||||
using namespace at;
|
using namespace at;
|
||||||
DISPATCH_DOUBLE_AND_FLOAT(g.scalar_type(), 0, "adam_cuda_kernel",
|
DISPATCH_DOUBLE_AND_FLOAT(g.scalar_type(), 0, "adam_cuda_kernel",
|
||||||
adam_cuda_kernel<scalar_t_0, scalar_t_0><<<blocks,threadsPerBlock, 0, stream>>>(
|
adam_cuda_kernel<scalar_t_0, scalar_t_0><<<blocks,threadsPerBlock, 0, stream>>>(
|
||||||
p.data<scalar_t_0>(),
|
p.DATA_PTR<scalar_t_0>(),
|
||||||
NULL, //don't output p_copy for fp32, it's wasted write
|
NULL, //don't output p_copy for fp32, it's wasted write
|
||||||
m.data<scalar_t_0>(),
|
m.DATA_PTR<scalar_t_0>(),
|
||||||
v.data<scalar_t_0>(),
|
v.DATA_PTR<scalar_t_0>(),
|
||||||
g.data<scalar_t_0>(),
|
g.DATA_PTR<scalar_t_0>(),
|
||||||
beta1,
|
beta1,
|
||||||
beta2,
|
beta2,
|
||||||
eps,
|
eps,
|
||||||
|
|
|
@ -687,14 +687,14 @@ void cuda_layer_norm(
|
||||||
DISPATCH_DOUBLE_FLOAT_AND_HALF(input->scalar_type(), 0, "layer_norm_cuda_kernel",
|
DISPATCH_DOUBLE_FLOAT_AND_HALF(input->scalar_type(), 0, "layer_norm_cuda_kernel",
|
||||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||||
HostApplyLayerNorm(
|
HostApplyLayerNorm(
|
||||||
output->data<scalar_t_0>(),
|
output->DATA_PTR<scalar_t_0>(),
|
||||||
mean->data<accscalar_t>(),
|
mean->DATA_PTR<accscalar_t>(),
|
||||||
invvar->data<accscalar_t>(),
|
invvar->DATA_PTR<accscalar_t>(),
|
||||||
input->data<scalar_t_0>(),
|
input->DATA_PTR<scalar_t_0>(),
|
||||||
n1,n2,
|
n1,n2,
|
||||||
epsilon,
|
epsilon,
|
||||||
gamma != NULL ? gamma->data<scalar_t_0>() : NULL,
|
gamma != NULL ? gamma->DATA_PTR<scalar_t_0>() : NULL,
|
||||||
beta != NULL ? beta->data<scalar_t_0>() : NULL);
|
beta != NULL ? beta->DATA_PTR<scalar_t_0>() : NULL);
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -728,20 +728,20 @@ void HostLayerNormGradient(
|
||||||
at::Tensor part_grad_beta = at::empty_like(part_grad_gamma);
|
at::Tensor part_grad_beta = at::empty_like(part_grad_gamma);
|
||||||
cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
|
cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
|
||||||
dout,
|
dout,
|
||||||
input->data<T>(),
|
input->DATA_PTR<T>(),
|
||||||
n1,n2,
|
n1,n2,
|
||||||
mean,
|
mean,
|
||||||
invvar,
|
invvar,
|
||||||
U(epsilon),
|
U(epsilon),
|
||||||
part_grad_gamma.data<U>(),
|
part_grad_gamma.DATA_PTR<U>(),
|
||||||
part_grad_beta.data<U>());
|
part_grad_beta.DATA_PTR<U>());
|
||||||
|
|
||||||
const dim3 threads3(32,8,1);
|
const dim3 threads3(32,8,1);
|
||||||
const dim3 blocks3((n2+threads2.x-1)/threads2.x,1,1);
|
const dim3 blocks3((n2+threads2.x-1)/threads2.x,1,1);
|
||||||
const int nshared3 = threads3.x * threads3.y * sizeof(U);
|
const int nshared3 = threads3.x * threads3.y * sizeof(U);
|
||||||
cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
|
cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
|
||||||
part_grad_gamma.data<U>(),
|
part_grad_gamma.DATA_PTR<U>(),
|
||||||
part_grad_beta.data<U>(),
|
part_grad_beta.DATA_PTR<U>(),
|
||||||
part_size,
|
part_size,
|
||||||
n1,n2,
|
n1,n2,
|
||||||
grad_gamma,
|
grad_gamma,
|
||||||
|
@ -758,7 +758,7 @@ void HostLayerNormGradient(
|
||||||
0;
|
0;
|
||||||
cuComputeGradInput<<<blocks1, threads1, nshared, stream>>>(
|
cuComputeGradInput<<<blocks1, threads1, nshared, stream>>>(
|
||||||
dout,
|
dout,
|
||||||
input->data<T>(),
|
input->DATA_PTR<T>(),
|
||||||
n1,n2,
|
n1,n2,
|
||||||
mean,
|
mean,
|
||||||
invvar,
|
invvar,
|
||||||
|
@ -790,18 +790,18 @@ void cuda_layer_norm_gradient(
|
||||||
DISPATCH_FLOAT_AND_HALF(input->scalar_type(), 0, "cuComputeGradInput",
|
DISPATCH_FLOAT_AND_HALF(input->scalar_type(), 0, "cuComputeGradInput",
|
||||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||||
HostLayerNormGradient(
|
HostLayerNormGradient(
|
||||||
dout->data<scalar_t_0>(),
|
dout->DATA_PTR<scalar_t_0>(),
|
||||||
mean->data<accscalar_t>(),
|
mean->DATA_PTR<accscalar_t>(),
|
||||||
invvar->data<accscalar_t>(),
|
invvar->DATA_PTR<accscalar_t>(),
|
||||||
input,
|
input,
|
||||||
n1,n2,
|
n1,n2,
|
||||||
// TMJ pass NULL argument for gamma, beta, grad_gamma and grad_beta
|
// TMJ pass NULL argument for gamma, beta, grad_gamma and grad_beta
|
||||||
// if gamma Tensor is NULL on input.
|
// if gamma Tensor is NULL on input.
|
||||||
gamma != NULL ? gamma->data<scalar_t_0>() : NULL,
|
gamma != NULL ? gamma->DATA_PTR<scalar_t_0>() : NULL,
|
||||||
gamma != NULL ? beta->data<scalar_t_0>() : NULL,
|
gamma != NULL ? beta->DATA_PTR<scalar_t_0>() : NULL,
|
||||||
epsilon,
|
epsilon,
|
||||||
grad_input->data<scalar_t_0>(),
|
grad_input->DATA_PTR<scalar_t_0>(),
|
||||||
gamma != NULL ? grad_gamma->data<scalar_t_0>() : NULL,
|
gamma != NULL ? grad_gamma->DATA_PTR<scalar_t_0>() : NULL,
|
||||||
gamma != NULL ? grad_beta->data<scalar_t_0>() : NULL);
|
gamma != NULL ? grad_beta->DATA_PTR<scalar_t_0>() : NULL);
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
|
@ -96,7 +96,7 @@ void multi_tensor_apply(
|
||||||
// using accscalar_t = acc_type<scalar_t, true>;
|
// using accscalar_t = acc_type<scalar_t, true>;
|
||||||
multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
|
multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
|
||||||
chunk_size,
|
chunk_size,
|
||||||
noop_flag.data<int>(),
|
noop_flag.DATA_PTR<int>(),
|
||||||
tl,
|
tl,
|
||||||
callable,
|
callable,
|
||||||
args...);
|
args...);
|
||||||
|
|
|
@ -274,8 +274,8 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
|
||||||
noop_flag,
|
noop_flag,
|
||||||
tensor_lists,
|
tensor_lists,
|
||||||
L2NormFunctor<scalar_t_0>(),
|
L2NormFunctor<scalar_t_0>(),
|
||||||
output.data<float>(),
|
output.DATA_PTR<float>(),
|
||||||
per_tensor ? output_per_tensor.data<float>() : nullptr,
|
per_tensor ? output_per_tensor.DATA_PTR<float>() : nullptr,
|
||||||
per_tensor,
|
per_tensor,
|
||||||
max_chunks_per_tensor);)
|
max_chunks_per_tensor);)
|
||||||
|
|
||||||
|
@ -289,10 +289,10 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
|
||||||
auto ret = at::empty({1}, output.options());
|
auto ret = at::empty({1}, output.options());
|
||||||
auto stream = at::cuda::getCurrentCUDAStream();
|
auto stream = at::cuda::getCurrentCUDAStream();
|
||||||
cleanup<<<per_tensor ? ntensors : 1, 512, 0, stream>>>(
|
cleanup<<<per_tensor ? ntensors : 1, 512, 0, stream>>>(
|
||||||
output.data<float>(),
|
output.DATA_PTR<float>(),
|
||||||
per_tensor ? output_per_tensor.data<float>() : nullptr,
|
per_tensor ? output_per_tensor.DATA_PTR<float>() : nullptr,
|
||||||
ret.data<float>(),
|
ret.DATA_PTR<float>(),
|
||||||
per_tensor ? ret_per_tensor.data<float>() : nullptr,
|
per_tensor ? ret_per_tensor.DATA_PTR<float>() : nullptr,
|
||||||
per_tensor,
|
per_tensor,
|
||||||
max_chunks_per_tensor);
|
max_chunks_per_tensor);
|
||||||
|
|
||||||
|
@ -344,8 +344,8 @@ void multi_tensor_norm_out_cuda(
|
||||||
noop_flag,
|
noop_flag,
|
||||||
tensor_lists,
|
tensor_lists,
|
||||||
MaxNormFunctor<scalar_t_0>(),
|
MaxNormFunctor<scalar_t_0>(),
|
||||||
output.data<float>(),
|
output.DATA_PTR<float>(),
|
||||||
output_per_tensor.data<float>(),
|
output_per_tensor.DATA_PTR<float>(),
|
||||||
true,
|
true,
|
||||||
max_chunks_per_tensor);)
|
max_chunks_per_tensor);)
|
||||||
}
|
}
|
||||||
|
@ -358,8 +358,8 @@ void multi_tensor_norm_out_cuda(
|
||||||
noop_flag,
|
noop_flag,
|
||||||
tensor_lists,
|
tensor_lists,
|
||||||
L2NormFunctor<scalar_t_0>(),
|
L2NormFunctor<scalar_t_0>(),
|
||||||
output.data<float>(),
|
output.DATA_PTR<float>(),
|
||||||
output_per_tensor.data<float>(),
|
output_per_tensor.DATA_PTR<float>(),
|
||||||
true,
|
true,
|
||||||
max_chunks_per_tensor);)
|
max_chunks_per_tensor);)
|
||||||
}
|
}
|
||||||
|
@ -373,10 +373,10 @@ void multi_tensor_norm_out_cuda(
|
||||||
auto ret = at::empty({1}, output.options());
|
auto ret = at::empty({1}, output.options());
|
||||||
auto stream = at::cuda::getCurrentCUDAStream();
|
auto stream = at::cuda::getCurrentCUDAStream();
|
||||||
cleanup_v2<<<ntensors, 512, 0, stream>>>(
|
cleanup_v2<<<ntensors, 512, 0, stream>>>(
|
||||||
output.data<float>(),
|
output.DATA_PTR<float>(),
|
||||||
output_per_tensor.data<float>(),
|
output_per_tensor.DATA_PTR<float>(),
|
||||||
ret.data<float>(),
|
ret.DATA_PTR<float>(),
|
||||||
out.data<float>(),
|
out.DATA_PTR<float>(),
|
||||||
true,
|
true,
|
||||||
max_chunks_per_tensor,
|
max_chunks_per_tensor,
|
||||||
norm_type,
|
norm_type,
|
||||||
|
|
|
@ -265,7 +265,7 @@ void multi_tensor_lamb_cuda(
|
||||||
epsilon,
|
epsilon,
|
||||||
(adamMode_t) mode,
|
(adamMode_t) mode,
|
||||||
weight_decay,
|
weight_decay,
|
||||||
std::get<0>(grad_norm_tuple).data<float>(),
|
std::get<0>(grad_norm_tuple).DATA_PTR<float>(),
|
||||||
max_grad_norm); )
|
max_grad_norm); )
|
||||||
|
|
||||||
// Compute update norms
|
// Compute update norms
|
||||||
|
@ -280,8 +280,8 @@ void multi_tensor_lamb_cuda(
|
||||||
noop_flag,
|
noop_flag,
|
||||||
grad_param_list,
|
grad_param_list,
|
||||||
LAMBStage2Functor<scalar_t_0>(),
|
LAMBStage2Functor<scalar_t_0>(),
|
||||||
std::get<1>(param_norm_tuple).data<float>(),
|
std::get<1>(param_norm_tuple).DATA_PTR<float>(),
|
||||||
std::get<1>(update_norm_tuple).data<float>(),
|
std::get<1>(update_norm_tuple).DATA_PTR<float>(),
|
||||||
lr); )
|
lr); )
|
||||||
|
|
||||||
AT_CUDA_CHECK(cudaGetLastError());
|
AT_CUDA_CHECK(cudaGetLastError());
|
||||||
|
|
|
@ -136,7 +136,7 @@ void multi_tensor_lamb_stage1_cuda(
|
||||||
noop_flag,
|
noop_flag,
|
||||||
tensor_lists,
|
tensor_lists,
|
||||||
LAMBStage1Functor<scalar_t_0, scalar_t_1, scalar_t_2>(),
|
LAMBStage1Functor<scalar_t_0, scalar_t_1, scalar_t_2>(),
|
||||||
per_tensor_decay.data<float>(),
|
per_tensor_decay.DATA_PTR<float>(),
|
||||||
beta1,
|
beta1,
|
||||||
beta2,
|
beta2,
|
||||||
beta1_correction,
|
beta1_correction,
|
||||||
|
|
|
@ -99,8 +99,8 @@ void multi_tensor_lamb_stage2_cuda(
|
||||||
noop_flag,
|
noop_flag,
|
||||||
tensor_lists,
|
tensor_lists,
|
||||||
LAMBStage2Functor<scalar_t_0, scalar_t_1>(),
|
LAMBStage2Functor<scalar_t_0, scalar_t_1>(),
|
||||||
per_tensor_param_norm.data<float>(),
|
per_tensor_param_norm.DATA_PTR<float>(),
|
||||||
per_tensor_update_norm.data<float>(),
|
per_tensor_update_norm.DATA_PTR<float>(),
|
||||||
learning_rate); ))
|
learning_rate); ))
|
||||||
|
|
||||||
AT_CUDA_CHECK(cudaGetLastError());
|
AT_CUDA_CHECK(cudaGetLastError());
|
||||||
|
|
|
@ -181,7 +181,7 @@ void multi_tensor_novograd_cuda(
|
||||||
lr,
|
lr,
|
||||||
(momentMode_t) moment_mode,
|
(momentMode_t) moment_mode,
|
||||||
weight_decay,
|
weight_decay,
|
||||||
grad_norms.data<float>()); )
|
grad_norms.DATA_PTR<float>()); )
|
||||||
|
|
||||||
AT_CUDA_CHECK(cudaGetLastError());
|
AT_CUDA_CHECK(cudaGetLastError());
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#include <ATen/ATen.h>
|
#include <ATen/ATen.h>
|
||||||
|
#include "compat.h"
|
||||||
|
|
||||||
// Forward/backward compatiblity hack around
|
// Forward/backward compatiblity hack around
|
||||||
// https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
|
// https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
|
||||||
|
|
246
csrc/welford.cu
246
csrc/welford.cu
|
@ -904,9 +904,9 @@ std::vector<at::Tensor> welford_mean_var_CUDA(const at::Tensor input) {
|
||||||
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "welford_mean_var_kernel",
|
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "welford_mean_var_kernel",
|
||||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||||
welford_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
|
welford_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
|
||||||
input.data<scalar_t_0>(),
|
input.DATA_PTR<scalar_t_0>(),
|
||||||
out_mean.data<accscalar_t>(),
|
out_mean.DATA_PTR<accscalar_t>(),
|
||||||
out_var_biased.data<accscalar_t>(),
|
out_var_biased.DATA_PTR<accscalar_t>(),
|
||||||
batch_size,
|
batch_size,
|
||||||
feature_size,
|
feature_size,
|
||||||
space_size);
|
space_size);
|
||||||
|
@ -943,12 +943,12 @@ at::Tensor batchnorm_forward_CUDA(
|
||||||
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_forward",
|
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_forward",
|
||||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||||
batchnorm_forward_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
|
batchnorm_forward_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
|
||||||
input.data<scalar_t_0>(),
|
input.DATA_PTR<scalar_t_0>(),
|
||||||
mean.data<accscalar_t>(),
|
mean.DATA_PTR<accscalar_t>(),
|
||||||
inv_std.data<accscalar_t>(),
|
inv_std.DATA_PTR<accscalar_t>(),
|
||||||
weight.has_value() ? weight.value().data<accscalar_t>() : NULL,
|
weight.has_value() ? weight.value().DATA_PTR<accscalar_t>() : NULL,
|
||||||
shift.has_value() ? shift.value().data<accscalar_t>() : NULL,
|
shift.has_value() ? shift.value().DATA_PTR<accscalar_t>() : NULL,
|
||||||
out.data<scalar_t_0>(),
|
out.DATA_PTR<scalar_t_0>(),
|
||||||
space_size,
|
space_size,
|
||||||
batch_size);
|
batch_size);
|
||||||
);
|
);
|
||||||
|
@ -961,12 +961,12 @@ at::Tensor batchnorm_forward_CUDA(
|
||||||
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_forward",
|
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_forward",
|
||||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||||
batchnorm_forward_kernel<scalar_t_0, accscalar_t, scalar_t_0><<<grid, block, 0, stream>>>(
|
batchnorm_forward_kernel<scalar_t_0, accscalar_t, scalar_t_0><<<grid, block, 0, stream>>>(
|
||||||
input.data<scalar_t_0>(),
|
input.DATA_PTR<scalar_t_0>(),
|
||||||
mean.data<accscalar_t>(),
|
mean.DATA_PTR<accscalar_t>(),
|
||||||
inv_std.data<accscalar_t>(),
|
inv_std.DATA_PTR<accscalar_t>(),
|
||||||
weight.has_value() ? weight.value().data<scalar_t_0>() : NULL,
|
weight.has_value() ? weight.value().DATA_PTR<scalar_t_0>() : NULL,
|
||||||
shift.has_value() ? shift.value().data<scalar_t_0>() : NULL,
|
shift.has_value() ? shift.value().DATA_PTR<scalar_t_0>() : NULL,
|
||||||
out.data<scalar_t_0>(),
|
out.DATA_PTR<scalar_t_0>(),
|
||||||
space_size,
|
space_size,
|
||||||
batch_size);
|
batch_size);
|
||||||
);
|
);
|
||||||
|
@ -1014,14 +1014,14 @@ std::vector<at::Tensor> reduce_bn_CUDA(
|
||||||
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward_reduce",
|
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward_reduce",
|
||||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||||
reduce_bn_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
|
reduce_bn_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
|
||||||
input.data<scalar_t_0>(),
|
input.DATA_PTR<scalar_t_0>(),
|
||||||
grad_output.data<scalar_t_0>(),
|
grad_output.DATA_PTR<scalar_t_0>(),
|
||||||
mean.data<accscalar_t>(),
|
mean.DATA_PTR<accscalar_t>(),
|
||||||
inv_std.data<accscalar_t>(),
|
inv_std.DATA_PTR<accscalar_t>(),
|
||||||
mean_dy.data<accscalar_t>(),
|
mean_dy.DATA_PTR<accscalar_t>(),
|
||||||
mean_dy_xmu.data<accscalar_t>(),
|
mean_dy_xmu.DATA_PTR<accscalar_t>(),
|
||||||
weight.has_value() ? grad_weight.data<accscalar_t>() : NULL,
|
weight.has_value() ? grad_weight.DATA_PTR<accscalar_t>() : NULL,
|
||||||
weight.has_value() ? grad_bias.data<accscalar_t>() : NULL,
|
weight.has_value() ? grad_bias.DATA_PTR<accscalar_t>() : NULL,
|
||||||
batch_size,
|
batch_size,
|
||||||
feature_size,
|
feature_size,
|
||||||
space_size);
|
space_size);
|
||||||
|
@ -1035,14 +1035,14 @@ std::vector<at::Tensor> reduce_bn_CUDA(
|
||||||
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward_reduce",
|
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward_reduce",
|
||||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||||
reduce_bn_kernel<scalar_t_0, accscalar_t, scalar_t_0><<<grid, block, 0, stream>>>(
|
reduce_bn_kernel<scalar_t_0, accscalar_t, scalar_t_0><<<grid, block, 0, stream>>>(
|
||||||
input.data<scalar_t_0>(),
|
input.DATA_PTR<scalar_t_0>(),
|
||||||
grad_output.data<scalar_t_0>(),
|
grad_output.DATA_PTR<scalar_t_0>(),
|
||||||
mean.data<accscalar_t>(),
|
mean.DATA_PTR<accscalar_t>(),
|
||||||
inv_std.data<accscalar_t>(),
|
inv_std.DATA_PTR<accscalar_t>(),
|
||||||
mean_dy.data<accscalar_t>(),
|
mean_dy.DATA_PTR<accscalar_t>(),
|
||||||
mean_dy_xmu.data<accscalar_t>(),
|
mean_dy_xmu.DATA_PTR<accscalar_t>(),
|
||||||
weight.has_value() ? grad_weight.data<scalar_t_0>() : NULL,
|
weight.has_value() ? grad_weight.DATA_PTR<scalar_t_0>() : NULL,
|
||||||
weight.has_value() ? grad_bias.data<scalar_t_0>() : NULL,
|
weight.has_value() ? grad_bias.DATA_PTR<scalar_t_0>() : NULL,
|
||||||
batch_size,
|
batch_size,
|
||||||
feature_size,
|
feature_size,
|
||||||
space_size);
|
space_size);
|
||||||
|
@ -1083,14 +1083,14 @@ at::Tensor batchnorm_backward_CUDA(
|
||||||
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward",
|
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward",
|
||||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||||
batchnorm_backward_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
|
batchnorm_backward_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
|
||||||
grad_output.data<scalar_t_0>(),
|
grad_output.DATA_PTR<scalar_t_0>(),
|
||||||
input.data<scalar_t_0>(),
|
input.DATA_PTR<scalar_t_0>(),
|
||||||
mean.data<accscalar_t>(),
|
mean.DATA_PTR<accscalar_t>(),
|
||||||
inv_std.data<accscalar_t>(),
|
inv_std.DATA_PTR<accscalar_t>(),
|
||||||
weight.has_value() ? weight.value().data<accscalar_t>() : NULL,
|
weight.has_value() ? weight.value().DATA_PTR<accscalar_t>() : NULL,
|
||||||
mean_dy.data<accscalar_t>(),
|
mean_dy.DATA_PTR<accscalar_t>(),
|
||||||
mean_dy_xmu.data<accscalar_t>(),
|
mean_dy_xmu.DATA_PTR<accscalar_t>(),
|
||||||
grad_input.data<scalar_t_0>(),
|
grad_input.DATA_PTR<scalar_t_0>(),
|
||||||
space_size,
|
space_size,
|
||||||
batch_size);
|
batch_size);
|
||||||
);
|
);
|
||||||
|
@ -1103,14 +1103,14 @@ at::Tensor batchnorm_backward_CUDA(
|
||||||
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward",
|
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward",
|
||||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||||
batchnorm_backward_kernel<scalar_t_0, accscalar_t, scalar_t_0><<<grid, block, 0, stream>>>(
|
batchnorm_backward_kernel<scalar_t_0, accscalar_t, scalar_t_0><<<grid, block, 0, stream>>>(
|
||||||
grad_output.data<scalar_t_0>(),
|
grad_output.DATA_PTR<scalar_t_0>(),
|
||||||
input.data<scalar_t_0>(),
|
input.DATA_PTR<scalar_t_0>(),
|
||||||
mean.data<accscalar_t>(),
|
mean.DATA_PTR<accscalar_t>(),
|
||||||
inv_std.data<accscalar_t>(),
|
inv_std.DATA_PTR<accscalar_t>(),
|
||||||
weight.has_value() ? weight.value().data<scalar_t_0>() : NULL,
|
weight.has_value() ? weight.value().DATA_PTR<scalar_t_0>() : NULL,
|
||||||
mean_dy.data<accscalar_t>(),
|
mean_dy.DATA_PTR<accscalar_t>(),
|
||||||
mean_dy_xmu.data<accscalar_t>(),
|
mean_dy_xmu.DATA_PTR<accscalar_t>(),
|
||||||
grad_input.data<scalar_t_0>(),
|
grad_input.DATA_PTR<scalar_t_0>(),
|
||||||
space_size,
|
space_size,
|
||||||
batch_size);
|
batch_size);
|
||||||
);
|
);
|
||||||
|
@ -1140,11 +1140,11 @@ std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_node
|
||||||
using namespace at;
|
using namespace at;
|
||||||
DISPATCH_FLOAT_AND_HALF(mean_feature_nodes.scalar_type(), 0, "welford_parallel_kernel",
|
DISPATCH_FLOAT_AND_HALF(mean_feature_nodes.scalar_type(), 0, "welford_parallel_kernel",
|
||||||
welford_kernel_parallel<scalar_t_0><<<grid, block, 0, stream>>>(
|
welford_kernel_parallel<scalar_t_0><<<grid, block, 0, stream>>>(
|
||||||
mean_feature_nodes.data<scalar_t_0>(),
|
mean_feature_nodes.DATA_PTR<scalar_t_0>(),
|
||||||
var_biased.data<scalar_t_0>(),
|
var_biased.DATA_PTR<scalar_t_0>(),
|
||||||
out_mean.data<scalar_t_0>(),
|
out_mean.DATA_PTR<scalar_t_0>(),
|
||||||
out_var.data<scalar_t_0>(),
|
out_var.DATA_PTR<scalar_t_0>(),
|
||||||
inv_std.data<scalar_t_0>(),
|
inv_std.DATA_PTR<scalar_t_0>(),
|
||||||
world_size,
|
world_size,
|
||||||
feature_size,
|
feature_size,
|
||||||
eps,
|
eps,
|
||||||
|
@ -1182,13 +1182,13 @@ std::vector<at::Tensor> welford_mean_var_c_last_CUDA(const at::Tensor input) {
|
||||||
using namespace at;
|
using namespace at;
|
||||||
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "welford_mean_var_c_last",
|
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "welford_mean_var_c_last",
|
||||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||||
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr;
|
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.DATA_PTR<accscalar_t>() : nullptr;
|
||||||
int* semaphores_ptr = grid.y > 1 ? semaphores.data<int>() : nullptr;
|
int* semaphores_ptr = grid.y > 1 ? semaphores.DATA_PTR<int>() : nullptr;
|
||||||
welford_kernel_c_last<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
|
welford_kernel_c_last<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
|
||||||
<<<grid, block, 0, stream>>>(
|
<<<grid, block, 0, stream>>>(
|
||||||
input.data<scalar_t_0>(),
|
input.DATA_PTR<scalar_t_0>(),
|
||||||
out_mean.data<accscalar_t>(),
|
out_mean.DATA_PTR<accscalar_t>(),
|
||||||
out_var_biased.data<accscalar_t>(),
|
out_var_biased.DATA_PTR<accscalar_t>(),
|
||||||
staging_data_ptr,
|
staging_data_ptr,
|
||||||
semaphores_ptr,
|
semaphores_ptr,
|
||||||
reduction_size,
|
reduction_size,
|
||||||
|
@ -1225,13 +1225,13 @@ at::Tensor batchnorm_forward_c_last_CUDA(
|
||||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||||
batchnorm_forward_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
|
batchnorm_forward_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
|
||||||
<<<grid, block, 0, stream>>>(
|
<<<grid, block, 0, stream>>>(
|
||||||
input.data<scalar_t_0>(),
|
input.DATA_PTR<scalar_t_0>(),
|
||||||
z.has_value() ? z.value().data<scalar_t_0>() : NULL,
|
z.has_value() ? z.value().DATA_PTR<scalar_t_0>() : NULL,
|
||||||
mean.data<accscalar_t>(),
|
mean.DATA_PTR<accscalar_t>(),
|
||||||
inv_std.data<accscalar_t>(),
|
inv_std.DATA_PTR<accscalar_t>(),
|
||||||
weight.has_value() ? weight.value().data<accscalar_t>() : NULL,
|
weight.has_value() ? weight.value().DATA_PTR<accscalar_t>() : NULL,
|
||||||
shift.has_value() ? shift.value().data<accscalar_t>(): NULL,
|
shift.has_value() ? shift.value().DATA_PTR<accscalar_t>(): NULL,
|
||||||
out.data<scalar_t_0>(),
|
out.DATA_PTR<scalar_t_0>(),
|
||||||
reduction_size,
|
reduction_size,
|
||||||
stride,
|
stride,
|
||||||
fuse_relu);
|
fuse_relu);
|
||||||
|
@ -1246,13 +1246,13 @@ at::Tensor batchnorm_forward_c_last_CUDA(
|
||||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||||
batchnorm_forward_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
|
batchnorm_forward_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
|
||||||
<<<grid, block, 0, stream>>>(
|
<<<grid, block, 0, stream>>>(
|
||||||
input.data<scalar_t_0>(),
|
input.DATA_PTR<scalar_t_0>(),
|
||||||
z.has_value() ? z.value().data<scalar_t_0>() : NULL,
|
z.has_value() ? z.value().DATA_PTR<scalar_t_0>() : NULL,
|
||||||
mean.data<accscalar_t>(),
|
mean.DATA_PTR<accscalar_t>(),
|
||||||
inv_std.data<accscalar_t>(),
|
inv_std.DATA_PTR<accscalar_t>(),
|
||||||
weight.has_value() ? weight.value().data<scalar_t_0>() : NULL,
|
weight.has_value() ? weight.value().DATA_PTR<scalar_t_0>() : NULL,
|
||||||
shift.has_value() ? shift.value().data<scalar_t_0>(): NULL,
|
shift.has_value() ? shift.value().DATA_PTR<scalar_t_0>(): NULL,
|
||||||
out.data<scalar_t_0>(),
|
out.DATA_PTR<scalar_t_0>(),
|
||||||
reduction_size,
|
reduction_size,
|
||||||
stride,
|
stride,
|
||||||
fuse_relu);
|
fuse_relu);
|
||||||
|
@ -1302,18 +1302,18 @@ std::vector<at::Tensor> reduce_bn_c_last_CUDA(
|
||||||
using namespace at;
|
using namespace at;
|
||||||
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward_reduce",
|
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward_reduce",
|
||||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||||
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr;
|
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.DATA_PTR<accscalar_t>() : nullptr;
|
||||||
int* semaphores_ptr = grid.y > 1 ? semaphores.data<int>() : nullptr;
|
int* semaphores_ptr = grid.y > 1 ? semaphores.DATA_PTR<int>() : nullptr;
|
||||||
reduce_bn_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
|
reduce_bn_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
|
||||||
<<<grid, block, 0, stream>>>(
|
<<<grid, block, 0, stream>>>(
|
||||||
input.data<scalar_t_0>(),
|
input.DATA_PTR<scalar_t_0>(),
|
||||||
grad_output.data<scalar_t_0>(),
|
grad_output.DATA_PTR<scalar_t_0>(),
|
||||||
mean.data<accscalar_t>(),
|
mean.DATA_PTR<accscalar_t>(),
|
||||||
inv_std.data<accscalar_t>(),
|
inv_std.DATA_PTR<accscalar_t>(),
|
||||||
mean_dy.data<accscalar_t>(),
|
mean_dy.DATA_PTR<accscalar_t>(),
|
||||||
mean_dy_xmu.data<accscalar_t>(),
|
mean_dy_xmu.DATA_PTR<accscalar_t>(),
|
||||||
weight.has_value() ? grad_weight.data<accscalar_t>() : NULL,
|
weight.has_value() ? grad_weight.DATA_PTR<accscalar_t>() : NULL,
|
||||||
weight.has_value() ?grad_bias.data<accscalar_t>() : NULL,
|
weight.has_value() ?grad_bias.DATA_PTR<accscalar_t>() : NULL,
|
||||||
staging_data_ptr,
|
staging_data_ptr,
|
||||||
semaphores_ptr,
|
semaphores_ptr,
|
||||||
reduction_size,
|
reduction_size,
|
||||||
|
@ -1327,18 +1327,18 @@ std::vector<at::Tensor> reduce_bn_c_last_CUDA(
|
||||||
using namespace at;
|
using namespace at;
|
||||||
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward_reduce",
|
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward_reduce",
|
||||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||||
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr;
|
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.DATA_PTR<accscalar_t>() : nullptr;
|
||||||
int* semaphores_ptr = grid.y > 1 ? semaphores.data<int>() : nullptr;
|
int* semaphores_ptr = grid.y > 1 ? semaphores.DATA_PTR<int>() : nullptr;
|
||||||
reduce_bn_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
|
reduce_bn_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
|
||||||
<<<grid, block, 0, stream>>>(
|
<<<grid, block, 0, stream>>>(
|
||||||
input.data<scalar_t_0>(),
|
input.DATA_PTR<scalar_t_0>(),
|
||||||
grad_output.data<scalar_t_0>(),
|
grad_output.DATA_PTR<scalar_t_0>(),
|
||||||
mean.data<accscalar_t>(),
|
mean.DATA_PTR<accscalar_t>(),
|
||||||
inv_std.data<accscalar_t>(),
|
inv_std.DATA_PTR<accscalar_t>(),
|
||||||
mean_dy.data<accscalar_t>(),
|
mean_dy.DATA_PTR<accscalar_t>(),
|
||||||
mean_dy_xmu.data<accscalar_t>(),
|
mean_dy_xmu.DATA_PTR<accscalar_t>(),
|
||||||
weight.has_value() ? grad_weight.data<scalar_t_0>() : NULL,
|
weight.has_value() ? grad_weight.DATA_PTR<scalar_t_0>() : NULL,
|
||||||
weight.has_value() ?grad_bias.data<scalar_t_0>() : NULL,
|
weight.has_value() ?grad_bias.DATA_PTR<scalar_t_0>() : NULL,
|
||||||
staging_data_ptr,
|
staging_data_ptr,
|
||||||
semaphores_ptr,
|
semaphores_ptr,
|
||||||
reduction_size,
|
reduction_size,
|
||||||
|
@ -1375,14 +1375,14 @@ at::Tensor batchnorm_backward_c_last_CUDA(
|
||||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||||
batchnorm_backward_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
|
batchnorm_backward_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
|
||||||
<<<grid, block, 0, stream>>>(
|
<<<grid, block, 0, stream>>>(
|
||||||
grad_output.data<scalar_t_0>(),
|
grad_output.DATA_PTR<scalar_t_0>(),
|
||||||
input.data<scalar_t_0>(),
|
input.DATA_PTR<scalar_t_0>(),
|
||||||
mean.data<accscalar_t>(),
|
mean.DATA_PTR<accscalar_t>(),
|
||||||
inv_std.data<accscalar_t>(),
|
inv_std.DATA_PTR<accscalar_t>(),
|
||||||
weight.has_value() ? weight.value().data<accscalar_t>() : NULL,
|
weight.has_value() ? weight.value().DATA_PTR<accscalar_t>() : NULL,
|
||||||
mean_dy.data<accscalar_t>(),
|
mean_dy.DATA_PTR<accscalar_t>(),
|
||||||
mean_dy_xmu.data<accscalar_t>(),
|
mean_dy_xmu.DATA_PTR<accscalar_t>(),
|
||||||
grad_input.data<scalar_t_0>(),
|
grad_input.DATA_PTR<scalar_t_0>(),
|
||||||
reduction_size,
|
reduction_size,
|
||||||
stride);
|
stride);
|
||||||
);
|
);
|
||||||
|
@ -1396,14 +1396,14 @@ at::Tensor batchnorm_backward_c_last_CUDA(
|
||||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||||
batchnorm_backward_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
|
batchnorm_backward_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
|
||||||
<<<grid, block, 0, stream>>>(
|
<<<grid, block, 0, stream>>>(
|
||||||
grad_output.data<scalar_t_0>(),
|
grad_output.DATA_PTR<scalar_t_0>(),
|
||||||
input.data<scalar_t_0>(),
|
input.DATA_PTR<scalar_t_0>(),
|
||||||
mean.data<accscalar_t>(),
|
mean.DATA_PTR<accscalar_t>(),
|
||||||
inv_std.data<accscalar_t>(),
|
inv_std.DATA_PTR<accscalar_t>(),
|
||||||
weight.has_value() ? weight.value().data<scalar_t_0>() : NULL,
|
weight.has_value() ? weight.value().DATA_PTR<scalar_t_0>() : NULL,
|
||||||
mean_dy.data<accscalar_t>(),
|
mean_dy.DATA_PTR<accscalar_t>(),
|
||||||
mean_dy_xmu.data<accscalar_t>(),
|
mean_dy_xmu.DATA_PTR<accscalar_t>(),
|
||||||
grad_input.data<scalar_t_0>(),
|
grad_input.DATA_PTR<scalar_t_0>(),
|
||||||
reduction_size,
|
reduction_size,
|
||||||
stride);
|
stride);
|
||||||
);
|
);
|
||||||
|
@ -1439,14 +1439,14 @@ at::Tensor relu_backward_c_last_CUDA(
|
||||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||||
relu_backward_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
|
relu_backward_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
|
||||||
<<<grid, block, 0, stream>>>(
|
<<<grid, block, 0, stream>>>(
|
||||||
grad_output.data<scalar_t_0>(),
|
grad_output.DATA_PTR<scalar_t_0>(),
|
||||||
input.data<scalar_t_0>(),
|
input.DATA_PTR<scalar_t_0>(),
|
||||||
z.has_value() ? z.value().data<scalar_t_0>() : NULL,
|
z.has_value() ? z.value().DATA_PTR<scalar_t_0>() : NULL,
|
||||||
mean.data<accscalar_t>(),
|
mean.DATA_PTR<accscalar_t>(),
|
||||||
inv_std.data<accscalar_t>(),
|
inv_std.DATA_PTR<accscalar_t>(),
|
||||||
weight.has_value() ? weight.value().data<accscalar_t>() : NULL,
|
weight.has_value() ? weight.value().DATA_PTR<accscalar_t>() : NULL,
|
||||||
shift.has_value() ? shift.value().data<accscalar_t>(): NULL,
|
shift.has_value() ? shift.value().DATA_PTR<accscalar_t>(): NULL,
|
||||||
out.data<scalar_t_0>(),
|
out.DATA_PTR<scalar_t_0>(),
|
||||||
reduction_size,
|
reduction_size,
|
||||||
stride);
|
stride);
|
||||||
);
|
);
|
||||||
|
@ -1460,14 +1460,14 @@ at::Tensor relu_backward_c_last_CUDA(
|
||||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||||
relu_backward_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
|
relu_backward_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
|
||||||
<<<grid, block, 0, stream>>>(
|
<<<grid, block, 0, stream>>>(
|
||||||
grad_output.data<scalar_t_0>(),
|
grad_output.DATA_PTR<scalar_t_0>(),
|
||||||
input.data<scalar_t_0>(),
|
input.DATA_PTR<scalar_t_0>(),
|
||||||
z.has_value() ? z.value().data<scalar_t_0>() : NULL,
|
z.has_value() ? z.value().DATA_PTR<scalar_t_0>() : NULL,
|
||||||
mean.data<accscalar_t>(),
|
mean.DATA_PTR<accscalar_t>(),
|
||||||
inv_std.data<accscalar_t>(),
|
inv_std.DATA_PTR<accscalar_t>(),
|
||||||
weight.has_value() ? weight.value().data<scalar_t_0>() : NULL,
|
weight.has_value() ? weight.value().DATA_PTR<scalar_t_0>() : NULL,
|
||||||
shift.has_value() ? shift.value().data<scalar_t_0>(): NULL,
|
shift.has_value() ? shift.value().DATA_PTR<scalar_t_0>(): NULL,
|
||||||
out.data<scalar_t_0>(),
|
out.DATA_PTR<scalar_t_0>(),
|
||||||
reduction_size,
|
reduction_size,
|
||||||
stride);
|
stride);
|
||||||
);
|
);
|
||||||
|
|
55
setup.py
55
setup.py
|
@ -67,6 +67,19 @@ def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
|
||||||
"https://github.com/NVIDIA/apex/pull/323#discussion_r287021798. "
|
"https://github.com/NVIDIA/apex/pull/323#discussion_r287021798. "
|
||||||
"You can try commenting out this check (at your own risk).")
|
"You can try commenting out this check (at your own risk).")
|
||||||
|
|
||||||
|
# Set up macros for forward/backward compatibility hack around
|
||||||
|
# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
|
||||||
|
# and
|
||||||
|
# https://github.com/NVIDIA/apex/issues/456
|
||||||
|
# https://github.com/pytorch/pytorch/commit/eb7b39e02f7d75c26d8a795ea8c7fd911334da7e#diff-4632522f237f1e4e728cb824300403ac
|
||||||
|
version_ge_1_1 = []
|
||||||
|
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
|
||||||
|
version_ge_1_1 = ['-DVERSION_GE_1_1']
|
||||||
|
version_ge_1_3 = []
|
||||||
|
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
|
||||||
|
version_ge_1_3 = ['-DVERSION_GE_1_3']
|
||||||
|
version_dependent_macros = version_ge_1_1 + version_ge_1_3
|
||||||
|
|
||||||
if "--cuda_ext" in sys.argv:
|
if "--cuda_ext" in sys.argv:
|
||||||
from torch.utils.cpp_extension import CUDAExtension
|
from torch.utils.cpp_extension import CUDAExtension
|
||||||
sys.argv.remove("--cuda_ext")
|
sys.argv.remove("--cuda_ext")
|
||||||
|
@ -76,12 +89,6 @@ if "--cuda_ext" in sys.argv:
|
||||||
else:
|
else:
|
||||||
check_cuda_torch_binary_vs_bare_metal(torch.utils.cpp_extension.CUDA_HOME)
|
check_cuda_torch_binary_vs_bare_metal(torch.utils.cpp_extension.CUDA_HOME)
|
||||||
|
|
||||||
# Set up macros for forward/backward compatibility hack around
|
|
||||||
# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
|
|
||||||
version_ge_1_1 = []
|
|
||||||
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
|
|
||||||
version_ge_1_1 = ['-DVERSION_GE_1_1']
|
|
||||||
|
|
||||||
ext_modules.append(
|
ext_modules.append(
|
||||||
CUDAExtension(name='amp_C',
|
CUDAExtension(name='amp_C',
|
||||||
sources=['csrc/amp_C_frontend.cpp',
|
sources=['csrc/amp_C_frontend.cpp',
|
||||||
|
@ -94,30 +101,33 @@ if "--cuda_ext" in sys.argv:
|
||||||
'csrc/multi_tensor_adam.cu',
|
'csrc/multi_tensor_adam.cu',
|
||||||
'csrc/multi_tensor_novograd.cu',
|
'csrc/multi_tensor_novograd.cu',
|
||||||
'csrc/multi_tensor_lamb.cu'],
|
'csrc/multi_tensor_lamb.cu'],
|
||||||
extra_compile_args={'cxx': ['-O3'],
|
extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
|
||||||
'nvcc':['-lineinfo',
|
'nvcc':['-lineinfo',
|
||||||
'-O3',
|
'-O3',
|
||||||
# '--resource-usage',
|
# '--resource-usage',
|
||||||
'--use_fast_math']}))
|
'--use_fast_math'] + version_dependent_macros}))
|
||||||
ext_modules.append(
|
ext_modules.append(
|
||||||
CUDAExtension(name='fused_adam_cuda',
|
CUDAExtension(name='fused_adam_cuda',
|
||||||
sources=['csrc/fused_adam_cuda.cpp',
|
sources=['csrc/fused_adam_cuda.cpp',
|
||||||
'csrc/fused_adam_cuda_kernel.cu'],
|
'csrc/fused_adam_cuda_kernel.cu'],
|
||||||
extra_compile_args={'cxx': ['-O3',],
|
extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
|
||||||
'nvcc':['-O3',
|
'nvcc':['-O3',
|
||||||
'--use_fast_math']}))
|
'--use_fast_math'] + version_dependent_macros}))
|
||||||
ext_modules.append(
|
ext_modules.append(
|
||||||
CUDAExtension(name='syncbn',
|
CUDAExtension(name='syncbn',
|
||||||
sources=['csrc/syncbn.cpp',
|
sources=['csrc/syncbn.cpp',
|
||||||
'csrc/welford.cu']))
|
'csrc/welford.cu'],
|
||||||
|
extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
|
||||||
|
'nvcc':['-O3'] + version_dependent_macros}))
|
||||||
|
|
||||||
ext_modules.append(
|
ext_modules.append(
|
||||||
CUDAExtension(name='fused_layer_norm_cuda',
|
CUDAExtension(name='fused_layer_norm_cuda',
|
||||||
sources=['csrc/layer_norm_cuda.cpp',
|
sources=['csrc/layer_norm_cuda.cpp',
|
||||||
'csrc/layer_norm_cuda_kernel.cu'],
|
'csrc/layer_norm_cuda_kernel.cu'],
|
||||||
extra_compile_args={'cxx': ['-O3'] + version_ge_1_1,
|
extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
|
||||||
'nvcc':['-maxrregcount=50',
|
'nvcc':['-maxrregcount=50',
|
||||||
'-O3',
|
'-O3',
|
||||||
'--use_fast_math'] + version_ge_1_1}))
|
'--use_fast_math'] + version_dependent_macros}))
|
||||||
|
|
||||||
if "--bnp" in sys.argv:
|
if "--bnp" in sys.argv:
|
||||||
from torch.utils.cpp_extension import CUDAExtension
|
from torch.utils.cpp_extension import CUDAExtension
|
||||||
|
@ -129,24 +139,20 @@ if "--bnp" in sys.argv:
|
||||||
if torch.utils.cpp_extension.CUDA_HOME is None:
|
if torch.utils.cpp_extension.CUDA_HOME is None:
|
||||||
raise RuntimeError("--bnp was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
|
raise RuntimeError("--bnp was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
|
||||||
else:
|
else:
|
||||||
# Set up macros for forward/backward compatibility hack around
|
|
||||||
# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
|
|
||||||
version_ge_1_1 = []
|
|
||||||
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
|
|
||||||
version_ge_1_1 = ['-DVERSION_GE_1_1']
|
|
||||||
ext_modules.append(
|
ext_modules.append(
|
||||||
CUDAExtension(name='bnp',
|
CUDAExtension(name='bnp',
|
||||||
sources=['apex/contrib/csrc/groupbn/batch_norm.cu',
|
sources=['apex/contrib/csrc/groupbn/batch_norm.cu',
|
||||||
'apex/contrib/csrc/groupbn/ipc.cu',
|
'apex/contrib/csrc/groupbn/ipc.cu',
|
||||||
'apex/contrib/csrc/groupbn/interface.cpp',
|
'apex/contrib/csrc/groupbn/interface.cpp',
|
||||||
'apex/contrib/csrc/groupbn/batch_norm_add_relu.cu'],
|
'apex/contrib/csrc/groupbn/batch_norm_add_relu.cu'],
|
||||||
extra_compile_args={'cxx': [] + version_ge_1_1,
|
include_dirs=['csrc'],
|
||||||
|
extra_compile_args={'cxx': [] + version_dependent_macros,
|
||||||
'nvcc':['-DCUDA_HAS_FP16=1',
|
'nvcc':['-DCUDA_HAS_FP16=1',
|
||||||
'-D__CUDA_NO_HALF_OPERATORS__',
|
'-D__CUDA_NO_HALF_OPERATORS__',
|
||||||
'-D__CUDA_NO_HALF_CONVERSIONS__',
|
'-D__CUDA_NO_HALF_CONVERSIONS__',
|
||||||
'-D__CUDA_NO_HALF2_OPERATORS__',
|
'-D__CUDA_NO_HALF2_OPERATORS__',
|
||||||
'-gencode',
|
'-gencode',
|
||||||
'arch=compute_70,code=sm_70'] + version_ge_1_1}))
|
'arch=compute_70,code=sm_70'] + version_dependent_macros}))
|
||||||
|
|
||||||
if "--xentropy" in sys.argv:
|
if "--xentropy" in sys.argv:
|
||||||
from torch.utils.cpp_extension import CUDAExtension
|
from torch.utils.cpp_extension import CUDAExtension
|
||||||
|
@ -158,18 +164,13 @@ if "--xentropy" in sys.argv:
|
||||||
if torch.utils.cpp_extension.CUDA_HOME is None:
|
if torch.utils.cpp_extension.CUDA_HOME is None:
|
||||||
raise RuntimeError("--xentropy was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
|
raise RuntimeError("--xentropy was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
|
||||||
else:
|
else:
|
||||||
# Set up macros for forward/backward compatibility hack around
|
|
||||||
# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
|
|
||||||
version_ge_1_1 = []
|
|
||||||
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
|
|
||||||
version_ge_1_1 = ['-DVERSION_GE_1_1']
|
|
||||||
ext_modules.append(
|
ext_modules.append(
|
||||||
CUDAExtension(name='xentropy_cuda',
|
CUDAExtension(name='xentropy_cuda',
|
||||||
sources=['apex/contrib/csrc/xentropy/interface.cpp',
|
sources=['apex/contrib/csrc/xentropy/interface.cpp',
|
||||||
'apex/contrib/csrc/xentropy/xentropy_kernel.cu'],
|
'apex/contrib/csrc/xentropy/xentropy_kernel.cu'],
|
||||||
include_dirs=['csrc'],
|
include_dirs=['csrc'],
|
||||||
extra_compile_args={'cxx': ['-O3'] + version_ge_1_1,
|
extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
|
||||||
'nvcc':['-O3'] + version_ge_1_1}))
|
'nvcc':['-O3'] + version_dependent_macros}))
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name='apex',
|
name='apex',
|
||||||
|
|
|
@ -13,7 +13,7 @@ print_red() {
|
||||||
}
|
}
|
||||||
|
|
||||||
images=(
|
images=(
|
||||||
"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.07-py3-devel"
|
"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.08-py3-devel"
|
||||||
"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:master-py3-devel"
|
"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:master-py3-devel"
|
||||||
"pytorch/pytorch:nightly-devel-cuda10.0-cudnn7"
|
"pytorch/pytorch:nightly-devel-cuda10.0-cudnn7"
|
||||||
"pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel"
|
"pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel"
|
||||||
|
|
Загрузка…
Ссылка в новой задаче