зеркало из https://github.com/microsoft/apex.git
* Pushing for build tests * Contrib files * Removing deprecated checks
This commit is contained in:
Родитель
1bf0d8d4ba
Коммит
325f5a0bec
|
@ -8,6 +8,8 @@
|
|||
|
||||
#include <cuda.h>
|
||||
|
||||
#include "compat.h"
|
||||
|
||||
#define cudaCheckErrors(msg) \
|
||||
do { \
|
||||
cudaError_t __err = cudaGetLastError(); \
|
||||
|
@ -72,7 +74,7 @@ at::Tensor nhwc_bn_fwd_train(
|
|||
const int C = x.size(3);
|
||||
|
||||
// generating new magic number and use that for sync
|
||||
int* magic = magic_tensor.data<int>();
|
||||
int* magic = magic_tensor.DATA_PTR<int>();
|
||||
*magic = (*magic + 1) & 0xff;
|
||||
|
||||
// Allocate output tensor
|
||||
|
@ -87,13 +89,13 @@ at::Tensor nhwc_bn_fwd_train(
|
|||
bn->setConstants(momentum, epsilon);
|
||||
|
||||
// set pointers within the wrapper
|
||||
bn->setInputOutputPointers(x.data<at::Half>(),
|
||||
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
|
||||
nullptr,
|
||||
y.data<at::Half>(),
|
||||
y.DATA_PTR<at::Half>(),
|
||||
nullptr);
|
||||
|
||||
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {nullptr, nullptr});
|
||||
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
|
||||
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {nullptr, nullptr});
|
||||
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});
|
||||
|
||||
// deal with workspace(s)
|
||||
auto workspace_bytes = bn->numWorkspaceBytes();
|
||||
|
@ -114,12 +116,12 @@ at::Tensor nhwc_bn_fwd_train(
|
|||
Workspace ws(total_workspace_bytes);
|
||||
|
||||
std::vector<void *> workspace;
|
||||
workspace.push_back(minibatch_mean.data<float>());
|
||||
workspace.push_back(minibatch_inv_var.data<float>());
|
||||
workspace.push_back(minibatch_mean.DATA_PTR<float>());
|
||||
workspace.push_back(minibatch_inv_var.DATA_PTR<float>());
|
||||
|
||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
const int retired_cta_bytes = workspace_bytes[2];
|
||||
void* retired_ctas = ret_cta.data<uint8_t>();
|
||||
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
|
||||
assert(ret_cta.size(0)>=retired_cta_bytes);
|
||||
workspace.push_back(retired_ctas);
|
||||
|
||||
|
@ -165,13 +167,13 @@ at::Tensor nhwc_bn_fwd_eval(
|
|||
bn->setConstants(momentum, epsilon);
|
||||
|
||||
// set pointers within the wrapper
|
||||
bn->setInputOutputPointers(x.data<at::Half>(),
|
||||
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
|
||||
nullptr,
|
||||
y.data<at::Half>(),
|
||||
y.DATA_PTR<at::Half>(),
|
||||
nullptr);
|
||||
|
||||
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {nullptr, nullptr});
|
||||
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
|
||||
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {nullptr, nullptr});
|
||||
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});
|
||||
|
||||
// deal with workspace(s)
|
||||
auto workspace_bytes = bn->numWorkspaceBytes();
|
||||
|
@ -197,7 +199,7 @@ at::Tensor nhwc_bn_fwd_eval(
|
|||
|
||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
const int retired_cta_bytes = workspace_bytes[2];
|
||||
void* retired_ctas = ret_cta.data<uint8_t>();
|
||||
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
|
||||
assert(ret_cta.size(0)>=retired_cta_bytes);
|
||||
workspace.push_back(retired_ctas);
|
||||
|
||||
|
@ -244,7 +246,7 @@ std::vector<at::Tensor> nhwc_bn_bwd(
|
|||
const int C = x.size(3);
|
||||
|
||||
// generating new magic number and use that for sync
|
||||
int* magic = magic_tensor.data<int>();
|
||||
int* magic = magic_tensor.DATA_PTR<int>();
|
||||
*magic = (*magic + 1) & 0xff;
|
||||
|
||||
// outputs
|
||||
|
@ -264,13 +266,13 @@ std::vector<at::Tensor> nhwc_bn_bwd(
|
|||
bn->setConstants(momentum, epsilon);
|
||||
|
||||
// set pointers within the wrapper
|
||||
bn->setInputOutputPointers(x.data<at::Half>(),
|
||||
x_grad.data<at::Half>(),
|
||||
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
|
||||
x_grad.DATA_PTR<at::Half>(),
|
||||
nullptr,
|
||||
dy.data<at::Half>());
|
||||
dy.DATA_PTR<at::Half>());
|
||||
|
||||
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {scale_grad.data<float>(), bias_grad.data<float>()});
|
||||
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
|
||||
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {scale_grad.DATA_PTR<float>(), bias_grad.DATA_PTR<float>()});
|
||||
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});
|
||||
|
||||
// deal with workspace(s)
|
||||
auto workspace_bytes = bn->numWorkspaceBytes();
|
||||
|
@ -291,12 +293,12 @@ std::vector<at::Tensor> nhwc_bn_bwd(
|
|||
Workspace ws(total_workspace_bytes);
|
||||
|
||||
std::vector<void *> workspace;
|
||||
workspace.push_back(minibatch_mean.data<float>());
|
||||
workspace.push_back(minibatch_inv_var.data<float>());
|
||||
workspace.push_back(minibatch_mean.DATA_PTR<float>());
|
||||
workspace.push_back(minibatch_inv_var.DATA_PTR<float>());
|
||||
|
||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
const int retired_cta_bytes = workspace_bytes[2];
|
||||
void* retired_ctas = ret_cta.data<uint8_t>();
|
||||
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
|
||||
assert(ret_cta.size(0)>=retired_cta_bytes);
|
||||
workspace.push_back(retired_ctas);
|
||||
|
||||
|
|
|
@ -8,6 +8,8 @@
|
|||
|
||||
#include <cuda.h>
|
||||
|
||||
#include "compat.h"
|
||||
|
||||
//FIXME move the common stuff to common h file
|
||||
#define cudaCheckErrors(msg) \
|
||||
do { \
|
||||
|
@ -74,7 +76,7 @@ at::Tensor nhwc_bn_addrelu_fwd_train(
|
|||
const int C = x.size(3);
|
||||
|
||||
// generating new magic number and use that for sync
|
||||
int* magic = magic_tensor.data<int>();
|
||||
int* magic = magic_tensor.DATA_PTR<int>();
|
||||
*magic = (*magic + 1) & 0xff;
|
||||
|
||||
// Allocate output tensor
|
||||
|
@ -89,15 +91,15 @@ at::Tensor nhwc_bn_addrelu_fwd_train(
|
|||
bn->setConstants(momentum, epsilon);
|
||||
|
||||
// set pointers within the wrapper
|
||||
bn->setInputOutputPointers(x.data<at::Half>(),
|
||||
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
|
||||
nullptr,
|
||||
y.data<at::Half>(),
|
||||
y.DATA_PTR<at::Half>(),
|
||||
nullptr,
|
||||
z.data<at::Half>(),
|
||||
z.DATA_PTR<at::Half>(),
|
||||
nullptr);
|
||||
|
||||
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {nullptr, nullptr});
|
||||
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
|
||||
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {nullptr, nullptr});
|
||||
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});
|
||||
|
||||
// deal with workspace(s)
|
||||
auto workspace_bytes = bn->numWorkspaceBytes();
|
||||
|
@ -118,13 +120,13 @@ at::Tensor nhwc_bn_addrelu_fwd_train(
|
|||
Workspace ws(total_workspace_bytes);
|
||||
|
||||
std::vector<void *> workspace;
|
||||
workspace.push_back(minibatch_mean.data<float>());
|
||||
workspace.push_back(minibatch_inv_var.data<float>());
|
||||
workspace.push_back(bitmask.data<int32_t>());
|
||||
workspace.push_back(minibatch_mean.DATA_PTR<float>());
|
||||
workspace.push_back(minibatch_inv_var.DATA_PTR<float>());
|
||||
workspace.push_back(bitmask.DATA_PTR<int32_t>());
|
||||
|
||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
const int retired_cta_bytes = workspace_bytes[3];
|
||||
void* retired_ctas = ret_cta.data<uint8_t>();
|
||||
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
|
||||
assert(ret_cta.size(0)>=retired_cta_bytes);
|
||||
|
||||
workspace.push_back(retired_ctas);
|
||||
|
@ -171,15 +173,15 @@ at::Tensor nhwc_bn_addrelu_fwd_eval(
|
|||
bn->setConstants(momentum, epsilon);
|
||||
|
||||
// set pointers within the wrapper
|
||||
bn->setInputOutputPointers(x.data<at::Half>(),
|
||||
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
|
||||
nullptr,
|
||||
y.data<at::Half>(),
|
||||
y.DATA_PTR<at::Half>(),
|
||||
nullptr,
|
||||
z.data<at::Half>(),
|
||||
z.DATA_PTR<at::Half>(),
|
||||
nullptr);
|
||||
|
||||
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {nullptr, nullptr});
|
||||
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
|
||||
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {nullptr, nullptr});
|
||||
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});
|
||||
|
||||
// deal with workspace(s)
|
||||
auto workspace_bytes = bn->numWorkspaceBytes();
|
||||
|
@ -206,7 +208,7 @@ at::Tensor nhwc_bn_addrelu_fwd_eval(
|
|||
|
||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
const int retired_cta_bytes = workspace_bytes[3];
|
||||
void* retired_ctas = ret_cta.data<uint8_t>();
|
||||
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
|
||||
assert(ret_cta.size(0)>=retired_cta_bytes);
|
||||
workspace.push_back(retired_ctas);
|
||||
|
||||
|
@ -253,7 +255,7 @@ std::vector<at::Tensor> nhwc_bn_addrelu_bwd(
|
|||
const int C = x.size(3);
|
||||
|
||||
// generating new magic number and use that for sync
|
||||
int* magic = magic_tensor.data<int>();
|
||||
int* magic = magic_tensor.DATA_PTR<int>();
|
||||
*magic = (*magic + 1) & 0xff;
|
||||
|
||||
// outputs
|
||||
|
@ -274,15 +276,15 @@ std::vector<at::Tensor> nhwc_bn_addrelu_bwd(
|
|||
bn->setConstants(momentum, epsilon);
|
||||
|
||||
// set pointers within the wrapper
|
||||
bn->setInputOutputPointers(x.data<at::Half>(),
|
||||
x_grad.data<at::Half>(),
|
||||
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
|
||||
x_grad.DATA_PTR<at::Half>(),
|
||||
nullptr,
|
||||
dy.data<at::Half>(),
|
||||
dy.DATA_PTR<at::Half>(),
|
||||
nullptr,
|
||||
z_grad.data<at::Half>());
|
||||
z_grad.DATA_PTR<at::Half>());
|
||||
|
||||
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {scale_grad.data<float>(), bias_grad.data<float>()});
|
||||
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
|
||||
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {scale_grad.DATA_PTR<float>(), bias_grad.DATA_PTR<float>()});
|
||||
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});
|
||||
|
||||
// deal with workspace(s)
|
||||
auto workspace_bytes = bn->numWorkspaceBytes();
|
||||
|
@ -303,13 +305,13 @@ std::vector<at::Tensor> nhwc_bn_addrelu_bwd(
|
|||
Workspace ws(total_workspace_bytes);
|
||||
|
||||
std::vector<void *> workspace;
|
||||
workspace.push_back(minibatch_mean.data<float>());
|
||||
workspace.push_back(minibatch_inv_var.data<float>());
|
||||
workspace.push_back(bitmask.data<int32_t>());
|
||||
workspace.push_back(minibatch_mean.DATA_PTR<float>());
|
||||
workspace.push_back(minibatch_inv_var.DATA_PTR<float>());
|
||||
workspace.push_back(bitmask.DATA_PTR<int32_t>());
|
||||
|
||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
const int retired_cta_bytes = workspace_bytes[3];
|
||||
void* retired_ctas = ret_cta.data<uint8_t>();
|
||||
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
|
||||
assert(ret_cta.size(0)>=retired_cta_bytes);
|
||||
workspace.push_back(retired_ctas);
|
||||
|
||||
|
|
|
@ -6,6 +6,8 @@
|
|||
|
||||
#include <cuda.h>
|
||||
|
||||
#include "compat.h"
|
||||
|
||||
|
||||
#define cudaCheckErrors(msg) \
|
||||
do { \
|
||||
|
@ -114,17 +116,17 @@ int64_t get_buffer_size(const int bn_sync_steps) {
|
|||
|
||||
void* get_remote_data_ptr(const at::Tensor& handle, const int64_t offset) {
|
||||
cudaIpcMemHandle_t my_handle;
|
||||
memcpy((unsigned char *)(&my_handle), handle.data<uint8_t>(), sizeof(my_handle));
|
||||
memcpy((unsigned char *)(&my_handle), handle.DATA_PTR<uint8_t>(), sizeof(my_handle));
|
||||
return ipc_mem_registry.getPtr(my_handle, offset);
|
||||
}
|
||||
|
||||
void close_remote_data(const at::Tensor& handle) {
|
||||
cudaIpcMemHandle_t my_handle;
|
||||
memcpy((unsigned char *)(&my_handle), handle.data<uint8_t>(), sizeof(my_handle));
|
||||
memcpy((unsigned char *)(&my_handle), handle.DATA_PTR<uint8_t>(), sizeof(my_handle));
|
||||
ipc_mem_registry.releasePtr(my_handle);
|
||||
}
|
||||
|
||||
void* get_data_ptr(
|
||||
const at::Tensor& data) {
|
||||
return data.data<uint8_t>();
|
||||
return data.DATA_PTR<uint8_t>();
|
||||
}
|
||||
|
|
|
@ -82,6 +82,7 @@
|
|||
#include <THC/THCThrustAllocator.cuh>
|
||||
|
||||
#include "type_shim.h"
|
||||
#include "compat.h"
|
||||
|
||||
using Tensor = at::Tensor;
|
||||
using TensorList = at::TensorList;
|
||||
|
@ -492,7 +493,7 @@ std::vector<Tensor> host_softmax_xentropy(
|
|||
inner_size *= input.size(i);
|
||||
// This kernel spawns a block per each element in the batch.
|
||||
// XXX: it assumes that inner_size == 1
|
||||
AT_CHECK(inner_size == 1, "Currently only inner size 1 supported");
|
||||
TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported");
|
||||
|
||||
const int ILP = 2;
|
||||
dim3 grid(outer_size);
|
||||
|
@ -504,15 +505,15 @@ std::vector<Tensor> host_softmax_xentropy(
|
|||
if (!half_to_float) {
|
||||
cunn_SoftMaxXEntropyForward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue>
|
||||
<<<grid, block, 2 * block.x * sizeof(accscalar_t), stream>>>(
|
||||
losses.data<accscalar_t>(), max_log_sum_exp.data<scalar_t_0>(),
|
||||
input.data<scalar_t_0>(), labels_.data<int64_t>(),
|
||||
losses.DATA_PTR<accscalar_t>(), max_log_sum_exp.DATA_PTR<scalar_t_0>(),
|
||||
input.DATA_PTR<scalar_t_0>(), labels_.DATA_PTR<int64_t>(),
|
||||
dim_size, smoothing
|
||||
);
|
||||
} else {
|
||||
cunn_SoftMaxXEntropyForward<ILP, scalar_t_0, accscalar_t, accscalar_t, Epilogue>
|
||||
<<<grid, block, 2 * block.x * sizeof(accscalar_t), stream>>>(
|
||||
losses.data<accscalar_t>(), max_log_sum_exp.data<accscalar_t>(),
|
||||
input.data<scalar_t_0>(), labels_.data<int64_t>(),
|
||||
losses.DATA_PTR<accscalar_t>(), max_log_sum_exp.DATA_PTR<accscalar_t>(),
|
||||
input.DATA_PTR<scalar_t_0>(), labels_.DATA_PTR<int64_t>(),
|
||||
dim_size, smoothing
|
||||
);
|
||||
}
|
||||
|
@ -561,7 +562,7 @@ Tensor host_softmax_xentropy_backward(
|
|||
inner_size *= logits.size(i);
|
||||
// See descriptions of kernels above.
|
||||
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
AT_CHECK(inner_size == 1, "Currently only inner size 1 supported");
|
||||
TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported");
|
||||
|
||||
const int ILP = 2;
|
||||
dim3 grid(outer_size);
|
||||
|
@ -572,17 +573,17 @@ Tensor host_softmax_xentropy_backward(
|
|||
if (!half_to_float) {
|
||||
cunn_SoftMaxXEntropyBackward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue>
|
||||
<<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
|
||||
gI.data<scalar_t_0>(), logits.data<scalar_t_0>(),
|
||||
max_log_sum_exp.data<scalar_t_0>(),
|
||||
grad.data<scalar_t_0>(), labels.data<int64_t>(),
|
||||
gI.DATA_PTR<scalar_t_0>(), logits.DATA_PTR<scalar_t_0>(),
|
||||
max_log_sum_exp.DATA_PTR<scalar_t_0>(),
|
||||
grad.DATA_PTR<scalar_t_0>(), labels.DATA_PTR<int64_t>(),
|
||||
smoothing, dim_size
|
||||
);
|
||||
} else {
|
||||
cunn_SoftMaxXEntropyBackward<ILP, scalar_t_0, accscalar_t, accscalar_t, Epilogue>
|
||||
<<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
|
||||
gI.data<scalar_t_0>(), logits.data<scalar_t_0>(),
|
||||
max_log_sum_exp.data<accscalar_t>(),
|
||||
grad.data<accscalar_t>(), labels.data<int64_t>(),
|
||||
gI.DATA_PTR<scalar_t_0>(), logits.DATA_PTR<scalar_t_0>(),
|
||||
max_log_sum_exp.DATA_PTR<accscalar_t>(),
|
||||
grad.DATA_PTR<accscalar_t>(), labels.DATA_PTR<int64_t>(),
|
||||
smoothing, dim_size
|
||||
);
|
||||
}
|
||||
|
|
|
@ -1,3 +1,9 @@
|
|||
#ifndef TORCH_CHECK
|
||||
#define TORCH_CHECK AT_CHECK
|
||||
#endif
|
||||
|
||||
#ifdef VERSION_GE_1_3
|
||||
#define DATA_PTR data_ptr
|
||||
#else
|
||||
#define DATA_PTR data
|
||||
#endif
|
||||
|
|
|
@ -190,11 +190,11 @@ void fused_adam_cuda(
|
|||
DISPATCH_FLOAT_AND_HALF(g.scalar_type(), 0, "adam_cuda_kernel",
|
||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||
adam_cuda_kernel<accscalar_t, scalar_t_0><<<blocks,threadsPerBlock, 0, stream>>>(
|
||||
p.data<accscalar_t>(),
|
||||
p_copy.numel() ? p_copy.data<scalar_t_0>() : NULL,
|
||||
m.data<accscalar_t>(),
|
||||
v.data<accscalar_t>(),
|
||||
g.data<scalar_t_0>(),
|
||||
p.DATA_PTR<accscalar_t>(),
|
||||
p_copy.numel() ? p_copy.DATA_PTR<scalar_t_0>() : NULL,
|
||||
m.DATA_PTR<accscalar_t>(),
|
||||
v.DATA_PTR<accscalar_t>(),
|
||||
g.DATA_PTR<scalar_t_0>(),
|
||||
beta1,
|
||||
beta2,
|
||||
eps,
|
||||
|
@ -208,11 +208,11 @@ void fused_adam_cuda(
|
|||
using namespace at;
|
||||
DISPATCH_DOUBLE_AND_FLOAT(g.scalar_type(), 0, "adam_cuda_kernel",
|
||||
adam_cuda_kernel<scalar_t_0, scalar_t_0><<<blocks,threadsPerBlock, 0, stream>>>(
|
||||
p.data<scalar_t_0>(),
|
||||
p.DATA_PTR<scalar_t_0>(),
|
||||
NULL, //don't output p_copy for fp32, it's wasted write
|
||||
m.data<scalar_t_0>(),
|
||||
v.data<scalar_t_0>(),
|
||||
g.data<scalar_t_0>(),
|
||||
m.DATA_PTR<scalar_t_0>(),
|
||||
v.DATA_PTR<scalar_t_0>(),
|
||||
g.DATA_PTR<scalar_t_0>(),
|
||||
beta1,
|
||||
beta2,
|
||||
eps,
|
||||
|
|
|
@ -687,14 +687,14 @@ void cuda_layer_norm(
|
|||
DISPATCH_DOUBLE_FLOAT_AND_HALF(input->scalar_type(), 0, "layer_norm_cuda_kernel",
|
||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||
HostApplyLayerNorm(
|
||||
output->data<scalar_t_0>(),
|
||||
mean->data<accscalar_t>(),
|
||||
invvar->data<accscalar_t>(),
|
||||
input->data<scalar_t_0>(),
|
||||
output->DATA_PTR<scalar_t_0>(),
|
||||
mean->DATA_PTR<accscalar_t>(),
|
||||
invvar->DATA_PTR<accscalar_t>(),
|
||||
input->DATA_PTR<scalar_t_0>(),
|
||||
n1,n2,
|
||||
epsilon,
|
||||
gamma != NULL ? gamma->data<scalar_t_0>() : NULL,
|
||||
beta != NULL ? beta->data<scalar_t_0>() : NULL);
|
||||
gamma != NULL ? gamma->DATA_PTR<scalar_t_0>() : NULL,
|
||||
beta != NULL ? beta->DATA_PTR<scalar_t_0>() : NULL);
|
||||
)
|
||||
}
|
||||
|
||||
|
@ -728,20 +728,20 @@ void HostLayerNormGradient(
|
|||
at::Tensor part_grad_beta = at::empty_like(part_grad_gamma);
|
||||
cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
|
||||
dout,
|
||||
input->data<T>(),
|
||||
input->DATA_PTR<T>(),
|
||||
n1,n2,
|
||||
mean,
|
||||
invvar,
|
||||
U(epsilon),
|
||||
part_grad_gamma.data<U>(),
|
||||
part_grad_beta.data<U>());
|
||||
part_grad_gamma.DATA_PTR<U>(),
|
||||
part_grad_beta.DATA_PTR<U>());
|
||||
|
||||
const dim3 threads3(32,8,1);
|
||||
const dim3 blocks3((n2+threads2.x-1)/threads2.x,1,1);
|
||||
const int nshared3 = threads3.x * threads3.y * sizeof(U);
|
||||
cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
|
||||
part_grad_gamma.data<U>(),
|
||||
part_grad_beta.data<U>(),
|
||||
part_grad_gamma.DATA_PTR<U>(),
|
||||
part_grad_beta.DATA_PTR<U>(),
|
||||
part_size,
|
||||
n1,n2,
|
||||
grad_gamma,
|
||||
|
@ -758,7 +758,7 @@ void HostLayerNormGradient(
|
|||
0;
|
||||
cuComputeGradInput<<<blocks1, threads1, nshared, stream>>>(
|
||||
dout,
|
||||
input->data<T>(),
|
||||
input->DATA_PTR<T>(),
|
||||
n1,n2,
|
||||
mean,
|
||||
invvar,
|
||||
|
@ -790,18 +790,18 @@ void cuda_layer_norm_gradient(
|
|||
DISPATCH_FLOAT_AND_HALF(input->scalar_type(), 0, "cuComputeGradInput",
|
||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||
HostLayerNormGradient(
|
||||
dout->data<scalar_t_0>(),
|
||||
mean->data<accscalar_t>(),
|
||||
invvar->data<accscalar_t>(),
|
||||
dout->DATA_PTR<scalar_t_0>(),
|
||||
mean->DATA_PTR<accscalar_t>(),
|
||||
invvar->DATA_PTR<accscalar_t>(),
|
||||
input,
|
||||
n1,n2,
|
||||
// TMJ pass NULL argument for gamma, beta, grad_gamma and grad_beta
|
||||
// if gamma Tensor is NULL on input.
|
||||
gamma != NULL ? gamma->data<scalar_t_0>() : NULL,
|
||||
gamma != NULL ? beta->data<scalar_t_0>() : NULL,
|
||||
gamma != NULL ? gamma->DATA_PTR<scalar_t_0>() : NULL,
|
||||
gamma != NULL ? beta->DATA_PTR<scalar_t_0>() : NULL,
|
||||
epsilon,
|
||||
grad_input->data<scalar_t_0>(),
|
||||
gamma != NULL ? grad_gamma->data<scalar_t_0>() : NULL,
|
||||
gamma != NULL ? grad_beta->data<scalar_t_0>() : NULL);
|
||||
grad_input->DATA_PTR<scalar_t_0>(),
|
||||
gamma != NULL ? grad_gamma->DATA_PTR<scalar_t_0>() : NULL,
|
||||
gamma != NULL ? grad_beta->DATA_PTR<scalar_t_0>() : NULL);
|
||||
)
|
||||
}
|
||||
|
|
|
@ -96,7 +96,7 @@ void multi_tensor_apply(
|
|||
// using accscalar_t = acc_type<scalar_t, true>;
|
||||
multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
|
||||
chunk_size,
|
||||
noop_flag.data<int>(),
|
||||
noop_flag.DATA_PTR<int>(),
|
||||
tl,
|
||||
callable,
|
||||
args...);
|
||||
|
|
|
@ -274,8 +274,8 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
|
|||
noop_flag,
|
||||
tensor_lists,
|
||||
L2NormFunctor<scalar_t_0>(),
|
||||
output.data<float>(),
|
||||
per_tensor ? output_per_tensor.data<float>() : nullptr,
|
||||
output.DATA_PTR<float>(),
|
||||
per_tensor ? output_per_tensor.DATA_PTR<float>() : nullptr,
|
||||
per_tensor,
|
||||
max_chunks_per_tensor);)
|
||||
|
||||
|
@ -289,10 +289,10 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
|
|||
auto ret = at::empty({1}, output.options());
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
cleanup<<<per_tensor ? ntensors : 1, 512, 0, stream>>>(
|
||||
output.data<float>(),
|
||||
per_tensor ? output_per_tensor.data<float>() : nullptr,
|
||||
ret.data<float>(),
|
||||
per_tensor ? ret_per_tensor.data<float>() : nullptr,
|
||||
output.DATA_PTR<float>(),
|
||||
per_tensor ? output_per_tensor.DATA_PTR<float>() : nullptr,
|
||||
ret.DATA_PTR<float>(),
|
||||
per_tensor ? ret_per_tensor.DATA_PTR<float>() : nullptr,
|
||||
per_tensor,
|
||||
max_chunks_per_tensor);
|
||||
|
||||
|
@ -344,8 +344,8 @@ void multi_tensor_norm_out_cuda(
|
|||
noop_flag,
|
||||
tensor_lists,
|
||||
MaxNormFunctor<scalar_t_0>(),
|
||||
output.data<float>(),
|
||||
output_per_tensor.data<float>(),
|
||||
output.DATA_PTR<float>(),
|
||||
output_per_tensor.DATA_PTR<float>(),
|
||||
true,
|
||||
max_chunks_per_tensor);)
|
||||
}
|
||||
|
@ -358,8 +358,8 @@ void multi_tensor_norm_out_cuda(
|
|||
noop_flag,
|
||||
tensor_lists,
|
||||
L2NormFunctor<scalar_t_0>(),
|
||||
output.data<float>(),
|
||||
output_per_tensor.data<float>(),
|
||||
output.DATA_PTR<float>(),
|
||||
output_per_tensor.DATA_PTR<float>(),
|
||||
true,
|
||||
max_chunks_per_tensor);)
|
||||
}
|
||||
|
@ -373,10 +373,10 @@ void multi_tensor_norm_out_cuda(
|
|||
auto ret = at::empty({1}, output.options());
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
cleanup_v2<<<ntensors, 512, 0, stream>>>(
|
||||
output.data<float>(),
|
||||
output_per_tensor.data<float>(),
|
||||
ret.data<float>(),
|
||||
out.data<float>(),
|
||||
output.DATA_PTR<float>(),
|
||||
output_per_tensor.DATA_PTR<float>(),
|
||||
ret.DATA_PTR<float>(),
|
||||
out.DATA_PTR<float>(),
|
||||
true,
|
||||
max_chunks_per_tensor,
|
||||
norm_type,
|
||||
|
|
|
@ -265,7 +265,7 @@ void multi_tensor_lamb_cuda(
|
|||
epsilon,
|
||||
(adamMode_t) mode,
|
||||
weight_decay,
|
||||
std::get<0>(grad_norm_tuple).data<float>(),
|
||||
std::get<0>(grad_norm_tuple).DATA_PTR<float>(),
|
||||
max_grad_norm); )
|
||||
|
||||
// Compute update norms
|
||||
|
@ -280,8 +280,8 @@ void multi_tensor_lamb_cuda(
|
|||
noop_flag,
|
||||
grad_param_list,
|
||||
LAMBStage2Functor<scalar_t_0>(),
|
||||
std::get<1>(param_norm_tuple).data<float>(),
|
||||
std::get<1>(update_norm_tuple).data<float>(),
|
||||
std::get<1>(param_norm_tuple).DATA_PTR<float>(),
|
||||
std::get<1>(update_norm_tuple).DATA_PTR<float>(),
|
||||
lr); )
|
||||
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
|
|
|
@ -136,7 +136,7 @@ void multi_tensor_lamb_stage1_cuda(
|
|||
noop_flag,
|
||||
tensor_lists,
|
||||
LAMBStage1Functor<scalar_t_0, scalar_t_1, scalar_t_2>(),
|
||||
per_tensor_decay.data<float>(),
|
||||
per_tensor_decay.DATA_PTR<float>(),
|
||||
beta1,
|
||||
beta2,
|
||||
beta1_correction,
|
||||
|
|
|
@ -99,8 +99,8 @@ void multi_tensor_lamb_stage2_cuda(
|
|||
noop_flag,
|
||||
tensor_lists,
|
||||
LAMBStage2Functor<scalar_t_0, scalar_t_1>(),
|
||||
per_tensor_param_norm.data<float>(),
|
||||
per_tensor_update_norm.data<float>(),
|
||||
per_tensor_param_norm.DATA_PTR<float>(),
|
||||
per_tensor_update_norm.DATA_PTR<float>(),
|
||||
learning_rate); ))
|
||||
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
|
|
|
@ -181,7 +181,7 @@ void multi_tensor_novograd_cuda(
|
|||
lr,
|
||||
(momentMode_t) moment_mode,
|
||||
weight_decay,
|
||||
grad_norms.data<float>()); )
|
||||
grad_norms.DATA_PTR<float>()); )
|
||||
|
||||
AT_CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#include <ATen/ATen.h>
|
||||
#include "compat.h"
|
||||
|
||||
// Forward/backward compatiblity hack around
|
||||
// https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288
|
||||
|
|
246
csrc/welford.cu
246
csrc/welford.cu
|
@ -904,9 +904,9 @@ std::vector<at::Tensor> welford_mean_var_CUDA(const at::Tensor input) {
|
|||
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "welford_mean_var_kernel",
|
||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||
welford_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
|
||||
input.data<scalar_t_0>(),
|
||||
out_mean.data<accscalar_t>(),
|
||||
out_var_biased.data<accscalar_t>(),
|
||||
input.DATA_PTR<scalar_t_0>(),
|
||||
out_mean.DATA_PTR<accscalar_t>(),
|
||||
out_var_biased.DATA_PTR<accscalar_t>(),
|
||||
batch_size,
|
||||
feature_size,
|
||||
space_size);
|
||||
|
@ -943,12 +943,12 @@ at::Tensor batchnorm_forward_CUDA(
|
|||
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_forward",
|
||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||
batchnorm_forward_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
|
||||
input.data<scalar_t_0>(),
|
||||
mean.data<accscalar_t>(),
|
||||
inv_std.data<accscalar_t>(),
|
||||
weight.has_value() ? weight.value().data<accscalar_t>() : NULL,
|
||||
shift.has_value() ? shift.value().data<accscalar_t>() : NULL,
|
||||
out.data<scalar_t_0>(),
|
||||
input.DATA_PTR<scalar_t_0>(),
|
||||
mean.DATA_PTR<accscalar_t>(),
|
||||
inv_std.DATA_PTR<accscalar_t>(),
|
||||
weight.has_value() ? weight.value().DATA_PTR<accscalar_t>() : NULL,
|
||||
shift.has_value() ? shift.value().DATA_PTR<accscalar_t>() : NULL,
|
||||
out.DATA_PTR<scalar_t_0>(),
|
||||
space_size,
|
||||
batch_size);
|
||||
);
|
||||
|
@ -961,12 +961,12 @@ at::Tensor batchnorm_forward_CUDA(
|
|||
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_forward",
|
||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||
batchnorm_forward_kernel<scalar_t_0, accscalar_t, scalar_t_0><<<grid, block, 0, stream>>>(
|
||||
input.data<scalar_t_0>(),
|
||||
mean.data<accscalar_t>(),
|
||||
inv_std.data<accscalar_t>(),
|
||||
weight.has_value() ? weight.value().data<scalar_t_0>() : NULL,
|
||||
shift.has_value() ? shift.value().data<scalar_t_0>() : NULL,
|
||||
out.data<scalar_t_0>(),
|
||||
input.DATA_PTR<scalar_t_0>(),
|
||||
mean.DATA_PTR<accscalar_t>(),
|
||||
inv_std.DATA_PTR<accscalar_t>(),
|
||||
weight.has_value() ? weight.value().DATA_PTR<scalar_t_0>() : NULL,
|
||||
shift.has_value() ? shift.value().DATA_PTR<scalar_t_0>() : NULL,
|
||||
out.DATA_PTR<scalar_t_0>(),
|
||||
space_size,
|
||||
batch_size);
|
||||
);
|
||||
|
@ -1014,14 +1014,14 @@ std::vector<at::Tensor> reduce_bn_CUDA(
|
|||
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward_reduce",
|
||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||
reduce_bn_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
|
||||
input.data<scalar_t_0>(),
|
||||
grad_output.data<scalar_t_0>(),
|
||||
mean.data<accscalar_t>(),
|
||||
inv_std.data<accscalar_t>(),
|
||||
mean_dy.data<accscalar_t>(),
|
||||
mean_dy_xmu.data<accscalar_t>(),
|
||||
weight.has_value() ? grad_weight.data<accscalar_t>() : NULL,
|
||||
weight.has_value() ? grad_bias.data<accscalar_t>() : NULL,
|
||||
input.DATA_PTR<scalar_t_0>(),
|
||||
grad_output.DATA_PTR<scalar_t_0>(),
|
||||
mean.DATA_PTR<accscalar_t>(),
|
||||
inv_std.DATA_PTR<accscalar_t>(),
|
||||
mean_dy.DATA_PTR<accscalar_t>(),
|
||||
mean_dy_xmu.DATA_PTR<accscalar_t>(),
|
||||
weight.has_value() ? grad_weight.DATA_PTR<accscalar_t>() : NULL,
|
||||
weight.has_value() ? grad_bias.DATA_PTR<accscalar_t>() : NULL,
|
||||
batch_size,
|
||||
feature_size,
|
||||
space_size);
|
||||
|
@ -1035,14 +1035,14 @@ std::vector<at::Tensor> reduce_bn_CUDA(
|
|||
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward_reduce",
|
||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||
reduce_bn_kernel<scalar_t_0, accscalar_t, scalar_t_0><<<grid, block, 0, stream>>>(
|
||||
input.data<scalar_t_0>(),
|
||||
grad_output.data<scalar_t_0>(),
|
||||
mean.data<accscalar_t>(),
|
||||
inv_std.data<accscalar_t>(),
|
||||
mean_dy.data<accscalar_t>(),
|
||||
mean_dy_xmu.data<accscalar_t>(),
|
||||
weight.has_value() ? grad_weight.data<scalar_t_0>() : NULL,
|
||||
weight.has_value() ? grad_bias.data<scalar_t_0>() : NULL,
|
||||
input.DATA_PTR<scalar_t_0>(),
|
||||
grad_output.DATA_PTR<scalar_t_0>(),
|
||||
mean.DATA_PTR<accscalar_t>(),
|
||||
inv_std.DATA_PTR<accscalar_t>(),
|
||||
mean_dy.DATA_PTR<accscalar_t>(),
|
||||
mean_dy_xmu.DATA_PTR<accscalar_t>(),
|
||||
weight.has_value() ? grad_weight.DATA_PTR<scalar_t_0>() : NULL,
|
||||
weight.has_value() ? grad_bias.DATA_PTR<scalar_t_0>() : NULL,
|
||||
batch_size,
|
||||
feature_size,
|
||||
space_size);
|
||||
|
@ -1083,14 +1083,14 @@ at::Tensor batchnorm_backward_CUDA(
|
|||
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward",
|
||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||
batchnorm_backward_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
|
||||
grad_output.data<scalar_t_0>(),
|
||||
input.data<scalar_t_0>(),
|
||||
mean.data<accscalar_t>(),
|
||||
inv_std.data<accscalar_t>(),
|
||||
weight.has_value() ? weight.value().data<accscalar_t>() : NULL,
|
||||
mean_dy.data<accscalar_t>(),
|
||||
mean_dy_xmu.data<accscalar_t>(),
|
||||
grad_input.data<scalar_t_0>(),
|
||||
grad_output.DATA_PTR<scalar_t_0>(),
|
||||
input.DATA_PTR<scalar_t_0>(),
|
||||
mean.DATA_PTR<accscalar_t>(),
|
||||
inv_std.DATA_PTR<accscalar_t>(),
|
||||
weight.has_value() ? weight.value().DATA_PTR<accscalar_t>() : NULL,
|
||||
mean_dy.DATA_PTR<accscalar_t>(),
|
||||
mean_dy_xmu.DATA_PTR<accscalar_t>(),
|
||||
grad_input.DATA_PTR<scalar_t_0>(),
|
||||
space_size,
|
||||
batch_size);
|
||||
);
|
||||
|
@ -1103,14 +1103,14 @@ at::Tensor batchnorm_backward_CUDA(
|
|||
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward",
|
||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||
batchnorm_backward_kernel<scalar_t_0, accscalar_t, scalar_t_0><<<grid, block, 0, stream>>>(
|
||||
grad_output.data<scalar_t_0>(),
|
||||
input.data<scalar_t_0>(),
|
||||
mean.data<accscalar_t>(),
|
||||
inv_std.data<accscalar_t>(),
|
||||
weight.has_value() ? weight.value().data<scalar_t_0>() : NULL,
|
||||
mean_dy.data<accscalar_t>(),
|
||||
mean_dy_xmu.data<accscalar_t>(),
|
||||
grad_input.data<scalar_t_0>(),
|
||||
grad_output.DATA_PTR<scalar_t_0>(),
|
||||
input.DATA_PTR<scalar_t_0>(),
|
||||
mean.DATA_PTR<accscalar_t>(),
|
||||
inv_std.DATA_PTR<accscalar_t>(),
|
||||
weight.has_value() ? weight.value().DATA_PTR<scalar_t_0>() : NULL,
|
||||
mean_dy.DATA_PTR<accscalar_t>(),
|
||||
mean_dy_xmu.DATA_PTR<accscalar_t>(),
|
||||
grad_input.DATA_PTR<scalar_t_0>(),
|
||||
space_size,
|
||||
batch_size);
|
||||
);
|
||||
|
@ -1140,11 +1140,11 @@ std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_node
|
|||
using namespace at;
|
||||
DISPATCH_FLOAT_AND_HALF(mean_feature_nodes.scalar_type(), 0, "welford_parallel_kernel",
|
||||
welford_kernel_parallel<scalar_t_0><<<grid, block, 0, stream>>>(
|
||||
mean_feature_nodes.data<scalar_t_0>(),
|
||||
var_biased.data<scalar_t_0>(),
|
||||
out_mean.data<scalar_t_0>(),
|
||||
out_var.data<scalar_t_0>(),
|
||||
inv_std.data<scalar_t_0>(),
|
||||
mean_feature_nodes.DATA_PTR<scalar_t_0>(),
|
||||
var_biased.DATA_PTR<scalar_t_0>(),
|
||||
out_mean.DATA_PTR<scalar_t_0>(),
|
||||
out_var.DATA_PTR<scalar_t_0>(),
|
||||
inv_std.DATA_PTR<scalar_t_0>(),
|
||||
world_size,
|
||||
feature_size,
|
||||
eps,
|
||||
|
@ -1182,13 +1182,13 @@ std::vector<at::Tensor> welford_mean_var_c_last_CUDA(const at::Tensor input) {
|
|||
using namespace at;
|
||||
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "welford_mean_var_c_last",
|
||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr;
|
||||
int* semaphores_ptr = grid.y > 1 ? semaphores.data<int>() : nullptr;
|
||||
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.DATA_PTR<accscalar_t>() : nullptr;
|
||||
int* semaphores_ptr = grid.y > 1 ? semaphores.DATA_PTR<int>() : nullptr;
|
||||
welford_kernel_c_last<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
|
||||
<<<grid, block, 0, stream>>>(
|
||||
input.data<scalar_t_0>(),
|
||||
out_mean.data<accscalar_t>(),
|
||||
out_var_biased.data<accscalar_t>(),
|
||||
input.DATA_PTR<scalar_t_0>(),
|
||||
out_mean.DATA_PTR<accscalar_t>(),
|
||||
out_var_biased.DATA_PTR<accscalar_t>(),
|
||||
staging_data_ptr,
|
||||
semaphores_ptr,
|
||||
reduction_size,
|
||||
|
@ -1225,13 +1225,13 @@ at::Tensor batchnorm_forward_c_last_CUDA(
|
|||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||
batchnorm_forward_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
|
||||
<<<grid, block, 0, stream>>>(
|
||||
input.data<scalar_t_0>(),
|
||||
z.has_value() ? z.value().data<scalar_t_0>() : NULL,
|
||||
mean.data<accscalar_t>(),
|
||||
inv_std.data<accscalar_t>(),
|
||||
weight.has_value() ? weight.value().data<accscalar_t>() : NULL,
|
||||
shift.has_value() ? shift.value().data<accscalar_t>(): NULL,
|
||||
out.data<scalar_t_0>(),
|
||||
input.DATA_PTR<scalar_t_0>(),
|
||||
z.has_value() ? z.value().DATA_PTR<scalar_t_0>() : NULL,
|
||||
mean.DATA_PTR<accscalar_t>(),
|
||||
inv_std.DATA_PTR<accscalar_t>(),
|
||||
weight.has_value() ? weight.value().DATA_PTR<accscalar_t>() : NULL,
|
||||
shift.has_value() ? shift.value().DATA_PTR<accscalar_t>(): NULL,
|
||||
out.DATA_PTR<scalar_t_0>(),
|
||||
reduction_size,
|
||||
stride,
|
||||
fuse_relu);
|
||||
|
@ -1246,13 +1246,13 @@ at::Tensor batchnorm_forward_c_last_CUDA(
|
|||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||
batchnorm_forward_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
|
||||
<<<grid, block, 0, stream>>>(
|
||||
input.data<scalar_t_0>(),
|
||||
z.has_value() ? z.value().data<scalar_t_0>() : NULL,
|
||||
mean.data<accscalar_t>(),
|
||||
inv_std.data<accscalar_t>(),
|
||||
weight.has_value() ? weight.value().data<scalar_t_0>() : NULL,
|
||||
shift.has_value() ? shift.value().data<scalar_t_0>(): NULL,
|
||||
out.data<scalar_t_0>(),
|
||||
input.DATA_PTR<scalar_t_0>(),
|
||||
z.has_value() ? z.value().DATA_PTR<scalar_t_0>() : NULL,
|
||||
mean.DATA_PTR<accscalar_t>(),
|
||||
inv_std.DATA_PTR<accscalar_t>(),
|
||||
weight.has_value() ? weight.value().DATA_PTR<scalar_t_0>() : NULL,
|
||||
shift.has_value() ? shift.value().DATA_PTR<scalar_t_0>(): NULL,
|
||||
out.DATA_PTR<scalar_t_0>(),
|
||||
reduction_size,
|
||||
stride,
|
||||
fuse_relu);
|
||||
|
@ -1302,18 +1302,18 @@ std::vector<at::Tensor> reduce_bn_c_last_CUDA(
|
|||
using namespace at;
|
||||
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward_reduce",
|
||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr;
|
||||
int* semaphores_ptr = grid.y > 1 ? semaphores.data<int>() : nullptr;
|
||||
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.DATA_PTR<accscalar_t>() : nullptr;
|
||||
int* semaphores_ptr = grid.y > 1 ? semaphores.DATA_PTR<int>() : nullptr;
|
||||
reduce_bn_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
|
||||
<<<grid, block, 0, stream>>>(
|
||||
input.data<scalar_t_0>(),
|
||||
grad_output.data<scalar_t_0>(),
|
||||
mean.data<accscalar_t>(),
|
||||
inv_std.data<accscalar_t>(),
|
||||
mean_dy.data<accscalar_t>(),
|
||||
mean_dy_xmu.data<accscalar_t>(),
|
||||
weight.has_value() ? grad_weight.data<accscalar_t>() : NULL,
|
||||
weight.has_value() ?grad_bias.data<accscalar_t>() : NULL,
|
||||
input.DATA_PTR<scalar_t_0>(),
|
||||
grad_output.DATA_PTR<scalar_t_0>(),
|
||||
mean.DATA_PTR<accscalar_t>(),
|
||||
inv_std.DATA_PTR<accscalar_t>(),
|
||||
mean_dy.DATA_PTR<accscalar_t>(),
|
||||
mean_dy_xmu.DATA_PTR<accscalar_t>(),
|
||||
weight.has_value() ? grad_weight.DATA_PTR<accscalar_t>() : NULL,
|
||||
weight.has_value() ?grad_bias.DATA_PTR<accscalar_t>() : NULL,
|
||||
staging_data_ptr,
|
||||
semaphores_ptr,
|
||||
reduction_size,
|
||||
|
@ -1327,18 +1327,18 @@ std::vector<at::Tensor> reduce_bn_c_last_CUDA(
|
|||
using namespace at;
|
||||
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward_reduce",
|
||||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr;
|
||||
int* semaphores_ptr = grid.y > 1 ? semaphores.data<int>() : nullptr;
|
||||
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.DATA_PTR<accscalar_t>() : nullptr;
|
||||
int* semaphores_ptr = grid.y > 1 ? semaphores.DATA_PTR<int>() : nullptr;
|
||||
reduce_bn_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
|
||||
<<<grid, block, 0, stream>>>(
|
||||
input.data<scalar_t_0>(),
|
||||
grad_output.data<scalar_t_0>(),
|
||||
mean.data<accscalar_t>(),
|
||||
inv_std.data<accscalar_t>(),
|
||||
mean_dy.data<accscalar_t>(),
|
||||
mean_dy_xmu.data<accscalar_t>(),
|
||||
weight.has_value() ? grad_weight.data<scalar_t_0>() : NULL,
|
||||
weight.has_value() ?grad_bias.data<scalar_t_0>() : NULL,
|
||||
input.DATA_PTR<scalar_t_0>(),
|
||||
grad_output.DATA_PTR<scalar_t_0>(),
|
||||
mean.DATA_PTR<accscalar_t>(),
|
||||
inv_std.DATA_PTR<accscalar_t>(),
|
||||
mean_dy.DATA_PTR<accscalar_t>(),
|
||||
mean_dy_xmu.DATA_PTR<accscalar_t>(),
|
||||
weight.has_value() ? grad_weight.DATA_PTR<scalar_t_0>() : NULL,
|
||||
weight.has_value() ?grad_bias.DATA_PTR<scalar_t_0>() : NULL,
|
||||
staging_data_ptr,
|
||||
semaphores_ptr,
|
||||
reduction_size,
|
||||
|
@ -1375,14 +1375,14 @@ at::Tensor batchnorm_backward_c_last_CUDA(
|
|||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||
batchnorm_backward_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
|
||||
<<<grid, block, 0, stream>>>(
|
||||
grad_output.data<scalar_t_0>(),
|
||||
input.data<scalar_t_0>(),
|
||||
mean.data<accscalar_t>(),
|
||||
inv_std.data<accscalar_t>(),
|
||||
weight.has_value() ? weight.value().data<accscalar_t>() : NULL,
|
||||
mean_dy.data<accscalar_t>(),
|
||||
mean_dy_xmu.data<accscalar_t>(),
|
||||
grad_input.data<scalar_t_0>(),
|
||||
grad_output.DATA_PTR<scalar_t_0>(),
|
||||
input.DATA_PTR<scalar_t_0>(),
|
||||
mean.DATA_PTR<accscalar_t>(),
|
||||
inv_std.DATA_PTR<accscalar_t>(),
|
||||
weight.has_value() ? weight.value().DATA_PTR<accscalar_t>() : NULL,
|
||||
mean_dy.DATA_PTR<accscalar_t>(),
|
||||
mean_dy_xmu.DATA_PTR<accscalar_t>(),
|
||||
grad_input.DATA_PTR<scalar_t_0>(),
|
||||
reduction_size,
|
||||
stride);
|
||||
);
|
||||
|
@ -1396,14 +1396,14 @@ at::Tensor batchnorm_backward_c_last_CUDA(
|
|||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||
batchnorm_backward_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
|
||||
<<<grid, block, 0, stream>>>(
|
||||
grad_output.data<scalar_t_0>(),
|
||||
input.data<scalar_t_0>(),
|
||||
mean.data<accscalar_t>(),
|
||||
inv_std.data<accscalar_t>(),
|
||||
weight.has_value() ? weight.value().data<scalar_t_0>() : NULL,
|
||||
mean_dy.data<accscalar_t>(),
|
||||
mean_dy_xmu.data<accscalar_t>(),
|
||||
grad_input.data<scalar_t_0>(),
|
||||
grad_output.DATA_PTR<scalar_t_0>(),
|
||||
input.DATA_PTR<scalar_t_0>(),
|
||||
mean.DATA_PTR<accscalar_t>(),
|
||||
inv_std.DATA_PTR<accscalar_t>(),
|
||||
weight.has_value() ? weight.value().DATA_PTR<scalar_t_0>() : NULL,
|
||||
mean_dy.DATA_PTR<accscalar_t>(),
|
||||
mean_dy_xmu.DATA_PTR<accscalar_t>(),
|
||||
grad_input.DATA_PTR<scalar_t_0>(),
|
||||
reduction_size,
|
||||
stride);
|
||||
);
|
||||
|
@ -1439,14 +1439,14 @@ at::Tensor relu_backward_c_last_CUDA(
|
|||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||
relu_backward_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
|
||||
<<<grid, block, 0, stream>>>(
|
||||
grad_output.data<scalar_t_0>(),
|
||||
input.data<scalar_t_0>(),
|
||||
z.has_value() ? z.value().data<scalar_t_0>() : NULL,
|
||||
mean.data<accscalar_t>(),
|
||||
inv_std.data<accscalar_t>(),
|
||||
weight.has_value() ? weight.value().data<accscalar_t>() : NULL,
|
||||
shift.has_value() ? shift.value().data<accscalar_t>(): NULL,
|
||||
out.data<scalar_t_0>(),
|
||||
grad_output.DATA_PTR<scalar_t_0>(),
|
||||
input.DATA_PTR<scalar_t_0>(),
|
||||
z.has_value() ? z.value().DATA_PTR<scalar_t_0>() : NULL,
|
||||
mean.DATA_PTR<accscalar_t>(),
|
||||
inv_std.DATA_PTR<accscalar_t>(),
|
||||
weight.has_value() ? weight.value().DATA_PTR<accscalar_t>() : NULL,
|
||||
shift.has_value() ? shift.value().DATA_PTR<accscalar_t>(): NULL,
|
||||
out.DATA_PTR<scalar_t_0>(),
|
||||
reduction_size,
|
||||
stride);
|
||||
);
|
||||
|
@ -1460,14 +1460,14 @@ at::Tensor relu_backward_c_last_CUDA(
|
|||
using accscalar_t = at::acc_type<scalar_t_0, true>;
|
||||
relu_backward_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
|
||||
<<<grid, block, 0, stream>>>(
|
||||
grad_output.data<scalar_t_0>(),
|
||||
input.data<scalar_t_0>(),
|
||||
z.has_value() ? z.value().data<scalar_t_0>() : NULL,
|
||||
mean.data<accscalar_t>(),
|
||||
inv_std.data<accscalar_t>(),
|
||||
weight.has_value() ? weight.value().data<scalar_t_0>() : NULL,
|
||||
shift.has_value() ? shift.value().data<scalar_t_0>(): NULL,
|
||||
out.data<scalar_t_0>(),
|
||||
grad_output.DATA_PTR<scalar_t_0>(),
|
||||
input.DATA_PTR<scalar_t_0>(),
|
||||
z.has_value() ? z.value().DATA_PTR<scalar_t_0>() : NULL,
|
||||
mean.DATA_PTR<accscalar_t>(),
|
||||
inv_std.DATA_PTR<accscalar_t>(),
|
||||
weight.has_value() ? weight.value().DATA_PTR<scalar_t_0>() : NULL,
|
||||
shift.has_value() ? shift.value().DATA_PTR<scalar_t_0>(): NULL,
|
||||
out.DATA_PTR<scalar_t_0>(),
|
||||
reduction_size,
|
||||
stride);
|
||||
);
|
||||
|
|
55
setup.py
55
setup.py
|
@ -67,6 +67,19 @@ def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
|
|||
"https://github.com/NVIDIA/apex/pull/323#discussion_r287021798. "
|
||||
"You can try commenting out this check (at your own risk).")
|
||||
|
||||
# Set up macros for forward/backward compatibility hack around
|
||||
# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
|
||||
# and
|
||||
# https://github.com/NVIDIA/apex/issues/456
|
||||
# https://github.com/pytorch/pytorch/commit/eb7b39e02f7d75c26d8a795ea8c7fd911334da7e#diff-4632522f237f1e4e728cb824300403ac
|
||||
version_ge_1_1 = []
|
||||
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
|
||||
version_ge_1_1 = ['-DVERSION_GE_1_1']
|
||||
version_ge_1_3 = []
|
||||
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
|
||||
version_ge_1_3 = ['-DVERSION_GE_1_3']
|
||||
version_dependent_macros = version_ge_1_1 + version_ge_1_3
|
||||
|
||||
if "--cuda_ext" in sys.argv:
|
||||
from torch.utils.cpp_extension import CUDAExtension
|
||||
sys.argv.remove("--cuda_ext")
|
||||
|
@ -76,12 +89,6 @@ if "--cuda_ext" in sys.argv:
|
|||
else:
|
||||
check_cuda_torch_binary_vs_bare_metal(torch.utils.cpp_extension.CUDA_HOME)
|
||||
|
||||
# Set up macros for forward/backward compatibility hack around
|
||||
# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
|
||||
version_ge_1_1 = []
|
||||
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
|
||||
version_ge_1_1 = ['-DVERSION_GE_1_1']
|
||||
|
||||
ext_modules.append(
|
||||
CUDAExtension(name='amp_C',
|
||||
sources=['csrc/amp_C_frontend.cpp',
|
||||
|
@ -94,30 +101,33 @@ if "--cuda_ext" in sys.argv:
|
|||
'csrc/multi_tensor_adam.cu',
|
||||
'csrc/multi_tensor_novograd.cu',
|
||||
'csrc/multi_tensor_lamb.cu'],
|
||||
extra_compile_args={'cxx': ['-O3'],
|
||||
extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
|
||||
'nvcc':['-lineinfo',
|
||||
'-O3',
|
||||
# '--resource-usage',
|
||||
'--use_fast_math']}))
|
||||
'--use_fast_math'] + version_dependent_macros}))
|
||||
ext_modules.append(
|
||||
CUDAExtension(name='fused_adam_cuda',
|
||||
sources=['csrc/fused_adam_cuda.cpp',
|
||||
'csrc/fused_adam_cuda_kernel.cu'],
|
||||
extra_compile_args={'cxx': ['-O3',],
|
||||
extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
|
||||
'nvcc':['-O3',
|
||||
'--use_fast_math']}))
|
||||
'--use_fast_math'] + version_dependent_macros}))
|
||||
ext_modules.append(
|
||||
CUDAExtension(name='syncbn',
|
||||
sources=['csrc/syncbn.cpp',
|
||||
'csrc/welford.cu']))
|
||||
'csrc/welford.cu'],
|
||||
extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
|
||||
'nvcc':['-O3'] + version_dependent_macros}))
|
||||
|
||||
ext_modules.append(
|
||||
CUDAExtension(name='fused_layer_norm_cuda',
|
||||
sources=['csrc/layer_norm_cuda.cpp',
|
||||
'csrc/layer_norm_cuda_kernel.cu'],
|
||||
extra_compile_args={'cxx': ['-O3'] + version_ge_1_1,
|
||||
extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
|
||||
'nvcc':['-maxrregcount=50',
|
||||
'-O3',
|
||||
'--use_fast_math'] + version_ge_1_1}))
|
||||
'--use_fast_math'] + version_dependent_macros}))
|
||||
|
||||
if "--bnp" in sys.argv:
|
||||
from torch.utils.cpp_extension import CUDAExtension
|
||||
|
@ -129,24 +139,20 @@ if "--bnp" in sys.argv:
|
|||
if torch.utils.cpp_extension.CUDA_HOME is None:
|
||||
raise RuntimeError("--bnp was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
|
||||
else:
|
||||
# Set up macros for forward/backward compatibility hack around
|
||||
# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
|
||||
version_ge_1_1 = []
|
||||
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
|
||||
version_ge_1_1 = ['-DVERSION_GE_1_1']
|
||||
ext_modules.append(
|
||||
CUDAExtension(name='bnp',
|
||||
sources=['apex/contrib/csrc/groupbn/batch_norm.cu',
|
||||
'apex/contrib/csrc/groupbn/ipc.cu',
|
||||
'apex/contrib/csrc/groupbn/interface.cpp',
|
||||
'apex/contrib/csrc/groupbn/batch_norm_add_relu.cu'],
|
||||
extra_compile_args={'cxx': [] + version_ge_1_1,
|
||||
include_dirs=['csrc'],
|
||||
extra_compile_args={'cxx': [] + version_dependent_macros,
|
||||
'nvcc':['-DCUDA_HAS_FP16=1',
|
||||
'-D__CUDA_NO_HALF_OPERATORS__',
|
||||
'-D__CUDA_NO_HALF_CONVERSIONS__',
|
||||
'-D__CUDA_NO_HALF2_OPERATORS__',
|
||||
'-gencode',
|
||||
'arch=compute_70,code=sm_70'] + version_ge_1_1}))
|
||||
'arch=compute_70,code=sm_70'] + version_dependent_macros}))
|
||||
|
||||
if "--xentropy" in sys.argv:
|
||||
from torch.utils.cpp_extension import CUDAExtension
|
||||
|
@ -158,18 +164,13 @@ if "--xentropy" in sys.argv:
|
|||
if torch.utils.cpp_extension.CUDA_HOME is None:
|
||||
raise RuntimeError("--xentropy was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
|
||||
else:
|
||||
# Set up macros for forward/backward compatibility hack around
|
||||
# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
|
||||
version_ge_1_1 = []
|
||||
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
|
||||
version_ge_1_1 = ['-DVERSION_GE_1_1']
|
||||
ext_modules.append(
|
||||
CUDAExtension(name='xentropy_cuda',
|
||||
sources=['apex/contrib/csrc/xentropy/interface.cpp',
|
||||
'apex/contrib/csrc/xentropy/xentropy_kernel.cu'],
|
||||
include_dirs=['csrc'],
|
||||
extra_compile_args={'cxx': ['-O3'] + version_ge_1_1,
|
||||
'nvcc':['-O3'] + version_ge_1_1}))
|
||||
extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
|
||||
'nvcc':['-O3'] + version_dependent_macros}))
|
||||
|
||||
setup(
|
||||
name='apex',
|
||||
|
|
|
@ -13,7 +13,7 @@ print_red() {
|
|||
}
|
||||
|
||||
images=(
|
||||
"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.07-py3-devel"
|
||||
"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.08-py3-devel"
|
||||
"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:master-py3-devel"
|
||||
"pytorch/pytorch:nightly-devel-cuda10.0-cudnn7"
|
||||
"pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel"
|
||||
|
|
Загрузка…
Ссылка в новой задаче