* Pushing for build tests

* Contrib files

* Removing deprecated checks
This commit is contained in:
mcarilli 2019-09-05 22:33:27 -07:00 коммит произвёл GitHub
Родитель 1bf0d8d4ba
Коммит 325f5a0bec
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
17 изменённых файлов: 281 добавлений и 266 удалений

Просмотреть файл

@ -8,6 +8,8 @@
#include <cuda.h>
#include "compat.h"
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
@ -72,7 +74,7 @@ at::Tensor nhwc_bn_fwd_train(
const int C = x.size(3);
// generating new magic number and use that for sync
int* magic = magic_tensor.data<int>();
int* magic = magic_tensor.DATA_PTR<int>();
*magic = (*magic + 1) & 0xff;
// Allocate output tensor
@ -87,13 +89,13 @@ at::Tensor nhwc_bn_fwd_train(
bn->setConstants(momentum, epsilon);
// set pointers within the wrapper
bn->setInputOutputPointers(x.data<at::Half>(),
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
nullptr,
y.data<at::Half>(),
y.DATA_PTR<at::Half>(),
nullptr);
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {nullptr, nullptr});
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {nullptr, nullptr});
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});
// deal with workspace(s)
auto workspace_bytes = bn->numWorkspaceBytes();
@ -114,12 +116,12 @@ at::Tensor nhwc_bn_fwd_train(
Workspace ws(total_workspace_bytes);
std::vector<void *> workspace;
workspace.push_back(minibatch_mean.data<float>());
workspace.push_back(minibatch_inv_var.data<float>());
workspace.push_back(minibatch_mean.DATA_PTR<float>());
workspace.push_back(minibatch_inv_var.DATA_PTR<float>());
auto stream = at::cuda::getCurrentCUDAStream().stream();
const int retired_cta_bytes = workspace_bytes[2];
void* retired_ctas = ret_cta.data<uint8_t>();
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
assert(ret_cta.size(0)>=retired_cta_bytes);
workspace.push_back(retired_ctas);
@ -165,13 +167,13 @@ at::Tensor nhwc_bn_fwd_eval(
bn->setConstants(momentum, epsilon);
// set pointers within the wrapper
bn->setInputOutputPointers(x.data<at::Half>(),
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
nullptr,
y.data<at::Half>(),
y.DATA_PTR<at::Half>(),
nullptr);
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {nullptr, nullptr});
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {nullptr, nullptr});
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});
// deal with workspace(s)
auto workspace_bytes = bn->numWorkspaceBytes();
@ -197,7 +199,7 @@ at::Tensor nhwc_bn_fwd_eval(
auto stream = at::cuda::getCurrentCUDAStream().stream();
const int retired_cta_bytes = workspace_bytes[2];
void* retired_ctas = ret_cta.data<uint8_t>();
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
assert(ret_cta.size(0)>=retired_cta_bytes);
workspace.push_back(retired_ctas);
@ -244,7 +246,7 @@ std::vector<at::Tensor> nhwc_bn_bwd(
const int C = x.size(3);
// generating new magic number and use that for sync
int* magic = magic_tensor.data<int>();
int* magic = magic_tensor.DATA_PTR<int>();
*magic = (*magic + 1) & 0xff;
// outputs
@ -264,13 +266,13 @@ std::vector<at::Tensor> nhwc_bn_bwd(
bn->setConstants(momentum, epsilon);
// set pointers within the wrapper
bn->setInputOutputPointers(x.data<at::Half>(),
x_grad.data<at::Half>(),
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
x_grad.DATA_PTR<at::Half>(),
nullptr,
dy.data<at::Half>());
dy.DATA_PTR<at::Half>());
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {scale_grad.data<float>(), bias_grad.data<float>()});
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {scale_grad.DATA_PTR<float>(), bias_grad.DATA_PTR<float>()});
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});
// deal with workspace(s)
auto workspace_bytes = bn->numWorkspaceBytes();
@ -291,12 +293,12 @@ std::vector<at::Tensor> nhwc_bn_bwd(
Workspace ws(total_workspace_bytes);
std::vector<void *> workspace;
workspace.push_back(minibatch_mean.data<float>());
workspace.push_back(minibatch_inv_var.data<float>());
workspace.push_back(minibatch_mean.DATA_PTR<float>());
workspace.push_back(minibatch_inv_var.DATA_PTR<float>());
auto stream = at::cuda::getCurrentCUDAStream().stream();
const int retired_cta_bytes = workspace_bytes[2];
void* retired_ctas = ret_cta.data<uint8_t>();
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
assert(ret_cta.size(0)>=retired_cta_bytes);
workspace.push_back(retired_ctas);

Просмотреть файл

@ -8,6 +8,8 @@
#include <cuda.h>
#include "compat.h"
//FIXME move the common stuff to common h file
#define cudaCheckErrors(msg) \
do { \
@ -74,7 +76,7 @@ at::Tensor nhwc_bn_addrelu_fwd_train(
const int C = x.size(3);
// generating new magic number and use that for sync
int* magic = magic_tensor.data<int>();
int* magic = magic_tensor.DATA_PTR<int>();
*magic = (*magic + 1) & 0xff;
// Allocate output tensor
@ -89,15 +91,15 @@ at::Tensor nhwc_bn_addrelu_fwd_train(
bn->setConstants(momentum, epsilon);
// set pointers within the wrapper
bn->setInputOutputPointers(x.data<at::Half>(),
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
nullptr,
y.data<at::Half>(),
y.DATA_PTR<at::Half>(),
nullptr,
z.data<at::Half>(),
z.DATA_PTR<at::Half>(),
nullptr);
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {nullptr, nullptr});
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {nullptr, nullptr});
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});
// deal with workspace(s)
auto workspace_bytes = bn->numWorkspaceBytes();
@ -118,13 +120,13 @@ at::Tensor nhwc_bn_addrelu_fwd_train(
Workspace ws(total_workspace_bytes);
std::vector<void *> workspace;
workspace.push_back(minibatch_mean.data<float>());
workspace.push_back(minibatch_inv_var.data<float>());
workspace.push_back(bitmask.data<int32_t>());
workspace.push_back(minibatch_mean.DATA_PTR<float>());
workspace.push_back(minibatch_inv_var.DATA_PTR<float>());
workspace.push_back(bitmask.DATA_PTR<int32_t>());
auto stream = at::cuda::getCurrentCUDAStream().stream();
const int retired_cta_bytes = workspace_bytes[3];
void* retired_ctas = ret_cta.data<uint8_t>();
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
assert(ret_cta.size(0)>=retired_cta_bytes);
workspace.push_back(retired_ctas);
@ -171,15 +173,15 @@ at::Tensor nhwc_bn_addrelu_fwd_eval(
bn->setConstants(momentum, epsilon);
// set pointers within the wrapper
bn->setInputOutputPointers(x.data<at::Half>(),
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
nullptr,
y.data<at::Half>(),
y.DATA_PTR<at::Half>(),
nullptr,
z.data<at::Half>(),
z.DATA_PTR<at::Half>(),
nullptr);
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {nullptr, nullptr});
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {nullptr, nullptr});
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});
// deal with workspace(s)
auto workspace_bytes = bn->numWorkspaceBytes();
@ -206,7 +208,7 @@ at::Tensor nhwc_bn_addrelu_fwd_eval(
auto stream = at::cuda::getCurrentCUDAStream().stream();
const int retired_cta_bytes = workspace_bytes[3];
void* retired_ctas = ret_cta.data<uint8_t>();
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
assert(ret_cta.size(0)>=retired_cta_bytes);
workspace.push_back(retired_ctas);
@ -253,7 +255,7 @@ std::vector<at::Tensor> nhwc_bn_addrelu_bwd(
const int C = x.size(3);
// generating new magic number and use that for sync
int* magic = magic_tensor.data<int>();
int* magic = magic_tensor.DATA_PTR<int>();
*magic = (*magic + 1) & 0xff;
// outputs
@ -274,15 +276,15 @@ std::vector<at::Tensor> nhwc_bn_addrelu_bwd(
bn->setConstants(momentum, epsilon);
// set pointers within the wrapper
bn->setInputOutputPointers(x.data<at::Half>(),
x_grad.data<at::Half>(),
bn->setInputOutputPointers(x.DATA_PTR<at::Half>(),
x_grad.DATA_PTR<at::Half>(),
nullptr,
dy.data<at::Half>(),
dy.DATA_PTR<at::Half>(),
nullptr,
z_grad.data<at::Half>());
z_grad.DATA_PTR<at::Half>());
bn->setWeightPointers({scale.data<float>(), bias.data<float>()}, {scale_grad.data<float>(), bias_grad.data<float>()});
bn->setParameterPointers({running_mean.data<float>(), running_inv_var.data<float>()});
bn->setWeightPointers({scale.DATA_PTR<float>(), bias.DATA_PTR<float>()}, {scale_grad.DATA_PTR<float>(), bias_grad.DATA_PTR<float>()});
bn->setParameterPointers({running_mean.DATA_PTR<float>(), running_inv_var.DATA_PTR<float>()});
// deal with workspace(s)
auto workspace_bytes = bn->numWorkspaceBytes();
@ -303,13 +305,13 @@ std::vector<at::Tensor> nhwc_bn_addrelu_bwd(
Workspace ws(total_workspace_bytes);
std::vector<void *> workspace;
workspace.push_back(minibatch_mean.data<float>());
workspace.push_back(minibatch_inv_var.data<float>());
workspace.push_back(bitmask.data<int32_t>());
workspace.push_back(minibatch_mean.DATA_PTR<float>());
workspace.push_back(minibatch_inv_var.DATA_PTR<float>());
workspace.push_back(bitmask.DATA_PTR<int32_t>());
auto stream = at::cuda::getCurrentCUDAStream().stream();
const int retired_cta_bytes = workspace_bytes[3];
void* retired_ctas = ret_cta.data<uint8_t>();
void* retired_ctas = ret_cta.DATA_PTR<uint8_t>();
assert(ret_cta.size(0)>=retired_cta_bytes);
workspace.push_back(retired_ctas);

Просмотреть файл

@ -6,6 +6,8 @@
#include <cuda.h>
#include "compat.h"
#define cudaCheckErrors(msg) \
do { \
@ -114,17 +116,17 @@ int64_t get_buffer_size(const int bn_sync_steps) {
void* get_remote_data_ptr(const at::Tensor& handle, const int64_t offset) {
cudaIpcMemHandle_t my_handle;
memcpy((unsigned char *)(&my_handle), handle.data<uint8_t>(), sizeof(my_handle));
memcpy((unsigned char *)(&my_handle), handle.DATA_PTR<uint8_t>(), sizeof(my_handle));
return ipc_mem_registry.getPtr(my_handle, offset);
}
void close_remote_data(const at::Tensor& handle) {
cudaIpcMemHandle_t my_handle;
memcpy((unsigned char *)(&my_handle), handle.data<uint8_t>(), sizeof(my_handle));
memcpy((unsigned char *)(&my_handle), handle.DATA_PTR<uint8_t>(), sizeof(my_handle));
ipc_mem_registry.releasePtr(my_handle);
}
void* get_data_ptr(
const at::Tensor& data) {
return data.data<uint8_t>();
return data.DATA_PTR<uint8_t>();
}

Просмотреть файл

@ -82,6 +82,7 @@
#include <THC/THCThrustAllocator.cuh>
#include "type_shim.h"
#include "compat.h"
using Tensor = at::Tensor;
using TensorList = at::TensorList;
@ -492,7 +493,7 @@ std::vector<Tensor> host_softmax_xentropy(
inner_size *= input.size(i);
// This kernel spawns a block per each element in the batch.
// XXX: it assumes that inner_size == 1
AT_CHECK(inner_size == 1, "Currently only inner size 1 supported");
TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported");
const int ILP = 2;
dim3 grid(outer_size);
@ -504,15 +505,15 @@ std::vector<Tensor> host_softmax_xentropy(
if (!half_to_float) {
cunn_SoftMaxXEntropyForward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue>
<<<grid, block, 2 * block.x * sizeof(accscalar_t), stream>>>(
losses.data<accscalar_t>(), max_log_sum_exp.data<scalar_t_0>(),
input.data<scalar_t_0>(), labels_.data<int64_t>(),
losses.DATA_PTR<accscalar_t>(), max_log_sum_exp.DATA_PTR<scalar_t_0>(),
input.DATA_PTR<scalar_t_0>(), labels_.DATA_PTR<int64_t>(),
dim_size, smoothing
);
} else {
cunn_SoftMaxXEntropyForward<ILP, scalar_t_0, accscalar_t, accscalar_t, Epilogue>
<<<grid, block, 2 * block.x * sizeof(accscalar_t), stream>>>(
losses.data<accscalar_t>(), max_log_sum_exp.data<accscalar_t>(),
input.data<scalar_t_0>(), labels_.data<int64_t>(),
losses.DATA_PTR<accscalar_t>(), max_log_sum_exp.DATA_PTR<accscalar_t>(),
input.DATA_PTR<scalar_t_0>(), labels_.DATA_PTR<int64_t>(),
dim_size, smoothing
);
}
@ -561,7 +562,7 @@ Tensor host_softmax_xentropy_backward(
inner_size *= logits.size(i);
// See descriptions of kernels above.
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
AT_CHECK(inner_size == 1, "Currently only inner size 1 supported");
TORCH_CHECK(inner_size == 1, "Currently only inner size 1 supported");
const int ILP = 2;
dim3 grid(outer_size);
@ -572,17 +573,17 @@ Tensor host_softmax_xentropy_backward(
if (!half_to_float) {
cunn_SoftMaxXEntropyBackward<ILP, scalar_t_0, accscalar_t, scalar_t_0, Epilogue>
<<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
gI.data<scalar_t_0>(), logits.data<scalar_t_0>(),
max_log_sum_exp.data<scalar_t_0>(),
grad.data<scalar_t_0>(), labels.data<int64_t>(),
gI.DATA_PTR<scalar_t_0>(), logits.DATA_PTR<scalar_t_0>(),
max_log_sum_exp.DATA_PTR<scalar_t_0>(),
grad.DATA_PTR<scalar_t_0>(), labels.DATA_PTR<int64_t>(),
smoothing, dim_size
);
} else {
cunn_SoftMaxXEntropyBackward<ILP, scalar_t_0, accscalar_t, accscalar_t, Epilogue>
<<<grid, block, block.x * sizeof(accscalar_t), stream>>>(
gI.data<scalar_t_0>(), logits.data<scalar_t_0>(),
max_log_sum_exp.data<accscalar_t>(),
grad.data<accscalar_t>(), labels.data<int64_t>(),
gI.DATA_PTR<scalar_t_0>(), logits.DATA_PTR<scalar_t_0>(),
max_log_sum_exp.DATA_PTR<accscalar_t>(),
grad.DATA_PTR<accscalar_t>(), labels.DATA_PTR<int64_t>(),
smoothing, dim_size
);
}

Просмотреть файл

@ -1,3 +1,9 @@
#ifndef TORCH_CHECK
#define TORCH_CHECK AT_CHECK
#endif
#ifdef VERSION_GE_1_3
#define DATA_PTR data_ptr
#else
#define DATA_PTR data
#endif

Просмотреть файл

@ -190,11 +190,11 @@ void fused_adam_cuda(
DISPATCH_FLOAT_AND_HALF(g.scalar_type(), 0, "adam_cuda_kernel",
using accscalar_t = at::acc_type<scalar_t_0, true>;
adam_cuda_kernel<accscalar_t, scalar_t_0><<<blocks,threadsPerBlock, 0, stream>>>(
p.data<accscalar_t>(),
p_copy.numel() ? p_copy.data<scalar_t_0>() : NULL,
m.data<accscalar_t>(),
v.data<accscalar_t>(),
g.data<scalar_t_0>(),
p.DATA_PTR<accscalar_t>(),
p_copy.numel() ? p_copy.DATA_PTR<scalar_t_0>() : NULL,
m.DATA_PTR<accscalar_t>(),
v.DATA_PTR<accscalar_t>(),
g.DATA_PTR<scalar_t_0>(),
beta1,
beta2,
eps,
@ -208,11 +208,11 @@ void fused_adam_cuda(
using namespace at;
DISPATCH_DOUBLE_AND_FLOAT(g.scalar_type(), 0, "adam_cuda_kernel",
adam_cuda_kernel<scalar_t_0, scalar_t_0><<<blocks,threadsPerBlock, 0, stream>>>(
p.data<scalar_t_0>(),
p.DATA_PTR<scalar_t_0>(),
NULL, //don't output p_copy for fp32, it's wasted write
m.data<scalar_t_0>(),
v.data<scalar_t_0>(),
g.data<scalar_t_0>(),
m.DATA_PTR<scalar_t_0>(),
v.DATA_PTR<scalar_t_0>(),
g.DATA_PTR<scalar_t_0>(),
beta1,
beta2,
eps,

Просмотреть файл

@ -687,14 +687,14 @@ void cuda_layer_norm(
DISPATCH_DOUBLE_FLOAT_AND_HALF(input->scalar_type(), 0, "layer_norm_cuda_kernel",
using accscalar_t = at::acc_type<scalar_t_0, true>;
HostApplyLayerNorm(
output->data<scalar_t_0>(),
mean->data<accscalar_t>(),
invvar->data<accscalar_t>(),
input->data<scalar_t_0>(),
output->DATA_PTR<scalar_t_0>(),
mean->DATA_PTR<accscalar_t>(),
invvar->DATA_PTR<accscalar_t>(),
input->DATA_PTR<scalar_t_0>(),
n1,n2,
epsilon,
gamma != NULL ? gamma->data<scalar_t_0>() : NULL,
beta != NULL ? beta->data<scalar_t_0>() : NULL);
gamma != NULL ? gamma->DATA_PTR<scalar_t_0>() : NULL,
beta != NULL ? beta->DATA_PTR<scalar_t_0>() : NULL);
)
}
@ -728,20 +728,20 @@ void HostLayerNormGradient(
at::Tensor part_grad_beta = at::empty_like(part_grad_gamma);
cuComputePartGradGammaBeta<<<blocks2, threads2, nshared2, stream>>>(
dout,
input->data<T>(),
input->DATA_PTR<T>(),
n1,n2,
mean,
invvar,
U(epsilon),
part_grad_gamma.data<U>(),
part_grad_beta.data<U>());
part_grad_gamma.DATA_PTR<U>(),
part_grad_beta.DATA_PTR<U>());
const dim3 threads3(32,8,1);
const dim3 blocks3((n2+threads2.x-1)/threads2.x,1,1);
const int nshared3 = threads3.x * threads3.y * sizeof(U);
cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
part_grad_gamma.data<U>(),
part_grad_beta.data<U>(),
part_grad_gamma.DATA_PTR<U>(),
part_grad_beta.DATA_PTR<U>(),
part_size,
n1,n2,
grad_gamma,
@ -758,7 +758,7 @@ void HostLayerNormGradient(
0;
cuComputeGradInput<<<blocks1, threads1, nshared, stream>>>(
dout,
input->data<T>(),
input->DATA_PTR<T>(),
n1,n2,
mean,
invvar,
@ -790,18 +790,18 @@ void cuda_layer_norm_gradient(
DISPATCH_FLOAT_AND_HALF(input->scalar_type(), 0, "cuComputeGradInput",
using accscalar_t = at::acc_type<scalar_t_0, true>;
HostLayerNormGradient(
dout->data<scalar_t_0>(),
mean->data<accscalar_t>(),
invvar->data<accscalar_t>(),
dout->DATA_PTR<scalar_t_0>(),
mean->DATA_PTR<accscalar_t>(),
invvar->DATA_PTR<accscalar_t>(),
input,
n1,n2,
// TMJ pass NULL argument for gamma, beta, grad_gamma and grad_beta
// if gamma Tensor is NULL on input.
gamma != NULL ? gamma->data<scalar_t_0>() : NULL,
gamma != NULL ? beta->data<scalar_t_0>() : NULL,
gamma != NULL ? gamma->DATA_PTR<scalar_t_0>() : NULL,
gamma != NULL ? beta->DATA_PTR<scalar_t_0>() : NULL,
epsilon,
grad_input->data<scalar_t_0>(),
gamma != NULL ? grad_gamma->data<scalar_t_0>() : NULL,
gamma != NULL ? grad_beta->data<scalar_t_0>() : NULL);
grad_input->DATA_PTR<scalar_t_0>(),
gamma != NULL ? grad_gamma->DATA_PTR<scalar_t_0>() : NULL,
gamma != NULL ? grad_beta->DATA_PTR<scalar_t_0>() : NULL);
)
}

Просмотреть файл

@ -96,7 +96,7 @@ void multi_tensor_apply(
// using accscalar_t = acc_type<scalar_t, true>;
multi_tensor_apply_kernel<<<loc_block_info, block_size, 0, stream>>>(
chunk_size,
noop_flag.data<int>(),
noop_flag.DATA_PTR<int>(),
tl,
callable,
args...);

Просмотреть файл

@ -274,8 +274,8 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
noop_flag,
tensor_lists,
L2NormFunctor<scalar_t_0>(),
output.data<float>(),
per_tensor ? output_per_tensor.data<float>() : nullptr,
output.DATA_PTR<float>(),
per_tensor ? output_per_tensor.DATA_PTR<float>() : nullptr,
per_tensor,
max_chunks_per_tensor);)
@ -289,10 +289,10 @@ std::tuple<at::Tensor, at::Tensor> multi_tensor_l2norm_cuda(
auto ret = at::empty({1}, output.options());
auto stream = at::cuda::getCurrentCUDAStream();
cleanup<<<per_tensor ? ntensors : 1, 512, 0, stream>>>(
output.data<float>(),
per_tensor ? output_per_tensor.data<float>() : nullptr,
ret.data<float>(),
per_tensor ? ret_per_tensor.data<float>() : nullptr,
output.DATA_PTR<float>(),
per_tensor ? output_per_tensor.DATA_PTR<float>() : nullptr,
ret.DATA_PTR<float>(),
per_tensor ? ret_per_tensor.DATA_PTR<float>() : nullptr,
per_tensor,
max_chunks_per_tensor);
@ -344,8 +344,8 @@ void multi_tensor_norm_out_cuda(
noop_flag,
tensor_lists,
MaxNormFunctor<scalar_t_0>(),
output.data<float>(),
output_per_tensor.data<float>(),
output.DATA_PTR<float>(),
output_per_tensor.DATA_PTR<float>(),
true,
max_chunks_per_tensor);)
}
@ -358,8 +358,8 @@ void multi_tensor_norm_out_cuda(
noop_flag,
tensor_lists,
L2NormFunctor<scalar_t_0>(),
output.data<float>(),
output_per_tensor.data<float>(),
output.DATA_PTR<float>(),
output_per_tensor.DATA_PTR<float>(),
true,
max_chunks_per_tensor);)
}
@ -373,10 +373,10 @@ void multi_tensor_norm_out_cuda(
auto ret = at::empty({1}, output.options());
auto stream = at::cuda::getCurrentCUDAStream();
cleanup_v2<<<ntensors, 512, 0, stream>>>(
output.data<float>(),
output_per_tensor.data<float>(),
ret.data<float>(),
out.data<float>(),
output.DATA_PTR<float>(),
output_per_tensor.DATA_PTR<float>(),
ret.DATA_PTR<float>(),
out.DATA_PTR<float>(),
true,
max_chunks_per_tensor,
norm_type,

Просмотреть файл

@ -265,7 +265,7 @@ void multi_tensor_lamb_cuda(
epsilon,
(adamMode_t) mode,
weight_decay,
std::get<0>(grad_norm_tuple).data<float>(),
std::get<0>(grad_norm_tuple).DATA_PTR<float>(),
max_grad_norm); )
// Compute update norms
@ -280,8 +280,8 @@ void multi_tensor_lamb_cuda(
noop_flag,
grad_param_list,
LAMBStage2Functor<scalar_t_0>(),
std::get<1>(param_norm_tuple).data<float>(),
std::get<1>(update_norm_tuple).data<float>(),
std::get<1>(param_norm_tuple).DATA_PTR<float>(),
std::get<1>(update_norm_tuple).DATA_PTR<float>(),
lr); )
AT_CUDA_CHECK(cudaGetLastError());

Просмотреть файл

@ -136,7 +136,7 @@ void multi_tensor_lamb_stage1_cuda(
noop_flag,
tensor_lists,
LAMBStage1Functor<scalar_t_0, scalar_t_1, scalar_t_2>(),
per_tensor_decay.data<float>(),
per_tensor_decay.DATA_PTR<float>(),
beta1,
beta2,
beta1_correction,

Просмотреть файл

@ -99,8 +99,8 @@ void multi_tensor_lamb_stage2_cuda(
noop_flag,
tensor_lists,
LAMBStage2Functor<scalar_t_0, scalar_t_1>(),
per_tensor_param_norm.data<float>(),
per_tensor_update_norm.data<float>(),
per_tensor_param_norm.DATA_PTR<float>(),
per_tensor_update_norm.DATA_PTR<float>(),
learning_rate); ))
AT_CUDA_CHECK(cudaGetLastError());

Просмотреть файл

@ -181,7 +181,7 @@ void multi_tensor_novograd_cuda(
lr,
(momentMode_t) moment_mode,
weight_decay,
grad_norms.data<float>()); )
grad_norms.DATA_PTR<float>()); )
AT_CUDA_CHECK(cudaGetLastError());

Просмотреть файл

@ -1,4 +1,5 @@
#include <ATen/ATen.h>
#include "compat.h"
// Forward/backward compatiblity hack around
// https://github.com/pytorch/pytorch/commit/3aeb78079bcd68282fe9117088e138b77318e288

Просмотреть файл

@ -904,9 +904,9 @@ std::vector<at::Tensor> welford_mean_var_CUDA(const at::Tensor input) {
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "welford_mean_var_kernel",
using accscalar_t = at::acc_type<scalar_t_0, true>;
welford_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
input.data<scalar_t_0>(),
out_mean.data<accscalar_t>(),
out_var_biased.data<accscalar_t>(),
input.DATA_PTR<scalar_t_0>(),
out_mean.DATA_PTR<accscalar_t>(),
out_var_biased.DATA_PTR<accscalar_t>(),
batch_size,
feature_size,
space_size);
@ -943,12 +943,12 @@ at::Tensor batchnorm_forward_CUDA(
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_forward",
using accscalar_t = at::acc_type<scalar_t_0, true>;
batchnorm_forward_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
input.data<scalar_t_0>(),
mean.data<accscalar_t>(),
inv_std.data<accscalar_t>(),
weight.has_value() ? weight.value().data<accscalar_t>() : NULL,
shift.has_value() ? shift.value().data<accscalar_t>() : NULL,
out.data<scalar_t_0>(),
input.DATA_PTR<scalar_t_0>(),
mean.DATA_PTR<accscalar_t>(),
inv_std.DATA_PTR<accscalar_t>(),
weight.has_value() ? weight.value().DATA_PTR<accscalar_t>() : NULL,
shift.has_value() ? shift.value().DATA_PTR<accscalar_t>() : NULL,
out.DATA_PTR<scalar_t_0>(),
space_size,
batch_size);
);
@ -961,12 +961,12 @@ at::Tensor batchnorm_forward_CUDA(
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_forward",
using accscalar_t = at::acc_type<scalar_t_0, true>;
batchnorm_forward_kernel<scalar_t_0, accscalar_t, scalar_t_0><<<grid, block, 0, stream>>>(
input.data<scalar_t_0>(),
mean.data<accscalar_t>(),
inv_std.data<accscalar_t>(),
weight.has_value() ? weight.value().data<scalar_t_0>() : NULL,
shift.has_value() ? shift.value().data<scalar_t_0>() : NULL,
out.data<scalar_t_0>(),
input.DATA_PTR<scalar_t_0>(),
mean.DATA_PTR<accscalar_t>(),
inv_std.DATA_PTR<accscalar_t>(),
weight.has_value() ? weight.value().DATA_PTR<scalar_t_0>() : NULL,
shift.has_value() ? shift.value().DATA_PTR<scalar_t_0>() : NULL,
out.DATA_PTR<scalar_t_0>(),
space_size,
batch_size);
);
@ -1014,14 +1014,14 @@ std::vector<at::Tensor> reduce_bn_CUDA(
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward_reduce",
using accscalar_t = at::acc_type<scalar_t_0, true>;
reduce_bn_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
input.data<scalar_t_0>(),
grad_output.data<scalar_t_0>(),
mean.data<accscalar_t>(),
inv_std.data<accscalar_t>(),
mean_dy.data<accscalar_t>(),
mean_dy_xmu.data<accscalar_t>(),
weight.has_value() ? grad_weight.data<accscalar_t>() : NULL,
weight.has_value() ? grad_bias.data<accscalar_t>() : NULL,
input.DATA_PTR<scalar_t_0>(),
grad_output.DATA_PTR<scalar_t_0>(),
mean.DATA_PTR<accscalar_t>(),
inv_std.DATA_PTR<accscalar_t>(),
mean_dy.DATA_PTR<accscalar_t>(),
mean_dy_xmu.DATA_PTR<accscalar_t>(),
weight.has_value() ? grad_weight.DATA_PTR<accscalar_t>() : NULL,
weight.has_value() ? grad_bias.DATA_PTR<accscalar_t>() : NULL,
batch_size,
feature_size,
space_size);
@ -1035,14 +1035,14 @@ std::vector<at::Tensor> reduce_bn_CUDA(
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward_reduce",
using accscalar_t = at::acc_type<scalar_t_0, true>;
reduce_bn_kernel<scalar_t_0, accscalar_t, scalar_t_0><<<grid, block, 0, stream>>>(
input.data<scalar_t_0>(),
grad_output.data<scalar_t_0>(),
mean.data<accscalar_t>(),
inv_std.data<accscalar_t>(),
mean_dy.data<accscalar_t>(),
mean_dy_xmu.data<accscalar_t>(),
weight.has_value() ? grad_weight.data<scalar_t_0>() : NULL,
weight.has_value() ? grad_bias.data<scalar_t_0>() : NULL,
input.DATA_PTR<scalar_t_0>(),
grad_output.DATA_PTR<scalar_t_0>(),
mean.DATA_PTR<accscalar_t>(),
inv_std.DATA_PTR<accscalar_t>(),
mean_dy.DATA_PTR<accscalar_t>(),
mean_dy_xmu.DATA_PTR<accscalar_t>(),
weight.has_value() ? grad_weight.DATA_PTR<scalar_t_0>() : NULL,
weight.has_value() ? grad_bias.DATA_PTR<scalar_t_0>() : NULL,
batch_size,
feature_size,
space_size);
@ -1083,14 +1083,14 @@ at::Tensor batchnorm_backward_CUDA(
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward",
using accscalar_t = at::acc_type<scalar_t_0, true>;
batchnorm_backward_kernel<scalar_t_0, accscalar_t, accscalar_t><<<grid, block, 0, stream>>>(
grad_output.data<scalar_t_0>(),
input.data<scalar_t_0>(),
mean.data<accscalar_t>(),
inv_std.data<accscalar_t>(),
weight.has_value() ? weight.value().data<accscalar_t>() : NULL,
mean_dy.data<accscalar_t>(),
mean_dy_xmu.data<accscalar_t>(),
grad_input.data<scalar_t_0>(),
grad_output.DATA_PTR<scalar_t_0>(),
input.DATA_PTR<scalar_t_0>(),
mean.DATA_PTR<accscalar_t>(),
inv_std.DATA_PTR<accscalar_t>(),
weight.has_value() ? weight.value().DATA_PTR<accscalar_t>() : NULL,
mean_dy.DATA_PTR<accscalar_t>(),
mean_dy_xmu.DATA_PTR<accscalar_t>(),
grad_input.DATA_PTR<scalar_t_0>(),
space_size,
batch_size);
);
@ -1103,14 +1103,14 @@ at::Tensor batchnorm_backward_CUDA(
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward",
using accscalar_t = at::acc_type<scalar_t_0, true>;
batchnorm_backward_kernel<scalar_t_0, accscalar_t, scalar_t_0><<<grid, block, 0, stream>>>(
grad_output.data<scalar_t_0>(),
input.data<scalar_t_0>(),
mean.data<accscalar_t>(),
inv_std.data<accscalar_t>(),
weight.has_value() ? weight.value().data<scalar_t_0>() : NULL,
mean_dy.data<accscalar_t>(),
mean_dy_xmu.data<accscalar_t>(),
grad_input.data<scalar_t_0>(),
grad_output.DATA_PTR<scalar_t_0>(),
input.DATA_PTR<scalar_t_0>(),
mean.DATA_PTR<accscalar_t>(),
inv_std.DATA_PTR<accscalar_t>(),
weight.has_value() ? weight.value().DATA_PTR<scalar_t_0>() : NULL,
mean_dy.DATA_PTR<accscalar_t>(),
mean_dy_xmu.DATA_PTR<accscalar_t>(),
grad_input.DATA_PTR<scalar_t_0>(),
space_size,
batch_size);
);
@ -1140,11 +1140,11 @@ std::vector<at::Tensor> welford_parallel_CUDA(const at::Tensor mean_feature_node
using namespace at;
DISPATCH_FLOAT_AND_HALF(mean_feature_nodes.scalar_type(), 0, "welford_parallel_kernel",
welford_kernel_parallel<scalar_t_0><<<grid, block, 0, stream>>>(
mean_feature_nodes.data<scalar_t_0>(),
var_biased.data<scalar_t_0>(),
out_mean.data<scalar_t_0>(),
out_var.data<scalar_t_0>(),
inv_std.data<scalar_t_0>(),
mean_feature_nodes.DATA_PTR<scalar_t_0>(),
var_biased.DATA_PTR<scalar_t_0>(),
out_mean.DATA_PTR<scalar_t_0>(),
out_var.DATA_PTR<scalar_t_0>(),
inv_std.DATA_PTR<scalar_t_0>(),
world_size,
feature_size,
eps,
@ -1182,13 +1182,13 @@ std::vector<at::Tensor> welford_mean_var_c_last_CUDA(const at::Tensor input) {
using namespace at;
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "welford_mean_var_c_last",
using accscalar_t = at::acc_type<scalar_t_0, true>;
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr;
int* semaphores_ptr = grid.y > 1 ? semaphores.data<int>() : nullptr;
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.DATA_PTR<accscalar_t>() : nullptr;
int* semaphores_ptr = grid.y > 1 ? semaphores.DATA_PTR<int>() : nullptr;
welford_kernel_c_last<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
<<<grid, block, 0, stream>>>(
input.data<scalar_t_0>(),
out_mean.data<accscalar_t>(),
out_var_biased.data<accscalar_t>(),
input.DATA_PTR<scalar_t_0>(),
out_mean.DATA_PTR<accscalar_t>(),
out_var_biased.DATA_PTR<accscalar_t>(),
staging_data_ptr,
semaphores_ptr,
reduction_size,
@ -1225,13 +1225,13 @@ at::Tensor batchnorm_forward_c_last_CUDA(
using accscalar_t = at::acc_type<scalar_t_0, true>;
batchnorm_forward_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
<<<grid, block, 0, stream>>>(
input.data<scalar_t_0>(),
z.has_value() ? z.value().data<scalar_t_0>() : NULL,
mean.data<accscalar_t>(),
inv_std.data<accscalar_t>(),
weight.has_value() ? weight.value().data<accscalar_t>() : NULL,
shift.has_value() ? shift.value().data<accscalar_t>(): NULL,
out.data<scalar_t_0>(),
input.DATA_PTR<scalar_t_0>(),
z.has_value() ? z.value().DATA_PTR<scalar_t_0>() : NULL,
mean.DATA_PTR<accscalar_t>(),
inv_std.DATA_PTR<accscalar_t>(),
weight.has_value() ? weight.value().DATA_PTR<accscalar_t>() : NULL,
shift.has_value() ? shift.value().DATA_PTR<accscalar_t>(): NULL,
out.DATA_PTR<scalar_t_0>(),
reduction_size,
stride,
fuse_relu);
@ -1246,13 +1246,13 @@ at::Tensor batchnorm_forward_c_last_CUDA(
using accscalar_t = at::acc_type<scalar_t_0, true>;
batchnorm_forward_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
<<<grid, block, 0, stream>>>(
input.data<scalar_t_0>(),
z.has_value() ? z.value().data<scalar_t_0>() : NULL,
mean.data<accscalar_t>(),
inv_std.data<accscalar_t>(),
weight.has_value() ? weight.value().data<scalar_t_0>() : NULL,
shift.has_value() ? shift.value().data<scalar_t_0>(): NULL,
out.data<scalar_t_0>(),
input.DATA_PTR<scalar_t_0>(),
z.has_value() ? z.value().DATA_PTR<scalar_t_0>() : NULL,
mean.DATA_PTR<accscalar_t>(),
inv_std.DATA_PTR<accscalar_t>(),
weight.has_value() ? weight.value().DATA_PTR<scalar_t_0>() : NULL,
shift.has_value() ? shift.value().DATA_PTR<scalar_t_0>(): NULL,
out.DATA_PTR<scalar_t_0>(),
reduction_size,
stride,
fuse_relu);
@ -1302,18 +1302,18 @@ std::vector<at::Tensor> reduce_bn_c_last_CUDA(
using namespace at;
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward_reduce",
using accscalar_t = at::acc_type<scalar_t_0, true>;
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr;
int* semaphores_ptr = grid.y > 1 ? semaphores.data<int>() : nullptr;
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.DATA_PTR<accscalar_t>() : nullptr;
int* semaphores_ptr = grid.y > 1 ? semaphores.DATA_PTR<int>() : nullptr;
reduce_bn_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
<<<grid, block, 0, stream>>>(
input.data<scalar_t_0>(),
grad_output.data<scalar_t_0>(),
mean.data<accscalar_t>(),
inv_std.data<accscalar_t>(),
mean_dy.data<accscalar_t>(),
mean_dy_xmu.data<accscalar_t>(),
weight.has_value() ? grad_weight.data<accscalar_t>() : NULL,
weight.has_value() ?grad_bias.data<accscalar_t>() : NULL,
input.DATA_PTR<scalar_t_0>(),
grad_output.DATA_PTR<scalar_t_0>(),
mean.DATA_PTR<accscalar_t>(),
inv_std.DATA_PTR<accscalar_t>(),
mean_dy.DATA_PTR<accscalar_t>(),
mean_dy_xmu.DATA_PTR<accscalar_t>(),
weight.has_value() ? grad_weight.DATA_PTR<accscalar_t>() : NULL,
weight.has_value() ?grad_bias.DATA_PTR<accscalar_t>() : NULL,
staging_data_ptr,
semaphores_ptr,
reduction_size,
@ -1327,18 +1327,18 @@ std::vector<at::Tensor> reduce_bn_c_last_CUDA(
using namespace at;
DISPATCH_FLOAT_AND_HALF(input.scalar_type(), 0, "batchnorm_backward_reduce",
using accscalar_t = at::acc_type<scalar_t_0, true>;
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.data<accscalar_t>() : nullptr;
int* semaphores_ptr = grid.y > 1 ? semaphores.data<int>() : nullptr;
accscalar_t* staging_data_ptr = grid.y > 1 ? staging_data.DATA_PTR<accscalar_t>() : nullptr;
int* semaphores_ptr = grid.y > 1 ? semaphores.DATA_PTR<int>() : nullptr;
reduce_bn_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
<<<grid, block, 0, stream>>>(
input.data<scalar_t_0>(),
grad_output.data<scalar_t_0>(),
mean.data<accscalar_t>(),
inv_std.data<accscalar_t>(),
mean_dy.data<accscalar_t>(),
mean_dy_xmu.data<accscalar_t>(),
weight.has_value() ? grad_weight.data<scalar_t_0>() : NULL,
weight.has_value() ?grad_bias.data<scalar_t_0>() : NULL,
input.DATA_PTR<scalar_t_0>(),
grad_output.DATA_PTR<scalar_t_0>(),
mean.DATA_PTR<accscalar_t>(),
inv_std.DATA_PTR<accscalar_t>(),
mean_dy.DATA_PTR<accscalar_t>(),
mean_dy_xmu.DATA_PTR<accscalar_t>(),
weight.has_value() ? grad_weight.DATA_PTR<scalar_t_0>() : NULL,
weight.has_value() ?grad_bias.DATA_PTR<scalar_t_0>() : NULL,
staging_data_ptr,
semaphores_ptr,
reduction_size,
@ -1375,14 +1375,14 @@ at::Tensor batchnorm_backward_c_last_CUDA(
using accscalar_t = at::acc_type<scalar_t_0, true>;
batchnorm_backward_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
<<<grid, block, 0, stream>>>(
grad_output.data<scalar_t_0>(),
input.data<scalar_t_0>(),
mean.data<accscalar_t>(),
inv_std.data<accscalar_t>(),
weight.has_value() ? weight.value().data<accscalar_t>() : NULL,
mean_dy.data<accscalar_t>(),
mean_dy_xmu.data<accscalar_t>(),
grad_input.data<scalar_t_0>(),
grad_output.DATA_PTR<scalar_t_0>(),
input.DATA_PTR<scalar_t_0>(),
mean.DATA_PTR<accscalar_t>(),
inv_std.DATA_PTR<accscalar_t>(),
weight.has_value() ? weight.value().DATA_PTR<accscalar_t>() : NULL,
mean_dy.DATA_PTR<accscalar_t>(),
mean_dy_xmu.DATA_PTR<accscalar_t>(),
grad_input.DATA_PTR<scalar_t_0>(),
reduction_size,
stride);
);
@ -1396,14 +1396,14 @@ at::Tensor batchnorm_backward_c_last_CUDA(
using accscalar_t = at::acc_type<scalar_t_0, true>;
batchnorm_backward_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
<<<grid, block, 0, stream>>>(
grad_output.data<scalar_t_0>(),
input.data<scalar_t_0>(),
mean.data<accscalar_t>(),
inv_std.data<accscalar_t>(),
weight.has_value() ? weight.value().data<scalar_t_0>() : NULL,
mean_dy.data<accscalar_t>(),
mean_dy_xmu.data<accscalar_t>(),
grad_input.data<scalar_t_0>(),
grad_output.DATA_PTR<scalar_t_0>(),
input.DATA_PTR<scalar_t_0>(),
mean.DATA_PTR<accscalar_t>(),
inv_std.DATA_PTR<accscalar_t>(),
weight.has_value() ? weight.value().DATA_PTR<scalar_t_0>() : NULL,
mean_dy.DATA_PTR<accscalar_t>(),
mean_dy_xmu.DATA_PTR<accscalar_t>(),
grad_input.DATA_PTR<scalar_t_0>(),
reduction_size,
stride);
);
@ -1439,14 +1439,14 @@ at::Tensor relu_backward_c_last_CUDA(
using accscalar_t = at::acc_type<scalar_t_0, true>;
relu_backward_c_last_kernel<scalar_t_0, accscalar_t, accscalar_t, ELEMENTS_PER_ITER>
<<<grid, block, 0, stream>>>(
grad_output.data<scalar_t_0>(),
input.data<scalar_t_0>(),
z.has_value() ? z.value().data<scalar_t_0>() : NULL,
mean.data<accscalar_t>(),
inv_std.data<accscalar_t>(),
weight.has_value() ? weight.value().data<accscalar_t>() : NULL,
shift.has_value() ? shift.value().data<accscalar_t>(): NULL,
out.data<scalar_t_0>(),
grad_output.DATA_PTR<scalar_t_0>(),
input.DATA_PTR<scalar_t_0>(),
z.has_value() ? z.value().DATA_PTR<scalar_t_0>() : NULL,
mean.DATA_PTR<accscalar_t>(),
inv_std.DATA_PTR<accscalar_t>(),
weight.has_value() ? weight.value().DATA_PTR<accscalar_t>() : NULL,
shift.has_value() ? shift.value().DATA_PTR<accscalar_t>(): NULL,
out.DATA_PTR<scalar_t_0>(),
reduction_size,
stride);
);
@ -1460,14 +1460,14 @@ at::Tensor relu_backward_c_last_CUDA(
using accscalar_t = at::acc_type<scalar_t_0, true>;
relu_backward_c_last_kernel<scalar_t_0, accscalar_t, scalar_t_0, ELEMENTS_PER_ITER>
<<<grid, block, 0, stream>>>(
grad_output.data<scalar_t_0>(),
input.data<scalar_t_0>(),
z.has_value() ? z.value().data<scalar_t_0>() : NULL,
mean.data<accscalar_t>(),
inv_std.data<accscalar_t>(),
weight.has_value() ? weight.value().data<scalar_t_0>() : NULL,
shift.has_value() ? shift.value().data<scalar_t_0>(): NULL,
out.data<scalar_t_0>(),
grad_output.DATA_PTR<scalar_t_0>(),
input.DATA_PTR<scalar_t_0>(),
z.has_value() ? z.value().DATA_PTR<scalar_t_0>() : NULL,
mean.DATA_PTR<accscalar_t>(),
inv_std.DATA_PTR<accscalar_t>(),
weight.has_value() ? weight.value().DATA_PTR<scalar_t_0>() : NULL,
shift.has_value() ? shift.value().DATA_PTR<scalar_t_0>(): NULL,
out.DATA_PTR<scalar_t_0>(),
reduction_size,
stride);
);

Просмотреть файл

@ -67,6 +67,19 @@ def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
"https://github.com/NVIDIA/apex/pull/323#discussion_r287021798. "
"You can try commenting out this check (at your own risk).")
# Set up macros for forward/backward compatibility hack around
# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
# and
# https://github.com/NVIDIA/apex/issues/456
# https://github.com/pytorch/pytorch/commit/eb7b39e02f7d75c26d8a795ea8c7fd911334da7e#diff-4632522f237f1e4e728cb824300403ac
version_ge_1_1 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
version_ge_1_1 = ['-DVERSION_GE_1_1']
version_ge_1_3 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
version_ge_1_3 = ['-DVERSION_GE_1_3']
version_dependent_macros = version_ge_1_1 + version_ge_1_3
if "--cuda_ext" in sys.argv:
from torch.utils.cpp_extension import CUDAExtension
sys.argv.remove("--cuda_ext")
@ -76,12 +89,6 @@ if "--cuda_ext" in sys.argv:
else:
check_cuda_torch_binary_vs_bare_metal(torch.utils.cpp_extension.CUDA_HOME)
# Set up macros for forward/backward compatibility hack around
# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
version_ge_1_1 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
version_ge_1_1 = ['-DVERSION_GE_1_1']
ext_modules.append(
CUDAExtension(name='amp_C',
sources=['csrc/amp_C_frontend.cpp',
@ -94,30 +101,33 @@ if "--cuda_ext" in sys.argv:
'csrc/multi_tensor_adam.cu',
'csrc/multi_tensor_novograd.cu',
'csrc/multi_tensor_lamb.cu'],
extra_compile_args={'cxx': ['-O3'],
extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
'nvcc':['-lineinfo',
'-O3',
# '--resource-usage',
'--use_fast_math']}))
'--use_fast_math'] + version_dependent_macros}))
ext_modules.append(
CUDAExtension(name='fused_adam_cuda',
sources=['csrc/fused_adam_cuda.cpp',
'csrc/fused_adam_cuda_kernel.cu'],
extra_compile_args={'cxx': ['-O3',],
extra_compile_args={'cxx': ['-O3',] + version_dependent_macros,
'nvcc':['-O3',
'--use_fast_math']}))
'--use_fast_math'] + version_dependent_macros}))
ext_modules.append(
CUDAExtension(name='syncbn',
sources=['csrc/syncbn.cpp',
'csrc/welford.cu']))
'csrc/welford.cu'],
extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
'nvcc':['-O3'] + version_dependent_macros}))
ext_modules.append(
CUDAExtension(name='fused_layer_norm_cuda',
sources=['csrc/layer_norm_cuda.cpp',
'csrc/layer_norm_cuda_kernel.cu'],
extra_compile_args={'cxx': ['-O3'] + version_ge_1_1,
extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
'nvcc':['-maxrregcount=50',
'-O3',
'--use_fast_math'] + version_ge_1_1}))
'--use_fast_math'] + version_dependent_macros}))
if "--bnp" in sys.argv:
from torch.utils.cpp_extension import CUDAExtension
@ -129,24 +139,20 @@ if "--bnp" in sys.argv:
if torch.utils.cpp_extension.CUDA_HOME is None:
raise RuntimeError("--bnp was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
else:
# Set up macros for forward/backward compatibility hack around
# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
version_ge_1_1 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
version_ge_1_1 = ['-DVERSION_GE_1_1']
ext_modules.append(
CUDAExtension(name='bnp',
sources=['apex/contrib/csrc/groupbn/batch_norm.cu',
'apex/contrib/csrc/groupbn/ipc.cu',
'apex/contrib/csrc/groupbn/interface.cpp',
'apex/contrib/csrc/groupbn/batch_norm_add_relu.cu'],
extra_compile_args={'cxx': [] + version_ge_1_1,
include_dirs=['csrc'],
extra_compile_args={'cxx': [] + version_dependent_macros,
'nvcc':['-DCUDA_HAS_FP16=1',
'-D__CUDA_NO_HALF_OPERATORS__',
'-D__CUDA_NO_HALF_CONVERSIONS__',
'-D__CUDA_NO_HALF2_OPERATORS__',
'-gencode',
'arch=compute_70,code=sm_70'] + version_ge_1_1}))
'arch=compute_70,code=sm_70'] + version_dependent_macros}))
if "--xentropy" in sys.argv:
from torch.utils.cpp_extension import CUDAExtension
@ -158,18 +164,13 @@ if "--xentropy" in sys.argv:
if torch.utils.cpp_extension.CUDA_HOME is None:
raise RuntimeError("--xentropy was requested, but nvcc was not found. Are you sure your environment has nvcc available? If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
else:
# Set up macros for forward/backward compatibility hack around
# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
version_ge_1_1 = []
if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
version_ge_1_1 = ['-DVERSION_GE_1_1']
ext_modules.append(
CUDAExtension(name='xentropy_cuda',
sources=['apex/contrib/csrc/xentropy/interface.cpp',
'apex/contrib/csrc/xentropy/xentropy_kernel.cu'],
include_dirs=['csrc'],
extra_compile_args={'cxx': ['-O3'] + version_ge_1_1,
'nvcc':['-O3'] + version_ge_1_1}))
extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
'nvcc':['-O3'] + version_dependent_macros}))
setup(
name='apex',

Просмотреть файл

@ -13,7 +13,7 @@ print_red() {
}
images=(
"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.07-py3-devel"
"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:19.08-py3-devel"
"gitlab-master.nvidia.com:5005/dl/dgx/pytorch:master-py3-devel"
"pytorch/pytorch:nightly-devel-cuda10.0-cudnn7"
"pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel"