зеркало из https://github.com/microsoft/DeepSpeed.git
CPU-Adam: Fix compile Issue (#1537)
* fixing the softmax masking when using triangular masking * move the TILE declaration outside of the SIMD loop * remove unrelated changes * fix Adagrad compile issue
This commit is contained in:
Родитель
f0122007df
Коммит
af443f63f4
|
@ -32,6 +32,7 @@ public:
|
|||
cudaFreeHost(_doubled_buffer[0]);
|
||||
cudaFreeHost(_doubled_buffer[1]);
|
||||
}
|
||||
#if defined(__AVX512__) or defined(__AVX256__)
|
||||
template <int span>
|
||||
void Step_AVX(size_t* rounded_size,
|
||||
float* _params,
|
||||
|
@ -40,11 +41,10 @@ public:
|
|||
size_t param_size,
|
||||
__half* dev_param = nullptr,
|
||||
bool half_precision = false);
|
||||
#if defined(__AVX512__) or defined(__AVX256__)
|
||||
#endif
|
||||
STEP(1)
|
||||
STEP(4)
|
||||
STEP(8)
|
||||
#endif
|
||||
inline void SynchronizeStreams()
|
||||
{
|
||||
for (int i = 0; i < 2; i++) cudaStreamSynchronize(_streams[i]);
|
||||
|
|
|
@ -5,12 +5,11 @@
|
|||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
#define TILE (128 * 1024 * 1024)
|
||||
#if defined(__AVX512__) or defined(__AVX256__)
|
||||
|
||||
#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
|
||||
|
||||
#define TILE (128 * 1024 * 1024)
|
||||
|
||||
#if defined(__AVX512__)
|
||||
#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
|
||||
#define SIMD_LOAD(x) _mm512_loadu_ps(x)
|
||||
|
|
Загрузка…
Ссылка в новой задаче