[CPU] SHM based allreduce improvement for small message size (#5571)

On CPU server, when running SHM based allreduce for small messages, the performance is pretty much dominated by synchronization latency. These latency includes the following two situations: 1. Wait for status change from other ranks. 2. Use `#pragma omp parallel for` to accelerator memory bandwidth bound operations such as `parallel_memcpy` or `reduce`. Each synchronization add a little time to allreduce latency. In current implementation, for small messages, 5 syncs on rank 0 are needed. This includes: 1) copy-in; 2) wait for other ranks done copy; 3) reduce; 4) copy-out; 5) wait for other ranks finish copy-out We redesign the algorithm for small message allreduce (called `symmetric_naive_allreduce`) to have only three syncs, each rank do exactly the same steps: 1) copy-in; 2) wait for other ranks done copy; 3) reduce to output buffer directly. We use double buffer so we can skip the last wait and go directly to next call using another buffer. We have a carefully designed state check to avoid using global barrier among ranks. Test shows for message size < 1MB, allreduce latency will reduce 30% to 50%. This is especially helpful for tensor parallel decoding with small batch size, where the tensor size is usually a few 10s of KBytes. |message size(bytes)|new method latency(us)|old method latency(us)| |---|---|---| | 2 | 13.34|20.39 | 4 | 13.44|19.57 | 8 | 13.70|19.76 | 16 | 13.27|20.43 | 32 | 13.42|19.75 | 64 | 13.38|19.80 | 128 | 13.70|19.44 | 256 | 13.99|20.33 | 512 | 13.91|20.28 | 1024 | 15.00|22.86 | 2048 | 15.82|20.93 | 4096 | 16.00|21.08 | 8192 | 16.31|21.50 | 16384 | 16.27|22.95 | 32768 | 16.13|25.17 | 65536 | 18.92|25.90 | 131072 | 21.12|27.42 | 262144 | 23.09|32.36 | 524288 | 32.78|42.80 Because the new method would compute same reduce value on all ranks. Caution needs to be taken to ensure the result is identical on all ranks. We use the test in the link https://github.com/delock/ds_allreduce_bench/blob/main/ds_comm_bench.py#L70 to ensure the implementation is correct. https://github.com/delock/ds_allreduce_bench/blob/main/validate.sh is a test script for better coverage. --------- Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com> Co-authored-by: Abhishek Kulkarni <11399+adk9@users.noreply.github.com>
2024-06-13 05:00:20 +08:00 · 2024-06-13 05:00:20 +08:00 · eda5075b88
--- a/csrc/cpu/comm/shm.cpp
+++ b/csrc/cpu/comm/shm.cpp
@ -21,9 +21,13 @@
 // states for collectives
 enum coll_state {
    coll_begin = 0,
-    coll_allreduce_naive__copy_in_done,   // this state is for rank != 0
-    coll_allreduce_naive__reduce_done,    // this state is for rank == 0
-    coll_allreduce_naive__copy_out_done,  // this state is for rank != 0
+    coll_allreduce_naive__copy_in_done,
+    coll_allreduce_naive__reduce_done,
+    // alternative state when allreduce is working on alternative buffer
+    // of the double buffer.
+    coll_alt1_allreduce_naive__copy_in_done,
+    coll_alt2_allreduce_naive__copy_in_done,
+    coll_alt1_allreduce_naive__reduce_done,
 };

 // SHM building blocks
@ -71,6 +75,8 @@ void shared_close(SharedData* data)
    }
 }

+static int world_size;
+
 // SHM based allreduce helper functions
 // buffer that holds shm name
 #define NAME_BUF_SIZE 1000
@ -78,64 +84,37 @@ void shared_close(SharedData* data)
 #define NAIVE_ALLREDUCE_THRESHOLD 1048576
 #define SHM_BUFFER_NAME "deepspeed_allreduce_buffer"
 struct allreduce_workspace {
-    enum coll_state state;
-    sem_t mutex;
-    sem_t turnstile1;
-    sem_t turnstile2;
-    int counter;
-    char buffer[MAX_BUF_SIZE];
+    enum coll_state states[2];  // idx=0 -- state for symmetric_naive_all_reduce
+                                // idx=1 -- state for distributed_naive_all_reduce
+    // double buffer to avoid syncing between rounds
+    // offset=0 -- 2*NAIVE_ALLREDUCE_THRESHOLD : buffer for symmetric_naive_all_reduce
+    // after that : buffer for distributed_naive_all_reduce
+    char buffer[2 * NAIVE_ALLREDUCE_THRESHOLD + 2 * MAX_BUF_SIZE];
 };
+
+#define BUFFER0_OFFSET(current_buffer) current_buffer* NAIVE_ALLREDUCE_THRESHOLD
+#define BUFFER1_OFFSET(current_buffer) 2 * NAIVE_ALLREDUCE_THRESHOLD + current_buffer* MAX_BUF_SIZE
+
 struct allreduce_workspace** workspace;

-void wait_buffer_state_until(int index, enum coll_state state)
-{
-    volatile enum coll_state* state_ptr = &(workspace[index]->state);
+// buffer for small messages, double buffer
+char** symmetric_buffer[2];
+// buffer for large messages, double buffer
+char** distributed_buffer[2];

-    while (*state_ptr != state)
-        ;
-}
-
-void wait_buffer_state_until_range(int index, enum coll_state start, int size)
+void wait_buffer_state_until_2(int index,
+                               enum coll_state state0,
+                               enum coll_state state1,
+                               int state_group)
 {
-    volatile enum coll_state* state_ptr = &(workspace[index]->state);
-    enum coll_state end = (enum coll_state)(start + size);
+    volatile enum coll_state* state_ptr = &(workspace[index]->states[state_group]);

    while (1) {
        volatile enum coll_state cur_state = *state_ptr;
-        if (cur_state >= start and cur_state < end) break;
+        if (cur_state == state0 || cur_state == state1) break;
    }
 }

-void wait_buffer_state_until_not(int index, enum coll_state state)
-{
-    volatile enum coll_state* state_ptr = &(workspace[index]->state);
-
-    while (*state_ptr == state)
-        ;
-}
-
-void barrier_wait(int root_idx, int num_ranks)
-{
-    // Phase 1: Wait for all threads to enter the barrier
-    auto shared = workspace[root_idx];
-    sem_wait(&shared->mutex);
-    shared->counter++;
-    if (shared->counter == num_ranks) {
-        for (int i = 0; i < num_ranks; ++i) { sem_post(&shared->turnstile1); }
-    }
-    sem_post(&shared->mutex);
-    sem_wait(&shared->turnstile1);
-
-    // Phase 2: Wait for all threads to exit the barrier
-    sem_wait(&shared->mutex);
-    shared->counter--;
-    if (shared->counter == 0) {
-        for (int i = 0; i < num_ranks; ++i) { sem_post(&shared->turnstile2); }
-    }
-    sem_post(&shared->mutex);
-    sem_wait(&shared->turnstile2);
-}
-
 __m512 cvt_bf16_to_fp32(const __m256i src) __attribute__((target("avx512bw")));
 inline __m512 cvt_bf16_to_fp32(const __m256i src)
 {
@ -167,123 +146,53 @@ inline __m256i cvt_fp32_to_bf16(const __m512 src)
 void reduce_2_bf16_buffers_iio(int num_elements, void* in0, void* in1, void* out)
    __attribute__((target("avx512bw")));

-void reduce_bf16_buffers(int start_elements,
-                         int num_elements,
-                         int num_buffers,
-                         int to_buffer_idx,
-                         struct allreduce_workspace** workspace)
+void reduce_bf16_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers)
    __attribute__((target("avx512bw")));

 void reduce_2_fp32_buffers_iio(int num_elements, void* in0, void* in1, void* out)
    __attribute__((target("avx512bw")));

-void reduce_fp32_buffers(int start_elements,
-                         int num_elements,
-                         int num_buffers,
-                         int to_buffer_idx,
-                         struct allreduce_workspace** workspace)
+void reduce_fp32_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers)
    __attribute__((target("avx512bw")));

-// N_REDUCE_LIMIT is the number of buffers that can be reduced together in one shot.
-// Compared with do N-1 2-reduces which needs 2*(N-1) read and N-1 write,
-// N-reduce only needs N read and 1 write, this saves 2/3 memory bandwidth.
-// When increase N_REDUCE_LIMIT to a bigger number, do the following steps
-// 1. Extend REPEAT_<X> macros list down below
-// 2. Extend switch cases which call "REPEAT(X, ...)" down below
-#define N_REDUCE_LIMIT 16
-
-void reduce_all_buffers(struct allreduce_workspace** workspace,
-                        int start_elements,
+void reduce_all_buffers(int start_elements,
                        int num_elements,
                        c10::ScalarType scalar_type,
-                        int num_buffers,
-                        int to_buffer_idx)
+                        int to_buffer_idx,
+                        char* to_buffer,
+                        char** buffers)
 {
    switch (scalar_type) {
        case c10::ScalarType::BFloat16:
-            if (num_buffers > 2 && num_buffers <= N_REDUCE_LIMIT) {
-                reduce_bf16_buffers(
-                    start_elements, num_elements, num_buffers, to_buffer_idx, workspace);
+            if (world_size == 2) {
+                // add the other buffer to to_buffer
+                reduce_2_bf16_buffers_iio(num_elements,
+                                          buffers[1 - to_buffer_idx] + start_elements * 2,
+                                          to_buffer + start_elements * 2,
+                                          to_buffer + start_elements * 2);
            } else {
-                for (int i = 0; i < num_buffers; i++) {
-                    if (i == to_buffer_idx) continue;
-                    reduce_2_bf16_buffers_iio(
-                        num_elements,
-                        workspace[i]->buffer + start_elements * 2,
-                        workspace[to_buffer_idx]->buffer + start_elements * 2,
-                        workspace[to_buffer_idx]->buffer + start_elements * 2);
-                }
+                reduce_bf16_buffers(start_elements, num_elements, to_buffer, buffers);
            }
            break;
        case c10::ScalarType::Float:
-            if (num_buffers > 2 && num_buffers <= N_REDUCE_LIMIT) {
-                reduce_fp32_buffers(
-                    start_elements, num_elements, num_buffers, to_buffer_idx, workspace);
+            if (world_size == 2) {
+                reduce_2_fp32_buffers_iio(num_elements,
+                                          buffers[1 - to_buffer_idx] + start_elements * 4,
+                                          to_buffer + start_elements * 4,
+                                          to_buffer + start_elements * 4);
            } else {
-                for (int i = 0; i < num_buffers; i++) {
-                    if (i == to_buffer_idx) continue;
-                    reduce_2_fp32_buffers_iio(
-                        num_elements,
-                        workspace[i]->buffer + start_elements * 4,
-                        workspace[to_buffer_idx]->buffer + start_elements * 4,
-                        workspace[to_buffer_idx]->buffer + start_elements * 4);
-                }
+                assert(world_size > 2);
+                reduce_fp32_buffers(start_elements, num_elements, to_buffer, buffers);
            }
            break;
        default: assert(!"Should not get here");
    }
 }

-#define REPEAT(N, x) REPEAT_##N(x)
-#define REPEAT_1(x) x(1)
-#define REPEAT_2(x) \
-    REPEAT_1(x);    \
-    x(2)
-#define REPEAT_3(x) \
-    REPEAT_2(x);    \
-    x(3)
-#define REPEAT_4(x) \
-    REPEAT_3(x);    \
-    x(4)
-#define REPEAT_5(x) \
-    REPEAT_4(x);    \
-    x(5)
-#define REPEAT_6(x) \
-    REPEAT_5(x);    \
-    x(6)
-#define REPEAT_7(x) \
-    REPEAT_6(x);    \
-    x(7)
-#define REPEAT_8(x) \
-    REPEAT_7(x);    \
-    x(8)
-#define REPEAT_9(x) \
-    REPEAT_8(x);    \
-    x(9)
-#define REPEAT_10(x) \
-    REPEAT_9(x);     \
-    x(10)
-#define REPEAT_11(x) \
-    REPEAT_10(x);    \
-    x(11)
-#define REPEAT_12(x) \
-    REPEAT_11(x);    \
-    x(12)
-#define REPEAT_13(x) \
-    REPEAT_12(x);    \
-    x(13)
-#define REPEAT_14(x) \
-    REPEAT_13(x);    \
-    x(14)
-#define REPEAT_15(x) \
-    REPEAT_14(x);    \
-    x(15)
-
-#define CVT_ADD_BF16(x)                                                                 \
-    do {                                                                                \
-        auto in##x##_val =                                                              \
-            cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(workspace[x]->buffer + i))); \
-        inout_val = _mm512_add_ps(inout_val, in##x##_val);                              \
+#define CVT_ADD_BF16(x)                                                                      \
+    do {                                                                                     \
+        auto in##x##_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(buffers[x] + i))); \
+        inout_val = _mm512_add_ps(inout_val, in##x##_val);                                   \
    } while (0)

 // Reduce functions down below use vectorized algorithm, the number of bytes processed each
@ -292,11 +201,7 @@ void reduce_all_buffers(struct allreduce_workspace** workspace,
 // whether this number needs to be changed
 #define VECTOR_LENGTH_IN_BYTES 32

-void reduce_bf16_buffers(int start_elements,
-                         int num_elements,
-                         int num_buffers,
-                         int to_buffer_idx,
-                         struct allreduce_workspace** workspace)
+void reduce_bf16_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers)
 {
    const int element_size = 2;
    const int vector_length = VECTOR_LENGTH_IN_BYTES / element_size;
@ -307,34 +212,40 @@ void reduce_bf16_buffers(int start_elements,
 #pragma omp parallel for
    for (int i = start_elements * element_size; i < (start_elements + main_elements) * element_size;
         i += VECTOR_LENGTH_IN_BYTES) {
-        auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(workspace[0]->buffer + i)));
-        switch (num_buffers) {
-            case 16: REPEAT(15, CVT_ADD_BF16); break;
-            case 15: REPEAT(14, CVT_ADD_BF16); break;
-            case 14: REPEAT(13, CVT_ADD_BF16); break;
-            case 13: REPEAT(12, CVT_ADD_BF16); break;
-            case 12: REPEAT(11, CVT_ADD_BF16); break;
-            case 11: REPEAT(10, CVT_ADD_BF16); break;
-            case 10: REPEAT(9, CVT_ADD_BF16); break;
-            case 9: REPEAT(8, CVT_ADD_BF16); break;
-            case 8: REPEAT(7, CVT_ADD_BF16); break;
-            case 7: REPEAT(6, CVT_ADD_BF16); break;
-            case 6: REPEAT(5, CVT_ADD_BF16); break;
-            case 5: REPEAT(4, CVT_ADD_BF16); break;
-            case 4: REPEAT(3, CVT_ADD_BF16); break;
-            case 3: REPEAT(2, CVT_ADD_BF16); break;
-            default: assert(!"Should not get here.");
+        auto inout_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(buffers[0] + i)));
+        switch (world_size) {
+            case 16: CVT_ADD_BF16(15);
+            case 15: CVT_ADD_BF16(14);
+            case 14: CVT_ADD_BF16(13);
+            case 13: CVT_ADD_BF16(12);
+            case 12: CVT_ADD_BF16(11);
+            case 11: CVT_ADD_BF16(10);
+            case 10: CVT_ADD_BF16(9);
+            case 9: CVT_ADD_BF16(8);
+            case 8: CVT_ADD_BF16(7);
+            case 7: CVT_ADD_BF16(6);
+            case 6: CVT_ADD_BF16(5);
+            case 5: CVT_ADD_BF16(4);
+            case 4: CVT_ADD_BF16(3);
+            case 3:
+                CVT_ADD_BF16(2);
+                CVT_ADD_BF16(1);
+                break;
+            default:
+                for (int j = 1; j < world_size; j++) {
+                    auto in_val = cvt_bf16_to_fp32(_mm256_loadu_si256((__m256i*)(buffers[j] + i)));
+                    inout_val = _mm512_add_ps(inout_val, in_val);
+                }
        }
-        _mm256_storeu_si256((__m256i*)(workspace[to_buffer_idx]->buffer + i),
-                            cvt_fp32_to_bf16(inout_val));
+        _mm256_storeu_si256((__m256i*)(to_buffer + i), cvt_fp32_to_bf16(inout_val));
    }

    // process remaining part
    int i = (start_elements + main_elements) * element_size;
    while (remain_elements > 0) {
        float val = 0.0f;
-        for (int j = 0; j < num_buffers; j++) { val += *(at::BFloat16*)(workspace[j]->buffer + i); }
-        *(at::BFloat16*)(workspace[to_buffer_idx]->buffer + i) = val;
+        for (int j = 0; j < world_size; j++) { val += *(at::BFloat16*)(buffers[j] + i); }
+        *(at::BFloat16*)(to_buffer + i) = val;
        remain_elements--;
        i += element_size;
    }
@ -367,17 +278,13 @@ void reduce_2_bf16_buffers_iio(int num_elements, void* in0, void* in1, void* out
    }
 }

-#define CVT_ADD_F32(x)                                                          \
-    do {                                                                        \
-        auto in##x##_val = _mm256_loadu_ps((float*)(workspace[x]->buffer + i)); \
-        inout_val = _mm256_add_ps(inout_val, in##x##_val);                      \
+#define CVT_ADD_F32(x)                                                \
+    do {                                                              \
+        auto in##x##_val = _mm256_loadu_ps((float*)(buffers[x] + i)); \
+        inout_val = _mm256_add_ps(inout_val, in##x##_val);            \
    } while (0)

-void reduce_fp32_buffers(int start_elements,
-                         int num_elements,
-                         int num_buffers,
-                         int to_buffer_idx,
-                         struct allreduce_workspace** workspace)
+void reduce_fp32_buffers(int start_elements, int num_elements, char* to_buffer, char** buffers)
 {
    const int element_size = 4;
    const int vector_length = VECTOR_LENGTH_IN_BYTES / element_size;
@ -388,33 +295,40 @@ void reduce_fp32_buffers(int start_elements,
 #pragma omp parallel for
    for (int i = start_elements * element_size; i < (start_elements + main_elements) * element_size;
         i += VECTOR_LENGTH_IN_BYTES) {
-        auto inout_val = _mm256_loadu_ps((float*)(workspace[0]->buffer + i));
-        switch (num_buffers) {
-            case 16: REPEAT(15, CVT_ADD_F32); break;
-            case 15: REPEAT(14, CVT_ADD_F32); break;
-            case 14: REPEAT(13, CVT_ADD_F32); break;
-            case 13: REPEAT(12, CVT_ADD_F32); break;
-            case 12: REPEAT(11, CVT_ADD_F32); break;
-            case 11: REPEAT(10, CVT_ADD_F32); break;
-            case 10: REPEAT(9, CVT_ADD_F32); break;
-            case 9: REPEAT(8, CVT_ADD_F32); break;
-            case 8: REPEAT(7, CVT_ADD_F32); break;
-            case 7: REPEAT(6, CVT_ADD_F32); break;
-            case 6: REPEAT(5, CVT_ADD_F32); break;
-            case 5: REPEAT(4, CVT_ADD_F32); break;
-            case 4: REPEAT(3, CVT_ADD_F32); break;
-            case 3: REPEAT(2, CVT_ADD_F32); break;
-            default: assert(!"Should not get here.");
+        auto inout_val = _mm256_loadu_ps((float*)(buffers[0] + i));
+        switch (world_size) {
+            case 16: CVT_ADD_F32(15);
+            case 15: CVT_ADD_F32(14);
+            case 14: CVT_ADD_F32(13);
+            case 13: CVT_ADD_F32(12);
+            case 12: CVT_ADD_F32(11);
+            case 11: CVT_ADD_F32(10);
+            case 10: CVT_ADD_F32(9);
+            case 9: CVT_ADD_F32(8);
+            case 8: CVT_ADD_F32(7);
+            case 7: CVT_ADD_F32(6);
+            case 6: CVT_ADD_F32(5);
+            case 5: CVT_ADD_F32(4);
+            case 4: CVT_ADD_F32(3);
+            case 3:
+                CVT_ADD_F32(2);
+                CVT_ADD_F32(1);
+                break;
+            default:
+                for (int j = 1; j < world_size; j++) {
+                    auto in_val = _mm256_loadu_ps((float*)(buffers[j] + i));
+                    inout_val = _mm256_add_ps(inout_val, in_val);
+                }
        }
-        _mm256_storeu_ps((float*)(workspace[to_buffer_idx]->buffer + i), inout_val);
+        _mm256_storeu_ps((float*)(to_buffer + i), inout_val);
    }

    // process remaining part
    int i = (start_elements + main_elements) * element_size;
    while (remain_elements > 0) {
        float val = 0.0f;
-        for (int j = 0; j < num_buffers; j++) { val += *(float*)(workspace[j]->buffer + i); }
-        *(float*)(workspace[to_buffer_idx]->buffer + i) = val;
+        for (int j = 0; j < world_size; j++) { val += *(float*)(buffers[j] + i); }
+        *(float*)(to_buffer + i) = val;
        remain_elements--;
        i += element_size;
    }
@ -448,7 +362,6 @@ void reduce_2_fp32_buffers_iio(int num_elements, void* in0, void* in1, void* out
 }

 static bool is_initialized = 0;
-static int world_size;
 static int world_rank;

 void shm_initialize(int size, int rank, char* addr_string, char* port_string)
@ -477,10 +390,15 @@ void shm_initialize(int size, int rank, char* addr_string, char* port_string)
    snprintf(shm_name, NAME_BUF_SIZE, "%s_%d", shm_name_prefix, rank);
    shared_create(&allreduce_buffer, shm_name, workspace_buf, sizeof(struct allreduce_workspace));
    workspace_buf = (struct allreduce_workspace*)allreduce_buffer.bytes;
-    workspace_buf->state = coll_begin;
+    workspace_buf->states[0] = coll_alt2_allreduce_naive__copy_in_done;
+    workspace_buf->states[1] = coll_begin;

    // create the workspace pointer list
    workspace = (struct allreduce_workspace**)malloc(size * sizeof(struct allreduce_workspace*));
+    symmetric_buffer[0] = (char**)malloc(size * sizeof(char**));
+    symmetric_buffer[1] = (char**)malloc(size * sizeof(char**));
+    distributed_buffer[0] = (char**)malloc(size * sizeof(char**));
+    distributed_buffer[1] = (char**)malloc(size * sizeof(char**));

    // map shm of all ranks
    for (int i = 0; i < size; i++) {
@ -494,11 +412,11 @@ void shm_initialize(int size, int rank, char* addr_string, char* port_string)
            workspace[i] = workspace_buf_other;
        } else {
            workspace[i] = workspace_buf;
-            workspace_buf->counter = 0;
-            sem_init(&workspace_buf->mutex, 1, 1);
-            sem_init(&workspace_buf->turnstile1, 1, 0);
-            sem_init(&workspace_buf->turnstile2, 1, 0);
        }
+        symmetric_buffer[0][i] = workspace[i]->buffer + BUFFER0_OFFSET(0);
+        symmetric_buffer[1][i] = workspace[i]->buffer + BUFFER0_OFFSET(1);
+        distributed_buffer[0][i] = workspace[i]->buffer + BUFFER1_OFFSET(0);
+        distributed_buffer[1][i] = workspace[i]->buffer + BUFFER1_OFFSET(1);
    }
 }

@ -539,46 +457,122 @@ size_t slice_el_start(size_t chunk_el, int slice_idx)
    return slice_size * slice_idx;
 }

-void naive_all_reduce(char* data_ptr,
-                      c10::ScalarType scalar_type,
-                      size_t chunk_size,
-                      size_t chunk_el)
+/*
+    Symmetrical naive all_reduce
+    step 0: before enter the function ith times, state is copy(i-1)
+    step 1: each rank copy data from input (data_ptr) to SHM buffer[i]
+    step 2: set own state to copy(i)
+    step 3: wait each other rank's state equal or later than copy(i)
+    step 4: reduce across SHM buffer(ith) directly into output (data_ptr)
+*/
+void symmetric_naive_all_reduce(char* data_ptr,
+                                c10::ScalarType scalar_type,
+                                size_t chunk_size,
+                                size_t chunk_el)
 {
-    parallel_memcpy(workspace[world_rank]->buffer, data_ptr, chunk_size);
-    std::atomic_thread_fence(std::memory_order_release);
-    workspace[world_rank]->state = coll_allreduce_naive__copy_in_done;
+#ifdef DO_PROFILE
+    static double total_t1_t0 = 0.0;
+    static double total_t2_t1 = 0.0;
+    static double total_t3_t2 = 0.0;
+    static int count = -16;  // warmup
+    auto t0 = std::chrono::system_clock::now();
+#endif

-    if (world_rank == 0) {
-        // compute allreduce result on rank 0
-        for (int i = 1; i < world_size; i++) {
-            // wait until the other rank copy the buffer
-            wait_buffer_state_until(i, coll_allreduce_naive__copy_in_done);
+    /*
+        We can't have infinite number of buffers and states.  2 sets of buffer
+        and 3 sets of states is just enough.  Consider current rank is in step 3,
+        with it's own state set to copy(i), the other rank will them have the
+        following situations:
+        ------------------------------------------------
+        my state | can I proceed? | the other rank state
+        ================================================
+                 |       N        | copy(i-1)
+                 |----------------|---------------------
+        copy(i)  |       Y        | copy(i)
+                 |----------------|---------------------
+                 |       Y        | copy(i+1)
+        ------------------------------------------------
+        * When I have state as copy(i), the other rank cannot have state
+          copy(i-2) or before. In that case I'll be in state copy(i-1) and cannot
+          proceed to copy(i).
+        * The other rank cannot have state copy(i+2) or beyond because my
+          state is still copy(i), copy(i+1) is as far as the other rank could go.
+        * From a rank's POV, all the other ranks can be divided into three sets:
+          - Lagging ranks: ranks that are still working on previous iteration
+          - Syncing ranks: ranks that are working on current iteration
+          - Leading ranks: ranks that are working on next iteration
+        * We can have 3 sets of states, one set for syncing ranks; one set for
+          lagging ranks; one set of leading ranks.  With 3 sets of states, we can
+          distinguish between lagging and leading ranks.
+        * Note from any rank's POV, leading ranks and lagging ranks does not
+          appear at the same time.  Either all other ranks are syncing or
+          lagging, or all other ranks are syncing or leading.  Otherwise leading
+          and lagging ranks will be 2 iterations apart and this should not happen.
+        * So we have 2 sets of buffers, one buffer is used by current iter;
+          one buffer used by either lagging ranks or leading ranks.
+    */
+    const int state_group = 0;
+    static int current_buffer = 0;
+    static int state_idx = 0;
+
+    enum coll_state copy_current, copy_next;
+
+    switch (state_idx) {
+        case 0:
+            copy_current = coll_allreduce_naive__copy_in_done;
+            copy_next = coll_alt1_allreduce_naive__copy_in_done;
+            break;
+        case 1:
+            copy_current = coll_alt1_allreduce_naive__copy_in_done;
+            copy_next = coll_alt2_allreduce_naive__copy_in_done;
+            break;
+        case 2:
+            copy_current = coll_alt2_allreduce_naive__copy_in_done;
+            copy_next = coll_allreduce_naive__copy_in_done;
+            break;
+        default: assert(!"Should not get here.");
+    }
+    state_idx = (state_idx + 1) % 3;
+
+    parallel_memcpy(symmetric_buffer[current_buffer][world_rank], data_ptr, chunk_size);
+    std::atomic_thread_fence(std::memory_order_release);
+    workspace[world_rank]->states[state_group] = copy_current;
+
+#ifdef DO_PROFILE
+    auto t1 = std::chrono::system_clock::now();
+#endif
+
+    for (int i = 0; i < world_size; i++) {
+        // wait until the other rank copy the buffer
+        if (i != world_rank) { wait_buffer_state_until_2(i, copy_current, copy_next, state_group); }
+    }
+#ifdef DO_PROFILE
+    auto t2 = std::chrono::system_clock::now();
+#endif
+
+    // each rank reduce the buffer independently so therre is no need for synchronization afterward
+    reduce_all_buffers(
+        0, chunk_el, scalar_type, world_rank, data_ptr, symmetric_buffer[current_buffer]);
+
+    // switch buffer
+    current_buffer = 1 - current_buffer;
+
+#ifdef DO_PROFILE
+    auto t3 = std::chrono::system_clock::now();
+
+    count++;
+    if (count > 0) {
+        total_t1_t0 += std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
+        total_t2_t1 += std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count();
+        total_t3_t2 += std::chrono::duration_cast<std::chrono::microseconds>(t3 - t2).count();
+        if (world_rank == 0 && count == 1000) {
+            printf("symmetric_naive_all_reduce time breakdown:\n");
+            printf("\tcopy input buffer: %.2f\n", total_t1_t0 / count);
+            printf("\twait for copy: %.2f\n", total_t2_t1 / count);
+            printf("\treduce: %.2f\n", total_t3_t2 / count);
        }
-        reduce_all_buffers(workspace, 0, chunk_el, scalar_type, world_size, 0);
-        std::atomic_thread_fence(std::memory_order_release);
-        workspace[world_rank]->state = coll_allreduce_naive__reduce_done;
-        parallel_memcpy(data_ptr, workspace[0]->buffer, chunk_size);
-    }
-    if (world_rank != 0) {
-        wait_buffer_state_until(0, coll_allreduce_naive__reduce_done);
-        parallel_memcpy(data_ptr, workspace[0]->buffer, chunk_size);
-        std::atomic_thread_fence(std::memory_order_release);
-        workspace[world_rank]->state = coll_allreduce_naive__copy_out_done;
-    }
-    if (world_rank == 0) {
-        for (int i = 1; i < world_size; i++) {
-            wait_buffer_state_until(i, coll_allreduce_naive__copy_out_done);
-        }
-        std::atomic_thread_fence(std::memory_order_release);
-        workspace[world_rank]->state = coll_begin;
-    }
-    if (world_rank != 0) {
-        // if rank 0 spin too fast it could be in state 1 of next allreduce
-        // in this case wait_buffer_state_until(0, 0) may cause deadlock
-        // what we are certain is when rank 0 finishes the state won't be 2
-        wait_buffer_state_until_not(0, coll_allreduce_naive__reduce_done);
-        workspace[world_rank]->state = coll_begin;
    }
+#endif
 }

 // naive allreduce distributed, each rank do naive reduce on its slice
@ -597,10 +591,33 @@ void distributed_naive_reduce(char* data_ptr,
    auto t0 = std::chrono::system_clock::now();
 #endif

+    const int state_group = 1;
+    static int current_buffer = 0;
+    static int state_idx = 0;
+
+    enum coll_state copy_current, copy_next, reduce_current;
+
+    // similar to symmetric_naive_allreduce, but here we only need two sets of
+    // states, because distributed naive reduce has two barriers in the algorithm
+    switch (state_idx) {
+        case 0:
+            copy_current = coll_allreduce_naive__copy_in_done;
+            reduce_current = coll_allreduce_naive__reduce_done;
+            copy_next = coll_alt1_allreduce_naive__copy_in_done;
+            break;
+        case 1:
+            copy_current = coll_alt1_allreduce_naive__copy_in_done;
+            reduce_current = coll_alt1_allreduce_naive__reduce_done;
+            copy_next = coll_allreduce_naive__copy_in_done;
+            break;
+        default: assert(!"Should not get here.");
+    }
+    state_idx = (state_idx + 1) % 2;
+
    int data_size = chunk_size / chunk_el;
-    parallel_memcpy(workspace[world_rank]->buffer, data_ptr, chunk_size);
+    parallel_memcpy(distributed_buffer[current_buffer][world_rank], data_ptr, chunk_size);
    std::atomic_thread_fence(std::memory_order_release);
-    workspace[world_rank]->state = coll_allreduce_naive__copy_in_done;
+    workspace[world_rank]->states[state_group] = copy_current;

 #ifdef DO_PROFILE
    auto t1 = std::chrono::system_clock::now();
@ -608,7 +625,8 @@ void distributed_naive_reduce(char* data_ptr,

    for (int i = 0; i < world_size; i++) {
        // wait until all the other ranks copy the buffer
-        wait_buffer_state_until_range(i, coll_allreduce_naive__copy_in_done, 2);
+        if (i != world_rank)
+            wait_buffer_state_until_2(i, copy_current, reduce_current, state_group);
    }

 #ifdef DO_PROFILE
@ -616,40 +634,36 @@ void distributed_naive_reduce(char* data_ptr,
 #endif

    // reduce scatter
-    reduce_all_buffers(workspace,
-                       slice_el_start(chunk_el, world_rank),
+    reduce_all_buffers(slice_el_start(chunk_el, world_rank),
                       slice_size(chunk_el, world_rank),
                       scalar_type,
-                       world_size,
-                       world_rank);
+                       world_rank,
+                       distributed_buffer[current_buffer][world_rank],
+                       distributed_buffer[current_buffer]);
    std::atomic_thread_fence(std::memory_order_release);
-    workspace[world_rank]->state = coll_allreduce_naive__reduce_done;
+    workspace[world_rank]->states[state_group] = reduce_current;

 #ifdef DO_PROFILE
    auto t3 = std::chrono::system_clock::now();
 #endif

    for (int i = 0; i < world_size; i++) {
-        int rank = (i + world_rank) % world_size;
-        // wait until the other rank reduce the buffer
-        wait_buffer_state_until_range(rank, coll_allreduce_naive__reduce_done, 2);
-        parallel_memcpy(slice_data(data_ptr, chunk_el, data_size, rank),
-                        slice_data(workspace[rank]->buffer, chunk_el, chunk_size / chunk_el, rank),
-                        slice_size(chunk_el, rank) * data_size);
+        // wait until all the other ranks reduce the buffer
+        if (i != world_rank) wait_buffer_state_until_2(i, reduce_current, copy_next, state_group);
    }
-    std::atomic_thread_fence(std::memory_order_release);
-    workspace[world_rank]->state = coll_allreduce_naive__copy_out_done;

-#ifdef DO_PROFILE
    auto t4 = std::chrono::system_clock::now();
-#endif

    for (int i = 0; i < world_size; i++) {
-        wait_buffer_state_until_not(i, coll_allreduce_naive__reduce_done);
+        int rank = (i + world_rank) % world_size;
+        parallel_memcpy(
+            slice_data(data_ptr, chunk_el, data_size, rank),
+            slice_data(
+                distributed_buffer[current_buffer][rank], chunk_el, chunk_size / chunk_el, rank),
+            slice_size(chunk_el, rank) * data_size);
    }

-    std::atomic_thread_fence(std::memory_order_release);
-    workspace[world_rank]->state = coll_begin;
+    current_buffer = 1 - current_buffer;

 #ifdef DO_PROFILE
    auto t5 = std::chrono::system_clock::now();
@ -665,8 +679,8 @@ void distributed_naive_reduce(char* data_ptr,
            printf("\tcopy input buffer: %.2f\n", total_t1_t0 / count);
            printf("\twait for copy: %.2f\n", total_t2_t1 / count);
            printf("\treduce: %.2f\n", total_t3_t2 / count);
-            printf("\tcopy buffer to output: %.2f\n", total_t4_t3 / count);
-            printf("\twait finish: %.2f\n", total_t5_t4 / count);
+            printf("\twait for reduce finish: %.2f\n", total_t4_t3 / count);
+            printf("\tcopy out: %.2f\n", total_t5_t4 / count);
        }
    }
 #endif
@ -679,7 +693,7 @@ void all_reduce_outer_loop(torch::Tensor& data, size_t numel, int data_size)
        size_t chunk_size = data_size - offset > MAX_BUF_SIZE ? MAX_BUF_SIZE : data_size - offset;
        size_t chunk_el = chunk_size / (data_size / numel);
        if (chunk_size < NAIVE_ALLREDUCE_THRESHOLD)
-            naive_all_reduce(data_ptr, data.scalar_type(), chunk_size, chunk_el);
+            symmetric_naive_all_reduce(data_ptr, data.scalar_type(), chunk_size, chunk_el);
        else
            distributed_naive_reduce(data_ptr, data.scalar_type(), chunk_size, chunk_el);
    }