Ab debug (#235)

* rehase evaluation inputs to reduce precision errors * add AB_DEBUG option for displaying outputs values
2021-04-14 09:09:22 +00:00 · 2021-04-14 09:09:22 +00:00 · fe0098a855
--- a/3
+++ b/3
@ -10,6 +10,7 @@ DEVICE_NAME ?=
 HOST_MODE ?= 0
 EXPECTED_TIMEOUT ?= inf
 BATCH ?=
+AB_DEBUG ?= 0

 CPU_THREADS ?=
 INNER_CMD = ./antares/run.sh
@ -18,7 +19,7 @@ BACKEND = $(shell ./antares/get_backend.sh)
 PARAMS ?=  docker run -v $(shell pwd):/antares -w /antares --privileged -v /:/host \
 	--shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \
 	-v $(shell dirname `find /usr/lib/ -name libnvidia-ptxjitcompiler.so` 2>/dev/null | tail -n 1):/usr/local/nvidia/lib64 \
-	-v $(shell pwd)/public/roc_prof:/usr/local/bin/rp -e CPU_THREADS=$(CPU_THREADS) -e RECORD=$(RECORD) -e BATCH=$(BATCH) \
+	-v $(shell pwd)/public/roc_prof:/usr/local/bin/rp -e CPU_THREADS=$(CPU_THREADS) -e RECORD=$(RECORD) -e BATCH=$(BATCH) -e AB_DEBUG=$(AB_DEBUG) \
 	-e STEP=$(STEP) -e AGENT_URL=$(value AGENT_URL) -e TUNER=$(TUNER) -e CONFIG='$(value CONFIG)' -e BACKEND=$(BACKEND) -e COMPUTE_V1='$(value COMPUTE_V1)' \
 	-e COMMIT=$(COMMIT) -e HARDWARE_CONFIG=$(HARDWARE_CONFIG) -e DEVICE_NAME='$(value DEVICE_NAME)' -e EXPECTED_TIMEOUT=$(EXPECTED_TIMEOUT)

--- a/graph_evaluator/execute_module.hpp
+++ b/graph_evaluator/execute_module.hpp
@ -171,8 +171,11 @@ struct ExecutionModule {
  std::string backend;

  void *hModule;
+  bool debug_output;

  ExecutionModule(std::string source) {
+    debug_output = getenv("AB_DEBUG") && *getenv("AB_DEBUG") ? atoi(getenv("AB_DEBUG")) : 0;
+
    static const char file_proto[] = "file://";

    if (0 == strncmp(source.c_str(), file_proto, sizeof(file_proto) - 1)) {
@ -264,7 +267,26 @@ struct ExecutionModule {
        if (--tensor_used[it->in_args[i]] == 0) {
          ab::release(tensor_memory[it->in_args[i]], local_tensors[it->in_args[i]].mem_size());
        }
+
+      if (debug_output) {
+        for (auto &arg: it->out_args) {
+          char d[32];
+          ab::memcpyDtoH(d, tensor_memory[arg], sizeof(d));
+          ab::synchronize();
+          if (local_tensors[arg].dtype == "float32")
+            fprintf(stderr, "[DEBUG] %s(%s) = %g, %g, %g, %g ..\n", arg.c_str(), local_tensors[arg].dtype.c_str(), ((float*)d)[0], ((float*)d)[1], ((float*)d)[2], ((float*)d)[3]);
+          else if (local_tensors[arg].dtype == "float64")
+            fprintf(stderr, "[DEBUG] %s(%s) = %g, %g, %g, %g ..\n", arg.c_str(), local_tensors[arg].dtype.c_str(), ((double*)d)[0], ((double*)d)[1], ((double*)d)[2], ((double*)d)[3]);
+          else if (local_tensors[arg].dtype == "int32")
+            fprintf(stderr, "[DEBUG] %s(%s) = %d, %d, %d, %d ..\n", arg.c_str(), local_tensors[arg].dtype.c_str(), ((int*)d)[0], ((int*)d)[1], ((int*)d)[2], ((int*)d)[3]);
+          else
+            fprintf(stderr, "[DEBUG] %s(%s) = %016x, %016x, %016x, %016x ..\n", arg.c_str(), local_tensors[arg].dtype.c_str(), ((int*)d)[0], ((int*)d)[1], ((int*)d)[2], ((int*)d)[3]);
+        }
+      }
    }
+    if (debug_output)
+      fprintf(stderr, "[DEBUG] =======================\n");
+
    return 0;
  }
 };
--- a/graph_evaluator/run_graph.cpp
+++ b/graph_evaluator/run_graph.cpp
@ -40,7 +40,10 @@ int main(int argc, char** argv)
          ((int*)hptr.data())[x] = (x + i + 1) % 71;
      } else if (it.dtype == "float32") {
        for (size_t x = 0; x < size; ++x)
-          ((float*)hptr.data())[x] = (x + i + 1) % 71;
+          ((float*)hptr.data())[x] = ((x + i + 1) % 71 - 35.5) * 0.00001;
+      } else if (it.dtype == "float64") {
+        for (size_t x = 0; x < size; ++x)
+          ((double*)hptr.data())[x] = ((x + i + 1) % 71 - 35.5) * 0.00001;
      } else {
        size_t byte_size = size * it.type_size();
        for (size_t x = 0; x < byte_size / sizeof(int); ++x)
@ -71,6 +74,12 @@ int main(int argc, char** argv)
      if (it.dtype == "int32") {
        for (size_t x = 0; x < byte_size / sizeof(int); ++x)
          digest += (x + 1) % 83 * ((int*)hptr.data())[x];
+      } else if (it.dtype == "float32") {
+        for (size_t x = 0; x < byte_size / sizeof(float); ++x)
+          digest += (x + 1) % 83 * ((float*)hptr.data())[x];
+      } else if (it.dtype == "float64") {
+        for (size_t x = 0; x < byte_size / sizeof(double); ++x)
+          digest += (x + 1) % 83 * ((double*)hptr.data())[x];
      } else {
        for (size_t x = 0; x < byte_size / sizeof(float); ++x)
          digest += (x + 1) % 83 * ((float*)hptr.data())[x];
--- a/lang/einstein_v2.py
+++ b/lang/einstein_v2.py
@ -65,7 +65,7 @@ class OpTensor:
            return self.cast(output_dtype)
        if self._op == 'const' and self._value == 1:
            return other.cast(output_dtype)
-        return OpTensor('op', {"name": "*", "inputs": [self, other]}, output_dtype)
+        return OpTensor('op', {"name": "*", "inputs": [self.cast(output_dtype), other.cast(output_dtype)]}, output_dtype)

    def __rmul__(self, other):
        other = OpTensor.parse(other)
@ -114,7 +114,7 @@ class OpTensor:
            return self.cast(output_dtype)
        if self._op == 'const' and self._value == 0:
            return other.cast(output_dtype)
-        return OpTensor('op', {"name": "+", "inputs": [self, other]}, output_dtype)
+        return OpTensor('op', {"name": "+", "inputs": [self.cast(output_dtype), other.cast(output_dtype)]}, output_dtype)

    def __radd__(self, other):
        other = OpTensor.parse(other)
@ -125,7 +125,7 @@ class OpTensor:
        output_dtype = OpTensor.merge_dtype(self, other)
        if other._op == 'const' and other._value == 0:
            return self.cast(output_dtype)
-        return OpTensor('op', {"name": "-", "inputs": [self, other]}, output_dtype)
+        return OpTensor('op', {"name": "-", "inputs": [self.cast(output_dtype), other.cast(output_dtype)]}, output_dtype)

    def __rsub__(self, other):
        other = OpTensor.parse(other)