diff --git a/Makefile b/Makefile index 997ad4d..6c1e6ae 100644 --- a/Makefile +++ b/Makefile @@ -10,6 +10,7 @@ DEVICE_NAME ?= HOST_MODE ?= 0 EXPECTED_TIMEOUT ?= inf BATCH ?= +AB_DEBUG ?= 0 CPU_THREADS ?= INNER_CMD = ./antares/run.sh @@ -18,7 +19,7 @@ BACKEND = $(shell ./antares/get_backend.sh) PARAMS ?= docker run -v $(shell pwd):/antares -w /antares --privileged -v /:/host \ --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \ -v $(shell dirname `find /usr/lib/ -name libnvidia-ptxjitcompiler.so` 2>/dev/null | tail -n 1):/usr/local/nvidia/lib64 \ - -v $(shell pwd)/public/roc_prof:/usr/local/bin/rp -e CPU_THREADS=$(CPU_THREADS) -e RECORD=$(RECORD) -e BATCH=$(BATCH) \ + -v $(shell pwd)/public/roc_prof:/usr/local/bin/rp -e CPU_THREADS=$(CPU_THREADS) -e RECORD=$(RECORD) -e BATCH=$(BATCH) -e AB_DEBUG=$(AB_DEBUG) \ -e STEP=$(STEP) -e AGENT_URL=$(value AGENT_URL) -e TUNER=$(TUNER) -e CONFIG='$(value CONFIG)' -e BACKEND=$(BACKEND) -e COMPUTE_V1='$(value COMPUTE_V1)' \ -e COMMIT=$(COMMIT) -e HARDWARE_CONFIG=$(HARDWARE_CONFIG) -e DEVICE_NAME='$(value DEVICE_NAME)' -e EXPECTED_TIMEOUT=$(EXPECTED_TIMEOUT) diff --git a/graph_evaluator/execute_module.hpp b/graph_evaluator/execute_module.hpp index 57b0708..732f4a2 100644 --- a/graph_evaluator/execute_module.hpp +++ b/graph_evaluator/execute_module.hpp @@ -171,8 +171,11 @@ struct ExecutionModule { std::string backend; void *hModule; + bool debug_output; ExecutionModule(std::string source) { + debug_output = getenv("AB_DEBUG") && *getenv("AB_DEBUG") ? atoi(getenv("AB_DEBUG")) : 0; + static const char file_proto[] = "file://"; if (0 == strncmp(source.c_str(), file_proto, sizeof(file_proto) - 1)) { @@ -264,7 +267,26 @@ struct ExecutionModule { if (--tensor_used[it->in_args[i]] == 0) { ab::release(tensor_memory[it->in_args[i]], local_tensors[it->in_args[i]].mem_size()); } + + if (debug_output) { + for (auto &arg: it->out_args) { + char d[32]; + ab::memcpyDtoH(d, tensor_memory[arg], sizeof(d)); + ab::synchronize(); + if (local_tensors[arg].dtype == "float32") + fprintf(stderr, "[DEBUG] %s(%s) = %g, %g, %g, %g ..\n", arg.c_str(), local_tensors[arg].dtype.c_str(), ((float*)d)[0], ((float*)d)[1], ((float*)d)[2], ((float*)d)[3]); + else if (local_tensors[arg].dtype == "float64") + fprintf(stderr, "[DEBUG] %s(%s) = %g, %g, %g, %g ..\n", arg.c_str(), local_tensors[arg].dtype.c_str(), ((double*)d)[0], ((double*)d)[1], ((double*)d)[2], ((double*)d)[3]); + else if (local_tensors[arg].dtype == "int32") + fprintf(stderr, "[DEBUG] %s(%s) = %d, %d, %d, %d ..\n", arg.c_str(), local_tensors[arg].dtype.c_str(), ((int*)d)[0], ((int*)d)[1], ((int*)d)[2], ((int*)d)[3]); + else + fprintf(stderr, "[DEBUG] %s(%s) = %016x, %016x, %016x, %016x ..\n", arg.c_str(), local_tensors[arg].dtype.c_str(), ((int*)d)[0], ((int*)d)[1], ((int*)d)[2], ((int*)d)[3]); + } + } } + if (debug_output) + fprintf(stderr, "[DEBUG] =======================\n"); + return 0; } }; diff --git a/graph_evaluator/run_graph.cpp b/graph_evaluator/run_graph.cpp index 5ab3799..f729727 100644 --- a/graph_evaluator/run_graph.cpp +++ b/graph_evaluator/run_graph.cpp @@ -40,7 +40,10 @@ int main(int argc, char** argv) ((int*)hptr.data())[x] = (x + i + 1) % 71; } else if (it.dtype == "float32") { for (size_t x = 0; x < size; ++x) - ((float*)hptr.data())[x] = (x + i + 1) % 71; + ((float*)hptr.data())[x] = ((x + i + 1) % 71 - 35.5) * 0.00001; + } else if (it.dtype == "float64") { + for (size_t x = 0; x < size; ++x) + ((double*)hptr.data())[x] = ((x + i + 1) % 71 - 35.5) * 0.00001; } else { size_t byte_size = size * it.type_size(); for (size_t x = 0; x < byte_size / sizeof(int); ++x) @@ -71,6 +74,12 @@ int main(int argc, char** argv) if (it.dtype == "int32") { for (size_t x = 0; x < byte_size / sizeof(int); ++x) digest += (x + 1) % 83 * ((int*)hptr.data())[x]; + } else if (it.dtype == "float32") { + for (size_t x = 0; x < byte_size / sizeof(float); ++x) + digest += (x + 1) % 83 * ((float*)hptr.data())[x]; + } else if (it.dtype == "float64") { + for (size_t x = 0; x < byte_size / sizeof(double); ++x) + digest += (x + 1) % 83 * ((double*)hptr.data())[x]; } else { for (size_t x = 0; x < byte_size / sizeof(float); ++x) digest += (x + 1) % 83 * ((float*)hptr.data())[x]; diff --git a/lang/einstein_v2.py b/lang/einstein_v2.py index 29a2c8a..a5a29bf 100644 --- a/lang/einstein_v2.py +++ b/lang/einstein_v2.py @@ -65,7 +65,7 @@ class OpTensor: return self.cast(output_dtype) if self._op == 'const' and self._value == 1: return other.cast(output_dtype) - return OpTensor('op', {"name": "*", "inputs": [self, other]}, output_dtype) + return OpTensor('op', {"name": "*", "inputs": [self.cast(output_dtype), other.cast(output_dtype)]}, output_dtype) def __rmul__(self, other): other = OpTensor.parse(other) @@ -114,7 +114,7 @@ class OpTensor: return self.cast(output_dtype) if self._op == 'const' and self._value == 0: return other.cast(output_dtype) - return OpTensor('op', {"name": "+", "inputs": [self, other]}, output_dtype) + return OpTensor('op', {"name": "+", "inputs": [self.cast(output_dtype), other.cast(output_dtype)]}, output_dtype) def __radd__(self, other): other = OpTensor.parse(other) @@ -125,7 +125,7 @@ class OpTensor: output_dtype = OpTensor.merge_dtype(self, other) if other._op == 'const' and other._value == 0: return self.cast(output_dtype) - return OpTensor('op', {"name": "-", "inputs": [self, other]}, output_dtype) + return OpTensor('op', {"name": "-", "inputs": [self.cast(output_dtype), other.cast(output_dtype)]}, output_dtype) def __rsub__(self, other): other = OpTensor.parse(other)