diff --git a/Makefile b/Makefile
index 997ad4d..6c1e6ae 100644
--- a/Makefile
+++ b/Makefile
@@ -10,6 +10,7 @@ DEVICE_NAME ?=
 HOST_MODE ?= 0
 EXPECTED_TIMEOUT ?= inf
 BATCH ?=
+AB_DEBUG ?= 0
 
 CPU_THREADS ?=
 INNER_CMD = ./antares/run.sh
@@ -18,7 +19,7 @@ BACKEND = $(shell ./antares/get_backend.sh)
 PARAMS ?=  docker run -v $(shell pwd):/antares -w /antares --privileged -v /:/host \
 	--shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \
 	-v $(shell dirname `find /usr/lib/ -name libnvidia-ptxjitcompiler.so` 2>/dev/null | tail -n 1):/usr/local/nvidia/lib64 \
-	-v $(shell pwd)/public/roc_prof:/usr/local/bin/rp -e CPU_THREADS=$(CPU_THREADS) -e RECORD=$(RECORD) -e BATCH=$(BATCH) \
+	-v $(shell pwd)/public/roc_prof:/usr/local/bin/rp -e CPU_THREADS=$(CPU_THREADS) -e RECORD=$(RECORD) -e BATCH=$(BATCH) -e AB_DEBUG=$(AB_DEBUG) \
 	-e STEP=$(STEP) -e AGENT_URL=$(value AGENT_URL) -e TUNER=$(TUNER) -e CONFIG='$(value CONFIG)' -e BACKEND=$(BACKEND) -e COMPUTE_V1='$(value COMPUTE_V1)' \
 	-e COMMIT=$(COMMIT) -e HARDWARE_CONFIG=$(HARDWARE_CONFIG) -e DEVICE_NAME='$(value DEVICE_NAME)' -e EXPECTED_TIMEOUT=$(EXPECTED_TIMEOUT)
 
diff --git a/graph_evaluator/execute_module.hpp b/graph_evaluator/execute_module.hpp
index 57b0708..732f4a2 100644
--- a/graph_evaluator/execute_module.hpp
+++ b/graph_evaluator/execute_module.hpp
@@ -171,8 +171,11 @@ struct ExecutionModule {
   std::string backend;
 
   void *hModule;
+  bool debug_output;
 
   ExecutionModule(std::string source) {
+    debug_output = getenv("AB_DEBUG") && *getenv("AB_DEBUG") ? atoi(getenv("AB_DEBUG")) : 0;
+
     static const char file_proto[] = "file://";
 
     if (0 == strncmp(source.c_str(), file_proto, sizeof(file_proto) - 1)) {
@@ -264,7 +267,26 @@ struct ExecutionModule {
         if (--tensor_used[it->in_args[i]] == 0) {
           ab::release(tensor_memory[it->in_args[i]], local_tensors[it->in_args[i]].mem_size());
         }
+
+      if (debug_output) {
+        for (auto &arg: it->out_args) {
+          char d[32];
+          ab::memcpyDtoH(d, tensor_memory[arg], sizeof(d));
+          ab::synchronize();
+          if (local_tensors[arg].dtype == "float32")
+            fprintf(stderr, "[DEBUG] %s(%s) = %g, %g, %g, %g ..\n", arg.c_str(), local_tensors[arg].dtype.c_str(), ((float*)d)[0], ((float*)d)[1], ((float*)d)[2], ((float*)d)[3]);
+          else if (local_tensors[arg].dtype == "float64")
+            fprintf(stderr, "[DEBUG] %s(%s) = %g, %g, %g, %g ..\n", arg.c_str(), local_tensors[arg].dtype.c_str(), ((double*)d)[0], ((double*)d)[1], ((double*)d)[2], ((double*)d)[3]);
+          else if (local_tensors[arg].dtype == "int32")
+            fprintf(stderr, "[DEBUG] %s(%s) = %d, %d, %d, %d ..\n", arg.c_str(), local_tensors[arg].dtype.c_str(), ((int*)d)[0], ((int*)d)[1], ((int*)d)[2], ((int*)d)[3]);
+          else
+            fprintf(stderr, "[DEBUG] %s(%s) = %016x, %016x, %016x, %016x ..\n", arg.c_str(), local_tensors[arg].dtype.c_str(), ((int*)d)[0], ((int*)d)[1], ((int*)d)[2], ((int*)d)[3]);
+        }
+      }
     }
+    if (debug_output)
+      fprintf(stderr, "[DEBUG] =======================\n");
+
     return 0;
   }
 };
diff --git a/graph_evaluator/run_graph.cpp b/graph_evaluator/run_graph.cpp
index 5ab3799..f729727 100644
--- a/graph_evaluator/run_graph.cpp
+++ b/graph_evaluator/run_graph.cpp
@@ -40,7 +40,10 @@ int main(int argc, char** argv)
           ((int*)hptr.data())[x] = (x + i + 1) % 71;
       } else if (it.dtype == "float32") {
         for (size_t x = 0; x < size; ++x)
-          ((float*)hptr.data())[x] = (x + i + 1) % 71;
+          ((float*)hptr.data())[x] = ((x + i + 1) % 71 - 35.5) * 0.00001;
+      } else if (it.dtype == "float64") {
+        for (size_t x = 0; x < size; ++x)
+          ((double*)hptr.data())[x] = ((x + i + 1) % 71 - 35.5) * 0.00001;
       } else {
         size_t byte_size = size * it.type_size();
         for (size_t x = 0; x < byte_size / sizeof(int); ++x)
@@ -71,6 +74,12 @@ int main(int argc, char** argv)
       if (it.dtype == "int32") {
         for (size_t x = 0; x < byte_size / sizeof(int); ++x)
           digest += (x + 1) % 83 * ((int*)hptr.data())[x];
+      } else if (it.dtype == "float32") {
+        for (size_t x = 0; x < byte_size / sizeof(float); ++x)
+          digest += (x + 1) % 83 * ((float*)hptr.data())[x];
+      } else if (it.dtype == "float64") {
+        for (size_t x = 0; x < byte_size / sizeof(double); ++x)
+          digest += (x + 1) % 83 * ((double*)hptr.data())[x];
       } else {
         for (size_t x = 0; x < byte_size / sizeof(float); ++x)
           digest += (x + 1) % 83 * ((float*)hptr.data())[x];
diff --git a/lang/einstein_v2.py b/lang/einstein_v2.py
index 29a2c8a..a5a29bf 100644
--- a/lang/einstein_v2.py
+++ b/lang/einstein_v2.py
@@ -65,7 +65,7 @@ class OpTensor:
             return self.cast(output_dtype)
         if self._op == 'const' and self._value == 1:
             return other.cast(output_dtype)
-        return OpTensor('op', {"name": "*", "inputs": [self, other]}, output_dtype)
+        return OpTensor('op', {"name": "*", "inputs": [self.cast(output_dtype), other.cast(output_dtype)]}, output_dtype)
 
     def __rmul__(self, other):
         other = OpTensor.parse(other)
@@ -114,7 +114,7 @@ class OpTensor:
             return self.cast(output_dtype)
         if self._op == 'const' and self._value == 0:
             return other.cast(output_dtype)
-        return OpTensor('op', {"name": "+", "inputs": [self, other]}, output_dtype)
+        return OpTensor('op', {"name": "+", "inputs": [self.cast(output_dtype), other.cast(output_dtype)]}, output_dtype)
 
     def __radd__(self, other):
         other = OpTensor.parse(other)
@@ -125,7 +125,7 @@ class OpTensor:
         output_dtype = OpTensor.merge_dtype(self, other)
         if other._op == 'const' and other._value == 0:
             return self.cast(output_dtype)
-        return OpTensor('op', {"name": "-", "inputs": [self, other]}, output_dtype)
+        return OpTensor('op', {"name": "-", "inputs": [self.cast(output_dtype), other.cast(output_dtype)]}, output_dtype)
 
     def __rsub__(self, other):
         other = OpTensor.parse(other)