Fix compiler warnings

2018-08-01 16:43:05 +01:00 · 2018-08-01 16:43:05 +01:00 · c73a2cf6fb
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -27,7 +27,7 @@ message(STATUS "Project version: ${PROJECT_VERSION_STRING_FULL}")
 set(CMAKE_CXX_FLAGS_RELEASE " -std=c++11 -O3 -Ofast -m64 -pthread -march=native -Wl,--no-as-needed -funroll-loops -ffinite-math-only -fPIC -Wno-unused-result -Wno-deprecated -Wno-deprecated-gpu-targets")
 set(CMAKE_CXX_FLAGS_DEBUG " -std=c++11 -g -O0 -pthread -fPIC -Wno-unused-result -Wno-deprecated -Wno-deprecated-gpu-targets")
 set(CMAKE_CXX_FLAGS_ST "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG")
-set(CMAKE_CXX_FLAGS_PROFILE "${CMAKE_CXX_FLAGS_RELEASE}  -pg -g")
+set(CMAKE_CXX_FLAGS_PROFILE "${CMAKE_CXX_FLAGS_RELEASE} -pg -g")
 set(CMAKE_CXX_FLAGS_PROFGEN "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-generate -fprofile-correction")
 set(CMAKE_CXX_FLAGS_PROFUSE "${CMAKE_CXX_FLAGS_RELEASE} -fprofile-use -fprofile-correction")
 set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@ -132,6 +132,9 @@ static void processPaths(
          processPaths(sub.second, TransformPath, PATHS.count(key) > 0);
        }
        break;
+      default:
+        // it is OK
+        break;
    }
  }
 }
@ -1132,8 +1135,7 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
             "(--config option)");
    auto configDir = boost::filesystem::path{configPaths.front()}.parent_path();
    for(const auto& configPath : configPaths)
-      ABORT_IF(boost::filesystem::path{configPaths.front()}.parent_path()
-                   != configDir,
+      ABORT_IF(boost::filesystem::path{configPath}.parent_path() != configDir,
               "relative-paths option requires all config files to be in the "
               "same directory");
    processPaths(config_, [&](const std::string& nodePath) -> std::string {
--- a/src/common/shape.h
+++ b/src/common/shape.h
@ -41,7 +41,7 @@ public:

  inline int& dim(int i) {
    if(i >= 0) {
-      ABORT_IF(i >= size(),
+      ABORT_IF(i >= (int)size(),
               "Index {} is out of bounds, shape has {} dimension",
               i,
               size());
@ -93,7 +93,7 @@ public:
    for(int j = shape_.size() - 2; j >= 0; --j)
      stride[j] = stride[j + 1] * shape_[j + 1];

-    for(int j = 0; j < d.size(); ++j)
+    for(size_t j = 0; j < d.size(); ++j)
      d[j] = (i / stride[j]) % shape_[j];
  }

@ -118,7 +118,7 @@ public:
  std::string toString() const {
    std::stringstream strm;
    strm << "shape=" << (*this)[0];
-    for(int i = 1; i < size(); ++i)
+    for(size_t i = 1; i < size(); ++i)
      strm << "x" << (*this)[i];
    strm << " size=" << elements();
    return strm.str();
@ -143,7 +143,7 @@ public:
  }

  static Shape broadcast(const std::vector<Shape>& shapes) {
-    int maxDims = 0;
+    size_t maxDims = 0;
    for(auto& s : shapes)
      if(s.size() > maxDims)
        maxDims = s.size();
@ -152,7 +152,7 @@ public:
    shape.resize(maxDims);

    for(auto& s : shapes) {
-      for(int i = 0; i < s.size(); ++i) {
+      for(int i = 0; i < (int)s.size(); ++i) {
        ABORT_IF(shape[-i] != s[-i] && shape[-i] != 1 && s[-i] != 1,
                 "Shapes {} and {} cannot be broadcasted",
                 (std::string)shape,
@ -170,7 +170,7 @@ public:

  template <typename T>
  static Shape broadcast(const std::vector<T>& nodes) {
-    int maxDims = 0;
+    size_t maxDims = 0;
    for(auto& n : nodes)
      if(n->shape().size() > maxDims)
        maxDims = n->shape().size();
@ -180,7 +180,7 @@ public:

    for(auto& node : nodes) {
      const Shape& shapen = node->shape();
-      for(int i = 1; i <= shapen.size(); ++i) {
+      for(int i = 1; i <= (int)shapen.size(); ++i) {
        ABORT_IF(shape[-i] != shapen[-i] && shape[-i] != 1 && shapen[-i] != 1,
                 "Shapes {} and {} cannot be broadcasted",
                 (std::string)shape,
@ -193,7 +193,7 @@ public:

  size_t hash() const {
    size_t seed = boost::hash<int>()(shape_[0]);
-    for(int i = 1; i < shape_.size(); ++i)
+    for(size_t i = 1; i < shape_.size(); ++i)
      boost::hash_combine(seed, shape_[i]);
    return seed;
  }
--- a/src/data/batch_generator.h
+++ b/src/data/batch_generator.h
@ -77,8 +77,8 @@ private:
      maxiBatch.reset(new sample_queue(cmpNone));
    }

-    int maxBatchSize = options_->get<int>("mini-batch");
-    int maxSize = maxBatchSize * options_->get<int>("maxi-batch");
+    size_t maxBatchSize = options_->get<int>("mini-batch");
+    size_t maxSize = maxBatchSize * options_->get<int>("maxi-batch");

    // consume data from corpus into maxi-batch (single sentences)
    // sorted into specified order (due to queue)
@ -185,7 +185,7 @@ public:
    currentBatch_ = bufferedBatches_.front();

    if(loadReady_
-       && bufferedBatches_.size()
+       && (int)bufferedBatches_.size()
              <= std::max(options_->get<int>("maxi-batch") / 5, 1)) {
      {
        std::unique_lock<std::mutex> lock(loadMutex_);
@ -239,7 +239,7 @@ public:
    }

    prepare(shuffle);
-    for(int i = 0; i < state->batchesEpoch; ++i)
+    for(size_t i = 0; i < state->batchesEpoch; ++i)
      next();

    return true;
--- a/src/data/batch_stats.h
+++ b/src/data/batch_stats.h
@ -18,7 +18,7 @@ private:
 public:
  size_t getBatchSize(const std::vector<size_t>& lengths) {
    auto it = map_.lower_bound(lengths);
-    for(int i = 0; i < lengths.size(); ++i)
+    for(size_t i = 0; i < lengths.size(); ++i)
      while(it != map_.end() && it->first[i] < lengths[i])
        it++;

@ -28,7 +28,7 @@ public:

  void add(Ptr<data::CorpusBatch> batch, size_t multiplier = 1) {
    std::vector<size_t> lengths;
-    for(int i = 0; i < batch->sets(); ++i)
+    for(size_t i = 0; i < batch->sets(); ++i)
      lengths.push_back((*batch)[i]->batchWidth());
    size_t batchSize = batch->size() * multiplier;

--- a/src/data/corpus.h
+++ b/src/data/corpus.h
@ -57,7 +57,7 @@ public:
  std::vector<Ptr<Vocab>>& getVocabs() { return vocabs_; }

  batch_ptr toBatch(const std::vector<sample>& batchVector) {
-    int batchSize = batchVector.size();
+    size_t batchSize = batchVector.size();

    std::vector<size_t> sentenceIds;

@ -73,14 +73,14 @@ public:
    }

    std::vector<Ptr<SubBatch>> subBatches;
-    for(int j = 0; j < maxDims.size(); ++j) {
+    for(size_t j = 0; j < maxDims.size(); ++j) {
      subBatches.emplace_back(New<SubBatch>(batchSize, maxDims[j], vocabs_[j]));
    }

    std::vector<size_t> words(maxDims.size(), 0);
-    for(int i = 0; i < batchSize; ++i) {
-      for(int j = 0; j < maxDims.size(); ++j) {
-        for(int k = 0; k < batchVector[i][j].size(); ++k) {
+    for(size_t i = 0; i < batchSize; ++i) {
+      for(size_t j = 0; j < maxDims.size(); ++j) {
+        for(size_t k = 0; k < batchVector[i][j].size(); ++k) {
          subBatches[j]->data()[k * batchSize + i] = batchVector[i][j][k];
          subBatches[j]->mask()[k * batchSize + i] = 1.f;
          words[j]++;
--- a/src/data/corpus_base.h
+++ b/src/data/corpus_base.h
@ -107,13 +107,14 @@ class SubBatch {
 private:
  std::vector<Word> indices_;
  std::vector<float> mask_;
-  Ptr<Vocab> vocab_;
-  // ... TODO: add the length information (remember it)

  size_t size_;
  size_t width_;
  size_t words_;

+  Ptr<Vocab> vocab_;
+  // ... TODO: add the length information (remember it)
+
 public:
  /**
   * @brief Creates an empty subbatch of specified size.
@ -178,15 +179,15 @@ public:
    size_t subSize = std::ceil(size_ / (float)n);

    size_t restSize = size_;
-    int pos = 0;
-    for(int k = 0; k < n; ++k) {
+    size_t pos = 0;
+    for(size_t k = 0; k < n; ++k) {
      size_t __size__ = std::min(subSize, restSize);
      if(__size__ > 0) {
        auto sb = New<SubBatch>(__size__, width_, vocab_);

        size_t __words__ = 0;
-        for(int j = 0; j < width_; ++j) {
-          for(int i = 0; i < __size__; ++i) {
+        for(size_t j = 0; j < width_; ++j) {
+          for(size_t i = 0; i < __size__; ++i) {
            sb->data()[j * __size__ + i] = indices_[j * size_ + pos + i];
            sb->mask()[j * __size__ + i] = mask_[j * size_ + pos + i];

@ -367,7 +368,7 @@ public:
    size_t pos = 0;
    for(auto split : splits) {
      std::vector<size_t> ids;
-      for(int i = pos; i < pos + split->size(); ++i)
+      for(size_t i = pos; i < pos + split->size(); ++i)
        ids.push_back(sentenceIds_[i]);
      split->setSentenceIds(ids);
      pos += split->size();
@ -394,8 +395,8 @@ public:
        // this needs to be split along the batch dimension
        // which is here the innermost dimension.
        // Should work for sentence-based weights, too.
-        for(int j = 0; j < width; ++j) {
-          for(int i = 0; i < split->size(); ++i) {
+        for(size_t j = 0; j < width; ++j) {
+          for(size_t i = 0; i < split->size(); ++i) {
            ws[j * split->size() + i] = dataWeights_[j * oldSize + i + pos];
          }
        }
--- a/src/data/corpus_nbest.h
+++ b/src/data/corpus_nbest.h
@ -48,7 +48,7 @@ public:
  std::vector<Ptr<Vocab>>& getVocabs() { return vocabs_; }

  batch_ptr toBatch(const std::vector<sample>& batchVector) {
-    int batchSize = batchVector.size();
+    size_t batchSize = batchVector.size();

    std::vector<size_t> sentenceIds;

@ -64,14 +64,14 @@ public:
    }

    std::vector<Ptr<SubBatch>> subBatches;
-    for(int j = 0; j < maxDims.size(); ++j) {
+    for(size_t j = 0; j < maxDims.size(); ++j) {
      subBatches.emplace_back(New<SubBatch>(batchSize, maxDims[j], vocabs_[j]));
    }

    std::vector<size_t> words(maxDims.size(), 0);
-    for(int i = 0; i < batchSize; ++i) {
-      for(int j = 0; j < maxDims.size(); ++j) {
-        for(int k = 0; k < batchVector[i][j].size(); ++k) {
+    for(size_t i = 0; i < batchSize; ++i) {
+      for(size_t j = 0; j < maxDims.size(); ++j) {
+        for(size_t k = 0; k < batchVector[i][j].size(); ++k) {
          subBatches[j]->data()[k * batchSize + i] = batchVector[i][j][k];
          subBatches[j]->mask()[k * batchSize + i] = 1.f;
          words[j]++;
--- a/src/data/corpus_sqlite.cpp
+++ b/src/data/corpus_sqlite.cpp
@ -59,7 +59,7 @@ void CorpusSQLite::fillSQLite() {
  if(fill) {
    std::string createStr = "create table lines (_id integer";
    std::string insertStr = "insert into lines values (?";
-    for(int i = 0; i < files_.size(); ++i) {
+    for(size_t i = 0; i < files_.size(); ++i) {
      createStr += ", line" + std::to_string(i) + " text";
      insertStr += ", ?";
    }
@ -79,7 +79,7 @@ void CorpusSQLite::fillSQLite() {
      ps.bind(1, (int)lines);

      std::string line;
-      for(int i = 0; i < files_.size(); ++i) {
+      for(size_t i = 0; i < files_.size(); ++i) {
        cont = cont && GetLine((std::istream&)*files_[i], line);
        if(cont)
          ps.bind(i + 2, line);
--- a/src/data/corpus_sqlite.h
+++ b/src/data/corpus_sqlite.h
@ -67,7 +67,7 @@ public:
  std::vector<Ptr<Vocab>>& getVocabs() { return vocabs_; }

  batch_ptr toBatch(const std::vector<sample>& batchVector) {
-    int batchSize = batchVector.size();
+    size_t batchSize = batchVector.size();

    std::vector<size_t> sentenceIds;

@ -83,14 +83,14 @@ public:
    }

    std::vector<Ptr<SubBatch>> subBatches;
-    for(int j = 0; j < maxDims.size(); ++j) {
+    for(size_t j = 0; j < maxDims.size(); ++j) {
      subBatches.emplace_back(New<SubBatch>(batchSize, maxDims[j], vocabs_[j]));
    }

    std::vector<size_t> words(maxDims.size(), 0);
-    for(int i = 0; i < batchSize; ++i) {
-      for(int j = 0; j < maxDims.size(); ++j) {
-        for(int k = 0; k < batchVector[i][j].size(); ++k) {
+    for(size_t i = 0; i < batchSize; ++i) {
+      for(size_t j = 0; j < maxDims.size(); ++j) {
+        for(size_t k = 0; k < batchVector[i][j].size(); ++k) {
          subBatches[j]->data()[k * batchSize + i] = batchVector[i][j][k];
          subBatches[j]->mask()[k * batchSize + i] = 1.f;
          words[j]++;
--- a/src/data/text_input.h
+++ b/src/data/text_input.h
@ -34,10 +34,9 @@ private:

 class TextInput : public DatasetBase<SentenceTuple, TextIterator, CorpusBatch> {
 private:
-  Ptr<Config> options_;
-
  std::vector<UPtr<std::istringstream>> files_;
  std::vector<Ptr<Vocab>> vocabs_;
+  Ptr<Config> options_;

  size_t pos_{0};

@ -57,7 +56,7 @@ public:
  // TODO: There are half dozen functions called toBatch(), which are very
  // similar. Factor them.
  batch_ptr toBatch(const std::vector<sample>& batchVector) {
-    int batchSize = batchVector.size();
+    size_t batchSize = batchVector.size();

    std::vector<size_t> sentenceIds;

@ -73,14 +72,14 @@ public:
    }

    std::vector<Ptr<SubBatch>> subBatches;
-    for(int j = 0; j < maxDims.size(); ++j) {
+    for(size_t j = 0; j < maxDims.size(); ++j) {
      subBatches.emplace_back(New<SubBatch>(batchSize, maxDims[j], vocabs_[j]));
    }

    std::vector<size_t> words(maxDims.size(), 0);
-    for(int i = 0; i < batchSize; ++i) {
-      for(int j = 0; j < maxDims.size(); ++j) {
-        for(int k = 0; k < batchVector[i][j].size(); ++k) {
+    for(size_t i = 0; i < batchSize; ++i) {
+      for(size_t j = 0; j < maxDims.size(); ++j) {
+        for(size_t k = 0; k < batchVector[i][j].size(); ++k) {
          subBatches[j]->data()[k * batchSize + i] = batchVector[i][j][k];
          subBatches[j]->mask()[k * batchSize + i] = 1.f;
          words[j]++;
--- a/src/functional/tmp.h
+++ b/src/functional/tmp.h
@ -109,7 +109,7 @@ struct Loop {
    float sum = 0;
    functional::Array<int, K> acc;
    for(int i = 0; i < length[N - n]; ++i) {
-      for(int j = 0; j < K; ++j) {
+      for(size_t j = 0; j < K; ++j) {
        acc[j] = pAcc[j] + (dim[N - n] + i) * in[j].shape().bstride(N - n);
      }
      sum += Loop<n - 1, N, K>::result(functor, in, acc, length, dim);
@ -130,7 +130,7 @@ struct Loop<1, N, K> {
    float sum = 0;
    functional::Array<int, K> acc;
    for(int i = 0; i < length[N - 1]; ++i) {
-      for(int j = 0; j < K; ++j) {
+      for(size_t j = 0; j < K; ++j) {
        acc[j] = pAcc[j] + (dim[N - 1] + i) * in[j].shape().bstride(N - 1);
      }
      sum += apply<K>(functor, in, acc);
--- a/src/graph/chainable.h
+++ b/src/graph/chainable.h
@ -7,9 +7,7 @@
 #include "3rd_party/exception.h"
 #include "common/definitions.h"

-/**
- * @brief Parent namespace for the Marian project
- */
+// Parent namespace for the Marian project
 namespace marian {

 #define NodeOp(op) [=]() { op; }
@ -19,8 +17,10 @@ class AutoTunerRecorder;

 template <class DataType>
 class Chainable;
-/** @brief Defines a convenience type to represent a shared pointer to a
- * Chainable<Tensor> object. */
+/**
+ * A convenience type to represent a shared pointer to a Chainable<Tensor>
+ * object.
+ */
 typedef Ptr<Chainable<Tensor>> Expr;
 typedef Weak<Chainable<Tensor>> WExpr;

--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@ -119,12 +119,11 @@ private:
  Ptr<Parameters> params_;
  Ptr<Tensors> tensors_;

-  Ptr<Backend> backend_;
-
  std::unordered_map<size_t, std::vector<Expr>> memoized_;

  bool inferenceOnly_{false};
  bool optimized_{false};
+  Ptr<Backend> backend_;

  bool reloaded_{false};
  std::string namespace_;
@ -439,7 +438,7 @@ public:
        shape.set(1, it.second->shape[0]);
      } else {
        shape.resize(it.second->shape.size());
-        for(int i = 0; i < it.second->shape.size(); ++i)
+        for(size_t i = 0; i < it.second->shape.size(); ++i)
          shape.set(i, it.second->shape[i]);
      }

--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@ -181,7 +181,7 @@ Expr atleast_nd(Expr a, size_t dims) {

  Shape nShape;
  nShape.resize(dims);
-  for(int i = 1; i <= a->shape().size(); ++i)
+  for(int i = 1; i <= (int)a->shape().size(); ++i)
    nShape.set(-i, a->shape()[-i]);

  return reshape(a, nShape);
@ -267,7 +267,7 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {

      // lower precicion for shapes, reduces data sparsity
      auto sh = [](Shape sh) {
-        for(int i = 0; i < sh.size(); ++i)
+        for(size_t i = 0; i < sh.size(); ++i)
          sh.set(i, sh[i] / 4);
        return sh;
      };
@ -353,7 +353,7 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
 // swap the last two axes
 Expr transpose(Expr a) {
  std::vector<int> axes(a->shape().size());
-  for(int i = 0; i < axes.size(); ++i) {
+  for(size_t i = 0; i < axes.size(); ++i) {
    axes[i] = i;
  }
  if(axes.size() > 1) {
--- a/src/graph/node.h
+++ b/src/graph/node.h
@ -163,7 +163,7 @@ struct NaryNodeOp : public Node {
             Type value_type = Type::float32)
      : Node(nodes.front()->graph(), shape, value_type) {
    children_.resize(nodes.size());
-    for(int i = 0; i < nodes.size(); ++i)
+    for(size_t i = 0; i < nodes.size(); ++i)
      children_[i] = nodes[i];

    setTrainable(std::any_of(
@ -187,7 +187,7 @@ struct NaryNodeOp : public Node {
    if(!hash_) {
      std::size_t seed = boost::hash<std::string>()(name());
      boost::hash_combine(seed, type());
-      for(int i = 0; i < children_.size(); ++i)
+      for(size_t i = 0; i < children_.size(); ++i)
        boost::hash_combine(seed, child(i)->hash());
      hash_ = seed;
    }
@ -201,7 +201,7 @@ struct NaryNodeOp : public Node {
      return false;
    if(children().size() != node->children().size())
      return false;
-    for(int i = 0; i < children().size(); ++i)
+    for(size_t i = 0; i < children().size(); ++i)
      if(children()[i]->getId() != node->children()[i]->getId())
        return false;
    return true;
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@ -687,14 +687,14 @@ struct ConcatenateNodeOp : public NaryNodeOp {

  void forward() {
    std::vector<Tensor> concatenees;
-    for(int i = 0; i < children_.size(); ++i)
+    for(size_t i = 0; i < children_.size(); ++i)
      concatenees.push_back(child(i)->val());
    Concatenate(val_, concatenees, ax_);
  }

  void backward() {
    std::vector<Tensor> deconcatenees;
-    for(int i = 0; i < children_.size(); ++i) {
+    for(size_t i = 0; i < children_.size(); ++i) {
      auto childPtr = child(i);
      childPtr
          ->set_zero_adjoint();  // @TODO: this is a hotfix, do this properly
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@ -227,7 +227,7 @@ struct TanhNodeOp : public NaryNodeOp {
                         child(0)->val(),
                         child(1)->val(),
                         child(2)->val());
-                 for(int i = 3; i < children_.size(); ++i)
+                 for(size_t i = 3; i < children_.size(); ++i)
                     Element(_1 = _1 + _2, val_, child(i)->val());
                 Element(_1 = tanh(_1), val_);)
        };
@ -237,7 +237,7 @@ struct TanhNodeOp : public NaryNodeOp {
  NodeOps backwardOps() {
    using namespace functional;
    NodeOps ops;
-    for(int i = 0; i < children_.size(); i++) {
+    for(size_t i = 0; i < children_.size(); i++) {
      ops.push_back(
          NodeOp(Add(_1 * (1.0f - (_2 * _2)), child(i)->grad(), adj_, val_)));
    }
@ -828,7 +828,7 @@ struct TransposeNodeOp : public UnaryNodeOp {
    ABORT_IF(shape.size() != axes.size(),
             "Shape and transpose axes have different number of dimensions");

-    for(int i = 0; i < shape.size(); ++i)
+    for(size_t i = 0; i < shape.size(); ++i)
      shape.set(i, a->shape()[axes[i]]);

    return shape;
--- a/src/layers/constructors.h
+++ b/src/layers/constructors.h
@ -117,7 +117,7 @@ public:
    else
      output = layers_[0]->apply(av);

-    for(int i = 1; i < layers_.size(); ++i)
+    for(size_t i = 1; i < layers_.size(); ++i)
      output = layers_[i]->apply(output);

    return output;
--- a/src/layers/weight.cpp
+++ b/src/layers/weight.cpp
@ -5,6 +5,7 @@ namespace marian {
 Ptr<WeightingBase> WeightingFactory(Ptr<Options> options) {
  if(options->has("data-weighting"))
    return New<DataWeighting>(options->get<std::string>("data-weighting-type"));
+  return nullptr;
 }

 Expr DataWeighting::getWeights(Ptr<ExpressionGraph> graph,
--- a/src/models/costs.h
+++ b/src/models/costs.h
@ -49,16 +49,11 @@ public:

    auto state = encdec->stepAll(graph, corpusBatch, clearGraph);

-    float ls = inference_ ? 0.f : options_->get<float>("label-smoothing");
-
    Expr weights;
-    Expr cost;
-    bool sentenceWeighting = false;
-
-    if(toBeWeighted_) {
+    if(toBeWeighted_)
      weights = weighter_->getWeights(graph, corpusBatch);
-    }

+    Expr cost;
    cost = loss_->getCost(state->getProbs(),
                          state->getTargetIndices(),
                          state->getTargetMask(),
--- a/src/models/hardatt.h
+++ b/src/models/hardatt.h
@ -49,7 +49,7 @@ public:
  virtual void blacklist(Expr totalCosts, Ptr<data::CorpusBatch> batch) {
    auto attentionIdx = getAttentionIndices();
    int dimVoc = totalCosts->shape()[-1];
-    for(int i = 0; i < attentionIdx.size(); i++) {
+    for(size_t i = 0; i < attentionIdx.size(); i++) {
      if(batch->front()->data()[attentionIdx[i]] != 0) {
        totalCosts->val()->set(
            i * dimVoc + DEFAULT_EOS_ID,  // this is checked at vocab-load time
@ -167,7 +167,7 @@ public:
        auto attCell = rnn::stacked_cell(graph)         //
                           .push_back(rnn::cell(graph)  //
                                      ("prefix", prefix_ + "_cell1"));
-        for(int i = 0; i < state->getEncoderStates().size(); ++i) {
+        for(size_t i = 0; i < state->getEncoderStates().size(); ++i) {
          std::string prefix = prefix_;
          if(state->getEncoderStates().size() > 1)
            prefix += "_att" + std::to_string(i + 1);
@ -185,7 +185,7 @@ public:
        rnn.push_back(rnn::cell(graph)("prefix", prefix_));
      }

-      for(int i = 0; i < decoderLayers - 1; ++i)
+      for(size_t i = 0; i < decoderLayers - 1; ++i)
        rnn.push_back(rnn::cell(graph)  //
                      ("prefix", prefix_ + "_l" + std::to_string(i)));

@ -209,7 +209,7 @@ public:
    Expr logits;
    if(type == "hard-soft-att") {
      std::vector<Expr> alignedContexts;
-      for(int k = 0; k < state->getEncoderStates().size(); ++k) {
+      for(size_t k = 0; k < state->getEncoderStates().size(); ++k) {
        // retrieve all the aligned contexts computed by the attention mechanism
        auto att = rnn_->at(0)
                       ->as<rnn::StackedCell>()
@ -279,7 +279,7 @@ public:

    auto stateHardAtt = std::dynamic_pointer_cast<DecoderStateHardAtt>(state);

-    int dimSrcWords = state->getEncoderStates()[0]->getContext()->shape()[-3];
+    size_t dimSrcWords = state->getEncoderStates()[0]->getContext()->shape()[-3];

    if(embIdx.empty()) {
      stateHardAtt->setAttentionIndices({0});
--- a/src/models/s2s.h
+++ b/src/models/s2s.h
@ -200,7 +200,7 @@ private:

    // setting up conditional (transitional) cell
    auto baseCell = rnn::stacked_cell(graph);
-    for(int i = 1; i <= decoderBaseDepth; ++i) {
+    for(size_t i = 1; i <= decoderBaseDepth; ++i) {
      bool transition = (i > 2);
      auto paramPrefix = prefix_ + "_cell" + std::to_string(i);
      baseCell.push_back(rnn::cell(graph)         //
@ -208,7 +208,7 @@ private:
                         ("final", i > 1)         //
                         ("transition", transition));
      if(i == 1) {
-        for(int k = 0; k < state->getEncoderStates().size(); ++k) {
+        for(size_t k = 0; k < state->getEncoderStates().size(); ++k) {
          auto attPrefix = prefix_;
          if(state->getEncoderStates().size() > 1)
            attPrefix += "_att" + std::to_string(k + 1);
@ -224,11 +224,11 @@ private:
    rnn.push_back(baseCell);

    // Add more cells to RNN (stacked RNN)
-    for(int i = 2; i <= decoderLayers; ++i) {
+    for(size_t i = 2; i <= decoderLayers; ++i) {
      // deep transition
      auto highCell = rnn::stacked_cell(graph);

-      for(int j = 1; j <= decoderHighDepth; j++) {
+      for(size_t j = 1; j <= decoderHighDepth; j++) {
        auto paramPrefix
            = prefix_ + "_l" + std::to_string(i) + "_cell" + std::to_string(j);
        highCell.push_back(rnn::cell(graph)("prefix", paramPrefix));
@ -309,7 +309,7 @@ public:
    rnn::States decoderStates = rnn_->lastCellStates();

    std::vector<Expr> alignedContexts;
-    for(int k = 0; k < state->getEncoderStates().size(); ++k) {
+    for(size_t k = 0; k < state->getEncoderStates().size(); ++k) {
      // retrieve all the aligned contexts computed by the attention mechanism
      auto att = rnn_->at(0)
                     ->as<rnn::StackedCell>()
--- a/src/models/states.h
+++ b/src/models/states.h
@ -28,16 +28,15 @@ public:

 class DecoderState {
 protected:
+  rnn::States states_;
+  Expr probs_;
  std::vector<Ptr<EncoderState>> encStates_;
+  Ptr<data::CorpusBatch> batch_;

  Expr targetEmbeddings_;
  Expr targetMask_;
  Expr targetIndices_;

-  Expr probs_;
-  rnn::States states_;
-  Ptr<data::CorpusBatch> batch_;
-
  // Keep track of current target token position during translation
  size_t position_{0};

--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@ -699,7 +699,7 @@ public:

      int dimSrcWords = encoderContext->shape()[-2];

-      int dims = encoderMask->shape().size();
+      //int dims = encoderMask->shape().size();
      encoderMask = atleast_nd(encoderMask, 4);
      encoderMask = reshape(transposeTimeBatch(encoderMask),
                            {1, dimBatch, 1, dimSrcWords});
@ -748,7 +748,7 @@ public:
      // Iterate over multiple encoders and simply stack the attention blocks
      if(encoderContexts.size() > 0) {
        // multiple encoders are applied one after another
-        for(int j = 0; j < encoderContexts.size(); ++j) {
+        for(size_t j = 0; j < encoderContexts.size(); ++j) {
          std::string prefix = prefix_ + "_l" + std::to_string(i) + "_context";
          if(j > 0)
            prefix += "_enc" + std::to_string(j + 1);
@ -775,7 +775,7 @@ public:
    // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab dim]
    Expr logits = output_->apply(decoderContext);

-    int dimTrgVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_];
+    //int dimTrgVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_];

    // return unormalized(!) probabilities
    auto nextState = New<TransformerState>(
--- a/src/rescorer/rescorer.h
+++ b/src/rescorer/rescorer.h
@ -68,7 +68,7 @@ public:

    models_.resize(graphs_.size());
    ThreadPool pool(graphs_.size(), graphs_.size());
-    for(int i = 0; i < graphs_.size(); ++i) {
+    for(size_t i = 0; i < graphs_.size(); ++i) {
      pool.enqueue(
          [=](int j) {
            models_[j] = New<Model>(temp);
--- a/src/rnn/attention.h
+++ b/src/rnn/attention.h
@ -119,12 +119,14 @@ public:
      recState = dropout(recState, dropMaskState_);

    auto mappedState = dot(recState, Wa_);
-    if(layerNorm_)
-      if(nematusNorm_)
+    if(layerNorm_) {
+      if(nematusNorm_) {
        mappedState = layerNorm(
            mappedState, W_comb_att_lns_, W_comb_att_lnb_, NEMATUS_LN_EPS);
-      else
+      } else {
        mappedState = layerNorm(mappedState, gammaState_);
+      }
+    }

    auto attReduce = attOps(va_, mappedContext_, mappedState);

--- a/src/rnn/cells.cpp
+++ b/src/rnn/cells.cpp
@ -14,7 +14,7 @@ struct GRUFastNodeOp : public NaryNodeOp {

  NodeOps forwardOps() {
    std::vector<Tensor> inputs;
-    for(int i = 0; i < children_.size(); ++i)
+    for(size_t i = 0; i < children_.size(); ++i)
      inputs.push_back(child(i)->val());

    return {NodeOp(GRUFastForward(val_, inputs, final_))};
@ -56,7 +56,7 @@ struct LSTMCellNodeOp : public NaryNodeOp {

  NodeOps forwardOps() {
    std::vector<Tensor> inputs;
-    for(int i = 0; i < children_.size(); ++i)
+    for(size_t i = 0; i < children_.size(); ++i)
      inputs.push_back(child(i)->val());

    return {NodeOp(LSTMCellForward(val_, inputs))};
@ -92,7 +92,7 @@ struct LSTMOutputNodeOp : public NaryNodeOp {

  NodeOps forwardOps() {
    std::vector<Tensor> inputs;
-    for(int i = 0; i < children_.size(); ++i)
+    for(size_t i = 0; i < children_.size(); ++i)
      inputs.push_back(child(i)->val());

    return {NodeOp(LSTMOutputForward(val_, inputs))};
--- a/src/rnn/constructors.h
+++ b/src/rnn/constructors.h
@ -98,7 +98,7 @@ public:

    int lastDimInput = options_->get<int>("dimInput");

-    for(int i = 0; i < stackableFactories_.size(); ++i) {
+    for(size_t i = 0; i < stackableFactories_.size(); ++i) {
      auto sf = stackableFactories_[i];

      if(sf->is<CellFactory>()) {
@ -142,7 +142,7 @@ public:

  Ptr<RNN> construct() {
    auto rnn = New<RNN>(graph_, options_);
-    for(int i = 0; i < layerFactories_.size(); ++i) {
+    for(size_t i = 0; i < layerFactories_.size(); ++i) {
      auto lf = layerFactories_[i];

      lf->getOptions()->merge(options_);
--- a/src/rnn/rnn.h
+++ b/src/rnn/rnn.h
@ -161,7 +161,7 @@ public:

    Expr output;
    Expr layerInput = input;
-    for(int i = 0; i < rnns_.size(); ++i) {
+    for(size_t i = 0; i < rnns_.size(); ++i) {
      auto lazyInput = layerInput;

      auto cell = rnns_[i]->at(0);
@ -188,7 +188,7 @@ public:

    Expr output;
    Expr layerInput = input;
-    for(int i = 0; i < rnns_.size(); ++i) {
+    for(size_t i = 0; i < rnns_.size(); ++i) {
      Expr lazyInput;
      auto cell = rnns_[i]->at(0);
      auto lazyInputs = cell->getLazyInputs(shared_from_this());
@ -217,7 +217,7 @@ public:

    Expr output;
    Expr layerInput = input;
-    for(int i = 0; i < rnns_.size(); ++i) {
+    for(size_t i = 0; i < rnns_.size(); ++i) {
      auto lazyInput = layerInput;

      auto cell = rnns_[i]->at(0);
--- a/src/rnn/types.h
+++ b/src/rnn/types.h
@ -221,7 +221,7 @@ public:
        = stackables_[0]->as<Cell>()->applyState(mappedInputs, state, mask);
    ;

-    for(int i = 1; i < stackables_.size(); ++i) {
+    for(size_t i = 1; i < stackables_.size(); ++i) {
      if(stackables_[i]->is<Cell>()) {
        auto hiddenNext
            = stackables_[i]->as<Cell>()->apply(lastInputs_, hidden, mask);
--- a/src/tensors/allocator.h
+++ b/src/tensors/allocator.h
@ -84,6 +84,7 @@ private:
  size_t available_{0};
  size_t step_{128 * 1024 * 1024};
  size_t alignment_{256};
+
  bool throw_{false};

  std::set<Gap> gaps_;
@ -161,8 +162,8 @@ public:
            size_t step,
            size_t alignment = 256)
      : device_(DispatchDevice(deviceId, alignment)),
-        step_(step),
        available_(0),
+        step_(step),
        alignment_(alignment) {
    reserve(bytes);
  }
--- a/src/tensors/cpu/add.h
+++ b/src/tensors/cpu/add.h
@ -23,12 +23,12 @@ void gAddGeneric(Functor functor,
                 float scale = 1.0) {
  int outLength = out.shape().elements();
  bool same = outLength == full.elements();
-  for(int i = 0; i < K; ++i)
+  for(size_t i = 0; i < K; ++i)
    same = same && outLength == ins[i].shape().elements();

  constexpr size_t N = functional::Shape::size();
  functional::Array<int, N> len;
-  for(int i = 0; i < N; ++i)
+  for(size_t i = 0; i < N; ++i)
    len[i] = full[i] / out.shape()[i];

  functional::Array<int, N> dims;
@ -75,7 +75,7 @@ void gAddReduce(Functor functor,
  int cols = full.back();

  bool same = true;
-  for(int i = 0; i < K; ++i)
+  for(size_t i = 0; i < K; ++i)
    same = same && ins[i].shape().elements() == full.elements();

  for(int j = 0; j < rows; ++j) {
@ -88,7 +88,7 @@ void gAddReduce(Functor functor,
      for(int id = 0; id < cols; ++id) {
        full.dims(j * cols + id, dims);
        functional::Array<int, K> indices;
-        for(int i = 0; i < K; ++i)
+        for(size_t i = 0; i < K; ++i)
          indices[i] = ins[i].shape().bindex(dims);
        sum += functional::apply(functor, ins, indices);
      }
@ -114,7 +114,7 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
    cpu::gAddReduce(functor, full, gOut, gIns, scale);
  } else if(out->shape() == full) {
    bool broadcast = false;
-    for(int i = 0; i < K; ++i)
+    for(size_t i = 0; i < K; ++i)
      broadcast = broadcast || gOut.shape() != gIns[i].shape();
    cpu::gAddEqual(functor, gOut, gIns, scale, broadcast);
  } else {
--- a/src/tensors/cpu/element.h
+++ b/src/tensors/cpu/element.h
@ -34,7 +34,7 @@ struct E {
      // increase index for current dimension by stride or 0 if broadcasting.
      // bstride(i) is look-up value, either equal to stride if the
      // corresponding dim is larger 1 or 0 if the dim is 1.
-      for(int k = 0; k < K; ++k)
+      for(size_t k = 0; k < K; ++k)
        indices[k] += tensors[k].shape().bstride(I);
    }
  }
--- a/src/tensors/cpu/tensor_operators.cpp
+++ b/src/tensors/cpu/tensor_operators.cpp
@ -52,10 +52,10 @@ inline void gInsertCols(float* out,
                        size_t offset_out,
                        size_t offset_in,
                        float beta) {
-  for(int j = 0; j < rows; ++j) {
+  for(size_t j = 0; j < rows; ++j) {
    float* rowOut = out + j * cols_out + offset_out;
    const float* rowIn = in + j * cols_in + offset_in;
-    for(int i = 0; i < cols; ++i) {
+    for(size_t i = 0; i < cols; ++i) {
      rowOut[i] = rowIn[i] + beta * rowOut[i];
    }
  }
@ -85,7 +85,7 @@ void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) {
 }

 void Concatenate(Tensor out, const std::vector<Tensor>& inputs, int ax) {
-  if(ax == out->shape().size() - 1)
+  if(ax == (int)out->shape().size() - 1)
    Concatenate1(out, inputs);
  else
    ConcatCont(out, inputs, ax);
@ -144,7 +144,7 @@ void SplitCont(std::vector<Tensor>& outputs, const Tensor in, int axis) {
 }

 void Deconcatenate(std::vector<Tensor>& outputs, const Tensor in, int ax) {
-  if(ax == in->shape().size() - 1)
+  if(ax == (int)in->shape().size() - 1)
    Split1(outputs, in);
  else
    SplitCont(outputs, in, ax);
@ -228,8 +228,8 @@ template <bool add>
 void TransposeGeneric(Tensor out, Tensor in, const std::vector<int>& vAxis) {
  functional::Array<int, functional::Shape::size()> permute;
  int diff = functional::Shape::size() - vAxis.size();
-  for(int i = 0; i < permute.size(); ++i)
-    if(i < diff)
+  for(size_t i = 0; i < permute.size(); ++i)
+    if((int)i < diff)
      permute[i] = i;
    else
      permute[i] = vAxis[i - diff] + diff;
@ -244,7 +244,7 @@ void TransposeGeneric(Tensor out, Tensor in, const std::vector<int>& vAxis) {

  for(int index = 0; index < length; ++index) {
    gOut.shape().dims(index, oDims);
-    for(int i = 0; i < N; ++i)
+    for(size_t i = 0; i < N; ++i)
      pDims[permute[i]] = oDims[i];
    if(add)
      gOut[index] += gIn[pDims];
@ -339,17 +339,17 @@ void SoftmaxGrad(Tensor grad_, Tensor adj_, Tensor val_) {
  const float* adj = adj_->data();
  const float* val = val_->data();

-  for(size_t j = 0; j < rows; ++j) {
+  for(int j = 0; j < rows; ++j) {
    float* gradRow = grad + j * cols;
    const float* adjRow = adj + j * cols;
    const float* valRow = val + j * cols;

    float sum = 0.f;
-    for(size_t i = 0; i < cols; ++i) {
+    for(int i = 0; i < cols; ++i) {
      sum += valRow[i] * adjRow[i];
    }

-    for(size_t i = 0; i < cols; ++i) {
+    for(int i = 0; i < cols; ++i) {
      gradRow[i] += valRow[i] * (adjRow[i] - sum);
    }
  }
@ -389,7 +389,7 @@ void CopyRows(Tensor out_,
  const float* in = in_->data();

 #pragma omp parallel for
-  for(int j = 0; j < rows; ++j) {
+  for(size_t j = 0; j < rows; ++j) {
    size_t dst = j;
    size_t src = indices[j];

@ -409,14 +409,14 @@ void PasteRows(Tensor out_,
  float* out = out_->data();
  const float* in = in_->data();

-  for(int j = 0; j < rows; ++j) {
+  for(size_t j = 0; j < rows; ++j) {
    size_t dst = indices[j];  // not a permutation - may alias, unlike PasteCols
    size_t src = j;

    float* rowOut = out + dst * cols;
    const float* rowIn = in + src * cols;

-    for(int i = 0; i < cols; ++i) {
+    for(size_t i = 0; i < cols; ++i) {
      rowOut[i] += rowIn[i];
    }
  }
@ -433,11 +433,11 @@ void CopyCols(Tensor out_,
  const float* in = in_->data();

 #pragma omp parallel for
-  for(int j = 0; j < rows; ++j) {
+  for(size_t j = 0; j < rows; ++j) {
    const float* rowIn = in + j * colsIn;
    float* rowOut = out + j * colsOut;

-    for(int i = 0; i < colsOut; ++i) {
+    for(size_t i = 0; i < colsOut; ++i) {
      rowOut[i] = rowIn[indices[i]];
    }
  }
@ -456,11 +456,11 @@ void PasteCols(Tensor out_,
  /* n.b. Unlike PasteRows, currently appears safe to assume indices[i] is a
   *      permutation i.e. no racy aliases, and no need to sum vs. just assign.
   */
-  for(int j = 0; j < rows; ++j) {
+  for(size_t j = 0; j < rows; ++j) {
    const float* rowIn = in + j * colsIn;
    float* rowOut = out + j * colsOut;

-    for(int i = 0; i < colsIn; ++i) {
+    for(size_t i = 0; i < colsIn; ++i) {
      rowOut[indices[i]] += rowIn[i];
    }
  }
@ -606,18 +606,19 @@ void GRUFastBackward(std::vector<Tensor> outputs,
        rowOutXW[l] += dfdxW_x;
      if(outSU)
        rowOutSU[l] += dfdxW_x * r;
-      if(outB)
+      if(outB) {
        if(final)
          outB[l] += dfdxW_x * r;
        else
          outB[l] += dfdxW_x;
+      }
    }
  }
 }

 void CrossEntropyPick(Tensor out_, Tensor in_, Tensor pick_) {
  float* out = out_->data();
-  Shape& outShape = out_->shape();
+  //Shape& outShape = out_->shape();
  const float* in = in_->data();
  Shape& inShape = in_->shape();
  float* pick = pick_->data();
@ -709,14 +710,14 @@ void Att(Tensor out_, Tensor va_, Tensor context_, Tensor state_) {
  int cols = k;

 #pragma omp parallel for
-  for(size_t j = 0; j < rows; ++j) {
+  for(int j = 0; j < rows; ++j) {
    const float* vaRow = va;
    const float* ctxRow = ctx + (j % (b * t)) * cols;
    const float* stateRow = state + ((j / (b * t)) * b + j % b) * cols;

    float sum = 0.f;
 #pragma omp simd reduction(+ : sum)
-    for(size_t i = 0; i < cols; ++i) {
+    for(int i = 0; i < cols; ++i) {
      float z = ctxRow[i] + stateRow[i];
      sum += std::tanh(z) * vaRow[i];
    }
@ -930,7 +931,7 @@ void Shift(Tensor out_,
           float padValue,
           bool invert) {
  int offset = 0;
-  for(int i = 0; i < shift.size(); ++i)
+  for(size_t i = 0; i < shift.size(); ++i)
    offset += in_->shape().stride(i) * shift[i];

  if(invert)
@ -953,7 +954,7 @@ void Shift(Tensor out_,

 void ShiftGrad(Tensor out_, Tensor in_, marian::Shape shift, bool invert) {
  int offset = 0;
-  for(int i = 0; i < shift.size(); ++i)
+  for(size_t i = 0; i < shift.size(); ++i)
    offset += in_->shape().stride(i) * shift[i];

  if(invert)
--- a/src/tensors/tensor.h
+++ b/src/tensors/tensor.h
@ -21,10 +21,9 @@ namespace marian {

 class TensorBase : public std::enable_shared_from_this<TensorBase> {
 private:
-  Type type_{Type::float32};
-  Shape shape_;
-
  Ptr<MemoryPiece> memory_;
+  Shape shape_;
+  Type type_{Type::float32};
  Ptr<Backend> backend_;

 public:
@ -221,7 +220,7 @@ public:
             type_);

    if(backend_->getDevice().type == DeviceType::cpu) {
-      for(int i = 0; i < k.size(); ++i)
+      for(size_t i = 0; i < k.size(); ++i)
        data()[k[i]] = v[i];
    }
 #ifdef CUDA_FOUND
@ -270,18 +269,18 @@ public:
    std::vector<T> values(totSize);
    get(values);

-    size_t dispCols = 5;
+    int dispCols = 5;
    if(isFloat(type_))
      strm << std::fixed << std::setprecision(8) << std::setfill(' ');
    else
      strm << std::fixed << std::setprecision(0) << std::setfill(' ');

-    for(int i = 0; i < values.size(); ++i) {
+    for(size_t i = 0; i < values.size(); ++i) {
      std::vector<int> dims;
      shape().dims(i, dims);

      bool disp = true;
-      for(int j = 0; j < dims.size(); ++j)
+      for(size_t j = 0; j < dims.size(); ++j)
        disp = disp && (dims[j] < dispCols || dims[j] >= shape()[j] - dispCols);

      if(disp) {
@ -320,14 +319,14 @@ public:

        bool prev = true;
        for(int j = dims.size() - 1; j >= 0; --j) {
-          if(j < dims.size() - 1)
+          if(j < (int)dims.size() - 1)
            prev = prev && dims[j + 1] + 1 == shape()[j + 1];
          if(prev && dims[j] + 1 == dispCols && shape()[j] > 2 * dispCols) {
-            if(j < dims.size() - 1)
+            if(j < (int)dims.size() - 1)
              for(int k = 0; k <= j; ++k)
                strm << " ";
            strm << "... ";
-            if(j < dims.size() - 1)
+            if(j < (int)dims.size() - 1)
              strm << std::endl;
            break;
          }
--- a/src/training/communicator.h
+++ b/src/training/communicator.h
@ -23,7 +23,7 @@ public:
    int pos = 0;
    std::vector<std::thread> group;
    // iterate over all shards
-    for(int idx = 0; idx < graphs_.size(); ++idx) {
+    for(size_t idx = 0; idx < graphs_.size(); ++idx) {
      int size = std::min(shardSize, totalSize);

      group.emplace_back(func, idx, pos);
@ -160,7 +160,7 @@ public:

    auto gather = [this, params](size_t idx, int pos) {
      // copy parameter shard to each graph, apart from last graph
-      for(int i = 0; i < graphs_.size() - 1; ++i) {
+      for(int i = 0; i < (int)graphs_.size() - 1; ++i) {
        auto subParam
            = graphs_[i]->params()->vals()->subtensor(pos, params[idx]->size());
        subParam->copyFrom(params[idx]);
--- a/src/training/graph_group_async.cpp
+++ b/src/training/graph_group_async.cpp
@ -21,7 +21,7 @@ void AsyncGraphGroup::fetchParams(Tensor oldParams,
  int pos = 0;

  std::vector<std::thread> threads;
-  for(int idx = 0; idx < devices_.size(); idx++) {
+  for(size_t idx = 0; idx < devices_.size(); idx++) {
    threads.emplace_back(std::thread(
        [&](int idx, int pos) {
          // individual mutex per-shard
@ -44,7 +44,7 @@ void AsyncGraphGroup::pushGradients(Tensor newGrads,
  // add instead of copy?
  std::vector<std::thread> threads;
  int pos = 0;
-  for(int idx = 0; idx < devices_.size(); idx++) {
+  for(size_t idx = 0; idx < devices_.size(); idx++) {
    threads.emplace_back(std::thread(
        [&](int idx, int pos) {
          // individual mutex per-shard
--- a/src/training/graph_group_async.h
+++ b/src/training/graph_group_async.h
@ -132,8 +132,8 @@ public:
  }

  void save(Ptr<ExpressionGraph> graph, bool final = false) {
-    int idx = 0;
-    for(int i = 0; i < graphs_.size(); ++i) {
+    size_t idx = 0;
+    for(size_t i = 0; i < graphs_.size(); ++i) {
      if(graph == graphs_[i]) {
        idx = i;
        break;
--- a/src/training/graph_group_multinode.cpp
+++ b/src/training/graph_group_multinode.cpp
@ -99,7 +99,7 @@ void MultiNodeGraphGroup::setupClients(Ptr<data::Batch> batch) {
 * batch.
 */
 void MultiNodeGraphGroup::runBatchThroughClientGraphs(Ptr<data::Batch> batch) {
-  for(int i = 0; i < devices_.size(); i++) {
+  for(size_t i = 0; i < devices_.size(); i++) {
    THREAD_GUARD(clientBuilders_[i]->build(clientGraphs_[i], batch);
                 clientGraphs_[i]->forward();
                 clientGraphs_[i]->getBackend()->synchronize(););
@ -130,7 +130,7 @@ void MultiNodeGraphGroup::calculateNodeSizes() {
 void MultiNodeGraphGroup::initClientCpuBuffers() {
  // Initialize CPU buffers used to send GPU data through MPI (can't send
  // directly from GPUs)
-  for(int i = 0; i < devices_.size(); i++) {
+  for(size_t i = 0; i < devices_.size(); i++) {
    // @TODO Optimization: Use full size to copy in one go, then send gradients
    // and receive parameters in parallel
    size_t size = nodeSizes_[mpi_my_rank_];
@ -163,7 +163,7 @@ void MultiNodeGraphGroup::initClientCommOverlapVars() {
 */
 void MultiNodeGraphGroup::initClientCommOverlapGpuTensors() {
  size_t modelSize = clientGraphs_[0]->params()->vals()->size();
-  for(int client = 0; client < devices_.size(); client++) {
+  for(size_t client = 0; client < devices_.size(); client++) {
    // Communication overlap buffer (for grads + params)
    Tensor commOverlapBuffer
        = newTensor(modelSize, clientGraphs_[client]->getBackend());
@ -193,7 +193,7 @@ void MultiNodeGraphGroup::setupServerShards() {
  // CPU buffer for receiving/sending grads/params
  serverShardBufferCPU_ = std::vector<float>(nodeSizes_[mpi_my_rank_]);
  // Shard optimizers
-  for(int shard = 0; shard < devices_.size(); shard++) {
+  for(size_t shard = 0; shard < devices_.size(); shard++) {
    shardOptimizers_.push_back(Optimizer(options_));
  }
  // Mutexes to prevent simultaneous access to tensors and/or optimizers
@ -208,7 +208,7 @@ void MultiNodeGraphGroup::setupServerShards() {
 void MultiNodeGraphGroup::calculateShardSizes() {
  size_t nodeSize = nodeSizes_[mpi_my_rank_];
  size_t shardSize = ceilf(((float)nodeSize) / devices_.size());
-  for(int shard = 0; shard < devices_.size(); shard++) {
+  for(size_t shard = 0; shard < devices_.size(); shard++) {
    size_t remainingNodeSize = nodeSize - (shardSize * shard);
    // Takes care of edge case where last shard is smaller than the others
    shardSizes_.push_back(std::min(shardSize, remainingNodeSize));
@ -224,7 +224,7 @@ void MultiNodeGraphGroup::initShardGpuTensors() {
  for(int i = 0; i < mpi_my_rank_; i++) {
    offset += nodeSizes_[i];
  }
-  for(int shard = 0; shard < devices_.size(); shard++) {
+  for(size_t shard = 0; shard < devices_.size(); shard++) {
    Tensor gpuParams
        = newTensor(shardSizes_[shard], clientGraphs_[shard]->getBackend());
    gpuParams->copyFrom(clientGraphs_[0]->params()->vals()->subtensor(
@ -379,7 +379,7 @@ void MultiNodeGraphGroup::launchCommOverlapThreads() {
 */
 void MultiNodeGraphGroup::shutDownCommOverlapThreads() {
  stopClientCommThreads_ = true;
-  for(int gpu = 0; gpu < devices_.size(); gpu++) {
+  for(size_t gpu = 0; gpu < devices_.size(); gpu++) {
    clientCommOverlapBuffersFilled_[gpu] = true;
    cvClientCommOverlapBuffersFilled_[gpu]
        .notify_one();  // Unblock thread from lock, then join it
--- a/src/training/graph_group_multinode.h
+++ b/src/training/graph_group_multinode.h
@ -382,7 +382,9 @@ protected:
   * number of GPUs on the other nodes.
   */
  void loadDeviceConfig(std::vector<size_t> deviceConfig) {
-    size_t index = 0, node = 0, nClientsSeen = 0;
+    size_t index = 0;
+    int node = 0;
+    int nClientsSeen = 0;
    numberClientsOfNodes_ = std::vector<int>(mpi_comm_world_size_, 0);
    while(index < deviceConfig.size()) {
      if(numberClientsOfNodes_[node] == 0) {
@ -407,9 +409,8 @@ public:
   */
  MultiNodeGraphGroup(Ptr<Config> options)
      : GraphGroup(options),
-        tau_{options_->get<size_t>("optimizer-delay")},
-        //        useLocalOpt_{options_->get<bool>("multi-node-local-optimizers")},
-        clientCommOverlap{options_->get<bool>("multi-node-overlap")} {
+        clientCommOverlap{options_->get<bool>("multi-node-overlap")},
+        tau_{options_->get<size_t>("optimizer-delay")} {
    // Set up devices for this node
    setupMPI();  // Setup MPI before creating device vectors
    std::vector<size_t> devices;
@ -448,8 +449,8 @@ public:
   */
  void update(Ptr<data::Batch> batch) {
    ABORT_IF(finalized_, "Training has already finished.");
-    if(batchIter_ % mpi_comm_world_size_
-       == mpi_my_rank_) {  // Only take batch assigned to this node
+    // Only take batch assigned to this node
+    if(batchIter_ % mpi_comm_world_size_ == (size_t)mpi_my_rank_) {
      execute(batch);
    }
    batchIter_++;
@ -489,8 +490,8 @@ public:
   * Save model of given graph to disk.
   */
  void save(Ptr<ExpressionGraph> graph, bool final = false) {
-    int idx = 0;
-    for(int i = 0; i < clientGraphs_.size(); ++i) {
+    size_t idx = 0;
+    for(size_t i = 0; i < clientGraphs_.size(); ++i) {
      if(graph == clientGraphs_[i]) {
        idx = i;
        break;
--- a/src/training/graph_group_sync.cpp
+++ b/src/training/graph_group_sync.cpp
@ -101,10 +101,10 @@ void SyncGraphGroup::execute(Ptr<data::Batch> batch) {

  std::vector<std::vector<Ptr<data::Batch>>> delayedBatches;

-  for(int i = 0; i < delay_; ++i) {
+  for(size_t i = 0; i < delay_; ++i) {
    if(i * devs < batches.size()) {
      delayedBatches.emplace_back();
-      for(int j = 0; j < devs; ++j) {
+      for(size_t j = 0; j < devs; ++j) {
        size_t index = i * devs + j;
        if(index < batches.size())
          delayedBatches.back().push_back(batches[i * devs + j]);
@ -249,8 +249,8 @@ void SyncGraphGroup::save(bool final) {
 }

 void SyncGraphGroup::save(Ptr<ExpressionGraph> graph, bool final) {
-  int idx = 0;
-  for(int i = 0; i < graphs_.size(); ++i) {
+  size_t idx = 0;
+  for(size_t i = 0; i < graphs_.size(); ++i) {
    if(graph == graphs_[i]) {
      idx = i;
      break;
--- a/src/training/scheduler.h
+++ b/src/training/scheduler.h
@ -52,17 +52,17 @@ public:

  bool keepGoing() {
    // stop if it reached the maximum number of epochs
-    int stopAfterEpochs = options_->get<size_t>("after-epochs");
+    size_t stopAfterEpochs = options_->get<size_t>("after-epochs");
    if(stopAfterEpochs > 0 && state_->epochs > stopAfterEpochs)
      return false;

    // stop if it reached the maximum number of batch updates
-    int stopAfterBatches = options_->get<size_t>("after-batches");
+    size_t stopAfterBatches = options_->get<size_t>("after-batches");
    if(stopAfterBatches > 0 && state_->batches >= stopAfterBatches)
      return false;

    // stop if the first validator did not improve for a given number of checks
-    int stopAfterStalled = options_->get<size_t>("early-stopping");
+    size_t stopAfterStalled = options_->get<size_t>("early-stopping");
    if(stopAfterStalled > 0 && !validators_.empty()
       && stalled() >= stopAfterStalled)
      return false;
@ -313,20 +313,20 @@ public:

      if(strategy == "epoch" || strategy == "epoch+batches"
         || strategy == "epoch+stalled") {
-        int startEpoch
+        size_t startEpoch
            = options_->get<std::vector<size_t>>("lr-decay-start").front();
        if(startEpoch && state.epochs >= startEpoch)
          decay = true;
      }

      if(strategy == "epoch+batches") {
-        int startBatches
+        size_t startBatches
            = options_->get<std::vector<size_t>>("lr-decay-start")[1];
        if(startBatches && state.batches >= startBatches)
          decay = true;
      }
      if(strategy == "epoch+stalled") {
-        int startStalled
+        size_t startStalled
            = options_->get<std::vector<size_t>>("lr-decay-start")[1];
        if(startStalled && state.maxStalled >= startStalled)
          decay = true;
@ -361,7 +361,7 @@ public:

    if(factor > 0.0) {
      if("batches" == options_->get<std::string>("lr-decay-strategy")) {
-        int start
+        size_t start
            = options_->get<std::vector<size_t>>("lr-decay-start").front();
        int freq = options_->get<size_t>("lr-decay-freq");

--- a/src/training/training_state.h
+++ b/src/training/training_state.h
@ -92,7 +92,7 @@ public:
      observer->actAfterBatches(*this);
  }

-  void newStalled(int num) {
+  void newStalled(size_t num) {
    stalled = num;
    if(num > maxStalled)
      ++maxStalled;
--- a/src/training/validator.h
+++ b/src/training/validator.h
@ -58,7 +58,7 @@ public:
  Validator(std::vector<Ptr<Vocab>> vocabs,
            Ptr<Config> options,
            bool lowerIsBetter = true)
-      : ValidatorBase(lowerIsBetter), options_(options), vocabs_(vocabs) {}
+      : ValidatorBase(lowerIsBetter), vocabs_(vocabs), options_(options) {}

  virtual float validate(const std::vector<Ptr<ExpressionGraph>>& graphs) {
    using namespace data;
@ -508,7 +508,7 @@ protected:
    size_t width = subBatch->batchWidth();

    Words ref;  // fill ref
-    for(int i = 0; i < width; ++i) {
+    for(size_t i = 0; i < width; ++i) {
      Word w = subBatch->data()[i * size + no];
      if(w == eos)
        break;
--- a/src/translator/beam_search.h
+++ b/src/translator/beam_search.h
@ -46,7 +46,7 @@ public:
      // Use alignments from the first scorer, even if ensemble
      alignments = scorers_[0]->getAlignment();

-    for(int i = 0; i < keys.size(); ++i) {
+    for(size_t i = 0; i < keys.size(); ++i) {
      // Keys contains indices to vocab items in the entire beam.
      // Values can be between 0 and beamSize * vocabSize.
      int embIdx = keys[i] % vocabSize;
@ -72,7 +72,7 @@ public:
          hypIdxTrans = hypIdx;

        int beamHypIdx = hypIdx % beamSize;
-        if(beamHypIdx >= beam.size())
+        if(beamHypIdx >= (int)beam.size())
          beamHypIdx = beamHypIdx % beam.size();

        if(first)
@ -84,7 +84,7 @@ public:
        if(options_->get<bool>("n-best")) {
          std::vector<float> breakDown(states.size(), 0);
          beam[beamHypIdx]->GetCostBreakdown().resize(states.size(), 0);
-          for(int j = 0; j < states.size(); ++j) {
+          for(size_t j = 0; j < states.size(); ++j) {
            int key = embIdx + hypIdxTrans * vocabSize;
            breakDown[j] = states[j]->breakDown(key)
                           + beam[beamHypIdx]->GetCostBreakdown()[j];
@ -213,8 +213,8 @@ public:

        int dimBatch = batch->size();

-        for(int i = 0; i < localBeamSize; ++i) {
-          for(int j = 0; j < beams.size(); ++j) {
+        for(size_t i = 0; i < localBeamSize; ++i) {
+          for(size_t j = 0; j < beams.size(); ++j) {
            auto& beam = beams[j];
            if(i < beam.size()) {
              auto hyp = beam[i];
@ -238,7 +238,7 @@ public:
      auto totalCosts = prevCosts;
      // BUGBUG: it's not cost but score (higher=better)

-      for(int i = 0; i < scorers_.size(); ++i) {
+      for(size_t i = 0; i < scorers_.size(); ++i) {
        states[i] = scorers_[i]->step(
            graph, states[i], hypIndices, embIndices, dimBatch, localBeamSize);

--- a/src/translator/output_collector.h
+++ b/src/translator/output_collector.h
@ -63,14 +63,12 @@ public:
  }

 protected:
-  UPtr<OutputFileStream> outStrm_;
-  boost::mutex mutex_;
-  long nextId_;
-
  typedef std::map<long, std::pair<std::string, std::string>> Outputs;
  Outputs outputs_;
-
+  long nextId_;
+  UPtr<OutputFileStream> outStrm_;
  Ptr<PrintingStrategy> printing_;
+  boost::mutex mutex_;
 };

 class StringCollector {
--- a/src/translator/scorers.cpp
+++ b/src/translator/scorers.cpp
@ -31,13 +31,12 @@ std::vector<Ptr<Scorer>> createScorers(Ptr<Config> options) {
  std::vector<Ptr<Scorer>> scorers;

  auto models = options->get<std::vector<std::string>>("models");
-  int dimVocab = options->get<std::vector<int>>("dim-vocabs").back();

  std::vector<float> weights(models.size(), 1.f);
  if(options->has("weights"))
    weights = options->get<std::vector<float>>("weights");

-  int i = 0;
+  size_t i = 0;
  for(auto model : models) {
    std::string fname = "F" + std::to_string(i);
    auto modelOptions = New<Config>(*options);