Fixed bugs added C++ speedups...

2013-11-25 15:12:50 -08:00 · 2013-11-25 15:12:50 -08:00 · 76bfc18e40
--- a/.gitignore
+++ b/.gitignore
@ -1,10 +1,2 @@
 *.pyc
-dashboard.zip
-histogram_tools.py
-histogram_specs.py
-specs.py
-histogram_specs.json
-validation/
-html/data
-out.txt
-Histograms.json
+build/
--- a/cmake/externals.cmake
+++ b/cmake/externals.cmake
@ -0,0 +1,18 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+include(ExternalProject)
+set_property(DIRECTORY PROPERTY EP_BASE "${CMAKE_BINARY_DIR}/externals")
+
+externalproject_add(
+    rapidjson
+    SVN_REPOSITORY http://rapidjson.googlecode.com/svn/trunk/
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    INSTALL_COMMAND ""
+)
+
+set(RAPIDJSON_INCLUDE_DIRS "${CMAKE_BINARY_DIR}/externals/Source/rapidjson/include")
+include_directories(${RAPIDJSON_INCLUDE_DIRS})
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -isystem ${RAPIDJSON_INCLUDE_DIRS}")
--- a/dashboard/auxiliary.py
+++ b/dashboard/auxiliary.py
@ -27,8 +27,8 @@ class HistogramAggregator:
        aggregator1.merge(**aggregator2.dump())
    """
    def __init__(self, values = [], buildId = "", revision = None):
-        replace_nan_inf(values)
-        self.values = values
+        self.values = list(values)
+        replace_nan_inf(self.values)
        self.buildId = buildId
        self.revision = revision

@ -37,13 +37,13 @@ class HistogramAggregator:
        if len(self.values) != len(values):
            # Choose the histogram with highest buildId
            if self.buildId < buildId:
-                self.values = values
+                self.values = list(values)
                self.buildId = buildId
                self.revision = revision
        else:
            if self.buildId < buildId:
-                self.values = values
                self.buildId = buildId
+                self.revision = revision
            for i in xrange(0, len(values) - 6):
                self.values[i] += values[i]
            # Entries [-6:-1] may have -1 indicating missing entry
@ -76,4 +76,4 @@ def replace_nan_inf(values):
        elif math.isnan(val):
            # this isn't good... but we can't handle all possible corner cases
            # NaN shouldn't be possible... besides it's not known to happen
-            values[i] = null
+            values[i] = None
--- a/src/Aggregator.cpp
+++ b/src/Aggregator.cpp
@ -0,0 +1,59 @@
+#include "cache/ResultSet.h"
+#include "cache/MeasureFile.h"
+#include "cache/Aggregate.h"
+
+#include <unistd.h>
+
+#include <iostream>
+
+/** Print usage */
+void usage() {
+  printf("Usage: aggregate -o [FILE]\n");
+}
+
+using namespace std;
+
+/** Main file */
+int main(int argc, char *argv[]) {
+  FILE* output = stdout;
+
+  // Parse arguments
+  int c;
+  while ((c = getopt(argc, argv, "ho:")) != -1) {
+    switch (c) {
+      case 'o':
+        output = fopen(optarg, "w");
+        break;
+      case 'h':
+        usage();
+        exit(0);
+        break;
+      case '?':
+        usage();
+        return 1;
+        break;
+      default:
+        abort();
+    }
+  }
+
+  // Aggregated result set
+  ResultSet set;
+
+/*
+  // Read input file names from stdin
+  cin.sync_with_stdio(false);
+  string filename;
+  while(getline(cin, filename)) {
+    set.aggregate(filename.data());
+  }
+  set.output(output);
+
+*/
+
+  // Close output file
+  fclose(output);
+
+  return 0;
+}
+
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -0,0 +1,24 @@
+set(MERGERESULTS_SRC
+    cache/InternedString.cpp
+    cache/Aggregate.cpp
+    cache/MeasureFile.cpp
+    cache/ResultSet.cpp
+    MergeResult.cpp
+)
+
+add_executable(mergeresults ${MERGERESULTS_SRC})
+add_dependencies(mergeresults rapidjson)
+target_link_libraries(mergeresults)
+
+set(AGGREGATOR_SRC
+    cache/InternedString.cpp
+    cache/Aggregate.cpp
+    cache/MeasureFile.cpp
+    cache/ResultSet.cpp
+    Aggregator.cpp
+)
+
+include_directories(${LIBLZMA_INCLUDE_DIRS})
+add_executable(aggregator ${AGGREGATOR_SRC})
+add_dependencies(aggregator rapidjson)
+target_link_libraries(aggregator ${LIBLZMA_LIBRARIES})
--- a/src/CompressedFileReader.cpp
+++ b/src/CompressedFileReader.cpp
@ -0,0 +1,184 @@
+#include "CompressedFileReader.h"
+
+#include <errno.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+
+#define INBUF_SIZE      (4 * 4096)
+
+CompressedFileReader::CompressedFileReader(FILE* input)
+ : _input(input), _stream(nullptr), _inbuf(nullptr), _outbuf(nullptr),
+   _size(4096), _nextLine(nullptr) {
+  // Allocate buffers
+  _inbuf  = new uint8_t[INBUF_SIZE];
+  _outbuf = (uint8_t*)malloc(_size);
+  _nextLine = _outbuf;
+
+  // Allocated an initialize stream
+  _stream = new lzma_stream;
+  *_stream = LZMA_STREAM_INIT;
+
+  // Initialized decoding stream
+  lzma_ret ret = lzma_auto_decoder(_stream, UINT64_MAX, LZMA_CONCATENATED);
+  if (ret != LZMA_OK) {
+    const char* msg;
+    switch (ret) {
+      case LZMA_MEM_ERROR:
+        msg = "unable to allocate memory";
+        break;
+      case LZMA_OPTIONS_ERROR:
+        msg = "invalid options";
+        break;
+      case LZMA_PROG_ERROR:
+        msg = "unknown error";
+        break;
+      default:
+        assert(false);
+        msg = "invalid error code";
+        break;
+    }
+
+    // Print error message
+    fprintf(
+      stderr,
+      "CompressedFileReader: lzma_auto_decoder() failed, %s\n",
+      msg
+    );
+
+    // Delete stream
+    delete _stream;
+    _stream = nullptr;
+
+    return;
+  }
+
+  // Setup stream
+  _stream->next_in    = NULL;
+  _stream->avail_in   = 0;
+  _stream->next_out   = _outbuf;
+  _stream->avail_out  = _size;
+}
+
+char* CompressedFileReader::nextLine() {
+  // If there is no decoder stream, then we're either done, or there was an
+  // error somewhere in the process
+  if (!_stream) {
+    return nullptr;
+  }
+
+  lzma_action action = LZMA_RUN;
+
+  // Bring us to a state where: _outbuf == _nextLine
+  // do this by moving memory between _nextLine and _stream->next_out
+  // to begin at _outbuf (update _nextLine = _outbuf, _stream->next_out =...)
+  assert(_stream->next_out - _nextLine >= 0);
+  memmove(_outbuf, _nextLine, _stream->next_out - _nextLine);
+  _stream->next_out   -= _nextLine - _outbuf;
+  _stream->avail_out  += _nextLine - _outbuf;
+  _nextLine = _outbuf;
+  assert(_stream->next_out >= _outbuf);
+
+  fprintf(stderr, "Getting line:\n");
+
+  // Optional optimization:
+  // Search for line breaks in interval from _outbuf, to _stream->next_out,
+  // if found, update _nextLine, flip '\n' to '\0' and return... This should
+  // give slightly better locality
+  assert(_nextLine == _outbuf);
+  while (_nextLine < _stream->next_out) {
+    if (*_nextLine == '\n') {
+      *_nextLine = '\0';
+      _nextLine += 1;
+      return (char*)_outbuf;
+    }
+    _nextLine++;
+  }
+  // Okay, so there is no _nextLine, yet... we'll keep moving it forward though
+
+  // Read until we reach a line break
+  while (true) {
+    // If there no available input, read from file
+    if (_stream->avail_in == 0 && !feof(_input)) {
+      _stream->next_in = _inbuf;
+      _stream->avail_in = fread(_inbuf, 1, INBUF_SIZE, _input);
+
+      // if there is an error
+      if (ferror(_input)) {
+        fprintf(
+          stderr,
+          "CompressedFileReader: fread() failed, %s\n",
+          strerror(errno)
+        );
+        // Free streaming context, we can't continue after an error
+        lzma_end(_stream);
+        delete _stream;
+        _stream = nullptr;
+
+        return nullptr;
+      }
+
+      // if at end of file, finish decoding, flushing buffers
+      if (feof(_input)) {
+        action = LZMA_FINISH;
+      }
+    }
+
+    // Decode LZMA stream
+    lzma_ret ret = lzma_code(_stream, action);
+
+    // If there is no more output buffer space, or we're at the end of the
+    // stream, search for line breaks and return, if any is found
+    if (_stream->avail_out == 0 || ret == LZMA_STREAM_END) {
+      // Find next line and return it, if there is any...
+      while (_nextLine < _stream->next_out) {
+        if (*_nextLine == '\n') {
+          *_nextLine = '\0';
+          _nextLine += 1;
+          return (char*)_outbuf;
+        }
+        _nextLine++;
+      }
+
+      // Realloc _outbuf, note that we should have filled _outbuf first
+      assert(_stream->next_out == _nextLine);
+      fprintf(stderr, "Size: %i != %i \n", _size, _nextLine - _outbuf );
+      fprintf(stderr, "avail_out: %i, '%i'\n", _stream->avail_out, *(_stream->next_out -1));
+      assert((_nextLine - _outbuf) == _size);
+
+      // Double size, to get a nice amortized complexity, no we don't bother
+      // scaling down the allocation
+      _outbuf = (uint8_t*)realloc(_outbuf, _size * 2);
+
+      // Update _nextLine, _stream->next_out and _stream->avail_out
+      _nextLine           = _outbuf + _size;
+      _stream->next_out   = _outbuf + _size;
+      _stream->avail_out  = _size;
+
+      // Store the updated size
+      _size *= 2;
+    }
+
+    // End stream and set _stream null if we're done
+    if (ret == LZMA_STREAM_END) {
+      lzma_end(_stream);
+      delete _stream;
+      _stream = nullptr;
+    } else if (ret != LZMA_OK) {
+      assert(false);
+    }
+  }
+}
+
+CompressedFileReader::~CompressedFileReader() {
+  // Free stream if not at end
+  if (_stream) {
+    lzma_end(_stream);
+    delete _stream;
+    _stream = nullptr;
+  }
+
+  // Release other buffers
+  free(_outbuf);
+  delete[] _inbuf;
+}
--- a/src/CompressedFileReader.h
+++ b/src/CompressedFileReader.h
@ -0,0 +1,42 @@
+#ifndef COMPRESSEDFILEREADER_H
+#define COMPRESSEDFILEREADER_H
+
+#include <stdio.h>
+#include <lzma.h>
+
+/** Read compressed files line by line */
+class CompressedFileReader {
+  /** Input file object */
+  FILE*         _input;
+
+  /** lzma decoder stream */
+  lzma_stream*  _stream;
+
+  /** Input buffer, buffering data from input file to decoder */
+  uint8_t*      _inbuf;
+
+  /** Output buffer, returned when a line is read */
+  uint8_t*      _outbuf;
+
+  /** Size of bytes allocated for _outbuf, doubled when more space is needed */
+  size_t        _size;
+
+  /** Position where next line starts, _outbuf, if no line available */
+  uint8_t*      _nextLine;
+public:
+  /** Create a compressed file reader */
+  CompressedFileReader(FILE* input);
+
+  /**
+   * Get next line, null, if at end of stream or error, errors are also printed
+   * to stderr. The returned pointer is valid until next invocation or
+   * destruction of the CompressedFileReader.
+   */
+  char* nextLine();
+
+  /** Destroy compressed file reader, freeing all allocated memory */
+  ~CompressedFileReader();
+};
+
+
+#endif // COMPRESSEDFILEREADER_H
--- a/src/MergeResult.cpp
+++ b/src/MergeResult.cpp
@ -0,0 +1,128 @@
+#include "cache/ResultSet.h"
+#include "cache/MeasureFile.h"
+#include "cache/Aggregate.h"
+
+#include "rapidjson/document.h"
+
+#include <unistd.h>
+
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+/** Print usage */
+void usage() {
+  printf("Usage: mergeresults -i [FILE] -o [FILE]\n");
+}
+
+using namespace std;
+using namespace rapidjson;
+
+
+/** Main file */
+int main(int argc, char *argv[]) {
+  vector<char*> inputs;
+  FILE* output = stdout;
+
+  // Parse arguments
+  int c;
+  while ((c = getopt(argc, argv, "hi:o:")) != -1) {
+    switch (c) {
+      case 'i':
+        inputs.push_back(optarg);
+        break;
+      case 'o':
+        output = fopen(optarg, "w");
+        break;
+      case 'h':
+        usage();
+        exit(0);
+        break;
+      case '?':
+        usage();
+        return 1;
+        break;
+      default:
+        abort();
+    }
+  }
+
+  // If input files are given read them
+  if (!inputs.empty()) {
+    // Input one by one
+    ResultSet set;
+    for (auto file : inputs) {
+      ifstream stream(file);
+      set.mergeStream(stream);
+    }
+    set.output(output);
+
+  } else {
+    // if no input files are given, we read from cin and output whenever the
+    // the filePath changes. This will mergeresult of sorted input, hence,
+    // perfect when piping in from GNU sort, which can efficiently merge sorted
+    // files
+    cin.sync_with_stdio(false);
+
+    // filePath and measureFile currently aggregated
+    string        filePath;
+    MeasureFile*  measureFile = nullptr;
+
+    string line;
+    int nb_line = 0;
+    while (getline(cin, line)) {
+      nb_line++;
+
+      // Find delimiter
+      size_t del = line.find('\t');
+      if (del == string::npos) {
+        fprintf(stderr, "No tab on line %i\n", nb_line);
+        continue;
+      }
+
+      // Find current file path
+      string currentFilePath = line.substr(0, del);
+
+      // If we're reached a new filePath, output the old one
+      if (filePath != currentFilePath) {
+        if (measureFile) {
+          measureFile->output(output, filePath);
+          delete measureFile;
+          measureFile = nullptr;
+        }
+        filePath = currentFilePath;
+      }
+
+      // Parse JSON document
+      Document d;
+      d.Parse<0>(line.data() + del + 1);
+
+      // Check that we have an object
+      if (!d.IsObject()) {
+        fprintf(stderr, "JSON root is not an object on line %i\n", nb_line);
+        continue;
+      }
+
+      // Allocate MeasureFile if not already aggregated
+      if (!measureFile) {
+        measureFile = new MeasureFile();
+      }
+
+      // Merge in JSON
+      measureFile->mergeJSON(d);
+    }
+
+    // Output last MeasureFile, if there was ever one
+    if (measureFile) {
+      measureFile->output(output, filePath);
+      delete measureFile;
+      measureFile = nullptr;
+    }
+  }
+
+  // Close output file
+  fclose(output);
+
+  return 0;
+}
--- a/src/cache/Aggregate.cpp
+++ b/src/cache/Aggregate.cpp
@ -0,0 +1,113 @@
+#include "Aggregate.h"
+
+#include <stdio.h>
+
+using namespace std;
+using namespace rapidjson;
+
+InternedStringContext  Aggregate::_buildIdStringCtx;
+
+InternedStringContext  Aggregate::_revisionStringCtx;
+
+
+void Aggregate::mergeJSON(const Value& dump) {
+  const Value::Member* jvalues    = dump.FindMember("values");
+  const Value::Member* jrevision  = dump.FindMember("revision");
+  const Value::Member* jbuildId   = dump.FindMember("buildId");
+  if (!jvalues || !jvalues->value.IsArray()) {
+    fprintf(stderr, "'values' in dump isn't an array\n");
+    return;
+  }
+  if (!jrevision || !jrevision->value.IsString()) {
+    fprintf(stderr, "'revision' in dump isn't a string\n");
+    return;
+  }
+  if (!jbuildId || !jbuildId->value.IsString()) {
+    fprintf(stderr, "'buildId' in dump isn't a string\n");
+    return;
+  }
+  const char* buildId  = jbuildId->value.GetString();
+  const char* revision = jrevision->value.GetString();
+  const Value& values  = jvalues->value;
+  size_t length        = values.Size();
+
+  // Check length of values
+  if(length == 0) {
+    fprintf(stderr, "Empty 'values' array in dump!\n");
+  }
+
+  // Check that we have doubles
+  for (size_t i = 0; i < length; i++) {
+    if(!values[i].IsNumber()) {
+      fprintf(stderr, "Array contains non-double value!\n");
+      return;
+    }
+  }
+
+  // Check if length matches
+  if (_length != length) {
+    // Replace if we have newer buildId or current length is zero
+    if (_length == 0 || _buildId < buildId) {
+      // Set buildId and revision
+      _buildId  = _buildIdStringCtx.createString(buildId);
+      _revision = _revisionStringCtx.createString(revision);
+      _length   = length;
+
+      // Free old values
+      if (_values) {
+        delete[] _values;
+      }
+      _values = new double[length];
+
+      for (size_t i = 0; i < length; i++) {
+        _values[i] = values[i].GetDouble();
+      }
+    }
+  } else {
+    // Update revision and buildId if we have a newer one
+    if (_buildId < buildId) {
+      _buildId  = _buildIdStringCtx.createString(buildId);
+      _revision = _revisionStringCtx.createString(revision);
+    }
+
+    size_t i;
+    for (i = 0; i < length - 6; i++) {
+      _values[i] += values[i].GetDouble();
+    }
+    for (; i < length - 1; i++) {
+      double val = values[i].GetDouble();
+      // Do not accumulate -1 (this indicates missing entry)
+      if (val == -1 && _values[i] == -1) {
+        continue;
+      }
+      _values[i] += val;
+    }
+    _values[i] += values[i].GetDouble();
+  }
+}
+
+void Aggregate::output(OutputContext& ctx, PathNode<Aggregate>* owner) {
+  if (ctx.comma) {
+    fputc(',', ctx.file);
+  }
+  ctx.comma = true;
+  fputc('\"', ctx.file);
+  owner->output(ctx.file);
+  fputs("\":{\"values\": [", ctx.file);
+  if(_length > 0) {
+    fprintf(ctx.file, "%.20g", _values[0]);
+    for(size_t i = 1; i < _length; i++) {
+      fprintf(ctx.file, ",%.20g", _values[i]);
+    }
+  }
+  fprintf(ctx.file, "],\"buildId\":\"%s\",\"revision\":\"%s\"}",
+          _buildId.data(), _revision.data());
+}
+
+
+
+
+
+
+
+
--- a/src/cache/Aggregate.h
+++ b/src/cache/Aggregate.h
@ -0,0 +1,48 @@
+#ifndef AGGREGATE_H
+#define AGGREGATE_H
+
+#include "PathNode.h"
+#include "InternedString.h"
+
+#include "rapidjson/document.h"
+
+/** Representation of histogram aggregate */
+class Aggregate {
+  /** Latests revision of aggregated histograms */
+  InternedString _revision;
+
+  /**
+   * Highest BuildId of aggregated histograms, used to find newest BuildId when
+   * merging data into Aggregate.
+   */
+  InternedString _buildId;
+
+  /** Aggregated values, null, if _length is zero */
+  double*        _values;
+
+  /** Length of _values, zero implies _values == null */
+  size_t         _length;
+
+  /** String context for interning BuildIds */
+  static InternedStringContext  _buildIdStringCtx;
+
+  /** String context for interning revision string */
+  static InternedStringContext  _revisionStringCtx;
+public:
+  Aggregate()
+   : _values(nullptr), _length(0) {}
+
+  /** Merge aggregated values from JSON dump */
+  void mergeJSON(const rapidjson::Value& dump);
+
+  /** Output context */
+  struct OutputContext {
+    FILE* file;
+    bool  comma;
+  };
+
+  /** Output to file */
+  void output(OutputContext& ctx, PathNode<Aggregate>* owner);
+};
+
+#endif // AGGREGATE_H
--- a/src/cache/InternedString.cpp
+++ b/src/cache/InternedString.cpp
@ -0,0 +1,72 @@
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "InternedString.h"
+
+#ifdef LOG_INTERNEDSTRING
+#define log(...)                fprintf(stderr, __VA_ARGS__);
+#else
+#define log(...)
+#endif
+
+void InternedString::releaseBuffer() {
+  // If we have a buffer
+  if (_buffer) {
+    // Decrement reference count
+    _buffer->refCount--;
+    assert(_buffer->refCount >= 0);
+    log("DEC: to %i of '%s'\n", _buffer->refCount, _buffer->payload.data());
+
+    // If there are no more references
+    if (_buffer->refCount == 0) {
+      // Erase from owners cache, if owner is still alive
+      if (_buffer->owner) {
+        size_t count = _buffer->owner->_cache.erase(_buffer->payload.data());
+        assert(count == 1);
+      }
+      log("DEL:         '%s'\n", _buffer->payload.data());
+      // Free buffer
+      delete _buffer;
+    }
+
+    // Remove pointer to buffer
+    _buffer = nullptr;
+  }
+}
+
+const char* InternedString::_emptyString = "";
+
+InternedString InternedStringContext::createString(const char* s) {
+  // Empty InternedStrings are a special case
+  if(strlen(s) == 0) {
+    return InternedString();
+  }
+
+  // Find buffer
+  InternedString::Buffer* buf = nullptr;
+  auto it = _cache.find(s);
+
+  // If a buffer doesn't exist create a new buffer
+  if (it == _cache.end()) {
+    buf = new InternedString::Buffer(s, this);
+    _cache.insert(std::make_pair<const char*, InternedString::Buffer*>(
+                  buf->payload.data(), (InternedString::Buffer*)buf));
+    assert(buf->refCount == 1);
+  } else {
+    buf = it->second;
+    buf->refCount++;
+  }
+  assert(buf);
+  assert(buf->payload == s);
+  return InternedString(buf);
+}
+
+
+InternedStringContext::~InternedStringContext() {
+  for(auto item : _cache) {
+    assert(item.second->owner == this);
+    item.second->owner = nullptr;
+  }
+  _cache.clear();
+}
--- a/src/cache/InternedString.h
+++ b/src/cache/InternedString.h
@ -0,0 +1,167 @@
+#ifndef INTERNED_STRING_H
+#define INTERNED_STRING_H
+
+#include <string.h>
+
+#include <string>
+#include <unordered_map>
+
+class InternedStringContext;
+
+/**
+ * Interned immutable string, used to reduce memory allocations when dealing
+ * with a lot of instances of the same string.
+ */
+class InternedString {
+  /** Buffer storing the contents of an interned string */
+  struct Buffer {
+    Buffer(const char* s, InternedStringContext* owner)
+     : refCount(1), payload(s), owner(owner) {}
+    size_t                  refCount;
+    std::string             payload;
+    InternedStringContext*  owner;
+  };
+
+  /** Internal Buffer, nullptr, for empty strings */
+  Buffer* _buffer;
+
+  /**
+   * Initialized InternedString from buffer
+   * Note, buffers are always allocated by instances of InternedStringContext.
+   */
+  explicit InternedString(Buffer* buffer)
+    : _buffer(buffer) {}
+
+  /** Release current buffer, decrementing refCount and freeing it if needed */
+  void releaseBuffer();
+
+  /** Empty string to return when _buffer is null */
+  static const char* _emptyString;
+public:
+  /** Initialized empty InternedString */
+  InternedString()
+   : _buffer(nullptr) {}
+
+  /** Copy-construct InternedString */
+  InternedString(const InternedString& s) {
+    _buffer = s._buffer;
+    if(_buffer) {
+      _buffer->refCount++;
+    }
+  }
+
+  /** Assignment operator */
+  InternedString& operator= (const InternedString& s) {
+    releaseBuffer();
+    _buffer = s._buffer;
+    if(_buffer) {
+      _buffer->refCount++;
+    }
+  }
+
+  /** Compare two InternedStrings */
+  bool operator==(const InternedString& s) const {
+    if(_buffer && s._buffer) {
+      if(_buffer->owner != s._buffer->owner || !_buffer->owner) {
+        return _buffer->payload == s._buffer->payload;
+      }
+    }
+    return _buffer == s._buffer;
+  }
+
+  /** Compare to C string */
+  bool operator==(const char* s) const {
+    if(!_buffer) {
+      return *s == '\0';
+    }
+    return _buffer->payload == s;
+  }
+
+  /** Compare to std::string */
+  bool operator==(const std::string& s) const {
+    if(!_buffer) {
+      return s.empty();
+    }
+    return _buffer->payload == s;
+  }
+
+  /** Compare strings */
+  bool operator<(const char* s) const {
+    if(!_buffer) {
+      return *s != '\0';
+    }
+    return _buffer->payload < s;
+  }
+
+  /** Get string as const char* */
+  const char* data() const {
+    if (_buffer)
+      return _buffer->payload.data();
+    return _emptyString;
+  }
+
+  /** Write to FILE */
+  void output(FILE* f) {
+    if(_buffer) {
+      fputs(_buffer->payload.data(), f);
+    }
+  }
+
+  /** Check if this is an empty string */
+  bool empty() {
+    return _buffer;
+  }
+
+  /** Destroy and deref InternedScript */
+  ~InternedString() {
+    releaseBuffer();
+  }
+
+  friend class InternedStringContext;
+};
+
+/** Context that owns a collection of interned strings */
+class InternedStringContext {
+  /** Hash for C strings */
+  struct StrHash {
+    size_t operator()(const char* s) const {
+      size_t hash = 0;
+      while(*s != '\0') {
+        hash = (hash << 6) ^ *(s++);
+      }
+      return hash;
+    }
+  };
+
+  /** Comparison operator for C strings */
+  struct StrCmp {
+    bool operator()(const char* s1, const char* s2) const {
+      return strcmp(s1, s2) == 0;
+    }
+  };
+
+  /** Buffer Cache type */
+  typedef std::unordered_map<const char*, InternedString::Buffer*, StrHash, StrCmp>
+                                                                    BufferCache;
+
+  /** Buffer Cache */
+  BufferCache _cache;
+public:
+  /** Create new interned string from C string */
+  InternedString createString(const char* s);
+
+  /** Create new interned string from const char* and string length */
+  InternedString createString(const char* s, size_t n);
+
+  /** Create new interned string from std::string */
+  InternedString createString(const std::string& s) {
+    return createString(s.data());
+  }
+
+  /** Free InternedStringContext and freeing buffers when they are deleted */
+  ~InternedStringContext();
+
+  friend class InternedString;
+};
+
+#endif // INTERNED_STRING_H
--- a/src/cache/MeasureFile.cpp
+++ b/src/cache/MeasureFile.cpp
@ -0,0 +1,60 @@
+#include "MeasureFile.h"
+
+#include "Aggregate.h"
+
+#include <stdio.h>
+
+using namespace std;
+using namespace rapidjson;
+
+InternedStringContext  MeasureFile::_filterStringCtx;
+
+void MeasureFile::mergeJSON(Value& blob) {
+  // For each member
+  for (auto it = blob.MemberBegin(); it != blob.MemberEnd(); ++it) {
+    // First find filter path
+    const char* filterPath = it->name.GetString();
+
+    // Check that we have an object
+    if (!it->value.IsObject()) {
+      printf("Value of filterPath: %s is not an object\n", filterPath);
+      continue;
+    }
+
+    // Find PathNode
+    PathNode<Aggregate>* n = _filterRoot.find(filterPath, _filterStringCtx);
+    if (!n->target()) {
+      n->setTarget(new Aggregate());
+    }
+    n->target()->mergeJSON(it->value);
+
+    #if FIRST_DUMP_ONLY
+    break;
+    #endif
+  }
+}
+
+/** Output to file */
+void MeasureFile::output(FILE* f, PathNode<MeasureFile>* owner) {
+  owner->output(f);
+  fputs("\t{", f);
+
+  Aggregate::OutputContext ctx;
+  ctx.file = f;
+  ctx.comma = false;
+  _filterRoot.outputTargetTree(ctx);
+
+  fputs("}\n", f);
+}
+
+void MeasureFile::output(FILE* f, const std::string& filePath) {
+  fputs(filePath.data(), f);
+  fputs("\t{", f);
+
+  Aggregate::OutputContext ctx;
+  ctx.file = f;
+  ctx.comma = false;
+  _filterRoot.outputTargetTree(ctx);
+
+  fputs("}\n", f);
+}
--- a/src/cache/MeasureFile.h
+++ b/src/cache/MeasureFile.h
@ -0,0 +1,32 @@
+#ifndef MEASUREFILE_H
+#define MEASUREFILE_H
+
+#include "PathNode.h"
+#include "InternedString.h"
+
+#include "rapidjson/document.h"
+
+#include <stdio.h>
+
+class Aggregate;
+
+/**
+ * In-memory representation of the aggregated data stored in a single JSON file.
+ * This is called an MeasureFile as there may be multiple of these files for a
+ * given measure under different channel, product, version and by-date.
+ */
+class MeasureFile {
+  PathNode<Aggregate>           _filterRoot;
+  static InternedStringContext  _filterStringCtx;
+public:
+  /** Merge with JSON from file */
+  void mergeJSON(rapidjson::Value& blob);
+
+  /** Output to file */
+  void output(FILE* f, PathNode<MeasureFile>* owner);
+
+  /** Output to file */
+  void output(FILE* f, const std::string& filePath);
+};
+
+#endif // MEASUREFILE_H
--- a/src/cache/PathNode.h
+++ b/src/cache/PathNode.h
@ -0,0 +1,126 @@
+#ifndef PATHNODE_H
+#define PATHNODE_H
+
+#include <stdio.h>
+#include <string>
+#include <vector>
+
+#include "InternedString.h"
+
+/** Node on a path separated by slashes */
+template<typename Target>
+class PathNode {
+  InternedString                  _value;
+  PathNode<Target>*               _parent;
+  std::vector<PathNode<Target>*>  _children;
+  Target*                         _target;
+
+  /** Create internal PathNode */
+  PathNode(const InternedString& value, PathNode* parent)
+   : _value(value), _parent(parent), _target(nullptr) {}
+
+public:
+  /** Create new root PathNode */
+  PathNode()
+   : _value(), _parent(nullptr), _target(nullptr) {}
+
+  /** Delete node, children, targets and remove from parent if any */
+  ~PathNode() {
+    // Delete target
+    if (_target) {
+      delete _target;
+      _target = nullptr;
+    }
+
+    // Delete children
+    for (auto child : _children) {
+      child->_parent = nullptr;
+      delete child;
+    }
+
+    // Remove from parent if we still have one
+    if (_parent) {
+      std::vector<PathNode<Target>*>& siblings = _parent->_children;
+      for(auto it = siblings.begin(); it != siblings.end(); it++) {
+        if(*it == this) {
+          siblings.erase(it);
+          break;
+        }
+      }
+    }
+  }
+
+  /** Set a target object, this will be deleted with this node */
+  void setTarget(Target* target) {
+    _target = target;
+  }
+
+  /** Get current target, nullptr if none is set */
+  Target* target() const {
+    return _target;
+  }
+
+  /** Write Path to file */
+  void output(FILE* f) {
+    if(_parent && _parent->_parent) {
+      _parent->output(f);
+      fputc('/', f);
+    }
+    _value.output(f);
+  }
+
+  /** Invoke output(ctx, parent) on the entire target tree */
+  template<typename Context>
+  void outputTargetTree(Context& ctx) {
+    // If we have a target, output it
+    if(_target) {
+      _target->output(ctx, this);
+    }
+
+    // Call recursively on children
+    for(auto child : _children) {
+      child->outputTargetTree(ctx);
+    }
+  }
+
+  /** Find/create PathNode under current node */
+  PathNode* find(const char* path, InternedStringContext& ctx) {
+    const char* reminder = path;
+    while(*reminder != '/' && *reminder != '\0'){
+      reminder++;
+    }
+    size_t n = reminder - path;
+
+    // Find child node
+    PathNode<Target>* next = nullptr;
+    for(auto child : _children) {
+      if(strncmp(child->_value.data(), path, n) == 0) {
+        next = child;
+        break;
+      }
+    }
+
+    // Create new child node if we need a new one
+    if(!next) {
+      // Create interned string with const cast hack to avoid copying the string
+      char tmp = *reminder;
+      *(const_cast<char*>(reminder)) = '\0';
+      InternedString name(ctx.createString(path));
+      *(const_cast<char*>(reminder)) = tmp;
+
+      // Create next node
+      next = new PathNode(name, this);
+      _children.push_back(next);
+    }
+
+    // If there is more path to lookup we that from next
+    if(*reminder != '\0') {
+      return next->find(reminder + 1, ctx);
+    }
+
+    // Otherwise return next
+    return next;
+  }
+};
+
+#endif // PATHNODE_H
--- a/src/cache/ResultSet.cpp
+++ b/src/cache/ResultSet.cpp
@ -0,0 +1,56 @@
+#include "ResultSet.h"
+#include "MeasureFile.h"
+#include "Aggregate.h"
+
+#include "rapidjson/document.h"
+
+#include <stdio.h>
+#include <string>
+
+using namespace std;
+using namespace rapidjson;
+
+
+void ResultSet::mergeStream(istream& stream) {
+  // Read input line by line
+  string line;
+  int nb_line = 0;
+  while (getline(stream, line)) {
+    nb_line++;
+
+    // Find delimiter
+    size_t del = line.find('\t');
+    if (del == string::npos) {
+      fprintf(stderr, "No tab on line %i\n", nb_line);
+      continue;
+    }
+
+    // Find filePath
+    string filePath = line.substr(0, del);
+
+    // Parse JSON document
+    Document d;
+    d.Parse<0>(line.data() + del + 1);
+
+    // Check that we have an object
+    if (!d.IsObject()) {
+      fprintf(stderr, "JSON root is not an object on line %i\n", nb_line);
+      continue;
+    }
+
+    // Find blob and merge with it
+    PathNode<MeasureFile>* n = _fileRoot.find(filePath.data(), _pathStringCtx);
+    if(!n->target()) {
+      n->setTarget(new MeasureFile());
+    }
+    n->target()->mergeJSON(d);
+  }
+}
+
+void ResultSet::aggregate(const char* filename) {
+
+}
+
+void ResultSet::output(FILE* f) {
+  _fileRoot.outputTargetTree(f);
+}
--- a/src/cache/ResultSet.h
+++ b/src/cache/ResultSet.h
@ -0,0 +1,26 @@
+#ifndef RESULTSET_H
+#define RESULTSET_H
+
+#include "PathNode.h"
+#include "InternedString.h"
+
+#include <iostream>
+
+class MeasureFile;
+
+/** A collection of results for various measures files */
+class ResultSet {
+  PathNode<MeasureFile> _fileRoot;
+  InternedStringContext _pathStringCtx;
+public:
+  /** Merge a result-set file into this ResultSet */
+  void mergeStream(std::istream& stream);
+
+  /** Output result-set to file */
+  void output(FILE* f);
+
+  /** Decompress and aggregated file */
+  void aggregate(const char* filename);
+};
+
+#endif // RESULTSET_H
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -0,0 +1,3 @@
+
+add_subdirectory(merge-test)
+#add_subdirectory(decompress)
--- a/tests/decompress/CMakeLists.txt
+++ b/tests/decompress/CMakeLists.txt
@ -0,0 +1,15 @@
+file(COPY a-b-c-def-g.txt.xz a-b-c-def-g.txt.lzma DESTINATION .)
+
+set(COMPRESSEDFILEREADERTEST_SRC
+    CompressedFileReaderTest.cpp
+    ../../src/CompressedFileReader.cpp
+)
+
+include_directories(${LIBLZMA_INCLUDE_DIRS})
+add_executable(compressedfilereadertest ${COMPRESSEDFILEREADERTEST_SRC})
+target_link_libraries(compressedfilereadertest ${LIBLZMA_LIBRARIES})
+
+add_test(
+  NAME CompressedFileReaderTest
+  COMMAND ./compressedfilereadertest
+)
--- a/tests/decompress/CompressedFileReaderTest.cpp
+++ b/tests/decompress/CompressedFileReaderTest.cpp
@ -0,0 +1,89 @@
+
+#include "../../src/CompressedFileReader.h"
+
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+
+int main(int argc, char* argv[]) {
+  // Test XZ reading
+  {
+    FILE* input = fopen("a-b-c-def-g.txt.xz", "r");
+
+    CompressedFileReader reader(input);
+    char* line;
+
+    // 1. line: "a"
+    line = reader.nextLine();
+    assert(line);
+    assert(strcmp(line, "a") == 0);
+
+    // 2. line: "b"
+    line = reader.nextLine();
+    assert(line);
+    assert(strcmp(line, "b") == 0);
+
+    // 3. line: "c"
+    line = reader.nextLine();
+    assert(line);
+    assert(strcmp(line, "c") == 0);
+
+    // 4. line: "def"
+    line = reader.nextLine();
+    assert(line);
+    assert(strcmp(line, "def") == 0);
+
+    // 5. line: "g"
+    line = reader.nextLine();
+    assert(line);
+    assert(strcmp(line, "g") == 0);
+
+    // End of input
+    line = reader.nextLine();
+    assert(!line);
+
+    fclose(input);
+  }
+
+  // Test lzma reading (legacy only)
+  {
+    FILE* input = fopen("a-b-c-def-g.txt.lzma", "r");
+
+    CompressedFileReader reader(input);
+    char* line;
+
+    // 1. line: "a"
+    line = reader.nextLine();
+    assert(line);
+    assert(strcmp(line, "a") == 0);
+
+    // 2. line: "b"
+    line = reader.nextLine();
+    assert(line);
+    assert(strcmp(line, "b") == 0);
+
+    // 3. line: "c"
+    line = reader.nextLine();
+    assert(line);
+    assert(strcmp(line, "c") == 0);
+
+    // 4. line: "def"
+    line = reader.nextLine();
+    assert(line);
+    assert(strcmp(line, "def") == 0);
+
+    // 5. line: "g"
+    line = reader.nextLine();
+    assert(line);
+    assert(strcmp(line, "g") == 0);
+
+    // End of input
+    line = reader.nextLine();
+    assert(!line);
+
+    fclose(input);
+  }
+
+  return 0;
+}
+
--- a/tests/decompress/a-b-c-def-g.txt
+++ b/tests/decompress/a-b-c-def-g.txt
@ -0,0 +1,5 @@
+a
+b
+c
+def
+g
--- a/tests/decompress/a-b-c-def-g.txt.lzma
+++ b/tests/decompress/a-b-c-def-g.txt.lzma
--- a/tests/decompress/a-b-c-def-g.txt.xz
+++ b/tests/decompress/a-b-c-def-g.txt.xz
--- a/tests/merge-test/CMakeLists.txt
+++ b/tests/merge-test/CMakeLists.txt
@ -0,0 +1,11 @@
+file(COPY result-1.txt result-2.txt DESTINATION .)
+
+add_test(
+  NAME merge-test-with-options
+  COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/with-options.sh"
+)
+
+add_test(
+  NAME merge-test-with-pipes
+  COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/with-pipes.sh"
+)
--- a/tests/merge-test/result-1.txt
+++ b/tests/merge-test/result-1.txt
@ -0,0 +1 @@
+aurora/24/CYCLE_COLLECTOR/by-submission-date	{"20130805/idle_daily/Fennec/Android/16/arm":{"values": [5, 10, 5, -1, -1, -1, -1, -1, 3],"buildId": "20130805004006","revision": "http://hg.mozilla.org/releases/mozilla-aurora/rev/4ea223de889c"}}
--- a/tests/merge-test/result-2.txt
+++ b/tests/merge-test/result-2.txt
@ -0,0 +1 @@
+aurora/24/CYCLE_COLLECTOR/by-submission-date	{"20130805/idle_daily/Fennec/Android/16/arm":{"values": [10, 5, 10, -1, -1, -1, -1, -1, 1],"buildId": "20130805004006","revision": "http://hg.mozilla.org/releases/mozilla-aurora/rev/4ea223de889c"}}
--- a/tests/merge-test/with-options.sh
+++ b/tests/merge-test/with-options.sh
@ -0,0 +1,12 @@
+#!/bin/bash -e
+
+echo "got here"
+
+# Merge test files
+../../mergeresults -i result-1.txt -i result-2.txt -o output.txt;
+
+# Check number of lines
+test `cat output.txt | wc -l` -eq 1;
+
+# Check that we have 15 in there
+test `cat output.txt | grep 15 | wc -l` -eq 1;
--- a/tests/merge-test/with-pipes.sh
+++ b/tests/merge-test/with-pipes.sh
@ -0,0 +1,10 @@
+#!/bin/bash -e
+
+# Merge test files
+cat result-1.txt result-2.txt | ../../mergeresults > output.txt;
+
+# Check number of lines
+test `cat output.txt | wc -l` -eq 1;
+
+# Check that we have 15 in there
+test `cat output.txt | grep 15 | wc -l` -eq 1;
				`@ -0,0 +1 @@`
				`aurora/24/CYCLE_COLLECTOR/by-submission-date {"20130805/idle_daily/Fennec/Android/16/arm":{"values": [5, 10, 5, -1, -1, -1, -1, -1, 3],"buildId": "20130805004006","revision": "http://hg.mozilla.org/releases/mozilla-aurora/rev/4ea223de889c"}}`