From f747d1c5c316a17be17a4f081e013e1f42dd9795 Mon Sep 17 00:00:00 2001
From: Chenhui Hu <chenhhu@microsoft.com>
Date: Fri, 17 Jan 2020 10:00:59 -0500
Subject: [PATCH] Chenhui/cpu unit test pipeline (#38)

* address review comments

* added full conda path

* minor change

* added conda to PATH

* added build status in README


Former-commit-id: 8caaa3c662a4f293efe7e72623909b456362925b
---
 README.md                                     |  8 +-
 .../dataset/retail/download_data.r            |  2 +-
 .../tests}/source_entire.R                    |  0
 .../tests}/test_datasets.R                    |  0
 forecasting_lib/tests/test_datasets.py        | 66 +++++++++++++++++
 tests/ci/cpu_unit_tests_linux.yml             | 63 ++++++++++++++++
 tests/unit/test_datasets.py                   | 73 -------------------
 7 files changed, 137 insertions(+), 75 deletions(-)
 rename {tests/unit => forecasting_lib/tests}/source_entire.R (100%)
 rename {tests/unit => forecasting_lib/tests}/test_datasets.R (100%)
 create mode 100644 forecasting_lib/tests/test_datasets.py
 create mode 100644 tests/ci/cpu_unit_tests_linux.yml
 delete mode 100644 tests/unit/test_datasets.py

diff --git a/README.md b/README.md
index f65d301d..180ab60b 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,9 @@
 # Forecasting Best Practices 
 
-This repository contains examples and best practices for building Forecasting solutions and systems, provided as [Jupyter notebooks](examples) and [a library of utility functions](forecasting_lib). The focus of the repository is on state-of-the-art methods and common scenarios that are popular among researchers and practitioners working on forecasting problems.
\ No newline at end of file
+This repository contains examples and best practices for building Forecasting solutions and systems, provided as [Jupyter notebooks](examples) and [a library of utility functions](forecasting_lib). The focus of the repository is on state-of-the-art methods and common scenarios that are popular among researchers and practitioners working on forecasting problems.
+
+## Build Status
+| Build | Branch | Status |
+| --- | --- | --- |
+| **Linux CPU** | master | [![Build Status](https://dev.azure.com/best-practices/forecasting/_apis/build/status/cpu_unit_tests_linux?branchName=master)](https://dev.azure.com/best-practices/forecasting/_build/latest?definitionId=128&branchName=master) |
+| **Linux CPU** | staging | [![Build Status](https://dev.azure.com/best-practices/forecasting/_apis/build/status/cpu_unit_tests_linux?branchName=staging)](https://dev.azure.com/best-practices/forecasting/_build/latest?definitionId=128&branchName=staging) |
\ No newline at end of file
diff --git a/forecasting_lib/forecasting_lib/dataset/retail/download_data.r b/forecasting_lib/forecasting_lib/dataset/retail/download_data.r
index 23f2e7c7..2827cd10 100644
--- a/forecasting_lib/forecasting_lib/dataset/retail/download_data.r
+++ b/forecasting_lib/forecasting_lib/dataset/retail/download_data.r
@@ -13,7 +13,7 @@ data("orangeJuice")
 yx <- orangeJuice[[1]]
 storedemo <- orangeJuice[[2]]
 
-fpath <- file.path("retail_sales", "OrangeJuice_Pt_3Weeks_Weekly", "data")
+fpath <- file.path("contrib", "tsperf", "OrangeJuice_Pt_3Weeks_Weekly", "data")
 
 write.csv(yx, file = file.path(fpath, "yx.csv"), quote = FALSE, na = " ", row.names = FALSE)
 write.csv(storedemo, file = file.path(fpath, "storedemo.csv"), quote = FALSE, na = " ", row.names = FALSE)
diff --git a/tests/unit/source_entire.R b/forecasting_lib/tests/source_entire.R
similarity index 100%
rename from tests/unit/source_entire.R
rename to forecasting_lib/tests/source_entire.R
diff --git a/tests/unit/test_datasets.R b/forecasting_lib/tests/test_datasets.R
similarity index 100%
rename from tests/unit/test_datasets.R
rename to forecasting_lib/tests/test_datasets.R
diff --git a/forecasting_lib/tests/test_datasets.py b/forecasting_lib/tests/test_datasets.py
new file mode 100644
index 00000000..496fa48d
--- /dev/null
+++ b/forecasting_lib/tests/test_datasets.py
@@ -0,0 +1,66 @@
+import os
+import subprocess
+import pandas as pd
+
+
+def test_download_retail_data():
+    RETAIL_DIR = os.path.join(".", "forecasting_lib", "forecasting_lib", "dataset", "retail")
+    DATA_DIR = os.path.join(".", "contrib", "tsperf", "OrangeJuice_Pt_3Weeks_Weekly", "data")
+    SCRIPT_PATH = os.path.join(RETAIL_DIR, "download_data.r")
+    DATA_FILE_LIST = ["yx.csv", "storedemo.csv"]
+    # Remove data files if they are existed
+    for f in DATA_FILE_LIST:
+        file_path = os.path.join(DATA_DIR, f)
+        if os.path.exists(file_path):
+            os.remove(file_path)
+        assert not os.path.exists(file_path)
+    # Call data download script
+    try:
+        subprocess.call(["sudo", "Rscript", SCRIPT_PATH])
+    except subprocess.CalledProcessError as e:
+        print(e.output)
+    # Check downloaded data
+    DATA_DIM_LIST = [(106139, 19), (83, 12)]
+    COLUMN_NAME_LIST = [
+        [
+            "store",
+            "brand",
+            "week",
+            "logmove",
+            "constant",
+            "price1",
+            "price2",
+            "price3",
+            "price4",
+            "price5",
+            "price6",
+            "price7",
+            "price8",
+            "price9",
+            "price10",
+            "price11",
+            "deal",
+            "feat",
+            "profit",
+        ],
+        [
+            "STORE",
+            "AGE60",
+            "EDUC",
+            "ETHNIC",
+            "INCOME",
+            "HHLARGE",
+            "WORKWOM",
+            "HVAL150",
+            "SSTRDIST",
+            "SSTRVOL",
+            "CPDIST5",
+            "CPWVOL5",
+        ],
+    ]
+    for idx, f in enumerate(DATA_FILE_LIST):
+        file_path = os.path.join(DATA_DIR, f)
+        assert os.path.exists(file_path)
+        df = pd.read_csv(file_path, index_col=None)
+        assert df.shape == DATA_DIM_LIST[idx]
+        assert list(df) == COLUMN_NAME_LIST[idx]
diff --git a/tests/ci/cpu_unit_tests_linux.yml b/tests/ci/cpu_unit_tests_linux.yml
new file mode 100644
index 00000000..63bb5bb1
--- /dev/null
+++ b/tests/ci/cpu_unit_tests_linux.yml
@@ -0,0 +1,63 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+# Pull request against these branches will trigger this build
+pr:
+- master
+- staging
+
+# Any commit to these branches will trigger the build.
+trigger:
+- staging
+- master
+
+
+jobs:
+- job: cpu_unit_tests_linux
+  timeoutInMinutes: 10 # how long to run the job before automatically cancelling
+  pool:
+    # vmImage: 'ubuntu-16.04' # hosted machine 
+    name: ForecastingAgents
+
+  steps:
+
+# Uncomment if hosted machine
+#  - task: UsePythonVersion@0
+#    inputs:
+#      versionSpec: '3.6.8'
+#      architecture: 'x64'
+#      addToPath: true
+#    displayName: 'Use Python 3.6.8'
+  
+  - bash: |
+      echo "##vso[task.prependpath]/data/anaconda/bin"
+      export PATH="/data/anaconda/bin:$PATH"
+      conda env list
+    displayName: Add Conda to PATH
+
+# Uncomment if needed
+# Conda creation can take around 10 min
+#  - bash: |
+#      python tools/generate_conda_file.py
+#      conda env create -n forecasting_cpu -f forecasting_cpu.yaml
+#    displayName: 'Creating Conda Environment with dependencies'
+
+  - bash: |
+      eval "$(conda shell.bash hook)" && conda activate forecast_cpu
+      pytest --durations=0 forecasting_lib/tests -m "not notebooks and not gpu and not azureml" --junitxml=junit/test-unitttest.xml
+    displayName: 'Run Unit tests'
+
+# Uncomment if needed
+#  - bash: |
+#      echo Remove Conda Environment
+#      conda remove -n forecasting_cpu --all -q --force -y
+#      echo Done Cleanup
+#    displayName: 'Cleanup Task'
+#    condition: always()
+
+  - task: PublishTestResults@2
+    inputs:
+      testResultsFiles: '**/test-unitttest.xml'
+      testRunTitle: 'Test results for PyTest'
+
+
diff --git a/tests/unit/test_datasets.py b/tests/unit/test_datasets.py
deleted file mode 100644
index 5d123887..00000000
--- a/tests/unit/test_datasets.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import os
-import sys
-import pytest
-import subprocess
-import pandas as pd
-
-def test_download_retail_data():
-    BENCHMARK_DIR = os.path.join(".", "retail_sales", "OrangeJuice_Pt_3Weeks_Weekly")
-    DATA_DIR = os.path.join(BENCHMARK_DIR, "data")
-    SCRIPT_PATH = os.path.join(BENCHMARK_DIR, "common", "download_data.r")
-    DATA_FILE_LIST = ["yx.csv", "storedemo.csv"]
-    # Remove data files if they are existed
-    for f in DATA_FILE_LIST:
-        file_path = os.path.join(DATA_DIR, f)
-        if os.path.exists(file_path):
-            os.remove(file_path)
-        assert not os.path.exists(file_path)
-    # Call data download script
-    try:
-        subprocess.call(["Rscript", SCRIPT_PATH])
-    except subprocess.CalledProcessError as e:
-        print(e.output)
-    # Check downloaded data
-    DATA_DIM_LIST = [(106139, 19), (83, 12)]
-    COLUMN_NAME_LIST = [["store", "brand", "week", "logmove", "constant",
-                          "price1", "price2", "price3", "price4", "price5",
-                          "price6", "price7", "price8", "price9", "price10",
-                          "price11", "deal", "feat", "profit"],
-                         ["STORE", "AGE60", "EDUC", "ETHNIC", "INCOME",
-                          "HHLARGE", "WORKWOM","HVAL150","SSTRDIST",
-                          "SSTRVOL", "CPDIST5", "CPWVOL5"]
-                        ]
-    for idx, f in enumerate(DATA_FILE_LIST):
-        file_path = os.path.join(DATA_DIR, f)
-        assert os.path.exists(file_path) 
-        df = pd.read_csv(file_path, index_col=None)
-        assert df.shape == DATA_DIM_LIST[idx]
-        assert list(df) == COLUMN_NAME_LIST[idx]
-
-def test_download_energy_data():
-    BENCHMARK_DIR = os.path.join(".", "energy_load", "GEFCom2017_D_Prob_MT_hourly")
-    DATA_DIR = os.path.join(BENCHMARK_DIR, "data")
-    SCRIPT_PATH = os.path.join(BENCHMARK_DIR, "common", "download_data.py")
-    DATA_FILE_LIST = ["2011_smd_hourly.xls", "2012_smd_hourly.xls",
-                      "2013_smd_hourly.xls", "2014_smd_hourly.xls",
-                      "2015_smd_hourly.xls", "2016_smd_hourly.xls",
-                      "2017_smd_hourly.xlsx"]
-    DATA_DIM_LIST = [[(57, 5), (8760, 16)] + [(8760, 14)]*8,
-                     [(57, 5), (8784, 16)] + [(8784, 14)]*8,
-                     [(59, 5), (8760, 16)] + [(8760, 14)]*8,
-                     [(59, 5), (8760, 16)] + [(8760, 14)]*8 + [(0,1)],
-                     [(57, 5), (8760, 16)] + [(8760, 14)]*8,
-                     [(47, 10), (8784, 17)] + [(8784, 14)]*8,
-                     [(51, 13), (8760, 21)] + [(8760, 14)]*8]
-    # Remove data files if they are existed
-    for f in DATA_FILE_LIST:
-        file_path = os.path.join(DATA_DIR, f)
-        if os.path.exists(file_path):
-            os.remove(file_path)
-        assert not os.path.exists(file_path)
-    # Call data download script
-    try:
-        subprocess.check_output(["python", SCRIPT_PATH])
-    except subprocess.CalledProcessError as e:
-        print(e.output)
-    # Check downloaded data (only check dimensions since download_data.py checks column names)
-    for file_idx, f in enumerate(DATA_FILE_LIST):
-        file_path = os.path.join(DATA_DIR, f)
-        assert os.path.exists(file_path) 
-        xls = pd.ExcelFile(file_path)
-        for sheet_idx, s in enumerate(xls.sheet_names):
-            assert xls.parse(s).shape == DATA_DIM_LIST[file_idx][sheet_idx]
-