fix: Some tidying and build fixes (#1984)

* fix: add interpret-ml install * chore: remove precommit and extra requirements.txt * chore: fix test errors * chore: notebook pip install fixes
2023-06-14 10:49:39 +01:00 · 2023-06-14 10:49:39 +01:00 · b23f050599
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,13 +0,0 @@
-repos:
-   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.3.0
-    hooks:
-    -   id: trailing-whitespace
-    -   id: end-of-file-fixer
-    -   id: check-added-large-files
-
-   repo: https://github.com/psf/black
-    rev: 22.3.0
-    hooks:
-    -   id: black
-    -   id: black-jupyter
--- a/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/cognitive/form/FormRecognizerV3Suite.scala
+++ b/cognitive/src/test/scala/com/microsoft/azure/synapse/ml/cognitive/form/FormRecognizerV3Suite.scala
@ -179,7 +179,7 @@ class AnalyzeDocumentSuite extends TransformerFuzzing[AnalyzeDocument] with Form
        "Contoso\nAddress:\n1 Redmond way Suite\n6000 Redmond, WA\n99243\n" +
          "Invoice For: Microsoft\n1020 Enterprise Way",
        "CustomerAddress,CustomerAddressRecipient," +
-          "CustomerName,DueDate,InvoiceDate,InvoiceId,Items,VendorAddress,VendorName")
+          "CustomerName,DueDate,InvoiceDate,InvoiceId,InvoiceTotal,Items,VendorAddress,VendorName")
    }
  }

--- a/environment.yml
+++ b/environment.yml
@ -44,3 +44,5 @@ dependencies:
    - huggingface-hub>=0.8.1
    - langchain==0.0.151
    - openai==0.27.5
+    - black==22.3.0
+    - black[jupyter]==22.3.0
--- a/hooks/pre-commit
+++ b/hooks/pre-commit
@ -1,7 +0,0 @@
-#!/usr/bin/env bash
-
-# If any command fails, exit immediately with that command's exit status
-set -eo pipefail
-
-echo "Running scalastyle.."
-sbt scalastyle test:scalastyle
--- a/notebooks/features/hyperparameter_tuning/HyperOpt-SynapseML.ipynb
+++ b/notebooks/features/hyperparameter_tuning/HyperOpt-SynapseML.ipynb
@ -517,4 +517,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 2
-}
+}
--- a/notebooks/features/isolation_forest/IsolationForest
+++ b/notebooks/features/isolation_forest/IsolationForest
@ -27,6 +27,24 @@
    " - If you are running it on Synapse, you'll need to [create an AML workspace and set up linked Service](https://microsoft.github.io/SynapseML/docs/next/mlflow/installation/). \n"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "import subprocess\n",
+    "import sys\n",
+    "\n",
+    "for package in [\"sqlparse\", \"raiwidgets\", \"interpret-community\"]:\n",
+    "    subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", package])"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
  {
   "cell_type": "markdown",
   "metadata": {
@ -145,27 +163,26 @@
    "experiment_name = f\"/Shared/isolation_forest_experiment-{str(uuid.uuid1())}/\"\n",
    "model_name = f\"isolation-forest-model\"\n",
    "if running_on_synapse():\n",
-    "    import subprocess\n",
-    "    import sys\n",
-    "    from pyspark.sql.functions import udf\n",
    "    from synapse.ml.core.platform import materializing_display as display\n",
    "\n",
    "    # use regular display when running on interactive notebook\n",
-    "    # from notebookutils.visualization import display\n",
-    "\n",
-    "    for package in [\"sqlparse\", \"raiwidgets\", \"interpret-community\"]:\n",
-    "        subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", package])"
+    "    # from notebookutils.visualization import display"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {},
   "outputs": [],
   "source": [
    "# Bootstrap Spark Session\n",
    "spark = SparkSession.builder.getOrCreate()"
-   ]
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
  },
  {
   "cell_type": "markdown",
@ -564,7 +581,7 @@
   "outputs": [],
   "source": [
    "# Define UDF\n",
-    "vec2array = udf(lambda vec: vec.toArray().tolist(), ArrayType(FloatType()))"
+    "vec2array = F.udf(lambda vec: vec.toArray().tolist(), ArrayType(FloatType()))"
   ]
  },
  {
@ -1013,4 +1030,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 1
-}
+}
--- a/notebooks/features/responsible_ai/Interpretability
+++ b/notebooks/features/responsible_ai/Interpretability
@ -51,7 +51,6 @@
    "\n",
    "if running_on_synapse():\n",
    "    shell = TerminalInteractiveShell.instance()\n",
-    "    shell.define_macro(\"foo\", \"\"\"a,b=10,20\"\"\")\n",
    "    from notebookutils.visualization import display\n",
    "\n",
    "\n",
@ -481,4 +480,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 4
-}
+}
--- a/pipeline.yaml
+++ b/pipeline.yaml
@ -65,12 +65,10 @@ jobs:
        scriptLocation: inlineScript
        scriptType: bash
        inlineScript: 'sbt scalastyle test:scalastyle'
-    - task: UsePythonVersion@00
-      inputs:
-        versionSpec: '3.8'
-    - script: pip install -r requirements.txt
-      displayName: 'Install requirements'
+    - template: templates/conda.yml
    - bash: |
+        set -e
+        source activate synapseml
        black --diff --color . && black --check -q .
      displayName: 'Python Style Check'

--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +0,0 @@
-# Copyright (C) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License. See LICENSE in project root for information.
-
-# Required to auto-format python code
-black==22.3.0
-black[jupyter]==22.3.0
-pre-commit==2.19.0
--- a/website/docs/documentation/transformers/core/_Train.md
+++ b/website/docs/documentation/transformers/core/_Train.md
@ -24,7 +24,7 @@ from synapse.ml.train import *
 from numpy import random

 df = spark.createDataFrame(
-      [(random.rand(), random.rand()) for _ in range(4096)], ["label", "prediction"]
+      [(random.rand(), random.rand()) for _ in range(2048)], ["label", "prediction"]
 )

 cms = (ComputeModelStatistics()
@ -43,8 +43,8 @@ import com.microsoft.azure.synapse.ml.train._
 import scala.util.Random

 val rand = new Random(1337)
-val df = (Seq.fill(4096)(rand.nextDouble())
-      .zip(Seq.fill(4096)(rand.nextDouble()))
+val df = (Seq.fill(2048)(rand.nextDouble())
+      .zip(Seq.fill(2048)(rand.nextDouble()))
      .toDF("label", "prediction"))

 val cms = (new ComputeModelStatistics()