diff --git a/notebooks/features/classification/Classification - Adult Census with Vowpal Wabbit.ipynb b/notebooks/features/classification/Classification - Adult Census with Vowpal Wabbit.ipynb index eb7f3f7bf6..7ce528148e 100644 --- a/notebooks/features/classification/Classification - Adult Census with Vowpal Wabbit.ipynb +++ b/notebooks/features/classification/Classification - Adult Census with Vowpal Wabbit.ipynb @@ -18,8 +18,10 @@ "outputs": [], "source": [ "import os\n", + "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()" ] }, @@ -29,7 +31,9 @@ "metadata": {}, "outputs": [], "source": [ - "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n", + "data = spark.read.parquet(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\"\n", + ")\n", "data = data.select([\"education\", \"marital-status\", \"hours-per-week\", \"income\"])\n", "train, test = data.randomSplit([0.75, 0.25], seed=123)\n", "train.limit(10).toPandas()" @@ -54,19 +58,23 @@ "from synapse.ml.vw import VowpalWabbitFeaturizer, VowpalWabbitClassifier\n", "\n", "# Define classification label\n", - "train = train.withColumn(\"label\", when(col(\"income\").contains(\"<\"), 0.0).otherwise(1.0)).repartition(1).cache()\n", + "train = (\n", + " train.withColumn(\"label\", when(col(\"income\").contains(\"<\"), 0.0).otherwise(1.0))\n", + " .repartition(1)\n", + " .cache()\n", + ")\n", "print(train.count())\n", "\n", "# Specify featurizer\n", - "vw_featurizer = VowpalWabbitFeaturizer(inputCols=[\"education\", \"marital-status\", \"hours-per-week\"],\n", - " outputCol=\"features\")\n", + "vw_featurizer = VowpalWabbitFeaturizer(\n", + " inputCols=[\"education\", \"marital-status\", \"hours-per-week\"], outputCol=\"features\"\n", + ")\n", "\n", "# Define VW classification model\n", "args = \"--loss_function=logistic --quiet --holdout_off\"\n", - "vw_model = VowpalWabbitClassifier(featuresCol=\"features\",\n", - " labelCol=\"label\",\n", - " passThroughArgs=args,\n", - " numPasses=10)\n", + "vw_model = VowpalWabbitClassifier(\n", + " featuresCol=\"features\", labelCol=\"label\", passThroughArgs=args, numPasses=10\n", + ")\n", "\n", "# Create a pipeline\n", "vw_pipeline = Pipeline(stages=[vw_featurizer, vw_model])" @@ -122,9 +130,10 @@ "outputs": [], "source": [ "from synapse.ml.train import ComputeModelStatistics\n", - "metrics = ComputeModelStatistics(evaluationMetric=\"classification\", \n", - " labelCol=\"label\", \n", - " scoredLabelsCol=\"prediction\").transform(prediction)\n", + "\n", + "metrics = ComputeModelStatistics(\n", + " evaluationMetric=\"classification\", labelCol=\"label\", scoredLabelsCol=\"prediction\"\n", + ").transform(prediction)\n", "metrics.toPandas()" ] } diff --git a/notebooks/features/classification/Classification - Adult Census.ipynb b/notebooks/features/classification/Classification - Adult Census.ipynb index e1216289f3..97abe2d16e 100644 --- a/notebooks/features/classification/Classification - Adult Census.ipynb +++ b/notebooks/features/classification/Classification - Adult Census.ipynb @@ -16,8 +16,10 @@ "execution_count": null, "source": [ "import os\n", + "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()" ], "outputs": [], @@ -44,7 +46,9 @@ "cell_type": "code", "execution_count": null, "source": [ - "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n", + "data = spark.read.parquet(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\"\n", + ")\n", "data = data.select([\"education\", \"marital-status\", \"hours-per-week\", \"income\"])\n", "train, test = data.randomSplit([0.75, 0.25], seed=123)\n", "train.limit(10).toPandas()" @@ -70,7 +74,10 @@ "source": [ "from synapse.ml.train import TrainClassifier\n", "from pyspark.ml.classification import LogisticRegression\n", - "model = TrainClassifier(model=LogisticRegression(), labelCol=\"income\", numFeatures=256).fit(train)" + "\n", + "model = TrainClassifier(\n", + " model=LogisticRegression(), labelCol=\"income\", numFeatures=256\n", + ").fit(train)" ], "outputs": [], "metadata": {} @@ -89,7 +96,9 @@ "if os.environ.get(\"AZURE_SERVICE\", None) != \"Microsoft.ProjectArcadia\":\n", " model.write().overwrite().save(\"dbfs:/AdultCensus.mml\")\n", "else:\n", - " model.write().overwrite().save(\"abfss://synapse@mmlsparkeuap.dfs.core.windows.net/models/AdultCensus.mml\")" + " model.write().overwrite().save(\n", + " \"abfss://synapse@mmlsparkeuap.dfs.core.windows.net/models/AdultCensus.mml\"\n", + " )" ], "outputs": [], "metadata": {} diff --git a/notebooks/features/classification/Classification - Before and After SynapseML.ipynb b/notebooks/features/classification/Classification - Before and After SynapseML.ipynb index 9a0fa1bbda..38ed17121f 100644 --- a/notebooks/features/classification/Classification - Before and After SynapseML.ipynb +++ b/notebooks/features/classification/Classification - Before and After SynapseML.ipynb @@ -29,8 +29,10 @@ "outputs": [], "source": [ "import os\n", + "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()" ] }, @@ -49,7 +51,9 @@ "metadata": {}, "outputs": [], "source": [ - "rawData = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\")\n", + "rawData = spark.read.parquet(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\"\n", + ")\n", "rawData.show(5)" ] }, @@ -74,12 +78,19 @@ "source": [ "from pyspark.sql.functions import udf\n", "from pyspark.sql.types import *\n", + "\n", + "\n", "def wordCount(s):\n", " return len(s.split())\n", + "\n", + "\n", "def wordLength(s):\n", " import numpy as np\n", + "\n", " ss = [len(w) for w in s.split()]\n", " return round(float(np.mean(ss)), 2)\n", + "\n", + "\n", "wordLengthUDF = udf(wordLength, DoubleType())\n", "wordCountUDF = udf(wordCount, IntegerType())" ] @@ -91,10 +102,15 @@ "outputs": [], "source": [ "from synapse.ml.stages import UDFTransformer\n", + "\n", "wordLength = \"wordLength\"\n", "wordCount = \"wordCount\"\n", - "wordLengthTransformer = UDFTransformer(inputCol=\"text\", outputCol=wordLength, udf=wordLengthUDF)\n", - "wordCountTransformer = UDFTransformer(inputCol=\"text\", outputCol=wordCount, udf=wordCountUDF)\n" + "wordLengthTransformer = UDFTransformer(\n", + " inputCol=\"text\", outputCol=wordLength, udf=wordLengthUDF\n", + ")\n", + "wordCountTransformer = UDFTransformer(\n", + " inputCol=\"text\", outputCol=wordCount, udf=wordCountUDF\n", + ")" ] }, { @@ -104,9 +120,14 @@ "outputs": [], "source": [ "from pyspark.ml import Pipeline\n", - "data = Pipeline(stages=[wordLengthTransformer, wordCountTransformer]) \\\n", - " .fit(rawData).transform(rawData) \\\n", - " .withColumn(\"label\", rawData[\"rating\"] > 3).drop(\"rating\")" + "\n", + "data = (\n", + " Pipeline(stages=[wordLengthTransformer, wordCountTransformer])\n", + " .fit(rawData)\n", + " .transform(rawData)\n", + " .withColumn(\"label\", rawData[\"rating\"] > 3)\n", + " .drop(\"rating\")\n", + ")" ] }, { @@ -155,24 +176,22 @@ "# Featurize text column\n", "tokenizer = Tokenizer(inputCol=\"text\", outputCol=\"tokenizedText\")\n", "numFeatures = 10000\n", - "hashingScheme = HashingTF(inputCol=\"tokenizedText\",\n", - " outputCol=\"TextFeatures\",\n", - " numFeatures=numFeatures)\n", + "hashingScheme = HashingTF(\n", + " inputCol=\"tokenizedText\", outputCol=\"TextFeatures\", numFeatures=numFeatures\n", + ")\n", "tokenizedData = tokenizer.transform(data)\n", "featurizedData = hashingScheme.transform(tokenizedData)\n", "\n", "# Merge text and numeric features in one feature column\n", "featureColumnsArray = [\"TextFeatures\", \"wordCount\", \"wordLength\"]\n", - "assembler = VectorAssembler(\n", - " inputCols = featureColumnsArray,\n", - " outputCol=\"features\")\n", + "assembler = VectorAssembler(inputCols=featureColumnsArray, outputCol=\"features\")\n", "assembledData = assembler.transform(featurizedData)\n", "\n", "# Select only columns of interest\n", "# Convert rating column from boolean to int\n", - "processedData = assembledData \\\n", - " .select(\"label\", \"features\") \\\n", - " .withColumn(\"label\", assembledData.label.cast(IntegerType()))" + "processedData = assembledData.select(\"label\", \"features\").withColumn(\n", + " \"label\", assembledData.label.cast(IntegerType())\n", + ")" ] }, { @@ -189,10 +208,12 @@ "\n", "# Train the models on the 'train' data\n", "lrHyperParams = [0.05, 0.1, 0.2, 0.4]\n", - "logisticRegressions = [LogisticRegression(regParam = hyperParam)\n", - " for hyperParam in lrHyperParams]\n", - "evaluator = BinaryClassificationEvaluator(rawPredictionCol=\"rawPrediction\",\n", - " metricName=\"areaUnderROC\")\n", + "logisticRegressions = [\n", + " LogisticRegression(regParam=hyperParam) for hyperParam in lrHyperParams\n", + "]\n", + "evaluator = BinaryClassificationEvaluator(\n", + " rawPredictionCol=\"rawPrediction\", metricName=\"areaUnderROC\"\n", + ")\n", "metrics = []\n", "models = []\n", "\n", @@ -245,10 +266,13 @@ "\n", "# Train the models on the 'train' data\n", "lrHyperParams = [0.05, 0.1, 0.2, 0.4]\n", - "logisticRegressions = [LogisticRegression(regParam = hyperParam)\n", - " for hyperParam in lrHyperParams]\n", - "lrmodels = [TrainClassifier(model=lrm, labelCol=\"label\", numFeatures=10000).fit(train)\n", - " for lrm in logisticRegressions]\n", + "logisticRegressions = [\n", + " LogisticRegression(regParam=hyperParam) for hyperParam in lrHyperParams\n", + "]\n", + "lrmodels = [\n", + " TrainClassifier(model=lrm, labelCol=\"label\", numFeatures=10000).fit(train)\n", + " for lrm in logisticRegressions\n", + "]\n", "\n", "# Select the best model\n", "bestModel = FindBestModel(evaluationMetric=\"AUC\", models=lrmodels).fit(test)\n", @@ -257,8 +281,10 @@ "# Get AUC on the validation dataset\n", "predictions = bestModel.transform(validation)\n", "metrics = ComputeModelStatistics().transform(predictions)\n", - "print(\"Best model's AUC on validation set = \"\n", - " + \"{0:.2f}%\".format(metrics.first()[\"AUC\"] * 100))" + "print(\n", + " \"Best model's AUC on validation set = \"\n", + " + \"{0:.2f}%\".format(metrics.first()[\"AUC\"] * 100)\n", + ")" ] } ], diff --git a/notebooks/features/classification/Classification - Twitter Sentiment with Vowpal Wabbit.ipynb b/notebooks/features/classification/Classification - Twitter Sentiment with Vowpal Wabbit.ipynb index c01d60ed81..be6343fe54 100644 --- a/notebooks/features/classification/Classification - Twitter Sentiment with Vowpal Wabbit.ipynb +++ b/notebooks/features/classification/Classification - Twitter Sentiment with Vowpal Wabbit.ipynb @@ -40,6 +40,7 @@ "source": [ "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()" ] }, @@ -78,9 +79,10 @@ "source": [ "def download_data(url, data_folder=DATA_FOLDER, filename=\"downloaded_data.zip\"):\n", " \"\"\"Download and extract data from url\"\"\"\n", - " \n", + "\n", " data_dir = \"./\" + DATA_FOLDER\n", - " if not os.path.exists(data_dir): os.makedirs(data_dir)\n", + " if not os.path.exists(data_dir):\n", + " os.makedirs(data_dir)\n", " downloaded_filepath = os.path.join(data_dir, filename)\n", " print(\"Downloading data...\")\n", " urllib.request.urlretrieve(url, downloaded_filepath)\n", @@ -89,7 +91,8 @@ " zipfile.extractall(data_dir)\n", " zipfile.close()\n", " print(\"Finished data downloading and extraction.\")\n", - " \n", + "\n", + "\n", "download_data(DATA_URL)" ] }, @@ -106,8 +109,12 @@ "metadata": {}, "outputs": [], "source": [ - "df_train = pd.read_csv(os.path.join(\".\", DATA_FOLDER, TRAIN_FILENAME), \n", - " header=None, names=COL_NAMES, encoding=ENCODING)\n", + "df_train = pd.read_csv(\n", + " os.path.join(\".\", DATA_FOLDER, TRAIN_FILENAME),\n", + " header=None,\n", + " names=COL_NAMES,\n", + " encoding=ENCODING,\n", + ")\n", "df_train = spark.createDataFrame(df_train, verifySchema=False)" ] }, @@ -155,10 +162,12 @@ "metadata": {}, "outputs": [], "source": [ - "df_train = df_train.orderBy(rand()) \\\n", - " .limit(100000) \\\n", - " .withColumn(\"label\", when(col(\"label\") > 0, 1.0).otherwise(0.0)) \\\n", - " .select([\"label\", \"text\"])" + "df_train = (\n", + " df_train.orderBy(rand())\n", + " .limit(100000)\n", + " .withColumn(\"label\", when(col(\"label\") > 0, 1.0).otherwise(0.0))\n", + " .select([\"label\", \"text\"])\n", + ")" ] }, { @@ -177,18 +186,15 @@ "outputs": [], "source": [ "# Specify featurizers\n", - "tokenizer = RegexTokenizer(inputCol=\"text\",\n", - " outputCol=\"words\")\n", + "tokenizer = RegexTokenizer(inputCol=\"text\", outputCol=\"words\")\n", "\n", - "count_vectorizer = CountVectorizer(inputCol=\"words\",\n", - " outputCol=\"features\")\n", + "count_vectorizer = CountVectorizer(inputCol=\"words\", outputCol=\"features\")\n", "\n", "# Define VW classification model\n", "args = \"--loss_function=logistic --quiet --holdout_off\"\n", - "vw_model = VowpalWabbitClassifier(featuresCol=\"features\", \n", - " labelCol=\"label\", \n", - " passThroughArgs=args, \n", - " numPasses=10)\n", + "vw_model = VowpalWabbitClassifier(\n", + " featuresCol=\"features\", labelCol=\"label\", passThroughArgs=args, numPasses=10\n", + ")\n", "\n", "# Create a pipeline\n", "vw_pipeline = Pipeline(stages=[tokenizer, count_vectorizer, vw_model])" @@ -225,8 +231,12 @@ "metadata": {}, "outputs": [], "source": [ - "df_test = pd.read_csv(os.path.join(\".\", DATA_FOLDER, TEST_FILENAME), \n", - " header=None, names=COL_NAMES, encoding=ENCODING)\n", + "df_test = pd.read_csv(\n", + " os.path.join(\".\", DATA_FOLDER, TEST_FILENAME),\n", + " header=None,\n", + " names=COL_NAMES,\n", + " encoding=ENCODING,\n", + ")\n", "df_test = spark.createDataFrame(df_test, verifySchema=False)" ] }, @@ -244,9 +254,11 @@ "outputs": [], "source": [ "print(\"Number of test samples before filtering: \", df_test.count())\n", - "df_test = df_test.filter(col(\"label\") != 2.0) \\\n", - " .withColumn(\"label\", when(col(\"label\") > 0, 1.0).otherwise(0.0)) \\\n", - " .select([\"label\", \"text\"])\n", + "df_test = (\n", + " df_test.filter(col(\"label\") != 2.0)\n", + " .withColumn(\"label\", when(col(\"label\") > 0, 1.0).otherwise(0.0))\n", + " .select([\"label\", \"text\"])\n", + ")\n", "print(\"Number of test samples after filtering: \", df_test.count())" ] }, @@ -268,9 +280,9 @@ "outputs": [], "source": [ "# Compute model performance metrics\n", - "metrics = ComputeModelStatistics(evaluationMetric=\"classification\", \n", - " labelCol=\"label\", \n", - " scoredLabelsCol=\"prediction\").transform(predictions)\n", + "metrics = ComputeModelStatistics(\n", + " evaluationMetric=\"classification\", labelCol=\"label\", scoredLabelsCol=\"prediction\"\n", + ").transform(predictions)\n", "metrics.toPandas()" ] }, @@ -292,8 +304,10 @@ " points += [(float(row._1()), float(row._2()))]\n", " return points\n", "\n", - "preds = predictions.select(\"label\", \"probability\") \\\n", - " .rdd.map(lambda row: (float(row[\"probability\"][1]), float(row[\"label\"])))\n", + "\n", + "preds = predictions.select(\"label\", \"probability\").rdd.map(\n", + " lambda row: (float(row[\"probability\"][1]), float(row[\"label\"]))\n", + ")\n", "roc_points = CurveMetrics(preds).get_curve(\"roc\")\n", "\n", "# Plot ROC curve\n", diff --git a/notebooks/features/cognitive_services/CognitiveServices - Analyze Text.ipynb b/notebooks/features/cognitive_services/CognitiveServices - Analyze Text.ipynb index 38888de5d6..1455d90092 100644 --- a/notebooks/features/cognitive_services/CognitiveServices - Analyze Text.ipynb +++ b/notebooks/features/cognitive_services/CognitiveServices - Analyze Text.ipynb @@ -18,14 +18,16 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", " from notebookutils.mssparkutils.credentials import getSecret\n", - " os.environ['TEXT_API_KEY'] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n", + "\n", + " os.environ[\"TEXT_API_KEY\"] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n", " from notebookutils.visualization import display\n", "\n", - "#put your service keys here\n", - "key = os.environ['TEXT_API_KEY']\n", - "location = 'eastus'" + "# put your service keys here\n", + "key = os.environ[\"TEXT_API_KEY\"]\n", + "location = \"eastus\"" ] }, { @@ -34,12 +36,13 @@ "metadata": {}, "outputs": [], "source": [ - "\n", - "df = spark.createDataFrame(data=[\n", + "df = spark.createDataFrame(\n", + " data=[\n", " [\"en\", \"Hello Seattle\"],\n", - " [\"en\", \"There once was a dog who lived in London and thought she was a human\"]\n", - " ], \n", - " schema=[\"language\",\"text\"])" + " [\"en\", \"There once was a dog who lived in London and thought she was a human\"],\n", + " ],\n", + " schema=[\"language\", \"text\"],\n", + ")" ] }, { @@ -59,7 +62,8 @@ "source": [ "from synapse.ml.cognitive import *\n", "\n", - "text_analyze = (TextAnalyze()\n", + "text_analyze = (\n", + " TextAnalyze()\n", " .setLocation(location)\n", " .setSubscriptionKey(key)\n", " .setTextCol(\"text\")\n", @@ -67,15 +71,15 @@ " .setErrorCol(\"error\")\n", " .setLanguageCol(\"language\")\n", " # set the tasks to perform\n", - " .setEntityRecognitionTasks([{\"parameters\": { \"model-version\": \"latest\"}}])\n", - " .setKeyPhraseExtractionTasks([{\"parameters\": { \"model-version\": \"latest\"}}])\n", + " .setEntityRecognitionTasks([{\"parameters\": {\"model-version\": \"latest\"}}])\n", + " .setKeyPhraseExtractionTasks([{\"parameters\": {\"model-version\": \"latest\"}}])\n", " # Uncomment these lines to add more tasks\n", " # .setEntityRecognitionPiiTasks([{\"parameters\": { \"model-version\": \"latest\"}}])\n", " # .setEntityLinkingTasks([{\"parameters\": { \"model-version\": \"latest\"}}])\n", " # .setSentimentAnalysisTasks([{\"parameters\": { \"model-version\": \"latest\"}}])\n", - " )\n", + ")\n", "\n", - "df_results = text_analyze.transform(df)\n" + "df_results = text_analyze.transform(df)" ] }, { @@ -97,8 +101,11 @@ "\n", "# reformat and display for easier viewing\n", "display(\n", - " df_results.select(\"language\", \"text\", \"error\", col(\"textAnalysis\").getItem(0)) # we are not batching so only have a single result\n", - " .select(\"language\", \"text\", \"error\", \"textAnalysis[0].*\") # explode the Text Analytics tasks into columns\n", + " df_results.select(\n", + " \"language\", \"text\", \"error\", col(\"textAnalysis\").getItem(0)\n", + " ).select( # we are not batching so only have a single result\n", + " \"language\", \"text\", \"error\", \"textAnalysis[0].*\"\n", + " ) # explode the Text Analytics tasks into columns\n", ")" ] } diff --git a/notebooks/features/cognitive_services/CognitiveServices - Celebrity Quote Analysis.ipynb b/notebooks/features/cognitive_services/CognitiveServices - Celebrity Quote Analysis.ipynb index 078e184265..4cbacea6f4 100644 --- a/notebooks/features/cognitive_services/CognitiveServices - Celebrity Quote Analysis.ipynb +++ b/notebooks/features/cognitive_services/CognitiveServices - Celebrity Quote Analysis.ipynb @@ -30,15 +30,19 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", " from notebookutils.mssparkutils.credentials import getSecret\n", - " os.environ['VISION_API_KEY'] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n", - " os.environ['TEXT_API_KEY'] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n", - " os.environ['BING_IMAGE_SEARCH_KEY'] = getSecret(\"mmlspark-build-keys\", \"bing-search-key\")\n", "\n", - "#put your service keys here\n", - "TEXT_API_KEY = os.environ[\"TEXT_API_KEY\"]\n", - "VISION_API_KEY = os.environ[\"VISION_API_KEY\"]\n", + " os.environ[\"VISION_API_KEY\"] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n", + " os.environ[\"TEXT_API_KEY\"] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n", + " os.environ[\"BING_IMAGE_SEARCH_KEY\"] = getSecret(\n", + " \"mmlspark-build-keys\", \"bing-search-key\"\n", + " )\n", + "\n", + "# put your service keys here\n", + "TEXT_API_KEY = os.environ[\"TEXT_API_KEY\"]\n", + "VISION_API_KEY = os.environ[\"VISION_API_KEY\"]\n", "BING_IMAGE_SEARCH_KEY = os.environ[\"BING_IMAGE_SEARCH_KEY\"]" ] }, @@ -61,18 +65,22 @@ }, "outputs": [], "source": [ - "imgsPerBatch = 10 #the number of images Bing will return for each query\n", - "offsets = [(i*imgsPerBatch,) for i in range(100)] # A list of offsets, used to page into the search results\n", + "imgsPerBatch = 10 # the number of images Bing will return for each query\n", + "offsets = [\n", + " (i * imgsPerBatch,) for i in range(100)\n", + "] # A list of offsets, used to page into the search results\n", "bingParameters = spark.createDataFrame(offsets, [\"offset\"])\n", "\n", - "bingSearch = BingImageSearch()\\\n", - " .setSubscriptionKey(BING_IMAGE_SEARCH_KEY)\\\n", - " .setOffsetCol(\"offset\")\\\n", - " .setQuery(\"celebrity quotes\")\\\n", - " .setCount(imgsPerBatch)\\\n", - " .setOutputCol(\"images\")\n", + "bingSearch = (\n", + " BingImageSearch()\n", + " .setSubscriptionKey(BING_IMAGE_SEARCH_KEY)\n", + " .setOffsetCol(\"offset\")\n", + " .setQuery(\"celebrity quotes\")\n", + " .setCount(imgsPerBatch)\n", + " .setOutputCol(\"images\")\n", + ")\n", "\n", - "#Transformer to that extracts and flattens the richly structured output of Bing Image Search into a simple URL column\n", + "# Transformer to that extracts and flattens the richly structured output of Bing Image Search into a simple URL column\n", "getUrls = BingImageSearch.getUrlTransformer(\"images\", \"url\")" ] }, @@ -94,15 +102,19 @@ }, "outputs": [], "source": [ - "celebs = RecognizeDomainSpecificContent()\\\n", - " .setSubscriptionKey(VISION_API_KEY)\\\n", - " .setModel(\"celebrities\")\\\n", - " .setUrl(\"https://eastus.api.cognitive.microsoft.com/vision/v2.0/\")\\\n", - " .setImageUrlCol(\"url\")\\\n", - " .setOutputCol(\"celebs\")\n", + "celebs = (\n", + " RecognizeDomainSpecificContent()\n", + " .setSubscriptionKey(VISION_API_KEY)\n", + " .setModel(\"celebrities\")\n", + " .setUrl(\"https://eastus.api.cognitive.microsoft.com/vision/v2.0/\")\n", + " .setImageUrlCol(\"url\")\n", + " .setOutputCol(\"celebs\")\n", + ")\n", "\n", - "#Extract the first celebrity we see from the structured response\n", - "firstCeleb = SQLTransformer(statement=\"SELECT *, celebs.result.celebrities[0].name as firstCeleb FROM __THIS__\")" + "# Extract the first celebrity we see from the structured response\n", + "firstCeleb = SQLTransformer(\n", + " statement=\"SELECT *, celebs.result.celebrities[0].name as firstCeleb FROM __THIS__\"\n", + ")" ] }, { @@ -123,22 +135,32 @@ }, "outputs": [], "source": [ - "from synapse.ml.stages import UDFTransformer \n", + "from synapse.ml.stages import UDFTransformer\n", + "\n", + "recognizeText = (\n", + " RecognizeText()\n", + " .setSubscriptionKey(VISION_API_KEY)\n", + " .setUrl(\"https://eastus.api.cognitive.microsoft.com/vision/v2.0/recognizeText\")\n", + " .setImageUrlCol(\"url\")\n", + " .setMode(\"Printed\")\n", + " .setOutputCol(\"ocr\")\n", + " .setConcurrency(5)\n", + ")\n", "\n", - "recognizeText = RecognizeText()\\\n", - " .setSubscriptionKey(VISION_API_KEY)\\\n", - " .setUrl(\"https://eastus.api.cognitive.microsoft.com/vision/v2.0/recognizeText\")\\\n", - " .setImageUrlCol(\"url\")\\\n", - " .setMode(\"Printed\")\\\n", - " .setOutputCol(\"ocr\")\\\n", - " .setConcurrency(5)\n", "\n", "def getTextFunction(ocrRow):\n", - " if ocrRow is None: return None\n", + " if ocrRow is None:\n", + " return None\n", " return \"\\n\".join([line.text for line in ocrRow.recognitionResult.lines])\n", "\n", + "\n", "# this transformer wil extract a simpler string from the structured output of recognize text\n", - "getText = UDFTransformer().setUDF(udf(getTextFunction)).setInputCol(\"ocr\").setOutputCol(\"text\")\n" + "getText = (\n", + " UDFTransformer()\n", + " .setUDF(udf(getTextFunction))\n", + " .setInputCol(\"ocr\")\n", + " .setOutputCol(\"text\")\n", + ")" ] }, { @@ -158,14 +180,18 @@ }, "outputs": [], "source": [ - "sentimentTransformer = TextSentiment()\\\n", - " .setTextCol(\"text\")\\\n", - " .setUrl(\"https://eastus.api.cognitive.microsoft.com/text/analytics/v3.0/sentiment\")\\\n", - " .setSubscriptionKey(TEXT_API_KEY)\\\n", + "sentimentTransformer = (\n", + " TextSentiment()\n", + " .setTextCol(\"text\")\n", + " .setUrl(\"https://eastus.api.cognitive.microsoft.com/text/analytics/v3.0/sentiment\")\n", + " .setSubscriptionKey(TEXT_API_KEY)\n", " .setOutputCol(\"sentiment\")\n", + ")\n", "\n", - "#Extract the sentiment score from the API response body\n", - "getSentiment = SQLTransformer(statement=\"SELECT *, sentiment[0].sentiment as sentimentLabel FROM __THIS__\")" + "# Extract the sentiment score from the API response body\n", + "getSentiment = SQLTransformer(\n", + " statement=\"SELECT *, sentiment[0].sentiment as sentimentLabel FROM __THIS__\"\n", + ")" ] }, { @@ -186,11 +212,25 @@ "outputs": [], "source": [ "from synapse.ml.stages import SelectColumns\n", - "# Select the final coulmns\n", - "cleanupColumns = SelectColumns().setCols([\"url\", \"firstCeleb\", \"text\", \"sentimentLabel\"])\n", "\n", - "celebrityQuoteAnalysis = PipelineModel(stages=[\n", - " bingSearch, getUrls, celebs, firstCeleb, recognizeText, getText, sentimentTransformer, getSentiment, cleanupColumns])\n", + "# Select the final coulmns\n", + "cleanupColumns = SelectColumns().setCols(\n", + " [\"url\", \"firstCeleb\", \"text\", \"sentimentLabel\"]\n", + ")\n", + "\n", + "celebrityQuoteAnalysis = PipelineModel(\n", + " stages=[\n", + " bingSearch,\n", + " getUrls,\n", + " celebs,\n", + " firstCeleb,\n", + " recognizeText,\n", + " getText,\n", + " sentimentTransformer,\n", + " getSentiment,\n", + " cleanupColumns,\n", + " ]\n", + ")\n", "\n", "celebrityQuoteAnalysis.transform(bingParameters).show(5)" ] diff --git a/notebooks/features/cognitive_services/CognitiveServices - Create a Multilingual Search Engine from Forms.ipynb b/notebooks/features/cognitive_services/CognitiveServices - Create a Multilingual Search Engine from Forms.ipynb index f0be1097c9..e3cb2e7f0a 100644 --- a/notebooks/features/cognitive_services/CognitiveServices - Create a Multilingual Search Engine from Forms.ipynb +++ b/notebooks/features/cognitive_services/CognitiveServices - Create a Multilingual Search Engine from Forms.ipynb @@ -7,19 +7,24 @@ "outputs": [], "source": [ "import os\n", + "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", " from notebookutils.mssparkutils.credentials import getSecret\n", - " os.environ['VISION_API_KEY'] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n", - " os.environ['AZURE_SEARCH_KEY'] = getSecret(\"mmlspark-build-keys\", \"azure-search-key\")\n", - " os.environ['TRANSLATOR_KEY'] = getSecret(\"mmlspark-build-keys\", \"translator-key\")\n", + "\n", + " os.environ[\"VISION_API_KEY\"] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n", + " os.environ[\"AZURE_SEARCH_KEY\"] = getSecret(\n", + " \"mmlspark-build-keys\", \"azure-search-key\"\n", + " )\n", + " os.environ[\"TRANSLATOR_KEY\"] = getSecret(\"mmlspark-build-keys\", \"translator-key\")\n", " from notebookutils.visualization import display\n", "\n", "\n", - "key = os.environ['VISION_API_KEY']\n", - "search_key = os.environ['AZURE_SEARCH_KEY']\n", - "translator_key = os.environ['TRANSLATOR_KEY']\n", + "key = os.environ[\"VISION_API_KEY\"]\n", + "search_key = os.environ[\"AZURE_SEARCH_KEY\"]\n", + "translator_key = os.environ[\"TRANSLATOR_KEY\"]\n", "\n", "search_service = \"mmlspark-azure-search\"\n", "search_index = \"form-demo-index\"" @@ -34,22 +39,24 @@ "from pyspark.sql.functions import udf\n", "from pyspark.sql.types import StringType\n", "\n", + "\n", "def blob_to_url(blob):\n", - " [prefix, postfix] = blob.split(\"@\")\n", - " container = prefix.split(\"/\")[-1]\n", - " split_postfix = postfix.split(\"/\")\n", - " account = split_postfix[0]\n", - " filepath = \"/\".join(split_postfix[1:])\n", - " return \"https://{}/{}/{}\".format(account, container, filepath)\n", + " [prefix, postfix] = blob.split(\"@\")\n", + " container = prefix.split(\"/\")[-1]\n", + " split_postfix = postfix.split(\"/\")\n", + " account = split_postfix[0]\n", + " filepath = \"/\".join(split_postfix[1:])\n", + " return \"https://{}/{}/{}\".format(account, container, filepath)\n", "\n", "\n", - "df2 = (spark.read.format(\"binaryFile\")\n", - " .load(\"wasbs://ignite2021@mmlsparkdemo.blob.core.windows.net/form_subset/*\")\n", - " .select(\"path\")\n", - " .limit(10)\n", - " .select(udf(blob_to_url, StringType())(\"path\").alias(\"url\"))\n", - " .cache()\n", - " )\n" + "df2 = (\n", + " spark.read.format(\"binaryFile\")\n", + " .load(\"wasbs://ignite2021@mmlsparkdemo.blob.core.windows.net/form_subset/*\")\n", + " .select(\"path\")\n", + " .limit(10)\n", + " .select(udf(blob_to_url, StringType())(\"path\").alias(\"url\"))\n", + " .cache()\n", + ")" ] }, { @@ -80,15 +87,17 @@ "source": [ "from synapse.ml.cognitive import AnalyzeInvoices\n", "\n", - "analyzed_df = (AnalyzeInvoices()\n", - " .setSubscriptionKey(key)\n", - " .setLocation(\"eastus\")\n", - " .setImageUrlCol(\"url\")\n", - " .setOutputCol(\"invoices\")\n", - " .setErrorCol(\"errors\")\n", - " .setConcurrency(5)\n", - " .transform(df2)\n", - " .cache())\n" + "analyzed_df = (\n", + " AnalyzeInvoices()\n", + " .setSubscriptionKey(key)\n", + " .setLocation(\"eastus\")\n", + " .setImageUrlCol(\"url\")\n", + " .setOutputCol(\"invoices\")\n", + " .setErrorCol(\"errors\")\n", + " .setConcurrency(5)\n", + " .transform(df2)\n", + " .cache()\n", + ")" ] }, { @@ -108,13 +117,15 @@ "source": [ "from synapse.ml.cognitive import FormOntologyLearner\n", "\n", - "organized_df = (FormOntologyLearner()\n", - " .setInputCol(\"invoices\")\n", - " .setOutputCol(\"extracted\")\n", - " .fit(analyzed_df)\n", - " .transform(analyzed_df)\n", - " .select(\"url\", \"extracted.*\")\n", - " .cache())" + "organized_df = (\n", + " FormOntologyLearner()\n", + " .setInputCol(\"invoices\")\n", + " .setOutputCol(\"extracted\")\n", + " .fit(analyzed_df)\n", + " .transform(analyzed_df)\n", + " .select(\"url\", \"extracted.*\")\n", + " .cache()\n", + ")" ] }, { @@ -133,11 +144,13 @@ "outputs": [], "source": [ "from pyspark.sql.functions import explode, col\n", - "itemized_df = (organized_df\n", - " .select(\"*\", explode(col(\"Items\")).alias(\"Item\"))\n", - " .drop(\"Items\")\n", - " .select(\"Item.*\", \"*\")\n", - " .drop(\"Item\"))\n" + "\n", + "itemized_df = (\n", + " organized_df.select(\"*\", explode(col(\"Items\")).alias(\"Item\"))\n", + " .drop(\"Items\")\n", + " .select(\"Item.*\", \"*\")\n", + " .drop(\"Item\")\n", + ")" ] }, { @@ -166,7 +179,8 @@ "source": [ "from synapse.ml.cognitive import Translate\n", "\n", - "translated_df = (Translate()\n", + "translated_df = (\n", + " Translate()\n", " .setSubscriptionKey(translator_key)\n", " .setLocation(\"eastus\")\n", " .setTextCol(\"Description\")\n", @@ -177,7 +191,8 @@ " .transform(itemized_df)\n", " .withColumn(\"Translations\", col(\"output.translations\")[0])\n", " .drop(\"output\", \"TranslationError\")\n", - " .cache())\n" + " .cache()\n", + ")" ] }, { @@ -198,16 +213,17 @@ "from synapse.ml.cognitive import *\n", "from pyspark.sql.functions import monotonically_increasing_id, lit\n", "\n", - "(translated_df\n", - " .withColumn(\"DocID\", monotonically_increasing_id().cast(\"string\"))\n", - " .withColumn(\"SearchAction\", lit(\"upload\"))\n", - " .writeToAzureSearch(\n", - " subscriptionKey=search_key,\n", - " actionCol=\"SearchAction\",\n", - " serviceName=search_service,\n", - " indexName=search_index,\n", - " keyCol=\"DocID\")\n", - ")\n" + "(\n", + " translated_df.withColumn(\"DocID\", monotonically_increasing_id().cast(\"string\"))\n", + " .withColumn(\"SearchAction\", lit(\"upload\"))\n", + " .writeToAzureSearch(\n", + " subscriptionKey=search_key,\n", + " actionCol=\"SearchAction\",\n", + " serviceName=search_service,\n", + " indexName=search_index,\n", + " keyCol=\"DocID\",\n", + " )\n", + ")" ] }, { @@ -217,8 +233,11 @@ "outputs": [], "source": [ "import requests\n", - "url = 'https://{}.search.windows.net/indexes/{}/docs/search?api-version=2019-05-06'.format(search_service, search_index)\n", - "requests.post(url, json={\"search\": \"door\"}, headers = {\"api-key\": search_key}).json()" + "\n", + "url = \"https://{}.search.windows.net/indexes/{}/docs/search?api-version=2019-05-06\".format(\n", + " search_service, search_index\n", + ")\n", + "requests.post(url, json={\"search\": \"door\"}, headers={\"api-key\": search_key}).json()" ] }, { diff --git a/notebooks/features/cognitive_services/CognitiveServices - Multivariate Anomaly Detection.ipynb b/notebooks/features/cognitive_services/CognitiveServices - Multivariate Anomaly Detection.ipynb index 95ab3418b9..6a0f295d75 100644 --- a/notebooks/features/cognitive_services/CognitiveServices - Multivariate Anomaly Detection.ipynb +++ b/notebooks/features/cognitive_services/CognitiveServices - Multivariate Anomaly Detection.ipynb @@ -76,10 +76,14 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", " from notebookutils.mssparkutils.credentials import getSecret\n", - " os.environ['ANOMALY_API_KEY'] = getSecret(\"mmlspark-build-keys\", \"anomaly-api-key\")\n", - " os.environ['BLOB_CONNECTION_STRING'] = getSecret(\"mmlspark-build-keys\", \"madtest-connection-string\")" + "\n", + " os.environ[\"ANOMALY_API_KEY\"] = getSecret(\"mmlspark-build-keys\", \"anomaly-api-key\")\n", + " os.environ[\"BLOB_CONNECTION_STRING\"] = getSecret(\n", + " \"mmlspark-build-keys\", \"madtest-connection-string\"\n", + " )" ] }, { @@ -226,11 +230,17 @@ }, "outputs": [], "source": [ - "df = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/MVAD/sample.csv\")\n", + "df = (\n", + " spark.read.format(\"csv\")\n", + " .option(\"header\", \"true\")\n", + " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/MVAD/sample.csv\")\n", + ")\n", "\n", - "df = df.withColumn(\"sensor_1\", col(\"sensor_1\").cast(DoubleType())) \\\n", - " .withColumn(\"sensor_2\", col(\"sensor_2\").cast(DoubleType())) \\\n", + "df = (\n", + " df.withColumn(\"sensor_1\", col(\"sensor_1\").cast(DoubleType()))\n", + " .withColumn(\"sensor_2\", col(\"sensor_2\").cast(DoubleType()))\n", " .withColumn(\"sensor_3\", col(\"sensor_3\").cast(DoubleType()))\n", + ")\n", "\n", "# Let's inspect the dataframe:\n", "df.show(5)" @@ -299,11 +309,12 @@ "source": [ "trainingStartTime = \"2020-06-01T12:00:00Z\"\n", "trainingEndTime = \"2020-07-02T17:55:00Z\"\n", - "intermediateSaveDir = \"intermediateData\" \n", + "intermediateSaveDir = \"intermediateData\"\n", "timestampColumn = \"timestamp\"\n", "inputColumns = [\"sensor_1\", \"sensor_2\", \"sensor_3\"]\n", "\n", - "estimator = (FitMultivariateAnomaly()\n", + "estimator = (\n", + " FitMultivariateAnomaly()\n", " .setSubscriptionKey(anomalyKey)\n", " .setLocation(location)\n", " .setStartTime(trainingStartTime)\n", @@ -314,7 +325,7 @@ " .setInputCols(inputColumns)\n", " .setSlidingWindow(200)\n", " .setConnectionString(connectionString)\n", - " )" + ")" ] }, { @@ -397,15 +408,15 @@ "inferenceStartTime = \"2020-07-02T18:00:00Z\"\n", "inferenceEndTime = \"2020-07-06T05:15:00Z\"\n", "\n", - "result = (model\n", - " .setStartTime(inferenceStartTime)\n", + "result = (\n", + " model.setStartTime(inferenceStartTime)\n", " .setEndTime(inferenceEndTime)\n", " .setOutputCol(\"results\")\n", " .setErrorCol(\"errors\")\n", " .setInputCols(inputColumns)\n", " .setTimestampCol(timestampColumn)\n", " .transform(df)\n", - " )\n", + ")\n", "\n", "result.show(5)" ] @@ -636,10 +647,18 @@ } ], "source": [ - "rdf = (result.select(\"timestamp\",*inputColumns, \"results.contributors\", \"results.isAnomaly\", \"results.severity\")\n", - " .orderBy('timestamp', ascending=True)\n", - " .filter(col('timestamp') >= lit(inferenceStartTime))\n", - " .toPandas())\n", + "rdf = (\n", + " result.select(\n", + " \"timestamp\",\n", + " *inputColumns,\n", + " \"results.contributors\",\n", + " \"results.isAnomaly\",\n", + " \"results.severity\"\n", + " )\n", + " .orderBy(\"timestamp\", ascending=True)\n", + " .filter(col(\"timestamp\") >= lit(inferenceStartTime))\n", + " .toPandas()\n", + ")\n", "\n", "rdf" ] @@ -887,10 +906,13 @@ " if type(x) is list:\n", " return dict([item[::-1] for item in x])\n", " else:\n", - " return {'series_0': 0, 'series_1': 0, 'series_2': 0}\n", + " return {\"series_0\": 0, \"series_1\": 0, \"series_2\": 0}\n", "\n", - "rdf['contributors'] = rdf['contributors'].apply(parse)\n", - "rdf = pd.concat([rdf.drop(['contributors'], axis=1), pd.json_normalize(rdf['contributors'])], axis=1)\n", + "\n", + "rdf[\"contributors\"] = rdf[\"contributors\"].apply(parse)\n", + "rdf = pd.concat(\n", + " [rdf.drop([\"contributors\"], axis=1), pd.json_normalize(rdf[\"contributors\"])], axis=1\n", + ")\n", "rdf" ] }, @@ -927,42 +949,95 @@ "\n", "\n", "####### Main Figure #######\n", - "plt.figure(figsize=(23,8))\n", - "plt.plot(rdf['timestamp'],rdf['sensor_1'], color='tab:orange', linestyle='solid', linewidth=2, label='sensor_1')\n", - "plt.plot(rdf['timestamp'],rdf['sensor_2'], color='tab:green', linestyle='solid', linewidth=2, label='sensor_2')\n", - "plt.plot(rdf['timestamp'],rdf['sensor_3'], color='tab:blue', linestyle='solid', linewidth=2, label='sensor_3')\n", - "plt.grid(axis='y')\n", - "plt.tick_params(axis='x',which='both',bottom=False,labelbottom=False)\n", + "plt.figure(figsize=(23, 8))\n", + "plt.plot(\n", + " rdf[\"timestamp\"],\n", + " rdf[\"sensor_1\"],\n", + " color=\"tab:orange\",\n", + " linestyle=\"solid\",\n", + " linewidth=2,\n", + " label=\"sensor_1\",\n", + ")\n", + "plt.plot(\n", + " rdf[\"timestamp\"],\n", + " rdf[\"sensor_2\"],\n", + " color=\"tab:green\",\n", + " linestyle=\"solid\",\n", + " linewidth=2,\n", + " label=\"sensor_2\",\n", + ")\n", + "plt.plot(\n", + " rdf[\"timestamp\"],\n", + " rdf[\"sensor_3\"],\n", + " color=\"tab:blue\",\n", + " linestyle=\"solid\",\n", + " linewidth=2,\n", + " label=\"sensor_3\",\n", + ")\n", + "plt.grid(axis=\"y\")\n", + "plt.tick_params(axis=\"x\", which=\"both\", bottom=False, labelbottom=False)\n", "plt.legend()\n", "\n", "anoms = list(rdf[\"severity\"] >= minSeverity)\n", "_, _, ymin, ymax = plt.axis()\n", - "plt.vlines(np.where(anoms), ymin=ymin , ymax=ymax , color='r', alpha=0.8)\n", + "plt.vlines(np.where(anoms), ymin=ymin, ymax=ymax, color=\"r\", alpha=0.8)\n", "\n", "plt.legend()\n", - "plt.title('A plot of the values from the three sensors with the detected anomalies highlighted in red.')\n", + "plt.title(\n", + " \"A plot of the values from the three sensors with the detected anomalies highlighted in red.\"\n", + ")\n", "plt.show()\n", "\n", "####### Severity Figure #######\n", - "plt.figure(figsize=(23,1))\n", - "plt.tick_params(axis='x',which='both',bottom=False,labelbottom=False)\n", - "plt.plot(rdf['timestamp'],rdf['severity'], color='black', linestyle='solid', linewidth=2, label='Severity score')\n", - "plt.plot(rdf['timestamp'],[minSeverity]*len(rdf['severity']), color='red', linestyle='dotted', linewidth=1, label='minSeverity')\n", - "plt.grid(axis='y')\n", + "plt.figure(figsize=(23, 1))\n", + "plt.tick_params(axis=\"x\", which=\"both\", bottom=False, labelbottom=False)\n", + "plt.plot(\n", + " rdf[\"timestamp\"],\n", + " rdf[\"severity\"],\n", + " color=\"black\",\n", + " linestyle=\"solid\",\n", + " linewidth=2,\n", + " label=\"Severity score\",\n", + ")\n", + "plt.plot(\n", + " rdf[\"timestamp\"],\n", + " [minSeverity] * len(rdf[\"severity\"]),\n", + " color=\"red\",\n", + " linestyle=\"dotted\",\n", + " linewidth=1,\n", + " label=\"minSeverity\",\n", + ")\n", + "plt.grid(axis=\"y\")\n", "plt.legend()\n", - "plt.ylim([0,1])\n", + "plt.ylim([0, 1])\n", "plt.title(\"Severity of the detected anomalies\")\n", "plt.show()\n", "\n", "####### Contributors Figure #######\n", - "plt.figure(figsize=(23,1))\n", - "plt.tick_params(axis='x',which='both',bottom=False,labelbottom=False)\n", - "plt.bar(rdf['timestamp'],rdf['series_0'], width=2, color='tab:orange', label='sensor_1')\n", - "plt.bar(rdf['timestamp'],rdf['series_1'], width=2, color='tab:green', label='sensor_2', bottom=rdf['series_0'])\n", - "plt.bar(rdf['timestamp'],rdf['series_2'], width=2, color='tab:blue', label='sensor_3', bottom=rdf['series_0']+rdf['series_1'])\n", - "plt.grid(axis='y')\n", + "plt.figure(figsize=(23, 1))\n", + "plt.tick_params(axis=\"x\", which=\"both\", bottom=False, labelbottom=False)\n", + "plt.bar(\n", + " rdf[\"timestamp\"], rdf[\"series_0\"], width=2, color=\"tab:orange\", label=\"sensor_1\"\n", + ")\n", + "plt.bar(\n", + " rdf[\"timestamp\"],\n", + " rdf[\"series_1\"],\n", + " width=2,\n", + " color=\"tab:green\",\n", + " label=\"sensor_2\",\n", + " bottom=rdf[\"series_0\"],\n", + ")\n", + "plt.bar(\n", + " rdf[\"timestamp\"],\n", + " rdf[\"series_2\"],\n", + " width=2,\n", + " color=\"tab:blue\",\n", + " label=\"sensor_3\",\n", + " bottom=rdf[\"series_0\"] + rdf[\"series_1\"],\n", + ")\n", + "plt.grid(axis=\"y\")\n", "plt.legend()\n", - "plt.ylim([0,1])\n", + "plt.ylim([0, 1])\n", "plt.title(\"The contribution of each sensor to the detected anomaly\")\n", "plt.show()" ] diff --git a/notebooks/features/cognitive_services/CognitiveServices - Overview.ipynb b/notebooks/features/cognitive_services/CognitiveServices - Overview.ipynb index 7c7e31f61b..f71b9417fe 100644 --- a/notebooks/features/cognitive_services/CognitiveServices - Overview.ipynb +++ b/notebooks/features/cognitive_services/CognitiveServices - Overview.ipynb @@ -111,7 +111,7 @@ "from pyspark.sql.functions import lit\n", "from pyspark.ml import PipelineModel\n", "from pyspark.sql.functions import col\n", - "import os\n" + "import os" ] }, { @@ -122,14 +122,24 @@ "source": [ "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", " from notebookutils.mssparkutils.credentials import getSecret\n", - " os.environ['ANOMALY_API_KEY'] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n", - " os.environ['COGNITIVE_SERVICE_KEY'] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n", - " os.environ['BING_IMAGE_SEARCH_KEY'] = getSecret(\"mmlspark-build-keys\", \"bing-search-key\")\n", - " os.environ['TRANSLATOR_KEY'] = getSecret(\"mmlspark-build-keys\", \"translator-key\")\n", - " os.environ['AZURE_SEARCH_KEY'] = getSecret(\"mmlspark-build-keys\", \"azure-search-key\")\n", - " from notebookutils.visualization import display\n" + "\n", + " os.environ[\"ANOMALY_API_KEY\"] = getSecret(\n", + " \"mmlspark-build-keys\", \"cognitive-api-key\"\n", + " )\n", + " os.environ[\"COGNITIVE_SERVICE_KEY\"] = getSecret(\n", + " \"mmlspark-build-keys\", \"cognitive-api-key\"\n", + " )\n", + " os.environ[\"BING_IMAGE_SEARCH_KEY\"] = getSecret(\n", + " \"mmlspark-build-keys\", \"bing-search-key\"\n", + " )\n", + " os.environ[\"TRANSLATOR_KEY\"] = getSecret(\"mmlspark-build-keys\", \"translator-key\")\n", + " os.environ[\"AZURE_SEARCH_KEY\"] = getSecret(\n", + " \"mmlspark-build-keys\", \"azure-search-key\"\n", + " )\n", + " from notebookutils.visualization import display" ] }, { @@ -149,7 +159,7 @@ "# A Translator subscription key\n", "translator_key = os.environ[\"TRANSLATOR_KEY\"]\n", "# An Azure search key\n", - "search_key = os.environ['AZURE_SEARCH_KEY']\n" + "search_key = os.environ[\"AZURE_SEARCH_KEY\"]" ] }, { @@ -168,24 +178,32 @@ "outputs": [], "source": [ "# Create a dataframe that's tied to it's column names\n", - "df = spark.createDataFrame([\n", - " (\"I am so happy today, its sunny!\", \"en-US\"),\n", - " (\"I am frustrated by this rush hour traffic\", \"en-US\"),\n", - " (\"The cognitive services on spark aint bad\", \"en-US\"),\n", - "], [\"text\", \"language\"])\n", + "df = spark.createDataFrame(\n", + " [\n", + " (\"I am so happy today, its sunny!\", \"en-US\"),\n", + " (\"I am frustrated by this rush hour traffic\", \"en-US\"),\n", + " (\"The cognitive services on spark aint bad\", \"en-US\"),\n", + " ],\n", + " [\"text\", \"language\"],\n", + ")\n", "\n", "# Run the Text Analytics service with options\n", - "sentiment = (TextSentiment()\n", - " .setTextCol(\"text\")\n", - " .setLocation(\"eastus\")\n", - " .setSubscriptionKey(service_key)\n", - " .setOutputCol(\"sentiment\")\n", - " .setErrorCol(\"error\")\n", - " .setLanguageCol(\"language\"))\n", + "sentiment = (\n", + " TextSentiment()\n", + " .setTextCol(\"text\")\n", + " .setLocation(\"eastus\")\n", + " .setSubscriptionKey(service_key)\n", + " .setOutputCol(\"sentiment\")\n", + " .setErrorCol(\"error\")\n", + " .setLanguageCol(\"language\")\n", + ")\n", "\n", "# Show the results of your text query in a table format\n", - "display(sentiment.transform(df).select(\"text\", col(\n", - " \"sentiment\")[0].getItem(\"sentiment\").alias(\"sentiment\")))" + "display(\n", + " sentiment.transform(df).select(\n", + " \"text\", col(\"sentiment\")[0].getItem(\"sentiment\").alias(\"sentiment\")\n", + " )\n", + ")" ] }, { @@ -203,16 +221,22 @@ "metadata": {}, "outputs": [], "source": [ - "df = spark.createDataFrame([\n", - " (\"20mg of ibuprofen twice a day\",),\n", - " (\"1tsp of Tylenol every 4 hours\",),\n", - " (\"6-drops of Vitamin B-12 every evening\",)], [\"text\"])\n", + "df = spark.createDataFrame(\n", + " [\n", + " (\"20mg of ibuprofen twice a day\",),\n", + " (\"1tsp of Tylenol every 4 hours\",),\n", + " (\"6-drops of Vitamin B-12 every evening\",),\n", + " ],\n", + " [\"text\"],\n", + ")\n", "\n", - "healthcare = (HealthcareSDK()\n", + "healthcare = (\n", + " HealthcareSDK()\n", " .setSubscriptionKey(service_key)\n", " .setLocation(\"eastus\")\n", " .setLanguage(\"en\")\n", - " .setOutputCol(\"response\"))\n", + " .setOutputCol(\"response\")\n", + ")\n", "\n", "display(healthcare.transform(df))" ] @@ -234,24 +258,30 @@ "from pyspark.sql.functions import col, flatten\n", "\n", "# Create a dataframe including sentences you want to translate\n", - "df = spark.createDataFrame([\n", - " ([\"Hello, what is your name?\", \"Bye\"],)\n", - "], [\"text\",])\n", + "df = spark.createDataFrame(\n", + " [([\"Hello, what is your name?\", \"Bye\"],)],\n", + " [\n", + " \"text\",\n", + " ],\n", + ")\n", "\n", "# Run the Translator service with options\n", - "translate = (Translate()\n", + "translate = (\n", + " Translate()\n", " .setSubscriptionKey(translator_key)\n", " .setLocation(\"eastus\")\n", " .setTextCol(\"text\")\n", " .setToLanguage([\"zh-Hans\"])\n", - " .setOutputCol(\"translation\"))\n", + " .setOutputCol(\"translation\")\n", + ")\n", "\n", "# Show the results of the translation.\n", - "display(translate\n", - " .transform(df)\n", - " .withColumn(\"translation\", flatten(col(\"translation.translations\")))\n", - " .withColumn(\"translation\", col(\"translation.text\"))\n", - " .select(\"translation\"))" + "display(\n", + " translate.transform(df)\n", + " .withColumn(\"translation\", flatten(col(\"translation.translations\")))\n", + " .withColumn(\"translation\", col(\"translation.text\"))\n", + " .select(\"translation\")\n", + ")" ] }, { @@ -271,22 +301,34 @@ "from pyspark.sql.functions import col, explode\n", "\n", "# Create a dataframe containing the source files\n", - "imageDf = spark.createDataFrame([\n", - " (\"https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/business_card.jpg\",)\n", - "], [\"source\",])\n", + "imageDf = spark.createDataFrame(\n", + " [\n", + " (\n", + " \"https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/business_card.jpg\",\n", + " )\n", + " ],\n", + " [\n", + " \"source\",\n", + " ],\n", + ")\n", "\n", "# Run the Form Recognizer service\n", - "analyzeBusinessCards = (AnalyzeBusinessCards()\n", - " .setSubscriptionKey(service_key)\n", - " .setLocation(\"eastus\")\n", - " .setImageUrlCol(\"source\")\n", - " .setOutputCol(\"businessCards\"))\n", + "analyzeBusinessCards = (\n", + " AnalyzeBusinessCards()\n", + " .setSubscriptionKey(service_key)\n", + " .setLocation(\"eastus\")\n", + " .setImageUrlCol(\"source\")\n", + " .setOutputCol(\"businessCards\")\n", + ")\n", "\n", "# Show the results of recognition.\n", - "display(analyzeBusinessCards\n", - " .transform(imageDf)\n", - " .withColumn(\"documents\", explode(col(\"businessCards.analyzeResult.documentResults.fields\")))\n", - " .select(\"source\", \"documents\"))" + "display(\n", + " analyzeBusinessCards.transform(imageDf)\n", + " .withColumn(\n", + " \"documents\", explode(col(\"businessCards.analyzeResult.documentResults.fields\"))\n", + " )\n", + " .select(\"source\", \"documents\")\n", + ")" ] }, { @@ -305,24 +347,38 @@ "outputs": [], "source": [ "# Create a dataframe with the image URLs\n", - "df = spark.createDataFrame([\n", - " (\"https://raw.githubusercontent.com/Azure-Samples/cognitive-services-sample-data-files/master/ComputerVision/Images/objects.jpg\", ),\n", - " (\"https://raw.githubusercontent.com/Azure-Samples/cognitive-services-sample-data-files/master/ComputerVision/Images/dog.jpg\", ),\n", - " (\"https://raw.githubusercontent.com/Azure-Samples/cognitive-services-sample-data-files/master/ComputerVision/Images/house.jpg\", )\n", - "], [\"image\", ])\n", + "df = spark.createDataFrame(\n", + " [\n", + " (\n", + " \"https://raw.githubusercontent.com/Azure-Samples/cognitive-services-sample-data-files/master/ComputerVision/Images/objects.jpg\",\n", + " ),\n", + " (\n", + " \"https://raw.githubusercontent.com/Azure-Samples/cognitive-services-sample-data-files/master/ComputerVision/Images/dog.jpg\",\n", + " ),\n", + " (\n", + " \"https://raw.githubusercontent.com/Azure-Samples/cognitive-services-sample-data-files/master/ComputerVision/Images/house.jpg\",\n", + " ),\n", + " ],\n", + " [\n", + " \"image\",\n", + " ],\n", + ")\n", "\n", "# Run the Computer Vision service. Analyze Image extracts infortmation from/about the images.\n", - "analysis = (AnalyzeImage()\n", - " .setLocation(\"eastus\")\n", - " .setSubscriptionKey(service_key)\n", - " .setVisualFeatures([\"Categories\", \"Color\", \"Description\", \"Faces\", \"Objects\", \"Tags\"])\n", - " .setOutputCol(\"analysis_results\")\n", - " .setImageUrlCol(\"image\")\n", - " .setErrorCol(\"error\"))\n", + "analysis = (\n", + " AnalyzeImage()\n", + " .setLocation(\"eastus\")\n", + " .setSubscriptionKey(service_key)\n", + " .setVisualFeatures(\n", + " [\"Categories\", \"Color\", \"Description\", \"Faces\", \"Objects\", \"Tags\"]\n", + " )\n", + " .setOutputCol(\"analysis_results\")\n", + " .setImageUrlCol(\"image\")\n", + " .setErrorCol(\"error\")\n", + ")\n", "\n", "# Show the results of what you wanted to pull out of the images.\n", - "display(analysis.transform(df).select(\n", - " \"image\", \"analysis_results.description.tags\"))\n" + "display(analysis.transform(df).select(\"image\", \"analysis_results.description.tags\"))" ] }, { @@ -343,17 +399,19 @@ "# Number of images Bing will return per query\n", "imgsPerBatch = 10\n", "# A list of offsets, used to page into the search results\n", - "offsets = [(i*imgsPerBatch,) for i in range(100)]\n", + "offsets = [(i * imgsPerBatch,) for i in range(100)]\n", "# Since web content is our data, we create a dataframe with options on that data: offsets\n", "bingParameters = spark.createDataFrame(offsets, [\"offset\"])\n", "\n", "# Run the Bing Image Search service with our text query\n", - "bingSearch = (BingImageSearch()\n", - " .setSubscriptionKey(bing_search_key)\n", - " .setOffsetCol(\"offset\")\n", - " .setQuery(\"Martin Luther King Jr. quotes\")\n", - " .setCount(imgsPerBatch)\n", - " .setOutputCol(\"images\"))\n", + "bingSearch = (\n", + " BingImageSearch()\n", + " .setSubscriptionKey(bing_search_key)\n", + " .setOffsetCol(\"offset\")\n", + " .setQuery(\"Martin Luther King Jr. quotes\")\n", + " .setCount(imgsPerBatch)\n", + " .setOutputCol(\"images\")\n", + ")\n", "\n", "# Transformer that extracts and flattens the richly structured output of Bing Image Search into a simple URL column\n", "getUrls = BingImageSearch.getUrlTransformer(\"images\", \"url\")\n", @@ -365,7 +423,7 @@ "pipeline = PipelineModel(stages=[bingSearch, getUrls])\n", "\n", "# Show the results of your search: image URLs\n", - "display(pipeline.transform(bingParameters))\n" + "display(pipeline.transform(bingParameters))" ] }, { @@ -383,20 +441,23 @@ "outputs": [], "source": [ "# Create a dataframe with our audio URLs, tied to the column called \"url\"\n", - "df = spark.createDataFrame([(\"https://mmlspark.blob.core.windows.net/datasets/Speech/audio2.wav\",)\n", - " ], [\"url\"])\n", + "df = spark.createDataFrame(\n", + " [(\"https://mmlspark.blob.core.windows.net/datasets/Speech/audio2.wav\",)], [\"url\"]\n", + ")\n", "\n", "# Run the Speech-to-text service to translate the audio into text\n", - "speech_to_text = (SpeechToTextSDK()\n", - " .setSubscriptionKey(service_key)\n", - " .setLocation(\"eastus\")\n", - " .setOutputCol(\"text\")\n", - " .setAudioDataCol(\"url\")\n", - " .setLanguage(\"en-US\")\n", - " .setProfanity(\"Masked\"))\n", + "speech_to_text = (\n", + " SpeechToTextSDK()\n", + " .setSubscriptionKey(service_key)\n", + " .setLocation(\"eastus\")\n", + " .setOutputCol(\"text\")\n", + " .setAudioDataCol(\"url\")\n", + " .setLanguage(\"en-US\")\n", + " .setProfanity(\"Masked\")\n", + ")\n", "\n", "# Show the results of the translation\n", - "display(speech_to_text.transform(df).select(\"url\", \"text.DisplayText\"))\n" + "display(speech_to_text.transform(df).select(\"url\", \"text.DisplayText\"))" ] }, { @@ -416,14 +477,24 @@ "from synapse.ml.cognitive import TextToSpeech\n", "\n", "# Create a dataframe with text and an output file location\n", - "df = spark.createDataFrame([(\"Reading out lod is fun! Check out aka.ms/spark for more information\", \"dbfs:/output.mp3\")], [\"text\", \"output_file\"])\n", - " \n", - "tts = (TextToSpeech()\n", + "df = spark.createDataFrame(\n", + " [\n", + " (\n", + " \"Reading out lod is fun! Check out aka.ms/spark for more information\",\n", + " \"dbfs:/output.mp3\",\n", + " )\n", + " ],\n", + " [\"text\", \"output_file\"],\n", + ")\n", + "\n", + "tts = (\n", + " TextToSpeech()\n", " .setSubscriptionKey(service_key)\n", " .setTextCol(\"text\")\n", " .setLocation(\"eastus\")\n", - " .setVoiceName(\"en-US-JennyNeural\") \n", - " .setOutputFileCol(\"output_file\"))\n", + " .setVoiceName(\"en-US-JennyNeural\")\n", + " .setOutputFileCol(\"output_file\")\n", + ")\n", "\n", "# Check to make sure there were no errors during audio creation\n", "display(tts.transform(df))" @@ -445,37 +516,43 @@ "outputs": [], "source": [ "# Create a dataframe with the point data that Anomaly Detector requires\n", - "df = spark.createDataFrame([\n", - " (\"1972-01-01T00:00:00Z\", 826.0),\n", - " (\"1972-02-01T00:00:00Z\", 799.0),\n", - " (\"1972-03-01T00:00:00Z\", 890.0),\n", - " (\"1972-04-01T00:00:00Z\", 900.0),\n", - " (\"1972-05-01T00:00:00Z\", 766.0),\n", - " (\"1972-06-01T00:00:00Z\", 805.0),\n", - " (\"1972-07-01T00:00:00Z\", 821.0),\n", - " (\"1972-08-01T00:00:00Z\", 20000.0),\n", - " (\"1972-09-01T00:00:00Z\", 883.0),\n", - " (\"1972-10-01T00:00:00Z\", 898.0),\n", - " (\"1972-11-01T00:00:00Z\", 957.0),\n", - " (\"1972-12-01T00:00:00Z\", 924.0),\n", - " (\"1973-01-01T00:00:00Z\", 881.0),\n", - " (\"1973-02-01T00:00:00Z\", 837.0),\n", - " (\"1973-03-01T00:00:00Z\", 9000.0)\n", - "], [\"timestamp\", \"value\"]).withColumn(\"group\", lit(\"series1\"))\n", + "df = spark.createDataFrame(\n", + " [\n", + " (\"1972-01-01T00:00:00Z\", 826.0),\n", + " (\"1972-02-01T00:00:00Z\", 799.0),\n", + " (\"1972-03-01T00:00:00Z\", 890.0),\n", + " (\"1972-04-01T00:00:00Z\", 900.0),\n", + " (\"1972-05-01T00:00:00Z\", 766.0),\n", + " (\"1972-06-01T00:00:00Z\", 805.0),\n", + " (\"1972-07-01T00:00:00Z\", 821.0),\n", + " (\"1972-08-01T00:00:00Z\", 20000.0),\n", + " (\"1972-09-01T00:00:00Z\", 883.0),\n", + " (\"1972-10-01T00:00:00Z\", 898.0),\n", + " (\"1972-11-01T00:00:00Z\", 957.0),\n", + " (\"1972-12-01T00:00:00Z\", 924.0),\n", + " (\"1973-01-01T00:00:00Z\", 881.0),\n", + " (\"1973-02-01T00:00:00Z\", 837.0),\n", + " (\"1973-03-01T00:00:00Z\", 9000.0),\n", + " ],\n", + " [\"timestamp\", \"value\"],\n", + ").withColumn(\"group\", lit(\"series1\"))\n", "\n", "# Run the Anomaly Detector service to look for irregular data\n", - "anamoly_detector = (SimpleDetectAnomalies()\n", - " .setSubscriptionKey(anomaly_key)\n", - " .setLocation(\"eastus\")\n", - " .setTimestampCol(\"timestamp\")\n", - " .setValueCol(\"value\")\n", - " .setOutputCol(\"anomalies\")\n", - " .setGroupbyCol(\"group\")\n", - " .setGranularity(\"monthly\"))\n", + "anamoly_detector = (\n", + " SimpleDetectAnomalies()\n", + " .setSubscriptionKey(anomaly_key)\n", + " .setLocation(\"eastus\")\n", + " .setTimestampCol(\"timestamp\")\n", + " .setValueCol(\"value\")\n", + " .setOutputCol(\"anomalies\")\n", + " .setGroupbyCol(\"group\")\n", + " .setGranularity(\"monthly\")\n", + ")\n", "\n", "# Show the full results of the analysis with the anomalies marked as \"True\"\n", - "display(anamoly_detector.transform(df).select(\n", - " \"timestamp\", \"value\", \"anomalies.isAnomaly\"))" + "display(\n", + " anamoly_detector.transform(df).select(\"timestamp\", \"value\", \"anomalies.isAnomaly\")\n", + ")" ] }, { @@ -495,19 +572,22 @@ "source": [ "# Use any requests from the python requests library\n", "\n", + "\n", "def world_bank_request(country):\n", - " return Request(\"GET\", \"http://api.worldbank.org/v2/country/{}?format=json\".format(country))\n", + " return Request(\n", + " \"GET\", \"http://api.worldbank.org/v2/country/{}?format=json\".format(country)\n", + " )\n", "\n", "\n", "# Create a dataframe with spcificies which countries we want data on\n", - "df = (spark.createDataFrame([(\"br\",), (\"usa\",)], [\"country\"])\n", - " .withColumn(\"request\", http_udf(world_bank_request)(col(\"country\"))))\n", + "df = spark.createDataFrame([(\"br\",), (\"usa\",)], [\"country\"]).withColumn(\n", + " \"request\", http_udf(world_bank_request)(col(\"country\"))\n", + ")\n", "\n", "# Much faster for big data because of the concurrency :)\n", - "client = (HTTPTransformer()\n", - " .setConcurrency(3)\n", - " .setInputCol(\"request\")\n", - " .setOutputCol(\"response\"))\n", + "client = (\n", + " HTTPTransformer().setConcurrency(3).setInputCol(\"request\").setOutputCol(\"response\")\n", + ")\n", "\n", "# Get the body of the response\n", "\n", @@ -517,9 +597,11 @@ "\n", "\n", "# Show the details of the country data returned\n", - "display(client.transform(df)\n", - " .select(\"country\", udf(get_response_body)(col(\"response\"))\n", - " .alias(\"response\")))\n" + "display(\n", + " client.transform(df).select(\n", + " \"country\", udf(get_response_body)(col(\"response\")).alias(\"response\")\n", + " )\n", + ")" ] }, { @@ -540,25 +622,44 @@ "search_service = \"mmlspark-azure-search\"\n", "search_index = \"test-33467690\"\n", "\n", - "df = spark.createDataFrame([(\"upload\", \"0\", \"https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg\"),\n", - " (\"upload\", \"1\", \"https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg\")],\n", - " [\"searchAction\", \"id\", \"url\"])\n", + "df = spark.createDataFrame(\n", + " [\n", + " (\n", + " \"upload\",\n", + " \"0\",\n", + " \"https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg\",\n", + " ),\n", + " (\n", + " \"upload\",\n", + " \"1\",\n", + " \"https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg\",\n", + " ),\n", + " ],\n", + " [\"searchAction\", \"id\", \"url\"],\n", + ")\n", "\n", - "tdf = AnalyzeImage()\\\n", - " .setSubscriptionKey(service_key)\\\n", - " .setLocation(\"eastus\")\\\n", - " .setImageUrlCol(\"url\")\\\n", - " .setOutputCol(\"analyzed\")\\\n", - " .setErrorCol(\"errors\")\\\n", - " .setVisualFeatures([\"Categories\", \"Tags\", \"Description\", \"Faces\", \"ImageType\", \"Color\", \"Adult\"])\\\n", - " .transform(df).select(\"*\", \"analyzed.*\")\\\n", + "tdf = (\n", + " AnalyzeImage()\n", + " .setSubscriptionKey(service_key)\n", + " .setLocation(\"eastus\")\n", + " .setImageUrlCol(\"url\")\n", + " .setOutputCol(\"analyzed\")\n", + " .setErrorCol(\"errors\")\n", + " .setVisualFeatures(\n", + " [\"Categories\", \"Tags\", \"Description\", \"Faces\", \"ImageType\", \"Color\", \"Adult\"]\n", + " )\n", + " .transform(df)\n", + " .select(\"*\", \"analyzed.*\")\n", " .drop(\"errors\", \"analyzed\")\n", + ")\n", "\n", - "tdf.writeToAzureSearch(subscriptionKey=search_key,\n", - " actionCol=\"searchAction\",\n", - " serviceName=search_service,\n", - " indexName=search_index,\n", - " keyCol=\"id\")\n" + "tdf.writeToAzureSearch(\n", + " subscriptionKey=search_key,\n", + " actionCol=\"searchAction\",\n", + " serviceName=search_service,\n", + " indexName=search_index,\n", + " keyCol=\"id\",\n", + ")" ] } ], diff --git a/notebooks/features/cognitive_services/CognitiveServices - Predictive Maintenance.ipynb b/notebooks/features/cognitive_services/CognitiveServices - Predictive Maintenance.ipynb index d6ff40cdcf..89042a8266 100644 --- a/notebooks/features/cognitive_services/CognitiveServices - Predictive Maintenance.ipynb +++ b/notebooks/features/cognitive_services/CognitiveServices - Predictive Maintenance.ipynb @@ -43,16 +43,21 @@ "cell_type": "code", "source": [ "import os\n", + "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", " from notebookutils.mssparkutils.credentials import getSecret\n", - " os.environ['ANOMALY_API_KEY'] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n", "\n", - "service_key = os.environ[\"ANOMALY_API_KEY\"] # Paste your anomaly detector key here\n", - "location = \"westus2\" # Paste your anomaly detector location here\n", + " os.environ[\"ANOMALY_API_KEY\"] = getSecret(\n", + " \"mmlspark-build-keys\", \"cognitive-api-key\"\n", + " )\n", "\n", - "assert (service_key is not None)" + "service_key = os.environ[\"ANOMALY_API_KEY\"] # Paste your anomaly detector key here\n", + "location = \"westus2\" # Paste your anomaly detector location here\n", + "\n", + "assert service_key is not None" ], "metadata": {}, "outputs": [], @@ -68,7 +73,11 @@ { "cell_type": "code", "source": [ - "df_signals = spark.read.csv(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/iot/IoTSignals.csv\", header=True, inferSchema=True)" + "df_signals = spark.read.csv(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/iot/IoTSignals.csv\",\n", + " header=True,\n", + " inferSchema=True,\n", + ")" ], "metadata": {}, "outputs": [], @@ -84,7 +93,29 @@ { "cell_type": "code", "source": [ - "from pyspark.sql.functions import col, struct\nfrom synapse.ml.cognitive import SimpleDetectAnomalies\nfrom synapse.ml.core.spark import FluentAPI\n\ndetector = (SimpleDetectAnomalies()\n .setSubscriptionKey(service_key)\n .setLocation(location)\n .setOutputCol(\"anomalies\")\n .setGroupbyCol(\"grouping\")\n .setSensitivity(95)\n .setGranularity(\"secondly\"))\n\ndf_anomaly = (df_signals\n .where(col(\"unitSymbol\") == 'RPM')\n .withColumn(\"timestamp\", col(\"dateTime\").cast(\"string\"))\n .withColumn(\"value\", col(\"measureValue\").cast(\"double\"))\n .withColumn(\"grouping\", struct(\"deviceId\"))\n .mlTransform(detector)).cache()\n\ndf_anomaly.createOrReplaceTempView('df_anomaly')" + "from pyspark.sql.functions import col, struct\n", + "from synapse.ml.cognitive import SimpleDetectAnomalies\n", + "from synapse.ml.core.spark import FluentAPI\n", + "\n", + "detector = (\n", + " SimpleDetectAnomalies()\n", + " .setSubscriptionKey(service_key)\n", + " .setLocation(location)\n", + " .setOutputCol(\"anomalies\")\n", + " .setGroupbyCol(\"grouping\")\n", + " .setSensitivity(95)\n", + " .setGranularity(\"secondly\")\n", + ")\n", + "\n", + "df_anomaly = (\n", + " df_signals.where(col(\"unitSymbol\") == \"RPM\")\n", + " .withColumn(\"timestamp\", col(\"dateTime\").cast(\"string\"))\n", + " .withColumn(\"value\", col(\"measureValue\").cast(\"double\"))\n", + " .withColumn(\"grouping\", struct(\"deviceId\"))\n", + " .mlTransform(detector)\n", + ").cache()\n", + "\n", + "df_anomaly.createOrReplaceTempView(\"df_anomaly\")" ], "metadata": {}, "outputs": [], @@ -100,7 +131,7 @@ { "cell_type": "code", "source": [ - "df_anomaly.select(\"timestamp\",\"value\",\"deviceId\",\"anomalies.isAnomaly\").show(3)\n" + "df_anomaly.select(\"timestamp\", \"value\", \"deviceId\", \"anomalies.isAnomaly\").show(3)" ], "metadata": {}, "outputs": [], @@ -123,7 +154,21 @@ { "cell_type": "code", "source": [ - "df_anomaly_single_device = spark.sql(\"\"\"\nselect\n timestamp,\n measureValue,\n anomalies.expectedValue,\n anomalies.expectedValue + anomalies.upperMargin as expectedUpperValue,\n anomalies.expectedValue - anomalies.lowerMargin as expectedLowerValue,\n case when anomalies.isAnomaly=true then 1 else 0 end as isAnomaly\nfrom\n df_anomaly\nwhere deviceid = 'dev-1' and timestamp < '2020-04-29'\norder by timestamp\nlimit 200\"\"\")" + "df_anomaly_single_device = spark.sql(\n", + " \"\"\"\n", + "select\n", + " timestamp,\n", + " measureValue,\n", + " anomalies.expectedValue,\n", + " anomalies.expectedValue + anomalies.upperMargin as expectedUpperValue,\n", + " anomalies.expectedValue - anomalies.lowerMargin as expectedLowerValue,\n", + " case when anomalies.isAnomaly=true then 1 else 0 end as isAnomaly\n", + "from\n", + " df_anomaly\n", + "where deviceid = 'dev-1' and timestamp < '2020-04-29'\n", + "order by timestamp\n", + "limit 200\"\"\"\n", + ")" ], "metadata": {}, "outputs": [], @@ -139,7 +184,50 @@ { "cell_type": "code", "source": [ - "import matplotlib.pyplot as plt\nfrom pyspark.sql.functions import col\n\nadf = df_anomaly_single_device.toPandas()\nadf_subset = df_anomaly_single_device.where(col(\"isAnomaly\") == 1).toPandas()\n\nplt.figure(figsize=(23,8))\nplt.plot(adf['timestamp'],adf['expectedUpperValue'], color='darkred', linestyle='solid', linewidth=0.25, label='UpperMargin')\nplt.plot(adf['timestamp'],adf['expectedValue'], color='darkgreen', linestyle='solid', linewidth=2, label='Expected Value')\nplt.plot(adf['timestamp'],adf['measureValue'], 'b', color='royalblue', linestyle='dotted', linewidth=2, label='Actual')\nplt.plot(adf['timestamp'],adf['expectedLowerValue'], color='black', linestyle='solid', linewidth=0.25, label='Lower Margin')\nplt.plot(adf_subset['timestamp'],adf_subset['measureValue'], 'ro', label = 'Anomaly')\nplt.legend()\nplt.title('RPM Anomalies with Confidence Intervals')\nplt.show()" + "import matplotlib.pyplot as plt\n", + "from pyspark.sql.functions import col\n", + "\n", + "adf = df_anomaly_single_device.toPandas()\n", + "adf_subset = df_anomaly_single_device.where(col(\"isAnomaly\") == 1).toPandas()\n", + "\n", + "plt.figure(figsize=(23, 8))\n", + "plt.plot(\n", + " adf[\"timestamp\"],\n", + " adf[\"expectedUpperValue\"],\n", + " color=\"darkred\",\n", + " linestyle=\"solid\",\n", + " linewidth=0.25,\n", + " label=\"UpperMargin\",\n", + ")\n", + "plt.plot(\n", + " adf[\"timestamp\"],\n", + " adf[\"expectedValue\"],\n", + " color=\"darkgreen\",\n", + " linestyle=\"solid\",\n", + " linewidth=2,\n", + " label=\"Expected Value\",\n", + ")\n", + "plt.plot(\n", + " adf[\"timestamp\"],\n", + " adf[\"measureValue\"],\n", + " \"b\",\n", + " color=\"royalblue\",\n", + " linestyle=\"dotted\",\n", + " linewidth=2,\n", + " label=\"Actual\",\n", + ")\n", + "plt.plot(\n", + " adf[\"timestamp\"],\n", + " adf[\"expectedLowerValue\"],\n", + " color=\"black\",\n", + " linestyle=\"solid\",\n", + " linewidth=0.25,\n", + " label=\"Lower Margin\",\n", + ")\n", + "plt.plot(adf_subset[\"timestamp\"], adf_subset[\"measureValue\"], \"ro\", label=\"Anomaly\")\n", + "plt.legend()\n", + "plt.title(\"RPM Anomalies with Confidence Intervals\")\n", + "plt.show()" ], "metadata": {}, "outputs": [], diff --git a/notebooks/features/geospatial_services/GeospatialServices - Flooding Risk.ipynb b/notebooks/features/geospatial_services/GeospatialServices - Flooding Risk.ipynb index 266e1af9b7..6ba54d803a 100644 --- a/notebooks/features/geospatial_services/GeospatialServices - Flooding Risk.ipynb +++ b/notebooks/features/geospatial_services/GeospatialServices - Flooding Risk.ipynb @@ -40,7 +40,7 @@ "retry_strategy = Retry(\n", " total=3,\n", " status_forcelist=[429, 500, 502, 503, 504],\n", - " method_whitelist=[\"HEAD\", \"GET\", \"PUT\", \"DELETE\", \"OPTIONS\", \"TRACE\"]\n", + " method_whitelist=[\"HEAD\", \"GET\", \"PUT\", \"DELETE\", \"OPTIONS\", \"TRACE\"],\n", ")\n", "adapter = HTTPAdapter(max_retries=retry_strategy)\n", "http = requests.Session()\n", @@ -49,42 +49,51 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", " from notebookutils.mssparkutils.credentials import getSecret\n", - " os.environ['AZURE_MAPS_KEY'] = getSecret(\"mmlspark-build-keys\", \"azuremaps-api-key\")\n", + "\n", + " os.environ[\"AZURE_MAPS_KEY\"] = getSecret(\"mmlspark-build-keys\", \"azuremaps-api-key\")\n", " from notebookutils.visualization import display\n", "\n", "\n", - "\n", "# Azure Maps account key\n", - "azureMapsKey = os.environ[\"AZURE_MAPS_KEY\"] #Replace this with your azure maps key\n", + "azureMapsKey = os.environ[\"AZURE_MAPS_KEY\"] # Replace this with your azure maps key\n", "\n", "# Creator Geo prefix\n", "# for this example, assuming that the creator resource is created in `EAST US 2`.\n", "atlas_geo_prefix = \"us\"\n", "\n", "# Load flood plains data\n", - "flood_plain_geojson = http.get(\"https://mmlspark.blob.core.windows.net/publicwasb/maps/KingCountyFloodPlains.geojson\").content\n", + "flood_plain_geojson = http.get(\n", + " \"https://mmlspark.blob.core.windows.net/publicwasb/maps/KingCountyFloodPlains.geojson\"\n", + ").content\n", "\n", "# Upload this flood plains data to your maps/creator account. This is a Long-Running async operation and takes approximately 15~30 seconds to complete\n", - "r= http.post(f'https://{atlas_geo_prefix}.atlas.microsoft.com/mapData/upload?api-version=1.0&dataFormat=geojson&subscription-key={azureMapsKey}',\n", - " json=json.loads(flood_plain_geojson))\n", + "r = http.post(\n", + " f\"https://{atlas_geo_prefix}.atlas.microsoft.com/mapData/upload?api-version=1.0&dataFormat=geojson&subscription-key={azureMapsKey}\",\n", + " json=json.loads(flood_plain_geojson),\n", + ")\n", "\n", "# Poll for resource upload completion\n", - "resource_location = r.headers.get('location')\n", + "resource_location = r.headers.get(\"location\")\n", "for _ in range(20):\n", - " resource = json.loads(http.get(f'{resource_location}&subscription-key={azureMapsKey}').content)\n", - " status = resource[\"status\"].lower()\n", - " if status == \"running\":\n", - " time.sleep(5) # wait in a polling loop\n", - " elif status == \"succeeded\":\n", - " break\n", - " else:\n", - " raise ValueError(\"Unknown status {}\".format(status))\n", + " resource = json.loads(\n", + " http.get(f\"{resource_location}&subscription-key={azureMapsKey}\").content\n", + " )\n", + " status = resource[\"status\"].lower()\n", + " if status == \"running\":\n", + " time.sleep(5) # wait in a polling loop\n", + " elif status == \"succeeded\":\n", + " break\n", + " else:\n", + " raise ValueError(\"Unknown status {}\".format(status))\n", "\n", "# Once the above operation returns a HTTP 201, get the user_data_id of the flood plains data, you uploaded to your map account.\n", - "user_data_id_resource_url = resource['resourceLocation']\n", - "user_data_id = json.loads(http.get(f'{user_data_id_resource_url}&subscription-key={azureMapsKey}').content)['udid']" + "user_data_id_resource_url = resource[\"resourceLocation\"]\n", + "user_data_id = json.loads(\n", + " http.get(f\"{user_data_id_resource_url}&subscription-key={azureMapsKey}\").content\n", + ")[\"udid\"]" ] }, { @@ -102,9 +111,9 @@ "metadata": {}, "outputs": [], "source": [ - "data = spark.read\\\n", - " .option(\"header\", \"true\")\\\n", - " .csv(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/maps/KingCountyAddress.csv\")\n", + "data = spark.read.option(\"header\", \"true\").csv(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/maps/KingCountyAddress.csv\"\n", + ")\n", "\n", "# Visualize incoming schema\n", "print(\"Schema:\")\n", @@ -135,23 +144,39 @@ "from synapse.ml.stages import FixedMiniBatchTransformer, FlattenBatch\n", "from synapse.ml.geospatial import *\n", "\n", + "\n", "def extract_location_fields(df):\n", " # Use this function to select only lat/lon columns into the dataframe\n", - " return df.select(col(\"*\"),\n", - " col(\"output.response.results\").getItem(0).getField(\"position\").getField(\"lat\").alias(\"Latitude\"),\n", - " col(\"output.response.results\").getItem(0).getField(\"position\").getField(\"lon\").alias(\"Longitude\")\n", + " return df.select(\n", + " col(\"*\"),\n", + " col(\"output.response.results\")\n", + " .getItem(0)\n", + " .getField(\"position\")\n", + " .getField(\"lat\")\n", + " .alias(\"Latitude\"),\n", + " col(\"output.response.results\")\n", + " .getItem(0)\n", + " .getField(\"position\")\n", + " .getField(\"lon\")\n", + " .alias(\"Longitude\"),\n", " ).drop(\"output\")\n", - " \n", + "\n", "\n", "# Azure Maps geocoder to enhance the dataframe with location data\n", - "geocoder = (AddressGeocoder()\n", + "geocoder = (\n", + " AddressGeocoder()\n", " .setSubscriptionKey(azureMapsKey)\n", " .setAddressCol(\"FullAddress\")\n", - " .setOutputCol(\"output\"))\n", + " .setOutputCol(\"output\")\n", + ")\n", "\n", "# Set up a fixed mini batch transformer to geocode addresses\n", - "batched_dataframe = geocoder.transform(FixedMiniBatchTransformer().setBatchSize(10).transform(subset_data.coalesce(1)))\n", - "geocoded_addresses = extract_location_fields(FlattenBatch().transform(batched_dataframe))\n", + "batched_dataframe = geocoder.transform(\n", + " FixedMiniBatchTransformer().setBatchSize(10).transform(subset_data.coalesce(1))\n", + ")\n", + "geocoded_addresses = extract_location_fields(\n", + " FlattenBatch().transform(batched_dataframe)\n", + ")\n", "\n", "# Display the results\n", "display(geocoded_addresses)" @@ -174,22 +199,27 @@ "source": [ "def extract_point_in_polygon_result_fields(df):\n", " # Use this function to select only lat/lon columns into the dataframe\n", - " return df.select(col(\"*\"),\n", + " return df.select(\n", + " col(\"*\"),\n", " col(\"output.result.pointInPolygons\").alias(\"In Polygon\"),\n", - " col(\"output.result.intersectingGeometries\").alias(\"Intersecting Polygons\")\n", + " col(\"output.result.intersectingGeometries\").alias(\"Intersecting Polygons\"),\n", " ).drop(\"output\")\n", "\n", "\n", - "check_point_in_polygon = (CheckPointInPolygon()\n", + "check_point_in_polygon = (\n", + " CheckPointInPolygon()\n", " .setSubscriptionKey(azureMapsKey)\n", " .setGeography(atlas_geo_prefix)\n", " .setUserDataIdentifier(user_data_id)\n", " .setLatitudeCol(\"Latitude\")\n", " .setLongitudeCol(\"Longitude\")\n", - " .setOutputCol(\"output\"))\n", + " .setOutputCol(\"output\")\n", + ")\n", "\n", "\n", - "flood_plain_addresses = extract_point_in_polygon_result_fields(check_point_in_polygon.transform(geocoded_addresses))\n", + "flood_plain_addresses = extract_point_in_polygon_result_fields(\n", + " check_point_in_polygon.transform(geocoded_addresses)\n", + ")\n", "\n", "# Display the results\n", "display(flood_plain_addresses)" @@ -209,7 +239,9 @@ "metadata": {}, "outputs": [], "source": [ - "res = http.delete(f\"https://{atlas_geo_prefix}.atlas.microsoft.com/mapData/{user_data_id}?api-version=1.0&subscription-key={azureMapsKey}\")" + "res = http.delete(\n", + " f\"https://{atlas_geo_prefix}.atlas.microsoft.com/mapData/{user_data_id}?api-version=1.0&subscription-key={azureMapsKey}\"\n", + ")" ] } ], diff --git a/notebooks/features/geospatial_services/GeospatialServices - Overview.ipynb b/notebooks/features/geospatial_services/GeospatialServices - Overview.ipynb index d8652bfe94..7e701ebba3 100644 --- a/notebooks/features/geospatial_services/GeospatialServices - Overview.ipynb +++ b/notebooks/features/geospatial_services/GeospatialServices - Overview.ipynb @@ -54,7 +54,7 @@ "outputs": [], "source": [ "from pyspark.sql.functions import udf, col\n", - "from pyspark.sql.types import StructType,StructField, DoubleType\n", + "from pyspark.sql.types import StructType, StructField, DoubleType\n", "from pyspark.sql.functions import lit\n", "from pyspark.ml import PipelineModel\n", "from pyspark.sql.functions import col\n", @@ -67,7 +67,7 @@ "retry_strategy = Retry(\n", " total=3,\n", " status_forcelist=[429, 500, 502, 503, 504],\n", - " method_whitelist=[\"HEAD\", \"GET\", \"PUT\", \"DELETE\", \"OPTIONS\", \"TRACE\"]\n", + " method_whitelist=[\"HEAD\", \"GET\", \"PUT\", \"DELETE\", \"OPTIONS\", \"TRACE\"],\n", ")\n", "adapter = HTTPAdapter(max_retries=retry_strategy)\n", "http = requests.Session()\n", @@ -83,10 +83,12 @@ "source": [ "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", " from notebookutils.mssparkutils.credentials import getSecret\n", - " os.environ['AZURE_MAPS_KEY'] = getSecret(\"mmlspark-build-keys\", \"azuremaps-api-key\")\n", - " from notebookutils.visualization import display\n" + "\n", + " os.environ[\"AZURE_MAPS_KEY\"] = getSecret(\"mmlspark-build-keys\", \"azuremaps-api-key\")\n", + " from notebookutils.visualization import display" ] }, { @@ -119,30 +121,51 @@ "source": [ "from synapse.ml.stages import FixedMiniBatchTransformer, FlattenBatch\n", "\n", - "df = spark.createDataFrame([\n", - " (\"One, Microsoft Way, Redmond\",),\n", - " (\"400 Broad St, Seattle\",),\n", - " (\"350 5th Ave, New York\",),\n", - " (\"Pike Pl, Seattle\",),\n", - " (\"Champ de Mars, 5 Avenue Anatole France, 75007 Paris\",)\n", - "], [\"address\",])\n", + "df = spark.createDataFrame(\n", + " [\n", + " (\"One, Microsoft Way, Redmond\",),\n", + " (\"400 Broad St, Seattle\",),\n", + " (\"350 5th Ave, New York\",),\n", + " (\"Pike Pl, Seattle\",),\n", + " (\"Champ de Mars, 5 Avenue Anatole France, 75007 Paris\",),\n", + " ],\n", + " [\n", + " \"address\",\n", + " ],\n", + ")\n", "\n", "\n", "def extract_location_fields(df):\n", " # Use this function to select only lat/lon columns into the dataframe\n", - " return df.select(col(\"*\"),\n", - " col(\"output.response.results\").getItem(0).getField(\"position\").getField(\"lat\").alias(\"Latitude\"),\n", - " col(\"output.response.results\").getItem(0).getField(\"position\").getField(\"lon\").alias(\"Longitude\")\n", + " return df.select(\n", + " col(\"*\"),\n", + " col(\"output.response.results\")\n", + " .getItem(0)\n", + " .getField(\"position\")\n", + " .getField(\"lat\")\n", + " .alias(\"Latitude\"),\n", + " col(\"output.response.results\")\n", + " .getItem(0)\n", + " .getField(\"position\")\n", + " .getField(\"lon\")\n", + " .alias(\"Longitude\"),\n", " ).drop(\"output\")\n", "\n", + "\n", "# Run the Azure Maps geocoder to enhance the data with location data\n", - "geocoder = (AddressGeocoder()\n", + "geocoder = (\n", + " AddressGeocoder()\n", " .setSubscriptionKey(azureMapsKey)\n", " .setAddressCol(\"address\")\n", - " .setOutputCol(\"output\"))\n", + " .setOutputCol(\"output\")\n", + ")\n", "\n", "# Show the results of your text query in a table format\n", - "display(extract_location_fields(geocoder.transform(FixedMiniBatchTransformer().setBatchSize(10).transform(df))))" + "display(\n", + " extract_location_fields(\n", + " geocoder.transform(FixedMiniBatchTransformer().setBatchSize(10).transform(df))\n", + " )\n", + ")" ] }, { @@ -161,26 +184,46 @@ "outputs": [], "source": [ "# Create a dataframe that's tied to it's column names\n", - "df = spark.createDataFrame(((\n", - " (48.858561, 2.294911),\n", - " (47.639765, -122.127896),\n", - " (47.621028, -122.348170),\n", - " (47.734012, -122.102737)\n", - " )), StructType([StructField(\"lat\", DoubleType()), StructField(\"lon\", DoubleType())]))\n", + "df = spark.createDataFrame(\n", + " (\n", + " (\n", + " (48.858561, 2.294911),\n", + " (47.639765, -122.127896),\n", + " (47.621028, -122.348170),\n", + " (47.734012, -122.102737),\n", + " )\n", + " ),\n", + " StructType([StructField(\"lat\", DoubleType()), StructField(\"lon\", DoubleType())]),\n", + ")\n", "\n", "# Run the Azure Maps geocoder to enhance the data with location data\n", - "rev_geocoder = (ReverseAddressGeocoder()\n", + "rev_geocoder = (\n", + " ReverseAddressGeocoder()\n", " .setSubscriptionKey(azureMapsKey)\n", " .setLatitudeCol(\"lat\")\n", " .setLongitudeCol(\"lon\")\n", - " .setOutputCol(\"output\"))\n", + " .setOutputCol(\"output\")\n", + ")\n", "\n", "# Show the results of your text query in a table format\n", "\n", - "display(rev_geocoder.transform(FixedMiniBatchTransformer().setBatchSize(10).transform(df)).select(col(\"*\"),\n", - " col(\"output.response.addresses\").getItem(0).getField(\"address\").getField(\"freeformAddress\").alias(\"In Polygon\"),\n", - " col(\"output.response.addresses\").getItem(0).getField(\"address\").getField(\"country\").alias(\"Intersecting Polygons\")\n", - " ).drop(\"output\"))\n" + "display(\n", + " rev_geocoder.transform(FixedMiniBatchTransformer().setBatchSize(10).transform(df))\n", + " .select(\n", + " col(\"*\"),\n", + " col(\"output.response.addresses\")\n", + " .getItem(0)\n", + " .getField(\"address\")\n", + " .getField(\"freeformAddress\")\n", + " .alias(\"In Polygon\"),\n", + " col(\"output.response.addresses\")\n", + " .getItem(0)\n", + " .getField(\"address\")\n", + " .getField(\"country\")\n", + " .alias(\"Intersecting Polygons\"),\n", + " )\n", + " .drop(\"output\")\n", + ")" ] }, { @@ -211,56 +254,47 @@ "import json\n", "\n", "# Choose a geography, you want your data to reside in.\n", - "# Allowed values \n", + "# Allowed values\n", "# us => North American datacenters\n", "# eu -> European datacenters\n", - "url_geo_prefix = 'us' \n", + "url_geo_prefix = \"us\"\n", "\n", "# Upload a geojson with polygons in them\n", - "r= http.post(f'https://{url_geo_prefix}.atlas.microsoft.com/mapData/upload?api-version=1.0&dataFormat=geojson&subscription-key={azureMapsKey}',\n", - " json= { \n", - " \"type\": \"FeatureCollection\", \n", + "r = http.post(\n", + " f\"https://{url_geo_prefix}.atlas.microsoft.com/mapData/upload?api-version=1.0&dataFormat=geojson&subscription-key={azureMapsKey}\",\n", + " json={\n", + " \"type\": \"FeatureCollection\",\n", " \"features\": [\n", " {\n", " \"type\": \"Feature\",\n", - " \"properties\": { \"geometryId\": \"test_geometry\" },\n", + " \"properties\": {\"geometryId\": \"test_geometry\"},\n", " \"geometry\": {\n", " \"type\": \"Polygon\",\n", - " \"coordinates\":[\n", + " \"coordinates\": [\n", " [\n", - " [\n", - " -122.14290618896484,\n", - " 47.67856488312544\n", - " ],\n", - " [\n", - " -122.03956604003906,\n", - " 47.67856488312544\n", - " ],\n", - " [\n", - " -122.03956604003906,\n", - " 47.7483271435476\n", - " ],\n", - " [\n", - " -122.14290618896484,\n", - " 47.7483271435476\n", - " ],\n", - " [\n", - " -122.14290618896484,\n", - " 47.67856488312544\n", - " ]\n", + " [-122.14290618896484, 47.67856488312544],\n", + " [-122.03956604003906, 47.67856488312544],\n", + " [-122.03956604003906, 47.7483271435476],\n", + " [-122.14290618896484, 47.7483271435476],\n", + " [-122.14290618896484, 47.67856488312544],\n", " ]\n", - " ]\n", - " } \n", - " } \n", - " ] \n", - " })\n", + " ],\n", + " },\n", + " }\n", + " ],\n", + " },\n", + ")\n", "\n", - "long_running_operation = r.headers.get('location')\n", - "time.sleep(30) # Sometimes this may take upto 30 seconds\n", + "long_running_operation = r.headers.get(\"location\")\n", + "time.sleep(30) # Sometimes this may take upto 30 seconds\n", "print(f\"Status Code: {r.status_code}, Long Running Operation: {long_running_operation}\")\n", - "# This Operation completes in approximately 5 ~ 15 seconds \n", - "user_data_id_resource_url = json.loads(http.get(f'{long_running_operation}&subscription-key={azureMapsKey}').content)['resourceLocation']\n", - "user_data_id = json.loads(http.get(f'{user_data_id_resource_url}&subscription-key={azureMapsKey}').content)['udid']" + "# This Operation completes in approximately 5 ~ 15 seconds\n", + "user_data_id_resource_url = json.loads(\n", + " http.get(f\"{long_running_operation}&subscription-key={azureMapsKey}\").content\n", + ")[\"resourceLocation\"]\n", + "user_data_id = json.loads(\n", + " http.get(f\"{user_data_id_resource_url}&subscription-key={azureMapsKey}\").content\n", + ")[\"udid\"]" ] }, { @@ -277,27 +311,39 @@ "outputs": [], "source": [ "# Create a dataframe that's tied to it's column names\n", - "df = spark.createDataFrame(((\n", - " (48.858561, 2.294911),\n", - " (47.639765, -122.127896),\n", - " (47.621028, -122.348170),\n", - " (47.734012, -122.102737)\n", - " )), StructType([StructField(\"lat\", DoubleType()), StructField(\"lon\", DoubleType())]))\n", + "df = spark.createDataFrame(\n", + " (\n", + " (\n", + " (48.858561, 2.294911),\n", + " (47.639765, -122.127896),\n", + " (47.621028, -122.348170),\n", + " (47.734012, -122.102737),\n", + " )\n", + " ),\n", + " StructType([StructField(\"lat\", DoubleType()), StructField(\"lon\", DoubleType())]),\n", + ")\n", "\n", "# Run the Azure Maps geocoder to enhance the data with location data\n", - "check_point_in_polygon = (CheckPointInPolygon()\n", + "check_point_in_polygon = (\n", + " CheckPointInPolygon()\n", " .setSubscriptionKey(azureMapsKey)\n", " .setGeography(url_geo_prefix)\n", " .setUserDataIdentifier(user_data_id)\n", " .setLatitudeCol(\"lat\")\n", " .setLongitudeCol(\"lon\")\n", - " .setOutputCol(\"output\"))\n", + " .setOutputCol(\"output\")\n", + ")\n", "\n", "# Show the results of your text query in a table format\n", - "display(check_point_in_polygon.transform(df).select(col(\"*\"),\n", + "display(\n", + " check_point_in_polygon.transform(df)\n", + " .select(\n", + " col(\"*\"),\n", " col(\"output.result.pointInPolygons\").alias(\"In Polygon\"),\n", - " col(\"output.result.intersectingGeometries\").alias(\"Intersecting Polygons\")\n", - " ).drop(\"output\"))" + " col(\"output.result.intersectingGeometries\").alias(\"Intersecting Polygons\"),\n", + " )\n", + " .drop(\"output\")\n", + ")" ] }, { @@ -313,7 +359,9 @@ "metadata": {}, "outputs": [], "source": [ - "res = http.delete(f\"https://{url_geo_prefix}.atlas.microsoft.com/mapData/{user_data_id}?api-version=1.0&subscription-key={azureMapsKey}\")" + "res = http.delete(\n", + " f\"https://{url_geo_prefix}.atlas.microsoft.com/mapData/{user_data_id}?api-version=1.0&subscription-key={azureMapsKey}\"\n", + ")" ] } ], diff --git a/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb b/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb index 5d8baa97e9..780ce7725f 100644 --- a/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb +++ b/notebooks/features/isolation_forest/IsolationForest - Multivariate Anomaly Detection.ipynb @@ -82,6 +82,7 @@ "source": [ "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", " from notebookutils.visualization import display" ], @@ -120,14 +121,26 @@ "outputs": [], "source": [ "# Table inputs\n", - "timestampColumn = \"timestamp\" # str: the name of the timestamp column in the table\n", - "inputCols = ['sensor_1', 'sensor_2', 'sensor_3'] # list(str): the names of the input variables \n", + "timestampColumn = \"timestamp\" # str: the name of the timestamp column in the table\n", + "inputCols = [\n", + " \"sensor_1\",\n", + " \"sensor_2\",\n", + " \"sensor_3\",\n", + "] # list(str): the names of the input variables\n", "\n", - "# Training Start time, and number of days to use for training: \n", - "trainingStartTime = \"2022-02-24T06:00:00Z\" # datetime: datetime for when to start the training\n", - "trainingEndTime = \"2022-03-08T23:55:00Z\" # datetime: datetime for when to end the training\n", - "inferenceStartTime = \"2022-03-09T09:30:00Z\" # datetime: datetime for when to start the training\n", - "inferenceEndTime = \"2022-03-20T23:55:00Z\" # datetime: datetime for when to end the training\n", + "# Training Start time, and number of days to use for training:\n", + "trainingStartTime = (\n", + " \"2022-02-24T06:00:00Z\" # datetime: datetime for when to start the training\n", + ")\n", + "trainingEndTime = (\n", + " \"2022-03-08T23:55:00Z\" # datetime: datetime for when to end the training\n", + ")\n", + "inferenceStartTime = (\n", + " \"2022-03-09T09:30:00Z\" # datetime: datetime for when to start the training\n", + ")\n", + "inferenceEndTime = (\n", + " \"2022-03-20T23:55:00Z\" # datetime: datetime for when to end the training\n", + ")\n", "\n", "# Isolation Forest parameters\n", "contamination = 0.021\n", @@ -169,7 +182,13 @@ }, "outputs": [], "source": [ - "df = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/generated_sample_mvad_data.csv\")" + "df = (\n", + " spark.read.format(\"csv\")\n", + " .option(\"header\", \"true\")\n", + " .load(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/generated_sample_mvad_data.csv\"\n", + " )\n", + ")" ] }, { @@ -200,14 +219,13 @@ "outputs": [], "source": [ "df = (\n", - " df\n", - " .orderBy(timestampColumn)\n", + " df.orderBy(timestampColumn)\n", " .withColumn(\"timestamp\", F.date_format(timestampColumn, \"yyyy-MM-dd'T'HH:mm:ss'Z'\"))\n", " .withColumn(\"sensor_1\", F.col(\"sensor_1\").cast(DoubleType()))\n", " .withColumn(\"sensor_2\", F.col(\"sensor_2\").cast(DoubleType()))\n", " .withColumn(\"sensor_3\", F.col(\"sensor_3\").cast(DoubleType()))\n", - " .drop('_c5')\n", - " )\n", + " .drop(\"_c5\")\n", + ")\n", "\n", "display(df)" ] @@ -240,7 +258,10 @@ "outputs": [], "source": [ "# filter to data with timestamps within the training window\n", - "df_train = df.filter((F.col(timestampColumn) >= trainingStartTime) & (F.col(timestampColumn) <= trainingEndTime))\n", + "df_train = df.filter(\n", + " (F.col(timestampColumn) >= trainingStartTime)\n", + " & (F.col(timestampColumn) <= trainingEndTime)\n", + ")\n", "display(df_train)" ] }, @@ -272,7 +293,10 @@ "outputs": [], "source": [ "# filter to data with timestamps within the inference window\n", - "df_test = df.filter((F.col(timestampColumn) >= inferenceStartTime) & (F.col(timestampColumn) <= inferenceEndTime))\n", + "df_test = df.filter(\n", + " (F.col(timestampColumn) >= inferenceStartTime)\n", + " & (F.col(timestampColumn) <= inferenceEndTime)\n", + ")\n", "display(df_test)" ] }, @@ -303,17 +327,19 @@ }, "outputs": [], "source": [ - "isolationForest = (IsolationForest()\n", - " .setNumEstimators(num_estimators)\n", - " .setBootstrap(False)\n", - " .setMaxSamples(max_samples)\n", - " .setMaxFeatures(max_features)\n", - " .setFeaturesCol(\"features\")\n", - " .setPredictionCol(\"predictedLabel\")\n", - " .setScoreCol(\"outlierScore\")\n", - " .setContamination(contamination)\n", - " .setContaminationError(0.01 * contamination)\n", - " .setRandomSeed(1))" + "isolationForest = (\n", + " IsolationForest()\n", + " .setNumEstimators(num_estimators)\n", + " .setBootstrap(False)\n", + " .setMaxSamples(max_samples)\n", + " .setMaxFeatures(max_features)\n", + " .setFeaturesCol(\"features\")\n", + " .setPredictionCol(\"predictedLabel\")\n", + " .setScoreCol(\"outlierScore\")\n", + " .setContamination(contamination)\n", + " .setContaminationError(0.01 * contamination)\n", + " .setRandomSeed(1)\n", + ")" ] }, { @@ -350,7 +376,9 @@ " va = VectorAssembler(inputCols=inputCols, outputCol=\"features\")\n", " pipeline = Pipeline(stages=[va, isolationForest])\n", " model = pipeline.fit(df_train)\n", - " mlflow.spark.log_model(model, artifact_path=artifact_path,registered_model_name=model_name)" + " mlflow.spark.log_model(\n", + " model, artifact_path=artifact_path, registered_model_name=model_name\n", + " )" ] }, { @@ -457,14 +485,14 @@ }, "outputs": [], "source": [ - "# Here, we create a TabularSHAP explainer, set the input columns to all the features the model takes, specify the model and the target output column \n", + "# Here, we create a TabularSHAP explainer, set the input columns to all the features the model takes, specify the model and the target output column\n", "# we are trying to explain. In this case, we are trying to explain the \"outlierScore\" output.\n", "shap = TabularSHAP(\n", " inputCols=inputCols,\n", " outputCol=\"shapValues\",\n", " model=model,\n", " targetCol=\"outlierScore\",\n", - " backgroundData=F.broadcast(df_test)\n", + " backgroundData=F.broadcast(df_test),\n", ")" ] }, @@ -522,13 +550,14 @@ }, "outputs": [], "source": [ - "# Here, we extract the SHAP values, the original features and the outlier score column. Then we convert it to a Pandas DataFrame for visualization. \n", - "# For each observation, the first element in the SHAP values vector is the base value (the mean output of the background dataset), \n", + "# Here, we extract the SHAP values, the original features and the outlier score column. Then we convert it to a Pandas DataFrame for visualization.\n", + "# For each observation, the first element in the SHAP values vector is the base value (the mean output of the background dataset),\n", "# and each of the following elements represents the SHAP values for each feature\n", "shaps = (\n", - " shap_df\n", - " .withColumn(\"shapValues\", vec2array(F.col(\"shapValues\").getItem(0)))\n", - " .select([\"shapValues\", \"outlierScore\"] + inputCols + [timestampColumn, \"prediction\"])\n", + " shap_df.withColumn(\"shapValues\", vec2array(F.col(\"shapValues\").getItem(0)))\n", + " .select(\n", + " [\"shapValues\", \"outlierScore\"] + inputCols + [timestampColumn, \"prediction\"]\n", + " )\n", " .withColumn(\"sensor_1_localimp\", F.col(\"shapValues\")[1])\n", " .withColumn(\"sensor_2_localimp\", F.col(\"shapValues\")[2])\n", " .withColumn(\"sensor_3_localimp\", F.col(\"shapValues\")[3])\n", @@ -565,7 +594,7 @@ }, "outputs": [], "source": [ - "local_importance_values = shaps_local[['shapValues']]\n", + "local_importance_values = shaps_local[[\"shapValues\"]]\n", "eval_data = shaps_local[inputCols]" ] }, @@ -634,8 +663,11 @@ "outputs": [], "source": [ "from interpret_community.adapter import ExplanationAdapter\n", + "\n", "adapter = ExplanationAdapter(inputCols, classification=False)\n", - "global_explanation = adapter.create_global(converted_importance_values, eval_data, expected_values=bias)" + "global_explanation = adapter.create_global(\n", + " converted_importance_values, eval_data, expected_values=bias\n", + ")" ] }, { @@ -687,13 +719,20 @@ "source": [ "# Defining a wrapper class with predict method for creating the Explanation Dashboard\n", "\n", + "\n", "class wrapper(object):\n", " def __init__(self, model):\n", " self.model = model\n", - " \n", + "\n", " def predict(self, data):\n", " sparkdata = spark.createDataFrame(data)\n", - " return model.transform(sparkdata).select('outlierScore').toPandas().values.flatten().tolist()" + " return (\n", + " model.transform(sparkdata)\n", + " .select(\"outlierScore\")\n", + " .toPandas()\n", + " .values.flatten()\n", + " .tolist()\n", + " )" ] }, { @@ -733,50 +772,119 @@ "def visualize(rdf):\n", " anoms = list(rdf[\"prediction\"] == 1)\n", "\n", - " fig = plt.figure(figsize=(26,12))\n", + " fig = plt.figure(figsize=(26, 12))\n", "\n", " ax = fig.add_subplot(611)\n", " ax.title.set_text(f\"Multivariate Anomaly Detection Results\")\n", - " ax.plot(rdf[timestampColumn],rdf[\"sensor_1\"], color='tab:orange', linestyle='solid', linewidth=2, label=\"sensor_1\")\n", - " ax.grid(axis='y')\n", + " ax.plot(\n", + " rdf[timestampColumn],\n", + " rdf[\"sensor_1\"],\n", + " color=\"tab:orange\",\n", + " linestyle=\"solid\",\n", + " linewidth=2,\n", + " label=\"sensor_1\",\n", + " )\n", + " ax.grid(axis=\"y\")\n", " _, _, ymin, ymax = plt.axis()\n", - " ax.vlines(rdf[timestampColumn][anoms], ymin=ymin , ymax=ymax , color='tab:red', alpha=0.2, linewidth=6)\n", - " ax.tick_params(axis='x',which='both',bottom=False,labelbottom=False)\n", - " ax.set_ylabel('sensor1_value')\n", + " ax.vlines(\n", + " rdf[timestampColumn][anoms],\n", + " ymin=ymin,\n", + " ymax=ymax,\n", + " color=\"tab:red\",\n", + " alpha=0.2,\n", + " linewidth=6,\n", + " )\n", + " ax.tick_params(axis=\"x\", which=\"both\", bottom=False, labelbottom=False)\n", + " ax.set_ylabel(\"sensor1_value\")\n", " ax.legend()\n", "\n", " ax = fig.add_subplot(612, sharex=ax)\n", - " ax.plot(rdf[timestampColumn],rdf[\"sensor_2\"], color='tab:green', linestyle='solid', linewidth=2, label=\"sensor_2\")\n", - " ax.grid(axis='y')\n", + " ax.plot(\n", + " rdf[timestampColumn],\n", + " rdf[\"sensor_2\"],\n", + " color=\"tab:green\",\n", + " linestyle=\"solid\",\n", + " linewidth=2,\n", + " label=\"sensor_2\",\n", + " )\n", + " ax.grid(axis=\"y\")\n", " _, _, ymin, ymax = plt.axis()\n", - " ax.vlines(rdf[timestampColumn][anoms], ymin=ymin , ymax=ymax , color='tab:red', alpha=0.2, linewidth=6)\n", - " ax.tick_params(axis='x',which='both',bottom=False,labelbottom=False)\n", - " ax.set_ylabel('sensor2_value')\n", + " ax.vlines(\n", + " rdf[timestampColumn][anoms],\n", + " ymin=ymin,\n", + " ymax=ymax,\n", + " color=\"tab:red\",\n", + " alpha=0.2,\n", + " linewidth=6,\n", + " )\n", + " ax.tick_params(axis=\"x\", which=\"both\", bottom=False, labelbottom=False)\n", + " ax.set_ylabel(\"sensor2_value\")\n", " ax.legend()\n", "\n", " ax = fig.add_subplot(613, sharex=ax)\n", - " ax.plot(rdf[timestampColumn],rdf[\"sensor_3\"], color='tab:purple', linestyle='solid', linewidth=2, label=\"sensor_3\")\n", - " ax.grid(axis='y')\n", + " ax.plot(\n", + " rdf[timestampColumn],\n", + " rdf[\"sensor_3\"],\n", + " color=\"tab:purple\",\n", + " linestyle=\"solid\",\n", + " linewidth=2,\n", + " label=\"sensor_3\",\n", + " )\n", + " ax.grid(axis=\"y\")\n", " _, _, ymin, ymax = plt.axis()\n", - " ax.vlines(rdf[timestampColumn][anoms], ymin=ymin , ymax=ymax , color='tab:red', alpha=0.2, linewidth=6)\n", - " ax.tick_params(axis='x',which='both',bottom=False,labelbottom=False)\n", - " ax.set_ylabel('sensor3_value')\n", + " ax.vlines(\n", + " rdf[timestampColumn][anoms],\n", + " ymin=ymin,\n", + " ymax=ymax,\n", + " color=\"tab:red\",\n", + " alpha=0.2,\n", + " linewidth=6,\n", + " )\n", + " ax.tick_params(axis=\"x\", which=\"both\", bottom=False, labelbottom=False)\n", + " ax.set_ylabel(\"sensor3_value\")\n", " ax.legend()\n", "\n", " ax = fig.add_subplot(614, sharex=ax)\n", - " ax.tick_params(axis='x',which='both',bottom=False,labelbottom=False)\n", - " ax.plot(rdf[timestampColumn],rdf['outlierScore'], color='black', linestyle='solid', linewidth=2, label='Outlier score')\n", - " ax.set_ylabel('outlier score')\n", - " ax.grid(axis='y')\n", + " ax.tick_params(axis=\"x\", which=\"both\", bottom=False, labelbottom=False)\n", + " ax.plot(\n", + " rdf[timestampColumn],\n", + " rdf[\"outlierScore\"],\n", + " color=\"black\",\n", + " linestyle=\"solid\",\n", + " linewidth=2,\n", + " label=\"Outlier score\",\n", + " )\n", + " ax.set_ylabel(\"outlier score\")\n", + " ax.grid(axis=\"y\")\n", " ax.legend()\n", - " \n", + "\n", " ax = fig.add_subplot(615, sharex=ax)\n", - " ax.tick_params(axis='x',which='both',bottom=False,labelbottom=False)\n", - " ax.bar(rdf[timestampColumn],rdf['sensor_1_localimp'].abs(), width=2, color='tab:orange', label=\"sensor_1\")\n", - " ax.bar(rdf[timestampColumn],rdf['sensor_2_localimp'].abs(), width=2, color='tab:green', label=\"sensor_2\", bottom=rdf[\"sensor_1_localimp\"].abs())\n", - " ax.bar(rdf[timestampColumn],rdf['sensor_3_localimp'].abs(), width=2, color='tab:purple', label=\"sensor_3\", bottom=rdf[\"sensor_1_localimp\"].abs()+rdf[\"sensor_2_localimp\"].abs())\n", - " ax.set_ylabel('Contribution scores')\n", - " ax.grid(axis='y')\n", + " ax.tick_params(axis=\"x\", which=\"both\", bottom=False, labelbottom=False)\n", + " ax.bar(\n", + " rdf[timestampColumn],\n", + " rdf[\"sensor_1_localimp\"].abs(),\n", + " width=2,\n", + " color=\"tab:orange\",\n", + " label=\"sensor_1\",\n", + " )\n", + " ax.bar(\n", + " rdf[timestampColumn],\n", + " rdf[\"sensor_2_localimp\"].abs(),\n", + " width=2,\n", + " color=\"tab:green\",\n", + " label=\"sensor_2\",\n", + " bottom=rdf[\"sensor_1_localimp\"].abs(),\n", + " )\n", + " ax.bar(\n", + " rdf[timestampColumn],\n", + " rdf[\"sensor_3_localimp\"].abs(),\n", + " width=2,\n", + " color=\"tab:purple\",\n", + " label=\"sensor_3\",\n", + " bottom=rdf[\"sensor_1_localimp\"].abs() + rdf[\"sensor_2_localimp\"].abs(),\n", + " )\n", + " ax.set_ylabel(\"Contribution scores\")\n", + " ax.grid(axis=\"y\")\n", " ax.legend()\n", "\n", " plt.show()" @@ -824,7 +932,7 @@ "metadata": {}, "outputs": [], "source": [ - "plt.figure(figsize=(10,7))\n", + "plt.figure(figsize=(10, 7))\n", "plt.bar(inputCols, global_explanation.global_importance_values)\n", "plt.ylabel(\"global importance values\")" ] @@ -860,6 +968,7 @@ "source": [ "# View the model explanation in the ExplanationDashboard\n", "from raiwidgets import ExplanationDashboard\n", + "\n", "ExplanationDashboard(global_explanation, wrapper(model), dataset=eval_data)" ] } diff --git a/notebooks/features/lightgbm/LightGBM - Overview.ipynb b/notebooks/features/lightgbm/LightGBM - Overview.ipynb index b565916c61..6d444cf8cd 100644 --- a/notebooks/features/lightgbm/LightGBM - Overview.ipynb +++ b/notebooks/features/lightgbm/LightGBM - Overview.ipynb @@ -88,8 +88,9 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", - " from notebookutils.visualization import display\n" + " from notebookutils.visualization import display" ], "outputs": [], "metadata": {} @@ -98,10 +99,14 @@ "cell_type": "code", "execution_count": null, "source": [ - "df = spark.read.format(\"csv\")\\\n", - " .option(\"header\", True)\\\n", - " .option(\"inferSchema\", True)\\\n", - " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/company_bankruptcy_prediction_data.csv\")\n", + "df = (\n", + " spark.read.format(\"csv\")\n", + " .option(\"header\", True)\n", + " .option(\"inferSchema\", True)\n", + " .load(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/company_bankruptcy_prediction_data.csv\"\n", + " )\n", + ")\n", "# print dataset size\n", "print(\"records read: \" + str(df.count()))\n", "print(\"Schema: \")\n", @@ -147,13 +152,11 @@ "execution_count": null, "source": [ "from pyspark.ml.feature import VectorAssembler\n", + "\n", "feature_cols = df.columns[1:]\n", - "featurizer = VectorAssembler(\n", - " inputCols=feature_cols,\n", - " outputCol='features'\n", - ")\n", - "train_data = featurizer.transform(train)['Bankrupt?', 'features']\n", - "test_data = featurizer.transform(test)['Bankrupt?', 'features']" + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "train_data = featurizer.transform(train)[\"Bankrupt?\", \"features\"]\n", + "test_data = featurizer.transform(test)[\"Bankrupt?\", \"features\"]" ], "outputs": [], "metadata": {} @@ -186,7 +189,10 @@ "execution_count": null, "source": [ "from synapse.ml.lightgbm import LightGBMClassifier\n", - "model = LightGBMClassifier(objective=\"binary\", featuresCol=\"features\", labelCol=\"Bankrupt?\", isUnbalance=True)" + "\n", + "model = LightGBMClassifier(\n", + " objective=\"binary\", featuresCol=\"features\", labelCol=\"Bankrupt?\", isUnbalance=True\n", + ")" ], "outputs": [], "metadata": {} @@ -215,10 +221,12 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " model.saveNativeModel(\"/models/lgbmclassifier.model\")\n", - " model = LightGBMClassificationModel.loadNativeModelFromFile(\"/models/lgbmclassifier.model\")\n", + " model = LightGBMClassificationModel.loadNativeModelFromFile(\n", + " \"/models/lgbmclassifier.model\"\n", + " )\n", "else:\n", " model.saveNativeModel(\"/lgbmclassifier.model\")\n", - " model = LightGBMClassificationModel.loadNativeModelFromFile(\"/lgbmclassifier.model\")\n" + " model = LightGBMClassificationModel.loadNativeModelFromFile(\"/lgbmclassifier.model\")" ], "outputs": [], "metadata": {} @@ -238,22 +246,24 @@ "import matplotlib.pyplot as plt\n", "\n", "feature_importances = model.getFeatureImportances()\n", - "fi = pd.Series(feature_importances,index = feature_cols)\n", - "fi = fi.sort_values(ascending = True)\n", + "fi = pd.Series(feature_importances, index=feature_cols)\n", + "fi = fi.sort_values(ascending=True)\n", "f_index = fi.index\n", "f_values = fi.values\n", - " \n", - "# print feature importances \n", - "print ('f_index:',f_index)\n", - "print ('f_values:',f_values)\n", + "\n", + "# print feature importances\n", + "print(\"f_index:\", f_index)\n", + "print(\"f_values:\", f_values)\n", "\n", "# plot\n", "x_index = list(range(len(fi)))\n", - "x_index = [x/len(fi) for x in x_index]\n", - "plt.rcParams['figure.figsize'] = (20,20)\n", - "plt.barh(x_index,f_values,height = 0.028 ,align=\"center\",color = 'tan',tick_label=f_index)\n", - "plt.xlabel('importances')\n", - "plt.ylabel('features')\n", + "x_index = [x / len(fi) for x in x_index]\n", + "plt.rcParams[\"figure.figsize\"] = (20, 20)\n", + "plt.barh(\n", + " x_index, f_values, height=0.028, align=\"center\", color=\"tan\", tick_label=f_index\n", + ")\n", + "plt.xlabel(\"importances\")\n", + "plt.ylabel(\"features\")\n", "plt.show()" ], "outputs": [], @@ -281,7 +291,12 @@ "execution_count": null, "source": [ "from synapse.ml.train import ComputeModelStatistics\n", - "metrics = ComputeModelStatistics(evaluationMetric=\"classification\", labelCol='Bankrupt?', scoredLabelsCol='prediction').transform(predictions)\n", + "\n", + "metrics = ComputeModelStatistics(\n", + " evaluationMetric=\"classification\",\n", + " labelCol=\"Bankrupt?\",\n", + " scoredLabelsCol=\"prediction\",\n", + ").transform(predictions)\n", "display(metrics)" ], "outputs": [], @@ -309,8 +324,9 @@ "cell_type": "code", "execution_count": null, "source": [ - "triazines = spark.read.format(\"libsvm\")\\\n", - " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/triazines.scale.svmlight\")" + "triazines = spark.read.format(\"libsvm\").load(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/triazines.scale.svmlight\"\n", + ")" ], "outputs": [], "metadata": {} @@ -356,10 +372,10 @@ "execution_count": null, "source": [ "from synapse.ml.lightgbm import LightGBMRegressor\n", - "model = LightGBMRegressor(objective='quantile',\n", - " alpha=0.2,\n", - " learningRate=0.3,\n", - " numLeaves=31).fit(train)" + "\n", + "model = LightGBMRegressor(\n", + " objective=\"quantile\", alpha=0.2, learningRate=0.3, numLeaves=31\n", + ").fit(train)" ], "outputs": [], "metadata": {} @@ -395,10 +411,10 @@ "execution_count": null, "source": [ "from synapse.ml.train import ComputeModelStatistics\n", - "metrics = ComputeModelStatistics(evaluationMetric='regression',\n", - " labelCol='label',\n", - " scoresCol='prediction') \\\n", - " .transform(scoredData)\n", + "\n", + "metrics = ComputeModelStatistics(\n", + " evaluationMetric=\"regression\", labelCol=\"label\", scoresCol=\"prediction\"\n", + ").transform(scoredData)\n", "display(metrics)" ], "outputs": [], @@ -422,7 +438,9 @@ "cell_type": "code", "execution_count": null, "source": [ - "df = spark.read.format(\"parquet\").load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/lightGBMRanker_train.parquet\")\n", + "df = spark.read.format(\"parquet\").load(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/lightGBMRanker_train.parquet\"\n", + ")\n", "# print some basic info\n", "print(\"records read: \" + str(df.count()))\n", "print(\"Schema: \")\n", @@ -445,20 +463,22 @@ "source": [ "from synapse.ml.lightgbm import LightGBMRanker\n", "\n", - "features_col = 'features'\n", - "query_col = 'query'\n", - "label_col = 'labels'\n", - "lgbm_ranker = LightGBMRanker(labelCol=label_col,\n", - " featuresCol=features_col,\n", - " groupCol=query_col,\n", - " predictionCol='preds',\n", - " leafPredictionCol='leafPreds',\n", - " featuresShapCol='importances',\n", - " repartitionByGroupingColumn=True,\n", - " numLeaves=32,\n", - " numIterations=200,\n", - " evalAt=[1,3,5],\n", - " metric='ndcg')" + "features_col = \"features\"\n", + "query_col = \"query\"\n", + "label_col = \"labels\"\n", + "lgbm_ranker = LightGBMRanker(\n", + " labelCol=label_col,\n", + " featuresCol=features_col,\n", + " groupCol=query_col,\n", + " predictionCol=\"preds\",\n", + " leafPredictionCol=\"leafPreds\",\n", + " featuresShapCol=\"importances\",\n", + " repartitionByGroupingColumn=True,\n", + " numLeaves=32,\n", + " numIterations=200,\n", + " evalAt=[1, 3, 5],\n", + " metric=\"ndcg\",\n", + ")" ], "outputs": [], "metadata": {} @@ -483,7 +503,9 @@ "cell_type": "code", "execution_count": null, "source": [ - "dt = spark.read.format(\"parquet\").load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/lightGBMRanker_test.parquet\")\n", + "dt = spark.read.format(\"parquet\").load(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/lightGBMRanker_test.parquet\"\n", + ")\n", "predictions = lgbm_ranker_model.transform(dt)\n", "predictions.limit(10).toPandas()" ], diff --git a/notebooks/features/onnx/ONNX - Inference on Spark.ipynb b/notebooks/features/onnx/ONNX - Inference on Spark.ipynb index 22571ae22c..d924e7b857 100644 --- a/notebooks/features/onnx/ONNX - Inference on Spark.ipynb +++ b/notebooks/features/onnx/ONNX - Inference on Spark.ipynb @@ -30,6 +30,7 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", " from notebookutils.visualization import display" ], @@ -46,10 +47,14 @@ "metadata": {}, "outputs": [], "source": [ - "df = spark.read.format(\"csv\")\\\n", - " .option(\"header\", True)\\\n", - " .option(\"inferSchema\", True)\\\n", - " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/company_bankruptcy_prediction_data.csv\")\n", + "df = (\n", + " spark.read.format(\"csv\")\n", + " .option(\"header\", True)\n", + " .option(\"inferSchema\", True)\n", + " .load(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/company_bankruptcy_prediction_data.csv\"\n", + " )\n", + ")\n", "\n", "display(df)" ] @@ -71,29 +76,26 @@ "from synapse.ml.lightgbm import LightGBMClassifier\n", "\n", "feature_cols = df.columns[1:]\n", - "featurizer = VectorAssembler(\n", - " inputCols=feature_cols,\n", - " outputCol='features'\n", - ")\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", "\n", - "train_data = featurizer.transform(df)['Bankrupt?', 'features']\n", + "train_data = featurizer.transform(df)[\"Bankrupt?\", \"features\"]\n", "\n", "model = (\n", - " LightGBMClassifier(featuresCol=\"features\", labelCol=\"Bankrupt?\")\n", - " .setEarlyStoppingRound(300)\n", - " .setLambdaL1(0.5)\n", - " .setNumIterations(1000)\n", - " .setNumThreads(-1)\n", - " .setMaxDeltaStep(0.5)\n", - " .setNumLeaves(31)\n", - " .setMaxDepth(-1)\n", - " .setBaggingFraction(0.7)\n", - " .setFeatureFraction(0.7)\n", - " .setBaggingFreq(2)\n", - " .setObjective(\"binary\")\n", - " .setIsUnbalance(True)\n", - " .setMinSumHessianInLeaf(20)\n", - " .setMinGainToSplit(0.01)\n", + " LightGBMClassifier(featuresCol=\"features\", labelCol=\"Bankrupt?\")\n", + " .setEarlyStoppingRound(300)\n", + " .setLambdaL1(0.5)\n", + " .setNumIterations(1000)\n", + " .setNumThreads(-1)\n", + " .setMaxDeltaStep(0.5)\n", + " .setNumLeaves(31)\n", + " .setMaxDepth(-1)\n", + " .setBaggingFraction(0.7)\n", + " .setFeatureFraction(0.7)\n", + " .setBaggingFreq(2)\n", + " .setObjective(\"binary\")\n", + " .setIsUnbalance(True)\n", + " .setMinSumHessianInLeaf(20)\n", + " .setMinGainToSplit(0.01)\n", ")\n", "\n", "model = model.fit(train_data)" @@ -115,12 +117,17 @@ "import lightgbm as lgb\n", "from lightgbm import Booster, LGBMClassifier\n", "\n", + "\n", "def convertModel(lgbm_model: LGBMClassifier or Booster, input_size: int) -> bytes:\n", - " from onnxmltools.convert import convert_lightgbm\n", - " from onnxconverter_common.data_types import FloatTensorType\n", - " initial_types = [(\"input\", FloatTensorType([-1, input_size]))]\n", - " onnx_model = convert_lightgbm(lgbm_model, initial_types=initial_types, target_opset=9)\n", - " return onnx_model.SerializeToString()\n", + " from onnxmltools.convert import convert_lightgbm\n", + " from onnxconverter_common.data_types import FloatTensorType\n", + "\n", + " initial_types = [(\"input\", FloatTensorType([-1, input_size]))]\n", + " onnx_model = convert_lightgbm(\n", + " lgbm_model, initial_types=initial_types, target_opset=9\n", + " )\n", + " return onnx_model.SerializeToString()\n", + "\n", "\n", "booster_model_str = model.getLightGBMBooster().modelStr().get()\n", "booster = lgb.Booster(model_str=booster_model_str)\n", @@ -162,8 +169,7 @@ "outputs": [], "source": [ "onnx_ml = (\n", - " onnx_ml\n", - " .setDeviceType(\"CPU\")\n", + " onnx_ml.setDeviceType(\"CPU\")\n", " .setFeedDict({\"input\": \"features\"})\n", " .setFetchDict({\"probability\": \"probabilities\", \"prediction\": \"label\"})\n", " .setMiniBatchSize(5000)\n", @@ -194,7 +200,14 @@ "cols = list(map(str, testPdf.columns))\n", "testDf = spark.createDataFrame(testPdf)\n", "testDf = testDf.union(testDf).repartition(200)\n", - "testDf = VectorAssembler().setInputCols(cols).setOutputCol(\"features\").transform(testDf).drop(*cols).cache()\n", + "testDf = (\n", + " VectorAssembler()\n", + " .setInputCols(cols)\n", + " .setOutputCol(\"features\")\n", + " .transform(testDf)\n", + " .drop(*cols)\n", + " .cache()\n", + ")\n", "\n", "display(onnx_ml.transform(testDf))" ] diff --git a/notebooks/features/opencv/OpenCV - Pipeline Image Transformations.ipynb b/notebooks/features/opencv/OpenCV - Pipeline Image Transformations.ipynb index c22be723ee..d91f7ca98c 100644 --- a/notebooks/features/opencv/OpenCV - Pipeline Image Transformations.ipynb +++ b/notebooks/features/opencv/OpenCV - Pipeline Image Transformations.ipynb @@ -27,8 +27,10 @@ "outputs": [], "source": [ "import os\n", + "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", "\n", "import synapse.ml\n", @@ -58,8 +60,14 @@ "outputs": [], "source": [ "import time\n", + "\n", "imageStream = spark.readStream.image().load(imageDir)\n", - "query = imageStream.select(\"image.height\").writeStream.format(\"memory\").queryName(\"heights\").start()\n", + "query = (\n", + " imageStream.select(\"image.height\")\n", + " .writeStream.format(\"memory\")\n", + " .queryName(\"heights\")\n", + " .start()\n", + ")\n", "time.sleep(3)\n", "print(\"Streaming query activity: {}\".format(query.isActive))" ] @@ -99,6 +107,7 @@ "outputs": [], "source": [ "from py4j.protocol import Py4JJavaError\n", + "\n", "try:\n", " query.stop()\n", "except Py4JJavaError as e:\n", @@ -123,16 +132,17 @@ "source": [ "from PIL import Image\n", "import matplotlib.pyplot as plt\n", - "data = images.take(3) # take first three rows of the dataframe\n", - "im = data[2][0] # the image is in the first column of a given row\n", + "\n", + "data = images.take(3) # take first three rows of the dataframe\n", + "im = data[2][0] # the image is in the first column of a given row\n", "\n", "print(\"image type: {}, number of fields: {}\".format(type(im), len(im)))\n", "print(\"image path: {}\".format(im.origin))\n", "print(\"height: {}, width: {}, OpenCV type: {}\".format(im.height, im.width, im.mode))\n", "\n", - "arr = toNDArray(im) # convert to numpy array\n", + "arr = toNDArray(im) # convert to numpy array\n", "print(images.count())\n", - "plt.imshow(Image.fromarray(arr, \"RGB\")) # display the image inside notebook\n" + "plt.imshow(Image.fromarray(arr, \"RGB\")) # display the image inside notebook" ] }, { @@ -151,15 +161,17 @@ "source": [ "from synapse.ml.opencv import ImageTransformer\n", "\n", - "tr = (ImageTransformer() # images are resized and then cropped\n", - " .setOutputCol(\"transformed\")\n", - " .resize(size=(200, 200))\n", - " .crop(0, 0, height = 180, width = 180) )\n", + "tr = (\n", + " ImageTransformer() # images are resized and then cropped\n", + " .setOutputCol(\"transformed\")\n", + " .resize(size=(200, 200))\n", + " .crop(0, 0, height=180, width=180)\n", + ")\n", "\n", "small = tr.transform(images).select(\"transformed\")\n", "\n", - "im = small.take(3)[2][0] # take third image\n", - "plt.imshow(Image.fromarray(toNDArray(im), \"RGB\")) # display the image inside notebook" + "im = small.take(3)[2][0] # take third image\n", + "plt.imshow(Image.fromarray(toNDArray(im), \"RGB\")) # display the image inside notebook" ] }, { @@ -180,17 +192,19 @@ "from pyspark.sql.functions import udf\n", "from synapse.ml.opencv import ImageSchema, toNDArray, toImage\n", "\n", - "def u(row):\n", - " array = toNDArray(row) # convert Image to numpy ndarray[height, width, 3]\n", - " array[:,:,2] = 0\n", - " return toImage(array) # numpy array back to Spark Row structure\n", "\n", - "noBlueUDF = udf(u,ImageSchema)\n", + "def u(row):\n", + " array = toNDArray(row) # convert Image to numpy ndarray[height, width, 3]\n", + " array[:, :, 2] = 0\n", + " return toImage(array) # numpy array back to Spark Row structure\n", + "\n", + "\n", + "noBlueUDF = udf(u, ImageSchema)\n", "\n", "noblue = small.withColumn(\"noblue\", noBlueUDF(small[\"transformed\"])).select(\"noblue\")\n", "\n", - "im = noblue.take(3)[2][0] # take second image\n", - "plt.imshow(Image.fromarray(toNDArray(im), \"RGB\")) # display the image inside notebook" + "im = noblue.take(3)[2][0] # take second image\n", + "plt.imshow(Image.fromarray(toNDArray(im), \"RGB\")) # display the image inside notebook" ] }, { diff --git a/notebooks/features/other/AzureSearchIndex - Met Artworks.ipynb b/notebooks/features/other/AzureSearchIndex - Met Artworks.ipynb index 6cf965cd59..b8addd44c7 100644 --- a/notebooks/features/other/AzureSearchIndex - Met Artworks.ipynb +++ b/notebooks/features/other/AzureSearchIndex - Met Artworks.ipynb @@ -34,10 +34,14 @@ "source": [ "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", " from notebookutils.mssparkutils.credentials import getSecret\n", - " os.environ['VISION_API_KEY'] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n", - " os.environ['AZURE_SEARCH_KEY'] = getSecret(\"mmlspark-build-keys\", \"azure-search-key\")" + "\n", + " os.environ[\"VISION_API_KEY\"] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n", + " os.environ[\"AZURE_SEARCH_KEY\"] = getSecret(\n", + " \"mmlspark-build-keys\", \"azure-search-key\"\n", + " )" ], "outputs": [], "metadata": {} @@ -46,8 +50,8 @@ "cell_type": "code", "execution_count": 4, "source": [ - "VISION_API_KEY = os.environ['VISION_API_KEY']\n", - "AZURE_SEARCH_KEY = os.environ['AZURE_SEARCH_KEY']\n", + "VISION_API_KEY = os.environ[\"VISION_API_KEY\"]\n", + "AZURE_SEARCH_KEY = os.environ[\"AZURE_SEARCH_KEY\"]\n", "search_service = \"mmlspark-azure-search\"\n", "search_index = \"test\"" ], @@ -60,14 +64,15 @@ "cell_type": "code", "execution_count": 5, "source": [ - "data = spark.read\\\n", - " .format(\"csv\")\\\n", - " .option(\"header\", True)\\\n", - " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/metartworks_sample.csv\")\\\n", - " .withColumn(\"searchAction\", lit(\"upload\"))\\\n", - " .withColumn(\"Neighbors\", split(col(\"Neighbors\"), \",\").cast(\"array\"))\\\n", - " .withColumn(\"Tags\", split(col(\"Tags\"), \",\").cast(\"array\"))\\\n", - " .limit(25)" + "data = (\n", + " spark.read.format(\"csv\")\n", + " .option(\"header\", True)\n", + " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/metartworks_sample.csv\")\n", + " .withColumn(\"searchAction\", lit(\"upload\"))\n", + " .withColumn(\"Neighbors\", split(col(\"Neighbors\"), \",\").cast(\"array\"))\n", + " .withColumn(\"Tags\", split(col(\"Tags\"), \",\").cast(\"array\"))\n", + " .limit(25)\n", + ")" ], "outputs": [], "metadata": { @@ -88,18 +93,25 @@ "from synapse.ml.cognitive import AnalyzeImage\n", "from synapse.ml.stages import SelectColumns\n", "\n", - "#define pipeline\n", - "describeImage = (AnalyzeImage()\n", - " .setSubscriptionKey(VISION_API_KEY)\n", - " .setLocation(\"eastus\")\n", - " .setImageUrlCol(\"PrimaryImageUrl\")\n", - " .setOutputCol(\"RawImageDescription\")\n", - " .setErrorCol(\"Errors\")\n", - " .setVisualFeatures([\"Categories\", \"Description\", \"Faces\", \"ImageType\", \"Color\", \"Adult\"])\n", - " .setConcurrency(5))\n", + "# define pipeline\n", + "describeImage = (\n", + " AnalyzeImage()\n", + " .setSubscriptionKey(VISION_API_KEY)\n", + " .setLocation(\"eastus\")\n", + " .setImageUrlCol(\"PrimaryImageUrl\")\n", + " .setOutputCol(\"RawImageDescription\")\n", + " .setErrorCol(\"Errors\")\n", + " .setVisualFeatures(\n", + " [\"Categories\", \"Description\", \"Faces\", \"ImageType\", \"Color\", \"Adult\"]\n", + " )\n", + " .setConcurrency(5)\n", + ")\n", "\n", - "df2 = describeImage.transform(data)\\\n", - " .select(\"*\", \"RawImageDescription.*\").drop(\"Errors\", \"RawImageDescription\")" + "df2 = (\n", + " describeImage.transform(data)\n", + " .select(\"*\", \"RawImageDescription.*\")\n", + " .drop(\"Errors\", \"RawImageDescription\")\n", + ")" ], "outputs": [], "metadata": { @@ -125,12 +137,14 @@ "execution_count": 10, "source": [ "from synapse.ml.cognitive import *\n", + "\n", "df2.writeToAzureSearch(\n", - " subscriptionKey=AZURE_SEARCH_KEY,\n", - " actionCol=\"searchAction\",\n", - " serviceName=search_service,\n", - " indexName=search_index,\n", - " keyCol=\"ObjectID\")" + " subscriptionKey=AZURE_SEARCH_KEY,\n", + " actionCol=\"searchAction\",\n", + " serviceName=search_service,\n", + " indexName=search_index,\n", + " keyCol=\"ObjectID\",\n", + ")" ], "outputs": [], "metadata": { @@ -148,8 +162,12 @@ "cell_type": "code", "execution_count": 12, "source": [ - "url = 'https://{}.search.windows.net/indexes/{}/docs/search?api-version=2019-05-06'.format(search_service, search_index)\n", - "requests.post(url, json={\"search\": \"Glass\"}, headers = {\"api-key\": AZURE_SEARCH_KEY}).json()" + "url = \"https://{}.search.windows.net/indexes/{}/docs/search?api-version=2019-05-06\".format(\n", + " search_service, search_index\n", + ")\n", + "requests.post(\n", + " url, json={\"search\": \"Glass\"}, headers={\"api-key\": AZURE_SEARCH_KEY}\n", + ").json()" ], "outputs": [], "metadata": { diff --git a/notebooks/features/other/ConditionalKNN - Exploring Art Across Cultures.ipynb b/notebooks/features/other/ConditionalKNN - Exploring Art Across Cultures.ipynb index cb9a907533..eea02cfe70 100644 --- a/notebooks/features/other/ConditionalKNN - Exploring Art Across Cultures.ipynb +++ b/notebooks/features/other/ConditionalKNN - Exploring Art Across Cultures.ipynb @@ -46,8 +46,9 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", - " from notebookutils.visualization import display\n" + " from notebookutils.visualization import display" ], "metadata": {}, "outputs": [], @@ -80,8 +81,9 @@ "source": [ "# loads the dataset and the two trained CKNN models for querying by medium and culture\n", "df = spark.read.parquet(\n", - " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/met_and_rijks.parquet\")\n", - "display(df.drop(\"Norm_Features\"))\n" + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/met_and_rijks.parquet\"\n", + ")\n", + "display(df.drop(\"Norm_Features\"))" ], "metadata": {}, "outputs": [], @@ -101,14 +103,14 @@ "# mediums = ['prints', 'drawings', 'ceramics', 'textiles', 'paintings', \"musical instruments\",\"glass\", 'accessories', 'photographs', \"metalwork\",\n", "# \"sculptures\", \"weapons\", \"stone\", \"precious\", \"paper\", \"woodwork\", \"leatherwork\", \"uncategorized\"]\n", "\n", - "mediums = ['paintings', 'glass', 'ceramics']\n", + "mediums = [\"paintings\", \"glass\", \"ceramics\"]\n", "\n", "# cultures = ['african (general)', 'american', 'ancient american', 'ancient asian', 'ancient european', 'ancient middle-eastern', 'asian (general)',\n", "# 'austrian', 'belgian', 'british', 'chinese', 'czech', 'dutch', 'egyptian']#, 'european (general)', 'french', 'german', 'greek',\n", "# 'iranian', 'italian', 'japanese', 'latin american', 'middle eastern', 'roman', 'russian', 'south asian', 'southeast asian',\n", "# 'spanish', 'swiss', 'various']\n", "\n", - "cultures = ['japanese', 'american', 'african (general)']\n", + "cultures = [\"japanese\", \"american\", \"african (general)\"]\n", "\n", "# Uncomment the above for more robust and large scale searches!\n", "\n", @@ -118,10 +120,16 @@ "culture_set = set(cultures)\n", "selected_ids = {\"AK-RBK-17525-2\", \"AK-MAK-1204\", \"AK-RAK-2015-2-9\"}\n", "\n", - "small_df = df.where(udf(lambda medium, culture, id_val: (medium in medium_set) or (\n", - " culture in culture_set) or (id_val in selected_ids), BooleanType())(\"Classification\", \"Culture\", \"id\"))\n", + "small_df = df.where(\n", + " udf(\n", + " lambda medium, culture, id_val: (medium in medium_set)\n", + " or (culture in culture_set)\n", + " or (id_val in selected_ids),\n", + " BooleanType(),\n", + " )(\"Classification\", \"Culture\", \"id\")\n", + ")\n", "\n", - "small_df.count()\n" + "small_df.count()" ], "metadata": {}, "outputs": [], @@ -138,12 +146,14 @@ { "cell_type": "code", "source": [ - "medium_cknn = (ConditionalKNN()\n", - " .setOutputCol(\"Matches\")\n", - " .setFeaturesCol(\"Norm_Features\")\n", - " .setValuesCol(\"Thumbnail_Url\")\n", - " .setLabelCol(\"Classification\")\n", - " .fit(small_df))" + "medium_cknn = (\n", + " ConditionalKNN()\n", + " .setOutputCol(\"Matches\")\n", + " .setFeaturesCol(\"Norm_Features\")\n", + " .setValuesCol(\"Thumbnail_Url\")\n", + " .setLabelCol(\"Classification\")\n", + " .fit(small_df)\n", + ")" ], "metadata": {}, "outputs": [], @@ -152,12 +162,14 @@ { "cell_type": "code", "source": [ - "culture_cknn = (ConditionalKNN()\n", - " .setOutputCol(\"Matches\")\n", - " .setFeaturesCol(\"Norm_Features\")\n", - " .setValuesCol(\"Thumbnail_Url\")\n", - " .setLabelCol(\"Culture\")\n", - " .fit(small_df))\n" + "culture_cknn = (\n", + " ConditionalKNN()\n", + " .setOutputCol(\"Matches\")\n", + " .setFeaturesCol(\"Norm_Features\")\n", + " .setValuesCol(\"Thumbnail_Url\")\n", + " .setLabelCol(\"Culture\")\n", + " .fit(small_df)\n", + ")" ], "metadata": {}, "outputs": [], @@ -180,8 +192,9 @@ "def add_matches(classes, cknn, df):\n", " results = df\n", " for label in classes:\n", - " results = (cknn.transform(results.withColumn(\"conditioner\", array(lit(label))))\n", - " .withColumnRenamed(\"Matches\", \"Matches_{}\".format(label)))\n", + " results = cknn.transform(\n", + " results.withColumn(\"conditioner\", array(lit(label)))\n", + " ).withColumnRenamed(\"Matches\", \"Matches_{}\".format(label))\n", " return results" ], "metadata": {}, @@ -201,19 +214,19 @@ "def plot_img(axis, url, title):\n", " try:\n", " response = requests.get(url)\n", - " img = Image.open(BytesIO(response.content)).convert('RGB')\n", + " img = Image.open(BytesIO(response.content)).convert(\"RGB\")\n", " axis.imshow(img, aspect=\"equal\")\n", " except:\n", " pass\n", " if title is not None:\n", - " axis.set_title(title, fontsize=4)\n", + " axis.set_title(title, fontsize=4)\n", " axis.axis(\"off\")\n", "\n", "\n", "def plot_urls(url_arr, titles, filename):\n", " nx, ny = url_arr.shape\n", "\n", - " plt.figure(figsize=(nx*5, ny*5), dpi=1600)\n", + " plt.figure(figsize=(nx * 5, ny * 5), dpi=1600)\n", " fig, axes = plt.subplots(ny, nx)\n", "\n", " # reshape required in the case of 1 image query\n", @@ -225,7 +238,7 @@ " if j == 0:\n", " plot_img(axes[j, i], url_arr[i, j], titles[i])\n", " else:\n", - " plot_img(axes[j, i], url_arr[i, j], None)\n", + " plot_img(axes[j, i], url_arr[i, j], None)\n", "\n", " plt.savefig(filename, dpi=1600) # saves the results as a PNG\n", "\n", @@ -248,6 +261,7 @@ "source": [ "# main method to test a particular dataset with two CKNN models and a set of art IDs, saving the result to filename.png\n", "\n", + "\n", "def test_all(data, cknn_medium, cknn_culture, test_ids, root):\n", " is_nice_obj = udf(lambda obj: obj in test_ids, BooleanType())\n", " test_df = data.where(is_nice_obj(\"id\"))\n", @@ -259,19 +273,21 @@ "\n", " original_urls = [row[\"Thumbnail_Url\"] for row in results]\n", "\n", - " culture_urls = [[row[\"Matches_{}\".format(\n", - " label)][0][\"value\"] for row in results] for label in cultures]\n", + " culture_urls = [\n", + " [row[\"Matches_{}\".format(label)][0][\"value\"] for row in results]\n", + " for label in cultures\n", + " ]\n", " culture_url_arr = np.array([original_urls] + culture_urls)[:, :]\n", - " plot_urls(culture_url_arr, [\"Original\"] +\n", - " cultures, root + \"matches_by_culture.png\")\n", + " plot_urls(culture_url_arr, [\"Original\"] + cultures, root + \"matches_by_culture.png\")\n", "\n", - " medium_urls = [[row[\"Matches_{}\".format(\n", - " label)][0][\"value\"] for row in results] for label in mediums]\n", + " medium_urls = [\n", + " [row[\"Matches_{}\".format(label)][0][\"value\"] for row in results]\n", + " for label in mediums\n", + " ]\n", " medium_url_arr = np.array([original_urls] + medium_urls)[:, :]\n", - " plot_urls(medium_url_arr, [\"Original\"] +\n", - " mediums, root + \"matches_by_medium.png\")\n", + " plot_urls(medium_url_arr, [\"Original\"] + mediums, root + \"matches_by_medium.png\")\n", "\n", - " return results_df_culture\n" + " return results_df_culture" ], "metadata": {}, "outputs": [], @@ -292,8 +308,7 @@ "cell_type": "code", "source": [ "# sample query\n", - "result_df = test_all(small_df, medium_cknn, culture_cknn,\n", - " selected_ids, root=\".\")\n" + "result_df = test_all(small_df, medium_cknn, culture_cknn, selected_ids, root=\".\")" ], "metadata": {}, "outputs": [], diff --git a/notebooks/features/other/CyberML - Anomalous Access Detection.ipynb b/notebooks/features/other/CyberML - Anomalous Access Detection.ipynb index 1f2f489cb4..45dc70ebaa 100644 --- a/notebooks/features/other/CyberML - Anomalous Access Detection.ipynb +++ b/notebooks/features/other/CyberML - Anomalous Access Detection.ipynb @@ -72,6 +72,7 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()" ] }, @@ -88,24 +89,28 @@ "metadata": {}, "outputs": [], "source": [ - "spark.sparkContext.setCheckpointDir('dbfs:/checkpoint_path/')\n", + "spark.sparkContext.setCheckpointDir(\"dbfs:/checkpoint_path/\")\n", "\n", "factory = DataFactory(\n", - " num_hr_users = 25,\n", - " num_hr_resources = 50,\n", - " num_fin_users = 35,\n", - " num_fin_resources = 75,\n", - " num_eng_users = 15,\n", - " num_eng_resources = 25,\n", - " single_component = True\n", + " num_hr_users=25,\n", + " num_hr_resources=50,\n", + " num_fin_users=35,\n", + " num_fin_resources=75,\n", + " num_eng_users=15,\n", + " num_eng_resources=25,\n", + " single_component=True,\n", ")\n", "\n", "training_pdf = factory.create_clustered_training_data(ratio=0.4)\n", "\n", "# a tenant id is used when independant datasets originate from different tenants, in this example we set all tenants-ids to the same value\n", - "training_df = spark.createDataFrame(training_pdf).withColumn('tenant_id', f.lit(0))\n", - "ingroup_df = spark.createDataFrame(factory.create_clustered_intra_test_data(training_pdf)).withColumn('tenant_id', f.lit(0))\n", - "outgroup_df = spark.createDataFrame(factory.create_clustered_inter_test_data()).withColumn('tenant_id', f.lit(0))" + "training_df = spark.createDataFrame(training_pdf).withColumn(\"tenant_id\", f.lit(0))\n", + "ingroup_df = spark.createDataFrame(\n", + " factory.create_clustered_intra_test_data(training_pdf)\n", + ").withColumn(\"tenant_id\", f.lit(0))\n", + "outgroup_df = spark.createDataFrame(\n", + " factory.create_clustered_inter_test_data()\n", + ").withColumn(\"tenant_id\", f.lit(0))" ] }, { @@ -142,11 +147,11 @@ "outputs": [], "source": [ "access_anomaly = AccessAnomaly(\n", - " tenantCol='tenant_id',\n", - " userCol='user',\n", - " resCol='res',\n", - " likelihoodCol='likelihood',\n", - " maxIter=1000\n", + " tenantCol=\"tenant_id\",\n", + " userCol=\"user\",\n", + " resCol=\"res\",\n", + " likelihoodCol=\"likelihood\",\n", + " maxIter=1000,\n", ")" ] }, @@ -182,10 +187,10 @@ "outputs": [], "source": [ "ingroup_scored_df.agg(\n", - " f.min('anomaly_score').alias('min_anomaly_score'),\n", - " f.max('anomaly_score').alias('max_anomaly_score'),\n", - " f.mean('anomaly_score').alias('mean_anomaly_score'),\n", - " f.stddev('anomaly_score').alias('stddev_anomaly_score'),\n", + " f.min(\"anomaly_score\").alias(\"min_anomaly_score\"),\n", + " f.max(\"anomaly_score\").alias(\"max_anomaly_score\"),\n", + " f.mean(\"anomaly_score\").alias(\"mean_anomaly_score\"),\n", + " f.stddev(\"anomaly_score\").alias(\"stddev_anomaly_score\"),\n", ").show()" ] }, @@ -205,10 +210,10 @@ "outputs": [], "source": [ "outgroup_scored_df.agg(\n", - " f.min('anomaly_score').alias('min_anomaly_score'),\n", - " f.max('anomaly_score').alias('max_anomaly_score'),\n", - " f.mean('anomaly_score').alias('mean_anomaly_score'),\n", - " f.stddev('anomaly_score').alias('stddev_anomaly_score'),\n", + " f.min(\"anomaly_score\").alias(\"min_anomaly_score\"),\n", + " f.max(\"anomaly_score\").alias(\"max_anomaly_score\"),\n", + " f.mean(\"anomaly_score\").alias(\"mean_anomaly_score\"),\n", + " f.stddev(\"anomaly_score\").alias(\"stddev_anomaly_score\"),\n", ").show()" ] }, @@ -229,38 +234,28 @@ "# Select a subset of results to send to Log Analytics\n", "#\n", "\n", - "full_res_df = outgroup_scored_df.orderBy(f.desc('anomaly_score')).cache()\n", + "full_res_df = outgroup_scored_df.orderBy(f.desc(\"anomaly_score\")).cache()\n", "\n", "from pyspark.sql.window import Window\n", "\n", - "w = Window.partitionBy(\n", - " 'tenant_id',\n", - " 'user',\n", - " 'res' \n", - " ).orderBy(\n", - " f.desc('anomaly_score')\n", - " )\n", + "w = Window.partitionBy(\"tenant_id\", \"user\", \"res\").orderBy(f.desc(\"anomaly_score\"))\n", "\n", "# select values above threshold\n", "results_above_threshold = full_res_df.filter(full_res_df.anomaly_score > 1.0)\n", "\n", "# get distinct resource/user and corresponding timestamp and highest score\n", - "results_to_la = results_above_threshold.withColumn(\n", - " 'index', f.row_number().over(w)\n", - " ).orderBy(\n", - " f.desc('anomaly_score')\n", - " ).select(\n", - " 'tenant_id',\n", - " f.col('user'),\n", - " f.col('res'),\n", - " 'anomaly_score'\n", - " ).where(\n", - " 'index == 1'\n", - " ).limit(100).cache()\n", + "results_to_la = (\n", + " results_above_threshold.withColumn(\"index\", f.row_number().over(w))\n", + " .orderBy(f.desc(\"anomaly_score\"))\n", + " .select(\"tenant_id\", f.col(\"user\"), f.col(\"res\"), \"anomaly_score\")\n", + " .where(\"index == 1\")\n", + " .limit(100)\n", + " .cache()\n", + ")\n", "\n", "# add a fake timestamp to the results\n", - "results_to_la = results_to_la.withColumn('timestamp', f.current_timestamp())\n", - " \n", + "results_to_la = results_to_la.withColumn(\"timestamp\", f.current_timestamp())\n", + "\n", "display(results_to_la)" ] }, @@ -283,7 +278,7 @@ "import numpy as np\n", "import pandas as pd\n", "\n", - "print (__version__) # requires version >= 1.9.0\n", + "print(__version__) # requires version >= 1.9.0\n", "\n", "# run plotly in offline mode\n", "offline.init_notebook_mode()" @@ -295,53 +290,43 @@ "metadata": {}, "outputs": [], "source": [ - "#Find all server accesses of users with high predicted scores\n", + "# Find all server accesses of users with high predicted scores\n", "# For display, limit to top 25 results\n", - "results_to_display = results_to_la.orderBy(\n", - " f.desc('anomaly_score')\n", - " ).limit(25).cache()\n", - "interesting_records = full_res_df.join(results_to_display, ['user'], 'left_semi')\n", - "non_anomalous_records = interesting_records.join(results_to_display, ['user', 'res'], 'left_anti')\n", + "results_to_display = results_to_la.orderBy(f.desc(\"anomaly_score\")).limit(25).cache()\n", + "interesting_records = full_res_df.join(results_to_display, [\"user\"], \"left_semi\")\n", + "non_anomalous_records = interesting_records.join(\n", + " results_to_display, [\"user\", \"res\"], \"left_anti\"\n", + ")\n", "\n", - "top_non_anomalous_records = non_anomalous_records.groupBy(\n", - " 'tenant_id',\n", - " 'user', \n", - " 'res'\n", - " ).agg(\n", - " f.count('*').alias('count'),\n", - " ).select(\n", - " f.col('tenant_id'),\n", - " f.col('user'),\n", - " f.col('res'),\n", - " 'count'\n", - " )\n", + "top_non_anomalous_records = (\n", + " non_anomalous_records.groupBy(\"tenant_id\", \"user\", \"res\")\n", + " .agg(\n", + " f.count(\"*\").alias(\"count\"),\n", + " )\n", + " .select(f.col(\"tenant_id\"), f.col(\"user\"), f.col(\"res\"), \"count\")\n", + ")\n", "\n", - "#pick only a subset of non-anomalous record for UI\n", + "# pick only a subset of non-anomalous record for UI\n", "w = Window.partitionBy(\n", - " 'tenant_id',\n", - " 'user',\n", - " ).orderBy(\n", - " f.desc('count')\n", - " )\n", + " \"tenant_id\",\n", + " \"user\",\n", + ").orderBy(f.desc(\"count\"))\n", "\n", "# pick top non-anomalous set\n", - "top_non_anomalous_accesses = top_non_anomalous_records.withColumn(\n", - " 'index', f.row_number().over(w)\n", - " ).orderBy(\n", - " f.desc('count')\n", - " ).select(\n", - " 'tenant_id',\n", - " f.col('user'),\n", - " f.col('res'),\n", - " f.col('count')\n", - " ).where(\n", - " 'index in (1,2,3,4,5)'\n", - " ).limit(25)\n", + "top_non_anomalous_accesses = (\n", + " top_non_anomalous_records.withColumn(\"index\", f.row_number().over(w))\n", + " .orderBy(f.desc(\"count\"))\n", + " .select(\"tenant_id\", f.col(\"user\"), f.col(\"res\"), f.col(\"count\"))\n", + " .where(\"index in (1,2,3,4,5)\")\n", + " .limit(25)\n", + ")\n", "\n", "# add back anomalous record\n", - "fileShare_accesses = (top_non_anomalous_accesses\n", - " .select('user', 'res', 'count')\n", - " .union(results_to_display.select('user', 'res', f.lit(1).alias('count'))).cache())" + "fileShare_accesses = (\n", + " top_non_anomalous_accesses.select(\"user\", \"res\", \"count\")\n", + " .union(results_to_display.select(\"user\", \"res\", f.lit(1).alias(\"count\")))\n", + " .cache()\n", + ")" ] }, { @@ -354,29 +339,63 @@ "high_scores_df = fileShare_accesses.toPandas()\n", "unique_arr = np.append(high_scores_df.user.unique(), high_scores_df.res.unique())\n", "\n", - "unique_df = pd.DataFrame(data = unique_arr, columns = ['name'])\n", - "unique_df['index'] = range(0, len(unique_df.index))\n", + "unique_df = pd.DataFrame(data=unique_arr, columns=[\"name\"])\n", + "unique_df[\"index\"] = range(0, len(unique_df.index))\n", "\n", "# create index for source & target and color for the normal accesses\n", - "normal_line_color = 'rgba(211, 211, 211, 0.8)'\n", - "anomolous_color = 'red'\n", - "x = pd.merge(high_scores_df, unique_df, how='left', left_on='user', right_on='name').drop(['name'], axis=1).rename(columns={'index' : 'userIndex'})\n", - "all_access_index_df = pd.merge(x, unique_df, how='left', left_on='res', right_on='name').drop(['name'], axis=1).rename(columns={'index' : 'resIndex'})\n", - "all_access_index_df['color'] = normal_line_color\n", + "normal_line_color = \"rgba(211, 211, 211, 0.8)\"\n", + "anomolous_color = \"red\"\n", + "x = (\n", + " pd.merge(high_scores_df, unique_df, how=\"left\", left_on=\"user\", right_on=\"name\")\n", + " .drop([\"name\"], axis=1)\n", + " .rename(columns={\"index\": \"userIndex\"})\n", + ")\n", + "all_access_index_df = (\n", + " pd.merge(x, unique_df, how=\"left\", left_on=\"res\", right_on=\"name\")\n", + " .drop([\"name\"], axis=1)\n", + " .rename(columns={\"index\": \"resIndex\"})\n", + ")\n", + "all_access_index_df[\"color\"] = normal_line_color\n", "\n", - "# results_to_display index, color and \n", - "y = results_to_display.toPandas().drop(['tenant_id', 'timestamp', 'anomaly_score'], axis=1)\n", - "y = pd.merge(y, unique_df, how='left', left_on='user', right_on='name').drop(['name'], axis=1).rename(columns={'index' : 'userIndex'})\n", - "high_scores_index_df = pd.merge(y, unique_df, how='left', left_on='res', right_on='name').drop(['name'], axis=1).rename(columns={'index' : 'resIndex'})\n", - "high_scores_index_df['count'] = 1\n", - "high_scores_index_df['color'] = anomolous_color\n", + "# results_to_display index, color and\n", + "y = results_to_display.toPandas().drop(\n", + " [\"tenant_id\", \"timestamp\", \"anomaly_score\"], axis=1\n", + ")\n", + "y = (\n", + " pd.merge(y, unique_df, how=\"left\", left_on=\"user\", right_on=\"name\")\n", + " .drop([\"name\"], axis=1)\n", + " .rename(columns={\"index\": \"userIndex\"})\n", + ")\n", + "high_scores_index_df = (\n", + " pd.merge(y, unique_df, how=\"left\", left_on=\"res\", right_on=\"name\")\n", + " .drop([\"name\"], axis=1)\n", + " .rename(columns={\"index\": \"resIndex\"})\n", + ")\n", + "high_scores_index_df[\"count\"] = 1\n", + "high_scores_index_df[\"color\"] = anomolous_color\n", "\n", "# substract 1 for the red entries in all_access df\n", - "hsi_df = high_scores_index_df[['user','res', 'count']].rename(columns={'count' : 'hsiCount'})\n", - "all_access_updated_count_df = pd.merge(all_access_index_df, hsi_df, how='left', left_on=['user', 'res'], right_on=['user', 'res'])\n", - "all_access_updated_count_df['count'] = np.where(all_access_updated_count_df['hsiCount']==1, all_access_updated_count_df['count'] - 1, all_access_updated_count_df['count'])\n", - "all_access_updated_count_df = all_access_updated_count_df.loc[all_access_updated_count_df['count'] > 0]\n", - "all_access_updated_count_df = all_access_updated_count_df[['user','res', 'count', 'userIndex', 'resIndex', 'color']]\n", + "hsi_df = high_scores_index_df[[\"user\", \"res\", \"count\"]].rename(\n", + " columns={\"count\": \"hsiCount\"}\n", + ")\n", + "all_access_updated_count_df = pd.merge(\n", + " all_access_index_df,\n", + " hsi_df,\n", + " how=\"left\",\n", + " left_on=[\"user\", \"res\"],\n", + " right_on=[\"user\", \"res\"],\n", + ")\n", + "all_access_updated_count_df[\"count\"] = np.where(\n", + " all_access_updated_count_df[\"hsiCount\"] == 1,\n", + " all_access_updated_count_df[\"count\"] - 1,\n", + " all_access_updated_count_df[\"count\"],\n", + ")\n", + "all_access_updated_count_df = all_access_updated_count_df.loc[\n", + " all_access_updated_count_df[\"count\"] > 0\n", + "]\n", + "all_access_updated_count_df = all_access_updated_count_df[\n", + " [\"user\", \"res\", \"count\", \"userIndex\", \"resIndex\", \"color\"]\n", + "]\n", "\n", "# combine the two tables\n", "frames = [all_access_updated_count_df, high_scores_index_df]\n", @@ -391,41 +410,33 @@ "outputs": [], "source": [ "data_trace = dict(\n", - " type='sankey',\n", - " domain = dict(\n", - " x = [0,1],\n", - " y = [0,1]\n", + " type=\"sankey\",\n", + " domain=dict(x=[0, 1], y=[0, 1]),\n", + " orientation=\"h\",\n", + " valueformat=\".0f\",\n", + " node=dict(\n", + " pad=10,\n", + " thickness=30,\n", + " line=dict(color=\"black\", width=0),\n", + " label=unique_df[\"name\"].dropna(axis=0, how=\"any\"),\n", " ),\n", - " orientation = \"h\",\n", - " valueformat = \".0f\",\n", - " node = dict(\n", - " pad = 10,\n", - " thickness = 30,\n", - " line = dict(\n", - " color = \"black\",\n", - " width = 0\n", - " ),\n", - " label = unique_df['name'].dropna(axis=0, how='any')\n", + " link=dict(\n", + " source=display_df[\"userIndex\"].dropna(axis=0, how=\"any\"),\n", + " target=display_df[\"resIndex\"].dropna(axis=0, how=\"any\"),\n", + " value=display_df[\"count\"].dropna(axis=0, how=\"any\"),\n", + " color=display_df[\"color\"].dropna(axis=0, how=\"any\"),\n", " ),\n", - " link = dict(\n", - " source = display_df['userIndex'].dropna(axis=0, how='any'),\n", - " target = display_df['resIndex'].dropna(axis=0, how='any'),\n", - " value = display_df['count'].dropna(axis=0, how='any'),\n", - " color = display_df['color'].dropna(axis=0, how='any'),\n", - " )\n", ")\n", "\n", - "layout = dict(\n", - " title = \"All resources accessed by users with highest anomalous scores\",\n", - " height = 772,\n", - " font = dict(\n", - " size = 10\n", - " ), \n", + "layout = dict(\n", + " title=\"All resources accessed by users with highest anomalous scores\",\n", + " height=772,\n", + " font=dict(size=10),\n", ")\n", "\n", "fig = dict(data=[data_trace], layout=layout)\n", "\n", - "p = plot(fig, output_type='div')\n", + "p = plot(fig, output_type=\"div\")\n", "\n", "displayHTML(p)" ] diff --git a/notebooks/features/other/DeepLearning - BiLSTM Medical Entity Extraction.ipynb b/notebooks/features/other/DeepLearning - BiLSTM Medical Entity Extraction.ipynb index 289b06f569..a0b48da08d 100644 --- a/notebooks/features/other/DeepLearning - BiLSTM Medical Entity Extraction.ipynb +++ b/notebooks/features/other/DeepLearning - BiLSTM Medical Entity Extraction.ipynb @@ -45,8 +45,9 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", - " from notebookutils.visualization import display\n" + " from notebookutils.visualization import display" ] }, { @@ -90,7 +91,8 @@ "source": [ "modelName = \"BiLSTM\"\n", "modelDir = abspath(\"models\")\n", - "if not os.path.exists(modelDir): os.makedirs(modelDir)\n", + "if not os.path.exists(modelDir):\n", + " os.makedirs(modelDir)\n", "d = ModelDownloader(spark, \"file://\" + modelDir)\n", "modelSchema = d.downloadByName(modelName)\n", "nltk.download(\"punkt\")" @@ -114,7 +116,9 @@ "wordEmbFileName = \"WordEmbeddings_PubMed.pkl\"\n", "pickleFile = join(abspath(\"models\"), wordEmbFileName)\n", "if not os.path.isfile(pickleFile):\n", - " urllib.request.urlretrieve(\"https://mmlspark.blob.core.windows.net/datasets/\" + wordEmbFileName, pickleFile)" + " urllib.request.urlretrieve(\n", + " \"https://mmlspark.blob.core.windows.net/datasets/\" + wordEmbFileName, pickleFile\n", + " )" ] }, { @@ -163,7 +167,7 @@ "outputs": [], "source": [ "sentences = sent_tokenize(content)\n", - "df = spark.createDataFrame(enumerate(sentences), [\"index\",\"sentence\"])" + "df = spark.createDataFrame(enumerate(sentences), [\"index\", \"sentence\"])" ] }, { @@ -180,6 +184,7 @@ " nltk.data.path.append(\"/dbfs/nltkdata\")\n", " return partition\n", "\n", + "\n", "df = df.rdd.mapPartitions(prepNLTK).toDF()" ] }, @@ -195,34 +200,38 @@ " prepNLTK(None)\n", " return word_tokenize(sent)\n", "\n", + "\n", "tokenizeUDF = udf(safe_tokenize, ArrayType(StringType()))\n", - "df = df.withColumn(\"tokens\",tokenizeUDF(\"sentence\"))\n", + "df = df.withColumn(\"tokens\", tokenizeUDF(\"sentence\"))\n", "\n", "countUDF = udf(len, IntegerType())\n", - "df = df.withColumn(\"count\",countUDF(\"tokens\"))\n", + "df = df.withColumn(\"count\", countUDF(\"tokens\"))\n", + "\n", "\n", "def wordToEmb(word):\n", " return wordvectors[wordToIndex.get(word.lower(), wordToIndex[\"UNK\"])]\n", "\n", + "\n", "def featurize(tokens):\n", " X = np.zeros((maxSentenceLen, nFeatures))\n", - " X[-len(tokens):,:] = np.array([wordToEmb(word) for word in tokens])\n", + " X[-len(tokens) :, :] = np.array([wordToEmb(word) for word in tokens])\n", " return [float(x) for x in X.reshape(maxSentenceLen, nFeatures).flatten()]\n", "\n", + "\n", "def safe_show(df, retries):\n", " try:\n", " df.show()\n", " except Exception as e:\n", " if retries >= 1:\n", - " safe_show(df, retries-1)\n", + " safe_show(df, retries - 1)\n", " else:\n", " raise e\n", "\n", - "featurizeUDF = udf(featurize, ArrayType(FloatType()))\n", + "\n", + "featurizeUDF = udf(featurize, ArrayType(FloatType()))\n", "\n", "df = df.withColumn(\"features\", featurizeUDF(\"tokens\")).cache()\n", - "safe_show(df, 5) # Can be flaky on build server\n", - " \n" + "safe_show(df, 5) # Can be flaky on build server" ], "metadata": { "collapsed": false, @@ -244,12 +253,14 @@ "metadata": {}, "outputs": [], "source": [ - "model = CNTKModel() \\\n", - " .setModelLocation(modelSchema.uri) \\\n", - " .setInputCol(\"features\") \\\n", - " .setOutputCol(\"probs\") \\\n", - " .setOutputNodeIndex(0) \\\n", + "model = (\n", + " CNTKModel()\n", + " .setModelLocation(modelSchema.uri)\n", + " .setInputCol(\"features\")\n", + " .setOutputCol(\"probs\")\n", + " .setOutputNodeIndex(0)\n", " .setMiniBatchSize(1)\n", + ")\n", "\n", "df = model.transform(df).cache()\n", "df.show()" @@ -263,10 +274,11 @@ "source": [ "def probsToEntities(probs, wordCount):\n", " reshaped_probs = np.array(probs).reshape(maxSentenceLen, nClasses)\n", - " reshaped_probs = reshaped_probs[-wordCount:,:]\n", + " reshaped_probs = reshaped_probs[-wordCount:, :]\n", " return [classToEntity[np.argmax(probs)] for probs in reshaped_probs]\n", "\n", - "toEntityUDF = udf(probsToEntities,ArrayType(StringType()))\n", + "\n", + "toEntityUDF = udf(probsToEntities, ArrayType(StringType()))\n", "df = df.withColumn(\"entities\", toEntityUDF(\"probs\", \"count\"))\n", "df.show()" ] @@ -287,28 +299,33 @@ "# Color Code the Text based on the entity type\n", "colors = {\n", " \"B-Disease\": \"blue\",\n", - " \"I-Disease\":\"blue\",\n", - " \"B-Drug\":\"lime\",\n", - " \"I-Drug\":\"lime\",\n", - " \"B-Chemical\":\"lime\",\n", - " \"I-Chemical\":\"lime\",\n", - " \"O\":\"black\",\n", - " \"NONE\":\"black\"\n", + " \"I-Disease\": \"blue\",\n", + " \"B-Drug\": \"lime\",\n", + " \"I-Drug\": \"lime\",\n", + " \"B-Chemical\": \"lime\",\n", + " \"I-Chemical\": \"lime\",\n", + " \"O\": \"black\",\n", + " \"NONE\": \"black\",\n", "}\n", "\n", + "\n", "def prettyPrint(words, annotations):\n", " formattedWords = []\n", - " for word,annotation in zip(words,annotations):\n", - " formattedWord = \"{}\".format(colors[annotation], word)\n", - " if annotation in {\"O\",\"NONE\"}:\n", + " for word, annotation in zip(words, annotations):\n", + " formattedWord = \"{}\".format(\n", + " colors[annotation], word\n", + " )\n", + " if annotation in {\"O\", \"NONE\"}:\n", " formattedWords.append(formattedWord)\n", " else:\n", " formattedWords.append(\"{}\".format(formattedWord))\n", " return \" \".join(formattedWords)\n", "\n", + "\n", "prettyPrintUDF = udf(prettyPrint, StringType())\n", - "df = df.withColumn(\"formattedSentence\", prettyPrintUDF(\"tokens\", \"entities\")) \\\n", - " .select(\"formattedSentence\")\n", + "df = df.withColumn(\"formattedSentence\", prettyPrintUDF(\"tokens\", \"entities\")).select(\n", + " \"formattedSentence\"\n", + ")\n", "\n", "sentences = [row[\"formattedSentence\"] for row in df.collect()]" ] @@ -322,6 +339,7 @@ "outputs": [], "source": [ "from IPython.core.display import display, HTML\n", + "\n", "for sentence in sentences:\n", " display(HTML(sentence))" ] diff --git a/notebooks/features/other/DeepLearning - CIFAR10 Convolutional Network.ipynb b/notebooks/features/other/DeepLearning - CIFAR10 Convolutional Network.ipynb index 979890d5e3..9a8de2f648 100644 --- a/notebooks/features/other/DeepLearning - CIFAR10 Convolutional Network.ipynb +++ b/notebooks/features/other/DeepLearning - CIFAR10 Convolutional Network.ipynb @@ -22,8 +22,9 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", - " from notebookutils.visualization import display\n" + " from notebookutils.visualization import display" ] }, { @@ -43,7 +44,9 @@ "\n", "# Please note that this is a copy of the CIFAR10 dataset originally found here:\n", "# http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz\n", - "imagesWithLabels = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/CIFAR10_test.parquet\")" + "imagesWithLabels = spark.read.parquet(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/CIFAR10_test.parquet\"\n", + ")" ] }, { @@ -59,7 +62,7 @@ "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " modelDir = \"abfss://synapse@mmlsparkeuap.dfs.core.windows.net/models/\"\n", "else:\n", - " modelDir = \"dbfs:/models/\"\n" + " modelDir = \"dbfs:/models/\"" ] }, { @@ -76,7 +79,7 @@ "outputs": [], "source": [ "d = ModelDownloader(spark, modelDir)\n", - "model = d.downloadByName(modelName)\n" + "model = d.downloadByName(modelName)" ] }, { @@ -93,24 +96,33 @@ "outputs": [], "source": [ "import time\n", + "\n", "start = time.time()\n", "\n", "# Use CNTK model to get log probabilities\n", - "cntkModel = CNTKModel().setInputCol(\"images\").setOutputCol(\"output\") \\\n", - " .setModelLocation(model.uri).setOutputNode(\"z\")\n", + "cntkModel = (\n", + " CNTKModel()\n", + " .setInputCol(\"images\")\n", + " .setOutputCol(\"output\")\n", + " .setModelLocation(model.uri)\n", + " .setOutputNode(\"z\")\n", + ")\n", "scoredImages = cntkModel.transform(imagesWithLabels)\n", "\n", "# Transform the log probabilities to predictions\n", - "def argmax(x): return max(enumerate(x),key=lambda p: p[1])[0]\n", + "def argmax(x):\n", + " return max(enumerate(x), key=lambda p: p[1])[0]\n", + "\n", "\n", "argmaxUDF = udf(argmax, IntegerType())\n", - "imagePredictions = scoredImages.withColumn(\"predictions\", argmaxUDF(\"output\")) \\\n", - " .select(\"predictions\", \"labels\")\n", + "imagePredictions = scoredImages.withColumn(\"predictions\", argmaxUDF(\"output\")).select(\n", + " \"predictions\", \"labels\"\n", + ")\n", "\n", "numRows = imagePredictions.count()\n", "\n", "end = time.time()\n", - "print(\"classifying {} images took {} seconds\".format(numRows,end-start))" + "print(\"classifying {} images took {} seconds\".format(numRows, end - start))" ] }, { @@ -144,8 +156,18 @@ "\n", "cm = confusion_matrix(y, y_hat)\n", "\n", - "labels = [\"airplane\", \"automobile\", \"bird\", \"cat\", \"deer\", \"dog\", \"frog\",\n", - " \"horse\", \"ship\", \"truck\"]\n", + "labels = [\n", + " \"airplane\",\n", + " \"automobile\",\n", + " \"bird\",\n", + " \"cat\",\n", + " \"deer\",\n", + " \"dog\",\n", + " \"frog\",\n", + " \"horse\",\n", + " \"ship\",\n", + " \"truck\",\n", + "]\n", "plt.imshow(cm, interpolation=\"nearest\", cmap=plt.cm.Blues)\n", "plt.colorbar()\n", "tick_marks = np.arange(len(labels))\n", diff --git a/notebooks/features/other/DeepLearning - Flower Image Classification.ipynb b/notebooks/features/other/DeepLearning - Flower Image Classification.ipynb index df47c32516..d79b22ef14 100644 --- a/notebooks/features/other/DeepLearning - Flower Image Classification.ipynb +++ b/notebooks/features/other/DeepLearning - Flower Image Classification.ipynb @@ -21,8 +21,9 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", - " from notebookutils.visualization import display\n" + " from notebookutils.visualization import display" ] }, { @@ -49,8 +50,13 @@ "source": [ "# Load the images\n", "# use flowers_and_labels.parquet on larger cluster in order to get better results\n", - "imagesWithLabels = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/flowers_and_labels2.parquet\") \\\n", - " .withColumnRenamed(\"bytes\",\"image\").sample(.1)\n", + "imagesWithLabels = (\n", + " spark.read.parquet(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/flowers_and_labels2.parquet\"\n", + " )\n", + " .withColumnRenamed(\"bytes\", \"image\")\n", + " .sample(0.1)\n", + ")\n", "\n", "imagesWithLabels.printSchema()" ] @@ -74,17 +80,15 @@ "from synapse.ml.stages import *\n", "\n", "# Make some featurizers\n", - "it = ImageTransformer()\\\n", - " .setOutputCol(\"scaled\")\\\n", - " .resize(size=(60, 60))\n", + "it = ImageTransformer().setOutputCol(\"scaled\").resize(size=(60, 60))\n", + "\n", + "ur = UnrollImage().setInputCol(\"scaled\").setOutputCol(\"features\")\n", "\n", - "ur = UnrollImage()\\\n", - " .setInputCol(\"scaled\")\\\n", - " .setOutputCol(\"features\")\n", - " \n", "dc1 = DropColumns().setCols([\"scaled\", \"image\"])\n", "\n", - "lr1 = LogisticRegression().setMaxIter(8).setFeaturesCol(\"features\").setLabelCol(\"labels\")\n", + "lr1 = (\n", + " LogisticRegression().setMaxIter(8).setFeaturesCol(\"features\").setLabelCol(\"labels\")\n", + ")\n", "\n", "dc2 = DropColumns().setCols([\"features\"])\n", "\n", @@ -97,20 +101,24 @@ "metadata": {}, "outputs": [], "source": [ - "resnet = ImageFeaturizer()\\\n", - " .setInputCol(\"image\")\\\n", - " .setOutputCol(\"features\")\\\n", - " .setModelLocation(model.uri)\\\n", - " .setLayerNames(model.layerNames)\\\n", + "resnet = (\n", + " ImageFeaturizer()\n", + " .setInputCol(\"image\")\n", + " .setOutputCol(\"features\")\n", + " .setModelLocation(model.uri)\n", + " .setLayerNames(model.layerNames)\n", " .setCutOutputLayers(1)\n", - " \n", + ")\n", + "\n", "dc3 = DropColumns().setCols([\"image\"])\n", - " \n", - "lr2 = LogisticRegression().setMaxIter(8).setFeaturesCol(\"features\").setLabelCol(\"labels\")\n", + "\n", + "lr2 = (\n", + " LogisticRegression().setMaxIter(8).setFeaturesCol(\"features\").setLabelCol(\"labels\")\n", + ")\n", "\n", "dc4 = DropColumns().setCols([\"features\"])\n", "\n", - "deepModel = Pipeline(stages=[resnet, dc3, lr2, dc4]) " + "deepModel = Pipeline(stages=[resnet, dc3, lr2, dc4])" ] }, { @@ -141,10 +149,10 @@ "outputs": [], "source": [ "def timedExperiment(model, train, test):\n", - " start = time.time()\n", - " result = model.fit(train).transform(test).toPandas()\n", - " print(\"Experiment took {}s\".format(time.time() - start))\n", - " return result" + " start = time.time()\n", + " result = model.fit(train).transform(test).toPandas()\n", + " print(\"Experiment took {}s\".format(time.time() - start))\n", + " return result" ] }, { @@ -153,7 +161,7 @@ "metadata": {}, "outputs": [], "source": [ - "train, test = imagesWithLabels.randomSplit([.8,.2])\n", + "train, test = imagesWithLabels.randomSplit([0.8, 0.2])\n", "train.count(), test.count()" ] }, @@ -192,26 +200,30 @@ "from sklearn.metrics import confusion_matrix\n", "import numpy as np\n", "\n", + "\n", "def evaluate(results, name):\n", - " y, y_hat = results[\"labels\"],results[\"prediction\"]\n", + " y, y_hat = results[\"labels\"], results[\"prediction\"]\n", " y = [int(l) for l in y]\n", "\n", - " accuracy = np.mean([1. if pred==true else 0. for (pred,true) in zip(y_hat,y)])\n", + " accuracy = np.mean([1.0 if pred == true else 0.0 for (pred, true) in zip(y_hat, y)])\n", " cm = confusion_matrix(y, y_hat)\n", " cm = cm.astype(\"float\") / cm.sum(axis=1)[:, np.newaxis]\n", "\n", - " plt.text(40, 10,\"$Accuracy$ $=$ ${}\\%$\".format(round(accuracy*100,1)),fontsize=14)\n", + " plt.text(\n", + " 40, 10, \"$Accuracy$ $=$ ${}\\%$\".format(round(accuracy * 100, 1)), fontsize=14\n", + " )\n", " plt.imshow(cm, interpolation=\"nearest\", cmap=plt.cm.Blues)\n", " plt.colorbar()\n", " plt.xlabel(\"$Predicted$ $label$\", fontsize=18)\n", " plt.ylabel(\"$True$ $Label$\", fontsize=18)\n", " plt.title(\"$Normalized$ $CM$ $for$ ${}$\".format(name))\n", "\n", - "plt.figure(figsize=(12,5))\n", - "plt.subplot(1,2,1)\n", - "evaluate(deepResults,\"CNTKModel + LR\")\n", - "plt.subplot(1,2,2)\n", - "evaluate(basicResults,\"LR\")\n", + "\n", + "plt.figure(figsize=(12, 5))\n", + "plt.subplot(1, 2, 1)\n", + "evaluate(deepResults, \"CNTKModel + LR\")\n", + "plt.subplot(1, 2, 2)\n", + "evaluate(basicResults, \"LR\")\n", "# Note that on the larger dataset the accuracy will bump up from 44% to >90%\n", "display(plt.show())" ] diff --git a/notebooks/features/other/DeepLearning - Transfer Learning.ipynb b/notebooks/features/other/DeepLearning - Transfer Learning.ipynb index 7a597a941a..c91eb73cde 100644 --- a/notebooks/features/other/DeepLearning - Transfer Learning.ipynb +++ b/notebooks/features/other/DeepLearning - Transfer Learning.ipynb @@ -33,6 +33,7 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", "\n", "\n", @@ -45,8 +46,13 @@ "d = ModelDownloader(spark, modelDir)\n", "model = d.downloadByName(modelName)\n", "print(model.layerNames)\n", - "cntkModel = CNTKModel().setInputCol(\"images\").setOutputCol(\"features\") \\\n", - " .setModelLocation(model.uri).setOutputNode(\"l8\")" + "cntkModel = (\n", + " CNTKModel()\n", + " .setInputCol(\"images\")\n", + " .setOutputCol(\"features\")\n", + " .setModelLocation(model.uri)\n", + " .setOutputNode(\"l8\")\n", + ")" ] }, { @@ -62,7 +68,9 @@ "metadata": {}, "outputs": [], "source": [ - "imagesWithLabels = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/CIFAR10_test.parquet\")" + "imagesWithLabels = spark.read.parquet(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/CIFAR10_test.parquet\"\n", + ")" ] }, { @@ -95,7 +103,7 @@ "metadata": {}, "outputs": [], "source": [ - "featurizedImages = cntkModel.transform(imagesWithLabels).select([\"features\",\"labels\"])" + "featurizedImages = cntkModel.transform(imagesWithLabels).select([\"features\", \"labels\"])" ] }, { @@ -114,9 +122,9 @@ "from synapse.ml.train import TrainClassifier\n", "from pyspark.ml.classification import RandomForestClassifier\n", "\n", - "train,test = featurizedImages.randomSplit([0.75,0.25])\n", + "train, test = featurizedImages.randomSplit([0.75, 0.25])\n", "\n", - "model = TrainClassifier(model=RandomForestClassifier(),labelCol=\"labels\").fit(train)" + "model = TrainClassifier(model=RandomForestClassifier(), labelCol=\"labels\").fit(train)" ] }, { @@ -133,6 +141,7 @@ "outputs": [], "source": [ "from synapse.ml.train import ComputeModelStatistics\n", + "\n", "predictions = model.transform(test)\n", "metrics = ComputeModelStatistics(evaluationMetric=\"accuracy\").transform(predictions)\n", "metrics.show()" diff --git a/notebooks/features/other/HyperParameterTuning - Fighting Breast Cancer.ipynb b/notebooks/features/other/HyperParameterTuning - Fighting Breast Cancer.ipynb index bcd497ce95..942a32995b 100644 --- a/notebooks/features/other/HyperParameterTuning - Fighting Breast Cancer.ipynb +++ b/notebooks/features/other/HyperParameterTuning - Fighting Breast Cancer.ipynb @@ -22,6 +22,7 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()" ] }, @@ -38,7 +39,9 @@ "metadata": {}, "outputs": [], "source": [ - "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BreastCancer.parquet\").cache()\n", + "data = spark.read.parquet(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/BreastCancer.parquet\"\n", + ").cache()\n", "tune, test = data.randomSplit([0.80, 0.20])\n", "tune.limit(10).toPandas()" ] @@ -58,7 +61,12 @@ "source": [ "from synapse.ml.automl import TuneHyperparameters\n", "from synapse.ml.train import TrainClassifier\n", - "from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier\n", + "from pyspark.ml.classification import (\n", + " LogisticRegression,\n", + " RandomForestClassifier,\n", + " GBTClassifier,\n", + ")\n", + "\n", "logReg = LogisticRegression()\n", "randForest = RandomForestClassifier()\n", "gbt = GBTClassifier()\n", @@ -83,13 +91,14 @@ "source": [ "from synapse.ml.automl import *\n", "\n", - "paramBuilder = \\\n", - " HyperparamBuilder() \\\n", - " .addHyperparam(logReg, logReg.regParam, RangeHyperParam(0.1, 0.3)) \\\n", - " .addHyperparam(randForest, randForest.numTrees, DiscreteHyperParam([5,10])) \\\n", - " .addHyperparam(randForest, randForest.maxDepth, DiscreteHyperParam([3,5])) \\\n", - " .addHyperparam(gbt, gbt.maxBins, RangeHyperParam(8,16)) \\\n", - " .addHyperparam(gbt, gbt.maxDepth, DiscreteHyperParam([3,5]))\n", + "paramBuilder = (\n", + " HyperparamBuilder()\n", + " .addHyperparam(logReg, logReg.regParam, RangeHyperParam(0.1, 0.3))\n", + " .addHyperparam(randForest, randForest.numTrees, DiscreteHyperParam([5, 10]))\n", + " .addHyperparam(randForest, randForest.maxDepth, DiscreteHyperParam([3, 5]))\n", + " .addHyperparam(gbt, gbt.maxBins, RangeHyperParam(8, 16))\n", + " .addHyperparam(gbt, gbt.maxDepth, DiscreteHyperParam([3, 5]))\n", + ")\n", "searchSpace = paramBuilder.build()\n", "# The search space is a list of params to tuples of estimator and hyperparam\n", "print(searchSpace)\n", @@ -110,9 +119,14 @@ "outputs": [], "source": [ "bestModel = TuneHyperparameters(\n", - " evaluationMetric=\"accuracy\", models=mmlmodels, numFolds=2,\n", - " numRuns=len(mmlmodels) * 2, parallelism=1,\n", - " paramSpace=randomSpace.space(), seed=0).fit(tune)" + " evaluationMetric=\"accuracy\",\n", + " models=mmlmodels,\n", + " numFolds=2,\n", + " numRuns=len(mmlmodels) * 2,\n", + " parallelism=1,\n", + " paramSpace=randomSpace.space(),\n", + " seed=0,\n", + ").fit(tune)" ] }, { @@ -146,6 +160,7 @@ "outputs": [], "source": [ "from synapse.ml.train import ComputeModelStatistics\n", + "\n", "prediction = bestModel.transform(test)\n", "metrics = ComputeModelStatistics().transform(prediction)\n", "metrics.limit(10).toPandas()" diff --git a/notebooks/features/other/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb b/notebooks/features/other/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb index d8d7469204..ca2b143204 100644 --- a/notebooks/features/other/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb +++ b/notebooks/features/other/TextAnalytics - Amazon Book Reviews with Word2Vec.ipynb @@ -16,8 +16,10 @@ "execution_count": null, "source": [ "import os\n", + "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()" ], "outputs": [], @@ -27,7 +29,7 @@ "cell_type": "code", "execution_count": null, "source": [ - "import pandas as pd\n" + "import pandas as pd" ], "outputs": [], "metadata": {} @@ -36,7 +38,9 @@ "cell_type": "code", "execution_count": null, "source": [ - "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\")\n", + "data = spark.read.parquet(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\"\n", + ")\n", "data.limit(10).toPandas()" ], "outputs": [], @@ -53,8 +57,7 @@ "cell_type": "code", "execution_count": null, "source": [ - "processedData = data.withColumn(\"label\", data[\"rating\"] > 3) \\\n", - " .select([\"text\", \"label\"])\n", + "processedData = data.withColumn(\"label\", data[\"rating\"] > 3).select([\"text\", \"label\"])\n", "processedData.limit(5).toPandas()" ], "outputs": [], @@ -89,11 +92,13 @@ "source": [ "from pyspark.ml import Pipeline\n", "from pyspark.ml.feature import Tokenizer, Word2Vec\n", + "\n", "tokenizer = Tokenizer(inputCol=\"text\", outputCol=\"words\")\n", "partitions = train.rdd.getNumPartitions()\n", - "word2vec = Word2Vec(maxIter=4, seed=42, inputCol=\"words\", outputCol=\"features\",\n", - " numPartitions=partitions)\n", - "textFeaturizer = Pipeline(stages = [tokenizer, word2vec]).fit(train)" + "word2vec = Word2Vec(\n", + " maxIter=4, seed=42, inputCol=\"words\", outputCol=\"features\", numPartitions=partitions\n", + ")\n", + "textFeaturizer = Pipeline(stages=[tokenizer, word2vec]).fit(train)" ], "outputs": [], "metadata": {} @@ -128,29 +133,42 @@ "cell_type": "code", "execution_count": null, "source": [ - "from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier\n", + "from pyspark.ml.classification import (\n", + " LogisticRegression,\n", + " RandomForestClassifier,\n", + " GBTClassifier,\n", + ")\n", "from synapse.ml.train import TrainClassifier\n", "import itertools\n", "\n", - "lrHyperParams = [0.05, 0.2]\n", - "logisticRegressions = [LogisticRegression(regParam = hyperParam)\n", - " for hyperParam in lrHyperParams]\n", - "lrmodels = [TrainClassifier(model=lrm, labelCol=\"label\").fit(ptrain)\n", - " for lrm in logisticRegressions]\n", + "lrHyperParams = [0.05, 0.2]\n", + "logisticRegressions = [\n", + " LogisticRegression(regParam=hyperParam) for hyperParam in lrHyperParams\n", + "]\n", + "lrmodels = [\n", + " TrainClassifier(model=lrm, labelCol=\"label\").fit(ptrain)\n", + " for lrm in logisticRegressions\n", + "]\n", "\n", - "rfHyperParams = itertools.product([5, 10], [2, 3])\n", - "randomForests = [RandomForestClassifier(numTrees=hyperParam[0], maxDepth=hyperParam[1])\n", - " for hyperParam in rfHyperParams]\n", - "rfmodels = [TrainClassifier(model=rfm, labelCol=\"label\").fit(ptrain)\n", - " for rfm in randomForests]\n", + "rfHyperParams = itertools.product([5, 10], [2, 3])\n", + "randomForests = [\n", + " RandomForestClassifier(numTrees=hyperParam[0], maxDepth=hyperParam[1])\n", + " for hyperParam in rfHyperParams\n", + "]\n", + "rfmodels = [\n", + " TrainClassifier(model=rfm, labelCol=\"label\").fit(ptrain) for rfm in randomForests\n", + "]\n", "\n", - "gbtHyperParams = itertools.product([8, 16], [2, 3])\n", - "gbtclassifiers = [GBTClassifier(maxBins=hyperParam[0], maxDepth=hyperParam[1])\n", - " for hyperParam in gbtHyperParams]\n", - "gbtmodels = [TrainClassifier(model=gbt, labelCol=\"label\").fit(ptrain)\n", - " for gbt in gbtclassifiers]\n", + "gbtHyperParams = itertools.product([8, 16], [2, 3])\n", + "gbtclassifiers = [\n", + " GBTClassifier(maxBins=hyperParam[0], maxDepth=hyperParam[1])\n", + " for hyperParam in gbtHyperParams\n", + "]\n", + "gbtmodels = [\n", + " TrainClassifier(model=gbt, labelCol=\"label\").fit(ptrain) for gbt in gbtclassifiers\n", + "]\n", "\n", - "trainedModels = lrmodels + rfmodels + gbtmodels" + "trainedModels = lrmodels + rfmodels + gbtmodels" ], "outputs": [], "metadata": {} @@ -167,6 +185,7 @@ "execution_count": null, "source": [ "from synapse.ml.automl import FindBestModel\n", + "\n", "bestModel = FindBestModel(evaluationMetric=\"AUC\", models=trainedModels).fit(ptest)\n", "bestModel.getRocCurve().show()\n", "bestModel.getBestModelMetrics().show()\n", @@ -187,12 +206,17 @@ "execution_count": null, "source": [ "from synapse.ml.train import ComputeModelStatistics\n", + "\n", "predictions = bestModel.transform(pvalidation)\n", "metrics = ComputeModelStatistics().transform(predictions)\n", - "print(\"Best model's accuracy on validation set = \"\n", - " + \"{0:.2f}%\".format(metrics.first()[\"accuracy\"] * 100))\n", - "print(\"Best model's AUC on validation set = \"\n", - " + \"{0:.2f}%\".format(metrics.first()[\"AUC\"] * 100))" + "print(\n", + " \"Best model's accuracy on validation set = \"\n", + " + \"{0:.2f}%\".format(metrics.first()[\"accuracy\"] * 100)\n", + ")\n", + "print(\n", + " \"Best model's AUC on validation set = \"\n", + " + \"{0:.2f}%\".format(metrics.first()[\"AUC\"] * 100)\n", + ")" ], "outputs": [], "metadata": {} diff --git a/notebooks/features/other/TextAnalytics - Amazon Book Reviews.ipynb b/notebooks/features/other/TextAnalytics - Amazon Book Reviews.ipynb index dde7120492..7b577af832 100644 --- a/notebooks/features/other/TextAnalytics - Amazon Book Reviews.ipynb +++ b/notebooks/features/other/TextAnalytics - Amazon Book Reviews.ipynb @@ -16,8 +16,10 @@ "execution_count": null, "source": [ "import os\n", + "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()" ], "outputs": [], @@ -36,7 +38,9 @@ "cell_type": "code", "execution_count": null, "source": [ - "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\")\n", + "data = spark.read.parquet(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\"\n", + ")\n", "data.limit(10).toPandas()" ], "outputs": [], @@ -55,9 +59,17 @@ "execution_count": null, "source": [ "from synapse.ml.featurize.text import TextFeaturizer\n", - "textFeaturizer = TextFeaturizer() \\\n", - " .setInputCol(\"text\").setOutputCol(\"features\") \\\n", - " .setUseStopWordsRemover(True).setUseIDF(True).setMinDocFreq(5).setNumFeatures(1 << 16).fit(data)" + "\n", + "textFeaturizer = (\n", + " TextFeaturizer()\n", + " .setInputCol(\"text\")\n", + " .setOutputCol(\"features\")\n", + " .setUseStopWordsRemover(True)\n", + " .setUseIDF(True)\n", + " .setMinDocFreq(5)\n", + " .setNumFeatures(1 << 16)\n", + " .fit(data)\n", + ")" ], "outputs": [], "metadata": {} @@ -84,8 +96,9 @@ "cell_type": "code", "execution_count": null, "source": [ - "processedData = processedData.withColumn(\"label\", processedData[\"rating\"] > 3) \\\n", - " .select([\"features\", \"label\"])\n", + "processedData = processedData.withColumn(\"label\", processedData[\"rating\"] > 3).select(\n", + " [\"features\", \"label\"]\n", + ")\n", "processedData.limit(5).toPandas()" ], "outputs": [], @@ -106,10 +119,16 @@ "from pyspark.ml.classification import LogisticRegression\n", "\n", "lrHyperParams = [0.05, 0.1, 0.2, 0.4]\n", - "logisticRegressions = [LogisticRegression(regParam = hyperParam) for hyperParam in lrHyperParams]\n", + "logisticRegressions = [\n", + " LogisticRegression(regParam=hyperParam) for hyperParam in lrHyperParams\n", + "]\n", "\n", "from synapse.ml.train import TrainClassifier\n", - "lrmodels = [TrainClassifier(model=lrm, labelCol=\"label\").fit(train) for lrm in logisticRegressions]" + "\n", + "lrmodels = [\n", + " TrainClassifier(model=lrm, labelCol=\"label\").fit(train)\n", + " for lrm in logisticRegressions\n", + "]" ], "outputs": [], "metadata": {} @@ -126,10 +145,11 @@ "execution_count": null, "source": [ "from synapse.ml.automl import FindBestModel, BestModel\n", + "\n", "bestModel = FindBestModel(evaluationMetric=\"AUC\", models=lrmodels).fit(test)\n", "bestModel.getRocCurve().show()\n", "bestModel.getBestModelMetrics().show()\n", - "bestModel.getAllModelMetrics().show()\n" + "bestModel.getAllModelMetrics().show()" ], "outputs": [], "metadata": {} @@ -146,10 +166,13 @@ "execution_count": null, "source": [ "from synapse.ml.train import ComputeModelStatistics\n", + "\n", "predictions = bestModel.transform(validation)\n", "metrics = ComputeModelStatistics().transform(predictions)\n", - "print(\"Best model's accuracy on validation set = \"\n", - " + \"{0:.2f}%\".format(metrics.first()[\"accuracy\"] * 100))" + "print(\n", + " \"Best model's accuracy on validation set = \"\n", + " + \"{0:.2f}%\".format(metrics.first()[\"accuracy\"] * 100)\n", + ")" ], "outputs": [], "metadata": {} diff --git a/notebooks/features/regression/Regression - Auto Imports.ipynb b/notebooks/features/regression/Regression - Auto Imports.ipynb index 973e934e35..120417d799 100644 --- a/notebooks/features/regression/Regression - Auto Imports.ipynb +++ b/notebooks/features/regression/Regression - Auto Imports.ipynb @@ -36,8 +36,10 @@ "outputs": [], "source": [ "import os\n", + "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()" ] }, @@ -47,7 +49,9 @@ "metadata": {}, "outputs": [], "source": [ - "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AutomobilePriceRaw.parquet\")\n" + "data = spark.read.parquet(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/AutomobilePriceRaw.parquet\"\n", + ")" ] }, { @@ -94,6 +98,7 @@ "outputs": [], "source": [ "from synapse.ml.stages import SummarizeData\n", + "\n", "summary = SummarizeData().transform(data)\n", "summary.toPandas()" ] @@ -139,10 +144,11 @@ "outputs": [], "source": [ "from synapse.ml.featurize import CleanMissingData\n", - "cols = [\"normalized-losses\", \"stroke\", \"bore\", \"horsepower\",\n", - " \"peak-rpm\", \"price\"]\n", - "cleanModel = CleanMissingData().setCleaningMode(\"Median\") \\\n", - " .setInputCols(cols).setOutputCols(cols)" + "\n", + "cols = [\"normalized-losses\", \"stroke\", \"bore\", \"horsepower\", \"peak-rpm\", \"price\"]\n", + "cleanModel = (\n", + " CleanMissingData().setCleaningMode(\"Median\").setInputCols(cols).setOutputCols(cols)\n", + ")" ] }, { @@ -195,7 +201,7 @@ "\n", "glr = GeneralizedLinearRegression(family=\"poisson\", link=\"log\")\n", "poissonModel = TrainRegressor().setModel(glr).setLabelCol(\"price\").setNumFeatures(256)\n", - "poissonPipe = Pipeline(stages = [cleanModel, poissonModel]).fit(train)\n", + "poissonPipe = Pipeline(stages=[cleanModel, poissonModel]).fit(train)\n", "poissonPrediction = poissonPipe.transform(test)" ] }, @@ -217,8 +223,10 @@ "from pyspark.ml.regression import RandomForestRegressor\n", "\n", "rfr = RandomForestRegressor(maxDepth=30, maxBins=128, numTrees=8, minInstancesPerNode=1)\n", - "randomForestModel = TrainRegressor(model=rfr, labelCol=\"price\", numFeatures=256).fit(train)\n", - "randomForestPipe = Pipeline(stages = [cleanModel, randomForestModel]).fit(train)\n", + "randomForestModel = TrainRegressor(model=rfr, labelCol=\"price\", numFeatures=256).fit(\n", + " train\n", + ")\n", + "randomForestPipe = Pipeline(stages=[cleanModel, randomForestModel]).fit(train)\n", "randomForestPrediction = randomForestPipe.transform(test)" ] }, @@ -245,6 +253,7 @@ "outputs": [], "source": [ "from synapse.ml.train import ComputeModelStatistics\n", + "\n", "poissonMetrics = ComputeModelStatistics().transform(poissonPrediction)\n", "print(\"Poisson Metrics\")\n", "poissonMetrics.toPandas()" @@ -275,10 +284,18 @@ "outputs": [], "source": [ "from synapse.ml.train import ComputePerInstanceStatistics\n", + "\n", + "\n", "def demonstrateEvalPerInstance(pred):\n", - " return ComputePerInstanceStatistics().transform(pred) \\\n", - " .select(\"price\", \"prediction\", \"L1_loss\", \"L2_loss\") \\\n", - " .limit(10).toPandas()\n", + " return (\n", + " ComputePerInstanceStatistics()\n", + " .transform(pred)\n", + " .select(\"price\", \"prediction\", \"L1_loss\", \"L2_loss\")\n", + " .limit(10)\n", + " .toPandas()\n", + " )\n", + "\n", + "\n", "demonstrateEvalPerInstance(poissonPrediction)" ] }, diff --git a/notebooks/features/regression/Regression - Flight Delays with DataCleaning.ipynb b/notebooks/features/regression/Regression - Flight Delays with DataCleaning.ipynb index dedd705598..81d3739b8c 100644 --- a/notebooks/features/regression/Regression - Flight Delays with DataCleaning.ipynb +++ b/notebooks/features/regression/Regression - Flight Delays with DataCleaning.ipynb @@ -32,8 +32,10 @@ "outputs": [], "source": [ "import os\n", + "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()" ] }, @@ -63,7 +65,9 @@ "metadata": {}, "outputs": [], "source": [ - "flightDelay = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/On_Time_Performance_2012_9.parquet\")\n", + "flightDelay = spark.read.parquet(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/On_Time_Performance_2012_9.parquet\"\n", + ")\n", "# print some basic info\n", "print(\"records read: \" + str(flightDelay.count()))\n", "print(\"Schema: \")\n", @@ -105,11 +109,20 @@ "outputs": [], "source": [ "from synapse.ml.featurize import DataConversion\n", - "flightDelay = DataConversion(cols=[\"Quarter\",\"Month\",\"DayofMonth\",\"DayOfWeek\",\n", - " \"OriginAirportID\",\"DestAirportID\",\n", - " \"CRSDepTime\",\"CRSArrTime\"],\n", - " convertTo=\"double\") \\\n", - " .transform(flightDelay)\n", + "\n", + "flightDelay = DataConversion(\n", + " cols=[\n", + " \"Quarter\",\n", + " \"Month\",\n", + " \"DayofMonth\",\n", + " \"DayOfWeek\",\n", + " \"OriginAirportID\",\n", + " \"DestAirportID\",\n", + " \"CRSDepTime\",\n", + " \"CRSArrTime\",\n", + " ],\n", + " convertTo=\"double\",\n", + ").transform(flightDelay)\n", "flightDelay.printSchema()\n", "flightDelay.limit(10).toPandas()" ] @@ -159,14 +172,13 @@ "from synapse.ml.train import TrainRegressor, TrainedRegressorModel\n", "from pyspark.ml.regression import LinearRegression\n", "\n", - "trainCat = DataConversion(cols=[\"Carrier\",\"DepTimeBlk\",\"ArrTimeBlk\"],\n", - " convertTo=\"toCategorical\") \\\n", - " .transform(train)\n", - "testCat = DataConversion(cols=[\"Carrier\",\"DepTimeBlk\",\"ArrTimeBlk\"],\n", - " convertTo=\"toCategorical\") \\\n", - " .transform(test)\n", - "lr = LinearRegression().setRegParam(0.1) \\\n", - " .setElasticNetParam(0.3)\n", + "trainCat = DataConversion(\n", + " cols=[\"Carrier\", \"DepTimeBlk\", \"ArrTimeBlk\"], convertTo=\"toCategorical\"\n", + ").transform(train)\n", + "testCat = DataConversion(\n", + " cols=[\"Carrier\", \"DepTimeBlk\", \"ArrTimeBlk\"], convertTo=\"toCategorical\"\n", + ").transform(test)\n", + "lr = LinearRegression().setRegParam(0.1).setElasticNetParam(0.3)\n", "model = TrainRegressor(model=lr, labelCol=\"ArrDelay\").fit(trainCat)" ] }, @@ -201,6 +213,7 @@ "outputs": [], "source": [ "from synapse.ml.train import ComputeModelStatistics\n", + "\n", "metrics = ComputeModelStatistics().transform(scoredData)\n", "metrics.toPandas()" ] @@ -220,9 +233,11 @@ "outputs": [], "source": [ "from synapse.ml.train import ComputePerInstanceStatistics\n", + "\n", "evalPerInstance = ComputePerInstanceStatistics().transform(scoredData)\n", - "evalPerInstance.select(\"ArrDelay\", \"prediction\", \"L1_loss\", \"L2_loss\") \\\n", - " .limit(10).toPandas()" + "evalPerInstance.select(\"ArrDelay\", \"prediction\", \"L1_loss\", \"L2_loss\").limit(\n", + " 10\n", + ").toPandas()" ] } ], diff --git a/notebooks/features/regression/Regression - Flight Delays.ipynb b/notebooks/features/regression/Regression - Flight Delays.ipynb index a840a7533d..c9e8808320 100644 --- a/notebooks/features/regression/Regression - Flight Delays.ipynb +++ b/notebooks/features/regression/Regression - Flight Delays.ipynb @@ -20,8 +20,10 @@ "outputs": [], "source": [ "import os\n", + "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()" ] }, @@ -49,7 +51,9 @@ "metadata": {}, "outputs": [], "source": [ - "flightDelay = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/On_Time_Performance_2012_9.parquet\")\n", + "flightDelay = spark.read.parquet(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/On_Time_Performance_2012_9.parquet\"\n", + ")\n", "# print some basic info\n", "print(\"records read: \" + str(flightDelay.count()))\n", "print(\"Schema: \")\n", @@ -70,7 +74,7 @@ "metadata": {}, "outputs": [], "source": [ - "train,test = flightDelay.randomSplit([0.75, 0.25])" + "train, test = flightDelay.randomSplit([0.75, 0.25])" ] }, { @@ -89,14 +93,23 @@ "from synapse.ml.train import TrainRegressor, TrainedRegressorModel\n", "from pyspark.ml.regression import LinearRegression\n", "from pyspark.ml.feature import StringIndexer\n", + "\n", "# Convert columns to categorical\n", "catCols = [\"Carrier\", \"DepTimeBlk\", \"ArrTimeBlk\"]\n", "trainCat = train\n", "testCat = test\n", "for catCol in catCols:\n", " simodel = StringIndexer(inputCol=catCol, outputCol=catCol + \"Tmp\").fit(train)\n", - " trainCat = simodel.transform(trainCat).drop(catCol).withColumnRenamed(catCol + \"Tmp\", catCol)\n", - " testCat = simodel.transform(testCat).drop(catCol).withColumnRenamed(catCol + \"Tmp\", catCol)\n", + " trainCat = (\n", + " simodel.transform(trainCat)\n", + " .drop(catCol)\n", + " .withColumnRenamed(catCol + \"Tmp\", catCol)\n", + " )\n", + " testCat = (\n", + " simodel.transform(testCat)\n", + " .drop(catCol)\n", + " .withColumnRenamed(catCol + \"Tmp\", catCol)\n", + " )\n", "lr = LinearRegression().setRegParam(0.1).setElasticNetParam(0.3)\n", "model = TrainRegressor(model=lr, labelCol=\"ArrDelay\").fit(trainCat)" ] @@ -140,6 +153,7 @@ "outputs": [], "source": [ "from synapse.ml.train import ComputeModelStatistics\n", + "\n", "metrics = ComputeModelStatistics().transform(scoredData)\n", "metrics.toPandas()" ] @@ -159,8 +173,11 @@ "outputs": [], "source": [ "from synapse.ml.train import ComputePerInstanceStatistics\n", + "\n", "evalPerInstance = ComputePerInstanceStatistics().transform(scoredData)\n", - "evalPerInstance.select(\"ArrDelay\", \"prediction\", \"L1_loss\", \"L2_loss\").limit(10).toPandas()" + "evalPerInstance.select(\"ArrDelay\", \"prediction\", \"L1_loss\", \"L2_loss\").limit(\n", + " 10\n", + ").toPandas()" ] } ], diff --git a/notebooks/features/regression/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb b/notebooks/features/regression/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb index 3c7c930740..fa2b655121 100644 --- a/notebooks/features/regression/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb +++ b/notebooks/features/regression/Regression - Vowpal Wabbit vs. LightGBM vs. Linear Regressor.ipynb @@ -20,10 +20,12 @@ "outputs": [], "source": [ "import os\n", + "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", - " from notebookutils.visualization import display\n" + " from notebookutils.visualization import display" ] }, { @@ -62,9 +64,11 @@ "source": [ "boston = load_boston()\n", "\n", - "feature_cols = ['f' + str(i) for i in range(boston.data.shape[1])]\n", - "header = ['target'] + feature_cols\n", - "df = spark.createDataFrame(pd.DataFrame(data=np.column_stack((boston.target, boston.data)), columns=header)).repartition(1)\n", + "feature_cols = [\"f\" + str(i) for i in range(boston.data.shape[1])]\n", + "header = [\"target\"] + feature_cols\n", + "df = spark.createDataFrame(\n", + " pd.DataFrame(data=np.column_stack((boston.target, boston.data)), columns=header)\n", + ").repartition(1)\n", "print(\"Dataframe has {} rows\".format(df.count()))\n", "display(df.limit(10).toPandas())" ] @@ -110,7 +114,7 @@ "outputs": [], "source": [ "features = train_data.columns[1:]\n", - "values = train_data.drop('target').toPandas()\n", + "values = train_data.drop(\"target\").toPandas()\n", "ncols = 5\n", "nrows = math.ceil(len(features) / ncols)" ] @@ -130,9 +134,9 @@ "metadata": {}, "outputs": [], "source": [ - "featurizer = VectorAssembler(inputCols=feature_cols, outputCol='features')\n", - "lr_train_data = featurizer.transform(train_data)['target', 'features']\n", - "lr_test_data = featurizer.transform(test_data)['target', 'features']\n", + "featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n", + "lr_train_data = featurizer.transform(train_data)[\"target\", \"features\"]\n", + "lr_test_data = featurizer.transform(test_data)[\"target\", \"features\"]\n", "display(lr_train_data.limit(10).toPandas())" ] }, @@ -143,7 +147,7 @@ "outputs": [], "source": [ "# By default, `maxIter` is 100. Other params you may want to change include: `regParam`, `elasticNetParam`, etc.\n", - "lr = LinearRegression(labelCol='target')\n", + "lr = LinearRegression(labelCol=\"target\")\n", "\n", "lr_model = lr.fit(lr_train_data)\n", "lr_predictions = lr_model.transform(lr_test_data)\n", @@ -169,12 +173,11 @@ "outputs": [], "source": [ "metrics = ComputeModelStatistics(\n", - " evaluationMetric='regression',\n", - " labelCol='target',\n", - " scoresCol='prediction').transform(lr_predictions)\n", + " evaluationMetric=\"regression\", labelCol=\"target\", scoresCol=\"prediction\"\n", + ").transform(lr_predictions)\n", "\n", "results = metrics.toPandas()\n", - "results.insert(0, 'model', ['Spark MLlib - Linear Regression'])\n", + "results.insert(0, \"model\", [\"Spark MLlib - Linear Regression\"])\n", "display(results)" ] }, @@ -198,12 +201,10 @@ "metadata": {}, "outputs": [], "source": [ - "vw_featurizer = VowpalWabbitFeaturizer(\n", - " inputCols=feature_cols,\n", - " outputCol='features')\n", + "vw_featurizer = VowpalWabbitFeaturizer(inputCols=feature_cols, outputCol=\"features\")\n", "\n", - "vw_train_data = vw_featurizer.transform(train_data)['target', 'features']\n", - "vw_test_data = vw_featurizer.transform(test_data)['target', 'features']\n", + "vw_train_data = vw_featurizer.transform(train_data)[\"target\", \"features\"]\n", + "vw_test_data = vw_featurizer.transform(test_data)[\"target\", \"features\"]\n", "display(vw_train_data.limit(10).toPandas())" ] }, @@ -222,10 +223,7 @@ "source": [ "# Use the same number of iterations as Spark MLlib's Linear Regression (=100)\n", "args = \"--holdout_off --loss_function quantile -l 7 -q :: --power_t 0.3\"\n", - "vwr = VowpalWabbitRegressor(\n", - " labelCol='target',\n", - " passThroughArgs=args,\n", - " numPasses=100)\n", + "vwr = VowpalWabbitRegressor(labelCol=\"target\", passThroughArgs=args, numPasses=100)\n", "\n", "# To reduce number of partitions (which will effect performance), use `vw_train_data.repartition(1)`\n", "vw_train_data_2 = vw_train_data.repartition(1).cache()\n", @@ -243,15 +241,12 @@ "outputs": [], "source": [ "metrics = ComputeModelStatistics(\n", - " evaluationMetric='regression',\n", - " labelCol='target',\n", - " scoresCol='prediction').transform(vw_predictions)\n", + " evaluationMetric=\"regression\", labelCol=\"target\", scoresCol=\"prediction\"\n", + ").transform(vw_predictions)\n", "\n", "vw_result = metrics.toPandas()\n", - "vw_result.insert(0, 'model', ['Vowpal Wabbit'])\n", - "results = results.append(\n", - " vw_result,\n", - " ignore_index=True)\n", + "vw_result.insert(0, \"model\", [\"Vowpal Wabbit\"])\n", + "results = results.append(vw_result, ignore_index=True)\n", "\n", "display(results)" ] @@ -270,12 +265,13 @@ "outputs": [], "source": [ "lgr = LightGBMRegressor(\n", - " objective='quantile',\n", + " objective=\"quantile\",\n", " alpha=0.2,\n", " learningRate=0.3,\n", " numLeaves=31,\n", - " labelCol='target',\n", - " numIterations=100)\n", + " labelCol=\"target\",\n", + " numIterations=100,\n", + ")\n", "\n", "# Using one partition since the training dataset is very small\n", "repartitioned_data = lr_train_data.repartition(1).cache()\n", @@ -293,16 +289,13 @@ "outputs": [], "source": [ "metrics = ComputeModelStatistics(\n", - " evaluationMetric='regression',\n", - " labelCol='target',\n", - " scoresCol='prediction').transform(lg_predictions)\n", + " evaluationMetric=\"regression\", labelCol=\"target\", scoresCol=\"prediction\"\n", + ").transform(lg_predictions)\n", "\n", "lg_result = metrics.toPandas()\n", - "lg_result.insert(0, 'model', ['LightGBM'])\n", + "lg_result.insert(0, \"model\", [\"LightGBM\"])\n", "\n", - "results = results.append(\n", - " lg_result,\n", - " ignore_index=True)\n", + "results = results.append(lg_result, ignore_index=True)\n", "\n", "display(results)" ] @@ -327,14 +320,14 @@ " from matplotlib.cm import get_cmap\n", " import matplotlib.pyplot as plt\n", "\n", - " f, axes = plt.subplots(nrows, ncols, sharey=True, figsize=(30,10))\n", + " f, axes = plt.subplots(nrows, ncols, sharey=True, figsize=(30, 10))\n", " f.tight_layout()\n", - " yy = [r['target'] for r in train_data.select('target').collect()]\n", + " yy = [r[\"target\"] for r in train_data.select(\"target\").collect()]\n", " for irow in range(nrows):\n", - " axes[irow][0].set_ylabel('target')\n", + " axes[irow][0].set_ylabel(\"target\")\n", " for icol in range(ncols):\n", " try:\n", - " feat = features[irow*ncols + icol]\n", + " feat = features[irow * ncols + icol]\n", " xx = values[feat]\n", " axes[irow][icol].scatter(xx, yy, s=10, alpha=0.25)\n", " axes[irow][icol].set_xlabel(feat)\n", @@ -342,28 +335,29 @@ " except IndexError:\n", " f.delaxes(axes[irow][icol])\n", "\n", - " cmap = get_cmap('YlOrRd')\n", + " cmap = get_cmap(\"YlOrRd\")\n", "\n", - " target = np.array(test_data.select('target').collect()).flatten()\n", + " target = np.array(test_data.select(\"target\").collect()).flatten()\n", " model_preds = [\n", " (\"Spark MLlib Linear Regression\", lr_predictions),\n", " (\"Vowpal Wabbit\", vw_predictions),\n", - " (\"LightGBM\", lg_predictions)]\n", + " (\"LightGBM\", lg_predictions),\n", + " ]\n", "\n", " f, axes = plt.subplots(1, len(model_preds), sharey=True, figsize=(18, 6))\n", " f.tight_layout()\n", "\n", " for i, (model_name, preds) in enumerate(model_preds):\n", - " preds = np.array(preds.select('prediction').collect()).flatten()\n", + " preds = np.array(preds.select(\"prediction\").collect()).flatten()\n", " err = np.absolute(preds - target)\n", "\n", " norm = Normalize()\n", " clrs = cmap(np.asarray(norm(err)))[:, :-1]\n", - " axes[i].scatter(preds, target, s=60, c=clrs, edgecolors='#888888', alpha=0.75)\n", - " axes[i].plot((0, 60), (0, 60), linestyle='--', color='#888888')\n", - " axes[i].set_xlabel('Predicted values')\n", - " if i ==0:\n", - " axes[i].set_ylabel('Actual values')\n", + " axes[i].scatter(preds, target, s=60, c=clrs, edgecolors=\"#888888\", alpha=0.75)\n", + " axes[i].plot((0, 60), (0, 60), linestyle=\"--\", color=\"#888888\")\n", + " axes[i].set_xlabel(\"Predicted values\")\n", + " if i == 0:\n", + " axes[i].set_ylabel(\"Actual values\")\n", " axes[i].set_title(model_name)" ] }, diff --git a/notebooks/features/responsible_ai/DataBalanceAnalysis - Adult Census Income.ipynb b/notebooks/features/responsible_ai/DataBalanceAnalysis - Adult Census Income.ipynb index 4cafc7a8a0..2475f4e2be 100644 --- a/notebooks/features/responsible_ai/DataBalanceAnalysis - Adult Census Income.ipynb +++ b/notebooks/features/responsible_ai/DataBalanceAnalysis - Adult Census Income.ipynb @@ -62,8 +62,9 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", - " from notebookutils.visualization import display\n" + " from notebookutils.visualization import display" ] }, { @@ -79,7 +80,9 @@ }, "outputs": [], "source": [ - "df = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n", + "df = spark.read.parquet(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\"\n", + ")\n", "display(df)" ] }, @@ -98,7 +101,9 @@ "source": [ "# Convert the \"income\" column from {<=50K, >50K} to {0, 1} to represent our binary classification label column\n", "label_col = \"income\"\n", - "df = df.withColumn(label_col, F.when(F.col(label_col).contains(\"<=50K\"), F.lit(0)).otherwise(F.lit(1)))" + "df = df.withColumn(\n", + " label_col, F.when(F.col(label_col).contains(\"<=50K\"), F.lit(0)).otherwise(F.lit(1))\n", + ")" ] }, { @@ -240,7 +245,11 @@ "outputs": [], "source": [ "# Drill down to feature == \"sex\"\n", - "display(feature_balance_measures.filter(F.col(\"FeatureName\") == \"sex\").sort(F.abs(\"FeatureBalanceMeasure.dp\").desc()))" + "display(\n", + " feature_balance_measures.filter(F.col(\"FeatureName\") == \"sex\").sort(\n", + " F.abs(\"FeatureBalanceMeasure.dp\").desc()\n", + " )\n", + ")" ] }, { @@ -257,7 +266,11 @@ "outputs": [], "source": [ "# Drill down to feature == \"race\"\n", - "display(feature_balance_measures.filter(F.col(\"FeatureName\") == \"race\").sort(F.abs(\"FeatureBalanceMeasure.dp\").desc()))" + "display(\n", + " feature_balance_measures.filter(F.col(\"FeatureName\") == \"race\").sort(\n", + " F.abs(\"FeatureBalanceMeasure.dp\").desc()\n", + " )\n", + ")" ] }, { @@ -288,15 +301,19 @@ "outputs": [], "source": [ "races = [row[\"race\"] for row in df.groupBy(\"race\").count().select(\"race\").collect()]\n", - "dp_rows = feature_balance_measures.filter(F.col(\"FeatureName\") == \"race\").select(\"ClassA\", \"ClassB\", \"FeatureBalanceMeasure.dp\").collect()\n", + "dp_rows = (\n", + " feature_balance_measures.filter(F.col(\"FeatureName\") == \"race\")\n", + " .select(\"ClassA\", \"ClassB\", \"FeatureBalanceMeasure.dp\")\n", + " .collect()\n", + ")\n", "race_dp_values = [(row[\"ClassA\"], row[\"ClassB\"], row[\"dp\"]) for row in dp_rows]\n", "\n", "race_dp_array = np.zeros((len(races), len(races)))\n", "for class_a, class_b, dp_value in race_dp_values:\n", - " i, j = races.index(class_a), races.index(class_b)\n", - " dp_value = round(dp_value, 2)\n", - " race_dp_array[i, j] = dp_value\n", - " race_dp_array[j, i] = -1 * dp_value\n", + " i, j = races.index(class_a), races.index(class_b)\n", + " dp_value = round(dp_value, 2)\n", + " race_dp_array[i, j] = dp_value\n", + " race_dp_array[j, i] = -1 * dp_value\n", "\n", "colormap = \"RdBu\"\n", "dp_min, dp_max = -1.0, 1.0\n", @@ -315,9 +332,9 @@ "plt.setp(ax.get_xticklabels(), rotation=45, ha=\"right\", rotation_mode=\"anchor\")\n", "\n", "for i in range(len(races)):\n", - " for j in range(len(races)):\n", - " text = ax.text(j, i, race_dp_array[i, j], ha=\"center\", va=\"center\", color=\"k\")\n", - " \n", + " for j in range(len(races)):\n", + " text = ax.text(j, i, race_dp_array[i, j], ha=\"center\", va=\"center\", color=\"k\")\n", + "\n", "ax.set_title(\"Demographic Parity of Races in Adult Dataset\")\n", "fig.tight_layout()\n", "plt.show()" @@ -426,13 +443,15 @@ "from synapse.ml.exploratory import DistributionBalanceMeasure\n", "\n", "distribution_balance_measures = (\n", - " DistributionBalanceMeasure()\n", - " .setSensitiveCols(cols_of_interest)\n", - " .transform(df)\n", + " DistributionBalanceMeasure().setSensitiveCols(cols_of_interest).transform(df)\n", ")\n", "\n", "# Sort by JS Distance descending\n", - "display(distribution_balance_measures.sort(F.abs(\"DistributionBalanceMeasure.js_dist\").desc()))" + "display(\n", + " distribution_balance_measures.sort(\n", + " F.abs(\"DistributionBalanceMeasure.js_dist\").desc()\n", + " )\n", + ")" ] }, { @@ -463,10 +482,20 @@ "outputs": [], "source": [ "distribution_rows = distribution_balance_measures.collect()\n", - "race_row = [row for row in distribution_rows if row[\"FeatureName\"] == \"race\"][0][\"DistributionBalanceMeasure\"]\n", - "sex_row = [row for row in distribution_rows if row[\"FeatureName\"] == \"sex\"][0][\"DistributionBalanceMeasure\"]\n", + "race_row = [row for row in distribution_rows if row[\"FeatureName\"] == \"race\"][0][\n", + " \"DistributionBalanceMeasure\"\n", + "]\n", + "sex_row = [row for row in distribution_rows if row[\"FeatureName\"] == \"sex\"][0][\n", + " \"DistributionBalanceMeasure\"\n", + "]\n", "\n", - "measures_of_interest = [\"kl_divergence\", \"js_dist\", \"inf_norm_dist\", \"total_variation_dist\", \"wasserstein_dist\"]\n", + "measures_of_interest = [\n", + " \"kl_divergence\",\n", + " \"js_dist\",\n", + " \"inf_norm_dist\",\n", + " \"total_variation_dist\",\n", + " \"wasserstein_dist\",\n", + "]\n", "race_measures = [round(race_row[measure], 4) for measure in measures_of_interest]\n", "sex_measures = [round(sex_row[measure], 4) for measure in measures_of_interest]\n", "\n", @@ -474,8 +503,8 @@ "width = 0.35\n", "\n", "fig, ax = plt.subplots()\n", - "rects1 = ax.bar(x - width/2, race_measures, width, label=\"Race\")\n", - "rects2 = ax.bar(x + width/2, sex_measures, width, label=\"Sex\")\n", + "rects1 = ax.bar(x - width / 2, race_measures, width, label=\"Race\")\n", + "rects2 = ax.bar(x + width / 2, sex_measures, width, label=\"Sex\")\n", "\n", "ax.set_xlabel(\"Measure\")\n", "ax.set_ylabel(\"Value\")\n", @@ -486,14 +515,19 @@ "\n", "plt.setp(ax.get_xticklabels(), rotation=20, ha=\"right\", rotation_mode=\"default\")\n", "\n", + "\n", "def autolabel(rects):\n", - " for rect in rects:\n", - " height = rect.get_height()\n", - " ax.annotate('{}'.format(height),\n", - " xy=(rect.get_x() + rect.get_width() / 2, height),\n", - " xytext=(0, 1), # 1 point vertical offset\n", - " textcoords=\"offset points\",\n", - " ha='center', va='bottom')\n", + " for rect in rects:\n", + " height = rect.get_height()\n", + " ax.annotate(\n", + " \"{}\".format(height),\n", + " xy=(rect.get_x() + rect.get_width() / 2, height),\n", + " xytext=(0, 1), # 1 point vertical offset\n", + " textcoords=\"offset points\",\n", + " ha=\"center\",\n", + " va=\"bottom\",\n", + " )\n", + "\n", "\n", "autolabel(rects1)\n", "autolabel(rects2)\n", @@ -571,9 +605,7 @@ "from synapse.ml.exploratory import AggregateBalanceMeasure\n", "\n", "aggregate_balance_measures = (\n", - " AggregateBalanceMeasure()\n", - " .setSensitiveCols(cols_of_interest)\n", - " .transform(df)\n", + " AggregateBalanceMeasure().setSensitiveCols(cols_of_interest).transform(df)\n", ")\n", "\n", "display(aggregate_balance_measures)" diff --git a/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb b/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb index be834f9716..7b1519b4d0 100644 --- a/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb +++ b/notebooks/features/responsible_ai/Interpretability - Explanation Dashboard.ipynb @@ -44,6 +44,7 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", " from notebookutils.visualization import display\n", "\n", @@ -80,9 +81,13 @@ }, "outputs": [], "source": [ - "df = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\").cache()\n", + "df = spark.read.parquet(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\"\n", + ").cache()\n", "\n", - "labelIndexer = StringIndexer(inputCol=\"income\", outputCol=\"label\", stringOrderType=\"alphabetAsc\").fit(df)\n", + "labelIndexer = StringIndexer(\n", + " inputCol=\"income\", outputCol=\"label\", stringOrderType=\"alphabetAsc\"\n", + ").fit(df)\n", "print(\"Label index assigment: \" + str(set(zip(labelIndexer.labels, [0, 1]))))\n", "\n", "training = labelIndexer.transform(df)\n", @@ -99,11 +104,23 @@ "]\n", "categorical_features_idx = [col + \"_idx\" for col in categorical_features]\n", "categorical_features_enc = [col + \"_enc\" for col in categorical_features]\n", - "numeric_features = [\"age\", \"education-num\", \"capital-gain\", \"capital-loss\", \"hours-per-week\"]\n", + "numeric_features = [\n", + " \"age\",\n", + " \"education-num\",\n", + " \"capital-gain\",\n", + " \"capital-loss\",\n", + " \"hours-per-week\",\n", + "]\n", "\n", - "strIndexer = StringIndexer(inputCols=categorical_features, outputCols=categorical_features_idx)\n", - "onehotEnc = OneHotEncoder(inputCols=categorical_features_idx, outputCols=categorical_features_enc)\n", - "vectAssem = VectorAssembler(inputCols=categorical_features_enc + numeric_features, outputCol=\"features\")\n", + "strIndexer = StringIndexer(\n", + " inputCols=categorical_features, outputCols=categorical_features_idx\n", + ")\n", + "onehotEnc = OneHotEncoder(\n", + " inputCols=categorical_features_idx, outputCols=categorical_features_enc\n", + ")\n", + "vectAssem = VectorAssembler(\n", + " inputCols=categorical_features_enc + numeric_features, outputCol=\"features\"\n", + ")\n", "lr = LogisticRegression(featuresCol=\"features\", labelCol=\"label\", weightCol=\"fnlwgt\")\n", "pipeline = Pipeline(stages=[strIndexer, onehotEnc, vectAssem, lr])\n", "model = pipeline.fit(training)" @@ -137,7 +154,9 @@ }, "outputs": [], "source": [ - "explain_instances = model.transform(training).orderBy(rand()).limit(5).repartition(200).cache()\n", + "explain_instances = (\n", + " model.transform(training).orderBy(rand()).limit(5).repartition(200).cache()\n", + ")\n", "display(explain_instances)" ] }, @@ -179,7 +198,7 @@ " backgroundData=broadcast(training.orderBy(rand()).limit(100).cache()),\n", ")\n", "\n", - "shap_df = shap.transform(explain_instances)\n" + "shap_df = shap.transform(explain_instances)" ] }, { @@ -214,7 +233,9 @@ "shaps = (\n", " shap_df.withColumn(\"probability\", vec_access(col(\"probability\"), lit(1)))\n", " .withColumn(\"shapValues\", vec2array(col(\"shapValues\").getItem(0)))\n", - " .select([\"shapValues\", \"probability\", \"label\"] + categorical_features + numeric_features)\n", + " .select(\n", + " [\"shapValues\", \"probability\", \"label\"] + categorical_features + numeric_features\n", + " )\n", ")\n", "\n", "shaps_local = shaps.toPandas()\n", @@ -259,9 +280,9 @@ "\n", "rows = shaps_local.shape[0]\n", "\n", - "local_importance_values = shaps_local[['shapValues']]\n", + "local_importance_values = shaps_local[[\"shapValues\"]]\n", "eval_data = shaps_local[features]\n", - "true_y = np.array(shaps_local[['label']])" + "true_y = np.array(shaps_local[[\"label\"]])" ] }, { @@ -323,8 +344,11 @@ "outputs": [], "source": [ "from interpret_community.adapter import ExplanationAdapter\n", + "\n", "adapter = ExplanationAdapter(features, classification=True)\n", - "global_explanation = adapter.create_global(converted_importance_values, eval_data, expected_values=bias)" + "global_explanation = adapter.create_global(\n", + " converted_importance_values, eval_data, expected_values=bias\n", + ")" ] }, { @@ -360,18 +384,30 @@ "outputs": [], "source": [ "class wrapper(object):\n", - " def __init__(self, model):\n", - " self.model = model\n", - " \n", - " def predict(self, data):\n", - " sparkdata = spark.createDataFrame(data)\n", - " return model.transform(sparkdata).select('prediction').toPandas().values.flatten().tolist()\n", - " \n", - " def predict_proba(self, data):\n", - " sparkdata = spark.createDataFrame(data)\n", - " prediction = model.transform(sparkdata).select('probability').toPandas().values.flatten().tolist()\n", - " proba_list = [vector.values.tolist() for vector in prediction]\n", - " return proba_list" + " def __init__(self, model):\n", + " self.model = model\n", + "\n", + " def predict(self, data):\n", + " sparkdata = spark.createDataFrame(data)\n", + " return (\n", + " model.transform(sparkdata)\n", + " .select(\"prediction\")\n", + " .toPandas()\n", + " .values.flatten()\n", + " .tolist()\n", + " )\n", + "\n", + " def predict_proba(self, data):\n", + " sparkdata = spark.createDataFrame(data)\n", + " prediction = (\n", + " model.transform(sparkdata)\n", + " .select(\"probability\")\n", + " .toPandas()\n", + " .values.flatten()\n", + " .tolist()\n", + " )\n", + " proba_list = [vector.values.tolist() for vector in prediction]\n", + " return proba_list" ] }, { @@ -384,7 +420,10 @@ "source": [ "# view the explanation in the ExplanationDashboard\n", "from raiwidgets import ExplanationDashboard\n", - "ExplanationDashboard(global_explanation, wrapper(model), dataset=eval_data, true_y=true_y)" + "\n", + "ExplanationDashboard(\n", + " global_explanation, wrapper(model), dataset=eval_data, true_y=true_y\n", + ")" ] }, { diff --git a/notebooks/features/responsible_ai/Interpretability - Image Explainers.ipynb b/notebooks/features/responsible_ai/Interpretability - Image Explainers.ipynb index e4112e6da0..d42cc0eb1b 100644 --- a/notebooks/features/responsible_ai/Interpretability - Image Explainers.ipynb +++ b/notebooks/features/responsible_ai/Interpretability - Image Explainers.ipynb @@ -34,26 +34,34 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", " from notebookutils.visualization import display\n", "\n", - "vec_slice = udf(lambda vec, indices: (vec.toArray())[indices].tolist(), ArrayType(FloatType()))\n", - "arg_top_k = udf(lambda vec, k: (-vec.toArray()).argsort()[:k].tolist(), ArrayType(IntegerType()))\n", + "vec_slice = udf(\n", + " lambda vec, indices: (vec.toArray())[indices].tolist(), ArrayType(FloatType())\n", + ")\n", + "arg_top_k = udf(\n", + " lambda vec, k: (-vec.toArray()).argsort()[:k].tolist(), ArrayType(IntegerType())\n", + ")\n", + "\n", "\n", "def downloadBytes(url: str):\n", - " with urllib.request.urlopen(url) as url:\n", - " barr = url.read()\n", - " return barr\n", + " with urllib.request.urlopen(url) as url:\n", + " barr = url.read()\n", + " return barr\n", + "\n", "\n", "def rotate_color_channel(bgr_image_array, height, width, nChannels):\n", - " B, G, R, *_ = np.asarray(bgr_image_array).reshape(height, width, nChannels).T\n", - " rgb_image_array = np.array((R, G, B)).T\n", - " return rgb_image_array\n", - " \n", + " B, G, R, *_ = np.asarray(bgr_image_array).reshape(height, width, nChannels).T\n", + " rgb_image_array = np.array((R, G, B)).T\n", + " return rgb_image_array\n", + "\n", + "\n", "def plot_superpixels(image_rgb_array, sp_clusters, weights, green_threshold=99):\n", " superpixels = sp_clusters\n", " green_value = np.percentile(weights, green_threshold)\n", - " img = Image.fromarray(image_rgb_array, mode='RGB').convert(\"RGBA\")\n", + " img = Image.fromarray(image_rgb_array, mode=\"RGB\").convert(\"RGBA\")\n", " image_array = np.asarray(img).copy()\n", " for (sp, v) in zip(superpixels, weights):\n", " if v > green_value:\n", @@ -85,32 +93,42 @@ "source": [ "from synapse.ml.io import *\n", "\n", - "image_df = spark.read.image().load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/explainers/images/david-lusvardi-dWcUncxocQY-unsplash.jpg\")\n", + "image_df = spark.read.image().load(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/explainers/images/david-lusvardi-dWcUncxocQY-unsplash.jpg\"\n", + ")\n", "display(image_df)\n", "\n", "# Rotate the image array from BGR into RGB channels for visualization later.\n", - "row = image_df.select(\"image.height\", \"image.width\", \"image.nChannels\", \"image.data\").head()\n", + "row = image_df.select(\n", + " \"image.height\", \"image.width\", \"image.nChannels\", \"image.data\"\n", + ").head()\n", "locals().update(row.asDict())\n", "rgb_image_array = rotate_color_channel(data, height, width, nChannels)\n", "\n", "# Download the ONNX model\n", - "modelPayload = downloadBytes(\"https://mmlspark.blob.core.windows.net/publicwasb/ONNXModels/resnet50-v2-7.onnx\")\n", + "modelPayload = downloadBytes(\n", + " \"https://mmlspark.blob.core.windows.net/publicwasb/ONNXModels/resnet50-v2-7.onnx\"\n", + ")\n", "\n", "featurizer = (\n", - " ImageTransformer(inputCol=\"image\", outputCol=\"features\")\n", - " .resize(224, True)\n", - " .centerCrop(224, 224)\n", - " .normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], color_scale_factor = 1/255)\n", - " .setTensorElementType(FloatType())\n", + " ImageTransformer(inputCol=\"image\", outputCol=\"features\")\n", + " .resize(224, True)\n", + " .centerCrop(224, 224)\n", + " .normalize(\n", + " mean=[0.485, 0.456, 0.406],\n", + " std=[0.229, 0.224, 0.225],\n", + " color_scale_factor=1 / 255,\n", + " )\n", + " .setTensorElementType(FloatType())\n", ")\n", "\n", "onnx = (\n", - " ONNXModel()\n", - " .setModelPayload(modelPayload)\n", - " .setFeedDict({\"data\": \"features\"})\n", - " .setFetchDict({\"rawPrediction\": \"resnetv24_dense0_fwd\"})\n", - " .setSoftMaxDict({\"rawPrediction\": \"probability\"})\n", - " .setMiniBatchSize(1)\n", + " ONNXModel()\n", + " .setModelPayload(modelPayload)\n", + " .setFeedDict({\"data\": \"features\"})\n", + " .setFetchDict({\"rawPrediction\": \"resnetv24_dense0_fwd\"})\n", + " .setSoftMaxDict({\"rawPrediction\": \"probability\"})\n", + " .setMiniBatchSize(1)\n", ")\n", "\n", "model = Pipeline(stages=[featurizer, onnx]).fit(image_df)" @@ -124,8 +142,8 @@ "source": [ "predicted = (\n", " model.transform(image_df)\n", - " .withColumn(\"top2pred\", arg_top_k(col(\"probability\"), lit(2)))\n", - " .withColumn(\"top2prob\", vec_slice(col(\"probability\"), col(\"top2pred\")))\n", + " .withColumn(\"top2pred\", arg_top_k(col(\"probability\"), lit(2)))\n", + " .withColumn(\"top2prob\", vec_slice(col(\"probability\"), col(\"top2pred\")))\n", ")\n", "\n", "display(predicted.select(\"top2pred\", \"top2prob\"))" @@ -183,8 +201,18 @@ "cell_type": "code", "execution_count": null, "source": [ - "plot_superpixels(rgb_image_array, lime_row[\"superpixels\"][\"clusters\"], list(lime_row[\"weights_violin\"]), 95)\n", - "plot_superpixels(rgb_image_array, lime_row[\"superpixels\"][\"clusters\"], list(lime_row[\"weights_piano\"]), 95)" + "plot_superpixels(\n", + " rgb_image_array,\n", + " lime_row[\"superpixels\"][\"clusters\"],\n", + " list(lime_row[\"weights_violin\"]),\n", + " 95,\n", + ")\n", + "plot_superpixels(\n", + " rgb_image_array,\n", + " lime_row[\"superpixels\"][\"clusters\"],\n", + " list(lime_row[\"weights_piano\"]),\n", + " 95,\n", + ")" ], "outputs": [], "metadata": {} @@ -250,8 +278,18 @@ "cell_type": "code", "execution_count": null, "source": [ - "plot_superpixels(rgb_image_array, shap_row[\"superpixels\"][\"clusters\"], list(shap_row[\"shaps_violin\"][1:]), 95)\n", - "plot_superpixels(rgb_image_array, shap_row[\"superpixels\"][\"clusters\"], list(shap_row[\"shaps_piano\"][1:]), 95)" + "plot_superpixels(\n", + " rgb_image_array,\n", + " shap_row[\"superpixels\"][\"clusters\"],\n", + " list(shap_row[\"shaps_violin\"][1:]),\n", + " 95,\n", + ")\n", + "plot_superpixels(\n", + " rgb_image_array,\n", + " shap_row[\"superpixels\"][\"clusters\"],\n", + " list(shap_row[\"shaps_piano\"][1:]),\n", + " 95,\n", + ")" ], "outputs": [], "metadata": {} diff --git a/notebooks/features/responsible_ai/Interpretability - PDP and ICE explainer.ipynb b/notebooks/features/responsible_ai/Interpretability - PDP and ICE explainer.ipynb index 735bb53abf..b38781e5ff 100644 --- a/notebooks/features/responsible_ai/Interpretability - PDP and ICE explainer.ipynb +++ b/notebooks/features/responsible_ai/Interpretability - PDP and ICE explainer.ipynb @@ -84,8 +84,9 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", - " from notebookutils.visualization import display\n" + " from notebookutils.visualization import display" ] }, { @@ -115,7 +116,9 @@ }, "outputs": [], "source": [ - "df = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n", + "df = spark.read.parquet(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\"\n", + ")\n", "display(df)" ] }, @@ -146,8 +149,23 @@ }, "outputs": [], "source": [ - "categorical_features = [\"race\", \"workclass\", \"marital-status\", \"education\", \"occupation\", \"relationship\", \"native-country\", \"sex\"]\n", - "numeric_features = [\"age\", \"education-num\", \"capital-gain\", \"capital-loss\", \"hours-per-week\"]" + "categorical_features = [\n", + " \"race\",\n", + " \"workclass\",\n", + " \"marital-status\",\n", + " \"education\",\n", + " \"occupation\",\n", + " \"relationship\",\n", + " \"native-country\",\n", + " \"sex\",\n", + "]\n", + "numeric_features = [\n", + " \"age\",\n", + " \"education-num\",\n", + " \"capital-gain\",\n", + " \"capital-loss\",\n", + " \"hours-per-week\",\n", + "]" ] }, { @@ -166,12 +184,24 @@ "string_indexer_outputs = [feature + \"_idx\" for feature in categorical_features]\n", "one_hot_encoder_outputs = [feature + \"_enc\" for feature in categorical_features]\n", "\n", - "pipeline = Pipeline(stages=[\n", - " StringIndexer().setInputCol(\"income\").setOutputCol(\"label\").setStringOrderType(\"alphabetAsc\"),\n", - " StringIndexer().setInputCols(categorical_features).setOutputCols(string_indexer_outputs),\n", - " OneHotEncoder().setInputCols(string_indexer_outputs).setOutputCols(one_hot_encoder_outputs),\n", - " VectorAssembler(inputCols=one_hot_encoder_outputs+numeric_features, outputCol=\"features\"),\n", - " GBTClassifier(weightCol=\"fnlwgt\", maxDepth=7, maxIter=100)])\n", + "pipeline = Pipeline(\n", + " stages=[\n", + " StringIndexer()\n", + " .setInputCol(\"income\")\n", + " .setOutputCol(\"label\")\n", + " .setStringOrderType(\"alphabetAsc\"),\n", + " StringIndexer()\n", + " .setInputCols(categorical_features)\n", + " .setOutputCols(string_indexer_outputs),\n", + " OneHotEncoder()\n", + " .setInputCols(string_indexer_outputs)\n", + " .setOutputCols(one_hot_encoder_outputs),\n", + " VectorAssembler(\n", + " inputCols=one_hot_encoder_outputs + numeric_features, outputCol=\"features\"\n", + " ),\n", + " GBTClassifier(weightCol=\"fnlwgt\", maxDepth=7, maxIter=100),\n", + " ]\n", + ")\n", "\n", "model = pipeline.fit(df)" ] @@ -204,7 +234,7 @@ "outputs": [], "source": [ "data = model.transform(df)\n", - "display(data.select('income', 'probability', 'prediction'))" + "display(data.select(\"income\", \"probability\", \"prediction\"))" ] }, { @@ -220,7 +250,9 @@ }, "outputs": [], "source": [ - "eval_auc = BinaryClassificationEvaluator(labelCol=\"label\", rawPredictionCol=\"prediction\")\n", + "eval_auc = BinaryClassificationEvaluator(\n", + " labelCol=\"label\", rawPredictionCol=\"prediction\"\n", + ")\n", "eval_auc.evaluate(data)" ] }, @@ -300,8 +332,14 @@ }, "outputs": [], "source": [ - "pdp = ICETransformer(model=model, targetCol=\"probability\", kind=\"average\", targetClasses=[1],\n", - " categoricalFeatures=categorical_features, numericFeatures=numeric_features)" + "pdp = ICETransformer(\n", + " model=model,\n", + " targetCol=\"probability\",\n", + " kind=\"average\",\n", + " targetClasses=[1],\n", + " categoricalFeatures=categorical_features,\n", + " numericFeatures=numeric_features,\n", + ")" ] }, { @@ -364,49 +402,54 @@ "source": [ "# Helper functions for visualization\n", "\n", + "\n", "def get_pandas_df_from_column(df, col_name):\n", - " keys_df = df.select(F.explode(F.map_keys(F.col(col_name)))).distinct()\n", - " keys = list(map(lambda row: row[0], keys_df.collect()))\n", - " key_cols = list(map(lambda f: F.col(col_name).getItem(f).alias(str(f)), keys))\n", - " final_cols = key_cols\n", - " pandas_df = df.select(final_cols).toPandas()\n", - " return pandas_df\n", + " keys_df = df.select(F.explode(F.map_keys(F.col(col_name)))).distinct()\n", + " keys = list(map(lambda row: row[0], keys_df.collect()))\n", + " key_cols = list(map(lambda f: F.col(col_name).getItem(f).alias(str(f)), keys))\n", + " final_cols = key_cols\n", + " pandas_df = df.select(final_cols).toPandas()\n", + " return pandas_df\n", + "\n", "\n", "def plot_dependence_for_categorical(df, col, col_int=True, figsize=(20, 5)):\n", - " dict_values = {}\n", - " col_names = list(df.columns)\n", + " dict_values = {}\n", + " col_names = list(df.columns)\n", "\n", - " for col_name in col_names:\n", - " dict_values[col_name] = df[col_name][0].toArray()[0]\n", - " marklist= sorted(dict_values.items(), key=lambda x: int(x[0]) if col_int else x[0]) \n", - " sortdict=dict(marklist)\n", + " for col_name in col_names:\n", + " dict_values[col_name] = df[col_name][0].toArray()[0]\n", + " marklist = sorted(\n", + " dict_values.items(), key=lambda x: int(x[0]) if col_int else x[0]\n", + " )\n", + " sortdict = dict(marklist)\n", + "\n", + " fig = plt.figure(figsize=figsize)\n", + " plt.bar(sortdict.keys(), sortdict.values())\n", + "\n", + " plt.xlabel(col, size=13)\n", + " plt.ylabel(\"Dependence\")\n", + " plt.show()\n", "\n", - " fig = plt.figure(figsize = figsize)\n", - " plt.bar(sortdict.keys(), sortdict.values())\n", "\n", - " plt.xlabel(col, size=13)\n", - " plt.ylabel(\"Dependence\")\n", - " plt.show()\n", - " \n", "def plot_dependence_for_numeric(df, col, col_int=True, figsize=(20, 5)):\n", - " dict_values = {}\n", - " col_names = list(df.columns)\n", + " dict_values = {}\n", + " col_names = list(df.columns)\n", "\n", - " for col_name in col_names:\n", - " dict_values[col_name] = df[col_name][0].toArray()[0]\n", - " marklist= sorted(dict_values.items(), key=lambda x: int(x[0]) if col_int else x[0]) \n", - " sortdict=dict(marklist)\n", + " for col_name in col_names:\n", + " dict_values[col_name] = df[col_name][0].toArray()[0]\n", + " marklist = sorted(\n", + " dict_values.items(), key=lambda x: int(x[0]) if col_int else x[0]\n", + " )\n", + " sortdict = dict(marklist)\n", "\n", - " fig = plt.figure(figsize = figsize)\n", + " fig = plt.figure(figsize=figsize)\n", "\n", - " \n", - " plt.plot(list(sortdict.keys()), list(sortdict.values()))\n", + " plt.plot(list(sortdict.keys()), list(sortdict.values()))\n", "\n", - " plt.xlabel(col, size=13)\n", - " plt.ylabel(\"Dependence\")\n", - " plt.ylim(0.0)\n", - " plt.show()\n", - " " + " plt.xlabel(col, size=13)\n", + " plt.ylabel(\"Dependence\")\n", + " plt.ylim(0.0)\n", + " plt.show()" ] }, { @@ -438,8 +481,8 @@ }, "outputs": [], "source": [ - "df_education_num = get_pandas_df_from_column(output_pdp, 'age_dependence')\n", - "plot_dependence_for_numeric(df_education_num, 'age')" + "df_education_num = get_pandas_df_from_column(output_pdp, \"age_dependence\")\n", + "plot_dependence_for_numeric(df_education_num, \"age\")" ] }, { @@ -487,8 +530,8 @@ }, "outputs": [], "source": [ - "df_occupation = get_pandas_df_from_column(output_pdp, 'marital-status_dependence')\n", - "plot_dependence_for_categorical(df_occupation, 'marital-status', False, figsize=(30, 5))" + "df_occupation = get_pandas_df_from_column(output_pdp, \"marital-status_dependence\")\n", + "plot_dependence_for_categorical(df_occupation, \"marital-status\", False, figsize=(30, 5))" ] }, { @@ -537,8 +580,8 @@ }, "outputs": [], "source": [ - "df_education_num = get_pandas_df_from_column(output_pdp, 'capital-gain_dependence')\n", - "plot_dependence_for_numeric(df_education_num, 'capital-gain_dependence')" + "df_education_num = get_pandas_df_from_column(output_pdp, \"capital-gain_dependence\")\n", + "plot_dependence_for_numeric(df_education_num, \"capital-gain_dependence\")" ] }, { @@ -570,12 +613,21 @@ }, "outputs": [], "source": [ - "pdp_cap_gain = ICETransformer(model=model, targetCol=\"probability\", kind=\"average\", targetClasses=[1], \n", - " numericFeatures=[{\"name\": \"capital-gain\", \"numSplits\": 20, \"rangeMin\": 0.0,\n", - " \"rangeMax\": 10000.0}], numSamples=50)\n", + "pdp_cap_gain = ICETransformer(\n", + " model=model,\n", + " targetCol=\"probability\",\n", + " kind=\"average\",\n", + " targetClasses=[1],\n", + " numericFeatures=[\n", + " {\"name\": \"capital-gain\", \"numSplits\": 20, \"rangeMin\": 0.0, \"rangeMax\": 10000.0}\n", + " ],\n", + " numSamples=50,\n", + ")\n", "output_pdp_cap_gain = pdp_cap_gain.transform(df)\n", - "df_education_num_gain = get_pandas_df_from_column(output_pdp_cap_gain, 'capital-gain_dependence')\n", - "plot_dependence_for_numeric(df_education_num_gain, 'capital-gain_dependence')" + "df_education_num_gain = get_pandas_df_from_column(\n", + " output_pdp_cap_gain, \"capital-gain_dependence\"\n", + ")\n", + "plot_dependence_for_numeric(df_education_num_gain, \"capital-gain_dependence\")" ] }, { @@ -670,8 +722,14 @@ }, "outputs": [], "source": [ - "ice = ICETransformer(model=model, targetCol=\"probability\", targetClasses=[1], \n", - " categoricalFeatures=categorical_features, numericFeatures=numeric_features, numSamples=50)\n", + "ice = ICETransformer(\n", + " model=model,\n", + " targetCol=\"probability\",\n", + " targetClasses=[1],\n", + " categoricalFeatures=categorical_features,\n", + " numericFeatures=numeric_features,\n", + " numSamples=50,\n", + ")\n", "\n", "output = ice.transform(df)" ] @@ -708,89 +766,89 @@ "\n", "from collections import defaultdict\n", "\n", + "\n", "def plot_ice_numeric(df, col, col_int=True, figsize=(20, 10)):\n", - " dict_values = defaultdict(list)\n", - " col_names = list(df.columns)\n", - " num_instances = df.shape[0]\n", - " \n", - " instances_y = {}\n", - " i = 0\n", + " dict_values = defaultdict(list)\n", + " col_names = list(df.columns)\n", + " num_instances = df.shape[0]\n", "\n", - " for col_name in col_names:\n", + " instances_y = {}\n", + " i = 0\n", + "\n", + " for col_name in col_names:\n", + " for i in range(num_instances):\n", + " dict_values[i].append(df[col_name][i].toArray()[0])\n", + "\n", + " fig = plt.figure(figsize=figsize)\n", " for i in range(num_instances):\n", - " dict_values[i].append(df[col_name][i].toArray()[0])\n", - " \n", - " fig = plt.figure(figsize = figsize)\n", - " for i in range(num_instances):\n", - " plt.plot(col_names, dict_values[i], \"k\")\n", - " \n", - " \n", - " plt.xlabel(col, size=13)\n", - " plt.ylabel(\"Dependence\")\n", - " plt.ylim(0.0)\n", - " \n", - " \n", - " \n", + " plt.plot(col_names, dict_values[i], \"k\")\n", + "\n", + " plt.xlabel(col, size=13)\n", + " plt.ylabel(\"Dependence\")\n", + " plt.ylim(0.0)\n", + "\n", + "\n", "def plot_ice_categorical(df, col, col_int=True, figsize=(20, 10)):\n", - " dict_values = defaultdict(list)\n", - " col_names = list(df.columns)\n", - " num_instances = df.shape[0]\n", - " \n", - " angles = [n / float(df.shape[1]) * 2 * pi for n in range(df.shape[1])]\n", - " angles += angles [:1]\n", - " \n", - " instances_y = {}\n", - " i = 0\n", + " dict_values = defaultdict(list)\n", + " col_names = list(df.columns)\n", + " num_instances = df.shape[0]\n", + "\n", + " angles = [n / float(df.shape[1]) * 2 * pi for n in range(df.shape[1])]\n", + " angles += angles[:1]\n", + "\n", + " instances_y = {}\n", + " i = 0\n", + "\n", + " for col_name in col_names:\n", + " for i in range(num_instances):\n", + " dict_values[i].append(df[col_name][i].toArray()[0])\n", + "\n", + " fig = plt.figure(figsize=figsize)\n", + " ax = plt.subplot(111, polar=True)\n", + " plt.xticks(angles[:-1], col_names)\n", "\n", - " for col_name in col_names:\n", " for i in range(num_instances):\n", - " dict_values[i].append(df[col_name][i].toArray()[0])\n", - " \n", - " fig = plt.figure(figsize = figsize)\n", - " ax = plt.subplot(111, polar=True)\n", - " plt.xticks(angles[:-1], col_names)\n", - " \n", - " for i in range(num_instances):\n", - " values = dict_values[i]\n", - " values += values[:1]\n", - " ax.plot(angles, values, \"k\")\n", - " ax.fill(angles, values, 'teal', alpha=0.1)\n", + " values = dict_values[i]\n", + " values += values[:1]\n", + " ax.plot(angles, values, \"k\")\n", + " ax.fill(angles, values, \"teal\", alpha=0.1)\n", + "\n", + " plt.xlabel(col, size=13)\n", + " plt.show()\n", "\n", - " plt.xlabel(col, size=13)\n", - " plt.show()\n", "\n", "def overlay_ice_with_pdp(df_ice, df_pdp, col, col_int=True, figsize=(20, 5)):\n", - " dict_values = defaultdict(list)\n", - " col_names_ice = list(df_ice.columns)\n", - " num_instances = df_ice.shape[0]\n", - " \n", - " instances_y = {}\n", - " i = 0\n", + " dict_values = defaultdict(list)\n", + " col_names_ice = list(df_ice.columns)\n", + " num_instances = df_ice.shape[0]\n", "\n", - " for col_name in col_names_ice:\n", + " instances_y = {}\n", + " i = 0\n", + "\n", + " for col_name in col_names_ice:\n", + " for i in range(num_instances):\n", + " dict_values[i].append(df_ice[col_name][i].toArray()[0])\n", + "\n", + " fig = plt.figure(figsize=figsize)\n", " for i in range(num_instances):\n", - " dict_values[i].append(df_ice[col_name][i].toArray()[0])\n", - " \n", - " fig = plt.figure(figsize = figsize)\n", - " for i in range(num_instances):\n", - " plt.plot(col_names_ice, dict_values[i], \"k\")\n", - " \n", - " dict_values_pdp = {}\n", - " col_names = list(df_pdp.columns)\n", + " plt.plot(col_names_ice, dict_values[i], \"k\")\n", "\n", - " for col_name in col_names:\n", - " dict_values_pdp[col_name] = df_pdp[col_name][0].toArray()[0]\n", - " marklist= sorted(dict_values_pdp.items(), key=lambda x: int(x[0]) if col_int else x[0]) \n", - " sortdict=dict(marklist)\n", - " \n", - " plt.plot(col_names_ice, list(sortdict.values()), \"r\", linewidth=5)\n", - " \n", - " \n", - " \n", - " plt.xlabel(col, size=13)\n", - " plt.ylabel(\"Dependence\")\n", - " plt.ylim(0.0)\n", - " plt.show()\n" + " dict_values_pdp = {}\n", + " col_names = list(df_pdp.columns)\n", + "\n", + " for col_name in col_names:\n", + " dict_values_pdp[col_name] = df_pdp[col_name][0].toArray()[0]\n", + " marklist = sorted(\n", + " dict_values_pdp.items(), key=lambda x: int(x[0]) if col_int else x[0]\n", + " )\n", + " sortdict = dict(marklist)\n", + "\n", + " plt.plot(col_names_ice, list(sortdict.values()), \"r\", linewidth=5)\n", + "\n", + " plt.xlabel(col, size=13)\n", + " plt.ylabel(\"Dependence\")\n", + " plt.ylim(0.0)\n", + " plt.show()" ] }, { @@ -824,10 +882,10 @@ }, "outputs": [], "source": [ - "age_df_ice = get_pandas_df_from_column(output, 'age_dependence')\n", - "age_df_pdp = get_pandas_df_from_column(output_pdp, 'age_dependence')\n", + "age_df_ice = get_pandas_df_from_column(output, \"age_dependence\")\n", + "age_df_pdp = get_pandas_df_from_column(output_pdp, \"age_dependence\")\n", "\n", - "overlay_ice_with_pdp(age_df_ice, age_df_pdp, col='age_dependence', figsize=(30, 10))" + "overlay_ice_with_pdp(age_df_ice, age_df_pdp, col=\"age_dependence\", figsize=(30, 10))" ] }, { @@ -891,9 +949,9 @@ }, "outputs": [], "source": [ - "occupation_dep = get_pandas_df_from_column(output, 'occupation_dependence')\n", + "occupation_dep = get_pandas_df_from_column(output, \"occupation_dependence\")\n", "\n", - "plot_ice_categorical(occupation_dep, 'occupation_dependence', figsize=(30, 10))" + "plot_ice_categorical(occupation_dep, \"occupation_dependence\", figsize=(30, 10))" ] }, { @@ -991,8 +1049,14 @@ }, "outputs": [], "source": [ - "pdp_based_imp = ICETransformer(model=model, targetCol=\"probability\", kind=\"feature\", targetClasses=[1],\n", - " categoricalFeatures=categorical_features, numericFeatures=numeric_features)\n", + "pdp_based_imp = ICETransformer(\n", + " model=model,\n", + " targetCol=\"probability\",\n", + " kind=\"feature\",\n", + " targetClasses=[1],\n", + " categoricalFeatures=categorical_features,\n", + " numericFeatures=numeric_features,\n", + ")\n", "\n", "output_pdp_based_imp = pdp_based_imp.transform(df)\n", "display(output_pdp_based_imp)" @@ -1027,19 +1091,20 @@ "source": [ "# Helper functions for visualization\n", "\n", + "\n", "def plot_pdp_based_imp(df, figsize=(35, 5)):\n", - " values_list = list(df.select('pdpBasedDependence').toPandas()['pdpBasedDependence'])\n", - " names = list(df.select('featureNames').toPandas()['featureNames'])\n", - " dependence_values = []\n", - " for vec in values_list:\n", - " dependence_values.append(vec.toArray()[0])\n", + " values_list = list(df.select(\"pdpBasedDependence\").toPandas()[\"pdpBasedDependence\"])\n", + " names = list(df.select(\"featureNames\").toPandas()[\"featureNames\"])\n", + " dependence_values = []\n", + " for vec in values_list:\n", + " dependence_values.append(vec.toArray()[0])\n", "\n", - " fig = plt.figure(figsize = figsize)\n", - " plt.bar(names, dependence_values)\n", + " fig = plt.figure(figsize=figsize)\n", + " plt.bar(names, dependence_values)\n", "\n", - " plt.xlabel(\"Feature names\", size=13)\n", - " plt.ylabel(\"PDP-based-feature-imporance\")\n", - " plt.show()" + " plt.xlabel(\"Feature names\", size=13)\n", + " plt.ylabel(\"PDP-based-feature-imporance\")\n", + " plt.show()" ] }, { diff --git a/notebooks/features/responsible_ai/Interpretability - Snow Leopard Detection.ipynb b/notebooks/features/responsible_ai/Interpretability - Snow Leopard Detection.ipynb index 6bb857e725..b5841f2cd3 100644 --- a/notebooks/features/responsible_ai/Interpretability - Snow Leopard Detection.ipynb +++ b/notebooks/features/responsible_ai/Interpretability - Snow Leopard Detection.ipynb @@ -14,12 +14,17 @@ "execution_count": null, "source": [ "import os\n", + "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", " from notebookutils.visualization import display\n", " from notebookutils.mssparkutils.credentials import getSecret\n", - " os.environ[\"BING_IMAGE_SEARCH_KEY\"] = getSecret(\"mmlspark-build-keys\", \"bing-search-key\")\n", + "\n", + " os.environ[\"BING_IMAGE_SEARCH_KEY\"] = getSecret(\n", + " \"mmlspark-build-keys\", \"bing-search-key\"\n", + " )\n", "\n", "# WARNING this notebook requires alot of memory.\n", "# If you get a heap space error, try dropping the number of images bing returns\n", @@ -41,22 +46,26 @@ "from synapse.ml.core.spark import FluentAPI\n", "from pyspark.sql.functions import lit\n", "\n", + "\n", "def bingPhotoSearch(name, queries, pages):\n", - " offsets = [offset*10 for offset in range(0, pages)] \n", - " parameters = [(query, offset) for offset in offsets for query in queries]\n", - " \n", - " return spark.createDataFrame(parameters, (\"queries\",\"offsets\")) \\\n", - " .mlTransform(\n", - " BingImageSearch() # Apply Bing Image Search\n", - " .setSubscriptionKey(BING_IMAGE_SEARCH_KEY) # Set the API Key\n", - " .setOffsetCol(\"offsets\") # Specify a column containing the offsets\n", - " .setQueryCol(\"queries\") # Specify a column containing the query words\n", - " .setCount(10) # Specify the number of images to return per offset\n", - " .setImageType(\"photo\") # Specify a filter to ensure we get photos\n", - " .setOutputCol(\"images\")) \\\n", - " .mlTransform(BingImageSearch.getUrlTransformer(\"images\", \"urls\")) \\\n", - " .withColumn(\"labels\", lit(name)) \\\n", - " .limit(400)\n" + " offsets = [offset * 10 for offset in range(0, pages)]\n", + " parameters = [(query, offset) for offset in offsets for query in queries]\n", + "\n", + " return (\n", + " spark.createDataFrame(parameters, (\"queries\", \"offsets\"))\n", + " .mlTransform(\n", + " BingImageSearch() # Apply Bing Image Search\n", + " .setSubscriptionKey(BING_IMAGE_SEARCH_KEY) # Set the API Key\n", + " .setOffsetCol(\"offsets\") # Specify a column containing the offsets\n", + " .setQueryCol(\"queries\") # Specify a column containing the query words\n", + " .setCount(10) # Specify the number of images to return per offset\n", + " .setImageType(\"photo\") # Specify a filter to ensure we get photos\n", + " .setOutputCol(\"images\")\n", + " )\n", + " .mlTransform(BingImageSearch.getUrlTransformer(\"images\", \"urls\"))\n", + " .withColumn(\"labels\", lit(name))\n", + " .limit(400)\n", + " )" ], "outputs": [], "metadata": { @@ -74,12 +83,12 @@ "cell_type": "code", "execution_count": null, "source": [ - "def displayDF(df, n=5, image_cols = set([\"urls\"])):\n", - " rows = df.take(n)\n", - " cols = df.columns\n", - " header = \"\".join([\"\" + c + \"\" for c in cols])\n", - " \n", - " style = \"\"\"\n", + "def displayDF(df, n=5, image_cols=set([\"urls\"])):\n", + " rows = df.take(n)\n", + " cols = df.columns\n", + " header = \"\".join([\"\" + c + \"\" for c in cols])\n", + "\n", + " style = \"\"\"\n", "\n", "\n", "\n", @@ -101,20 +110,20 @@ "}\n", "\n", "\"\"\"\n", - " \n", - " table = []\n", - " for row in rows:\n", - " table.append(\"\")\n", - " for col in cols:\n", - " if col in image_cols:\n", - " rep = ''.format(row[col])\n", - " else:\n", - " rep = row[col]\n", - " table.append(\"{}\".format(rep))\n", - " table.append(\"\")\n", - " tableHTML = \"\".join(table)\n", - " \n", - " body = \"\"\"\n", + "\n", + " table = []\n", + " for row in rows:\n", + " table.append(\"\")\n", + " for col in cols:\n", + " if col in image_cols:\n", + " rep = ''.format(row[col])\n", + " else:\n", + " rep = row[col]\n", + " table.append(\"{}\".format(rep))\n", + " table.append(\"\")\n", + " tableHTML = \"\".join(table)\n", + "\n", + " body = \"\"\"\n", "\n", "\n", " \n", @@ -124,11 +133,13 @@ "
\n", "\n", "\n", - " \"\"\".format(header, tableHTML)\n", - " try:\n", - " displayHTML(style + body)\n", - " except:\n", - " pass" + " \"\"\".format(\n", + " header, tableHTML\n", + " )\n", + " try:\n", + " displayHTML(style + body)\n", + " except:\n", + " pass" ], "outputs": [], "metadata": { @@ -152,7 +163,9 @@ "cell_type": "code", "execution_count": null, "source": [ - "randomWords = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/random_words.parquet\").cache()\n", + "randomWords = spark.read.parquet(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/random_words.parquet\"\n", + ").cache()\n", "randomWords.show()" ], "outputs": [], @@ -162,16 +175,19 @@ "cell_type": "code", "execution_count": null, "source": [ - "randomLinks = randomWords \\\n", - " .mlTransform(BingImageSearch()\n", - " .setSubscriptionKey(BING_IMAGE_SEARCH_KEY)\n", - " .setCount(10)\n", - " .setQueryCol(\"words\")\n", - " .setOutputCol(\"images\")) \\\n", - " .mlTransform(BingImageSearch.getUrlTransformer(\"images\", \"urls\")) \\\n", - " .withColumn(\"label\", lit(\"other\")) \\\n", - " .limit(400)\n", - " \n", + "randomLinks = (\n", + " randomWords.mlTransform(\n", + " BingImageSearch()\n", + " .setSubscriptionKey(BING_IMAGE_SEARCH_KEY)\n", + " .setCount(10)\n", + " .setQueryCol(\"words\")\n", + " .setOutputCol(\"images\")\n", + " )\n", + " .mlTransform(BingImageSearch.getUrlTransformer(\"images\", \"urls\"))\n", + " .withColumn(\"label\", lit(\"other\"))\n", + " .limit(400)\n", + ")\n", + "\n", "displayDF(randomLinks)" ], "outputs": [], @@ -183,11 +199,17 @@ "cell_type": "code", "execution_count": null, "source": [ - "images = snowLeopardUrls.union(randomLinks).distinct().repartition(100)\\\n", - " .mlTransform(BingImageSearch.downloadFromUrls(\"urls\", \"image\", concurrency=5, timeout=5000))\\\n", - " .dropna()\n", + "images = (\n", + " snowLeopardUrls.union(randomLinks)\n", + " .distinct()\n", + " .repartition(100)\n", + " .mlTransform(\n", + " BingImageSearch.downloadFromUrls(\"urls\", \"image\", concurrency=5, timeout=5000)\n", + " )\n", + " .dropna()\n", + ")\n", "\n", - "train, test = images.randomSplit([.7,.3], seed=1)" + "train, test = images.randomSplit([0.7, 0.3], seed=1)" ], "outputs": [], "metadata": {} @@ -205,23 +227,31 @@ "from synapse.ml.stages import UDFTransformer\n", "from pyspark.sql.types import *\n", "\n", + "\n", "def getIndex(row):\n", - " return float(row[1])\n", + " return float(row[1])\n", + "\n", "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", - " network = ModelDownloader(spark, \"abfss://synapse@mmlsparkeuap.dfs.core.windows.net/models/\").downloadByName(\"ResNet50\")\n", + " network = ModelDownloader(\n", + " spark, \"abfss://synapse@mmlsparkeuap.dfs.core.windows.net/models/\"\n", + " ).downloadByName(\"ResNet50\")\n", "else:\n", - " network = ModelDownloader(spark, \"dbfs:/Models/\").downloadByName(\"ResNet50\")\n", + " network = ModelDownloader(spark, \"dbfs:/Models/\").downloadByName(\"ResNet50\")\n", "\n", - "model = Pipeline(stages=[\n", - " StringIndexer(inputCol = \"labels\", outputCol=\"index\"),\n", - " ImageFeaturizer(inputCol=\"image\", outputCol=\"features\", cutOutputLayers=1).setModel(network),\n", - " LogisticRegression(maxIter=5, labelCol=\"index\", regParam=10.0),\n", - " UDFTransformer()\\\n", - " .setUDF(udf(getIndex, DoubleType()))\\\n", - " .setInputCol(\"probability\")\\\n", - " .setOutputCol(\"leopard_prob\")\n", - "])\n", + "model = Pipeline(\n", + " stages=[\n", + " StringIndexer(inputCol=\"labels\", outputCol=\"index\"),\n", + " ImageFeaturizer(\n", + " inputCol=\"image\", outputCol=\"features\", cutOutputLayers=1\n", + " ).setModel(network),\n", + " LogisticRegression(maxIter=5, labelCol=\"index\", regParam=10.0),\n", + " UDFTransformer()\n", + " .setUDF(udf(getIndex, DoubleType()))\n", + " .setInputCol(\"probability\")\n", + " .setOutputCol(\"leopard_prob\"),\n", + " ]\n", + ")\n", "\n", "fitModel = model.fit(train)" ], @@ -240,14 +270,18 @@ "execution_count": null, "source": [ "def plotConfusionMatrix(df, label, prediction, classLabels):\n", - " from synapse.ml.plot import confusionMatrix\n", - " import matplotlib.pyplot as plt\n", - " fig = plt.figure(figsize=(4.5, 4.5))\n", - " confusionMatrix(df, label, prediction, classLabels)\n", - " display(fig)\n", + " from synapse.ml.plot import confusionMatrix\n", + " import matplotlib.pyplot as plt\n", + "\n", + " fig = plt.figure(figsize=(4.5, 4.5))\n", + " confusionMatrix(df, label, prediction, classLabels)\n", + " display(fig)\n", + "\n", "\n", "if os.environ.get(\"AZURE_SERVICE\", None) != \"Microsoft.ProjectArcadia\":\n", - " plotConfusionMatrix(fitModel.transform(test), \"index\", \"prediction\", fitModel.stages[0].labels)" + " plotConfusionMatrix(\n", + " fitModel.transform(test), \"index\", \"prediction\", fitModel.stages[0].labels\n", + " )" ], "outputs": [], "metadata": { @@ -261,19 +295,23 @@ "import urllib.request\n", "from synapse.ml.lime import ImageLIME\n", "\n", - "test_image_url = \"https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/snow_leopard1.jpg\"\n", + "test_image_url = (\n", + " \"https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/snow_leopard1.jpg\"\n", + ")\n", "with urllib.request.urlopen(test_image_url) as url:\n", " barr = url.read()\n", "test_subsample = spark.createDataFrame([(bytearray(barr),)], [\"image\"])\n", "\n", - "lime = ImageLIME()\\\n", - " .setModel(fitModel)\\\n", - " .setPredictionCol(\"leopard_prob\")\\\n", - " .setOutputCol(\"weights\")\\\n", - " .setInputCol(\"image\")\\\n", - " .setCellSize(100.0)\\\n", - " .setModifier(50.0)\\\n", - " .setNSamples(300)\n", + "lime = (\n", + " ImageLIME()\n", + " .setModel(fitModel)\n", + " .setPredictionCol(\"leopard_prob\")\n", + " .setOutputCol(\"weights\")\n", + " .setInputCol(\"image\")\n", + " .setCellSize(100.0)\n", + " .setModifier(50.0)\n", + " .setNSamples(300)\n", + ")\n", "\n", "result = lime.transform(test_subsample)" ], @@ -289,12 +327,13 @@ "import matplotlib.pyplot as plt\n", "import PIL, io, numpy as np\n", "\n", + "\n", "def plot_superpixels(row):\n", - " image_bytes = row['image']\n", - " superpixels = row['superpixels']['clusters']\n", - " weights = list(row['weights'])\n", - " mean_weight = np.percentile(weights,90)\n", - " img = (PIL.Image.open(io.BytesIO(image_bytes))).convert('RGBA')\n", + " image_bytes = row[\"image\"]\n", + " superpixels = row[\"superpixels\"][\"clusters\"]\n", + " weights = list(row[\"weights\"])\n", + " mean_weight = np.percentile(weights, 90)\n", + " img = (PIL.Image.open(io.BytesIO(image_bytes))).convert(\"RGBA\")\n", " image_array = np.asarray(img).copy()\n", " for (sp, w) in zip(superpixels, weights):\n", " if w > mean_weight:\n", @@ -305,6 +344,7 @@ " plt.imshow(image_array)\n", " display()\n", "\n", + "\n", "# Gets first row from the LIME-transformed data frame\n", "if os.environ.get(\"AZURE_SERVICE\", None) != \"Microsoft.ProjectArcadia\":\n", " plot_superpixels(result.take(1)[0])" diff --git a/notebooks/features/responsible_ai/Interpretability - Tabular SHAP explainer.ipynb b/notebooks/features/responsible_ai/Interpretability - Tabular SHAP explainer.ipynb index b547aba109..5fc21a0b60 100644 --- a/notebooks/features/responsible_ai/Interpretability - Tabular SHAP explainer.ipynb +++ b/notebooks/features/responsible_ai/Interpretability - Tabular SHAP explainer.ipynb @@ -43,6 +43,7 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", " from notebookutils.visualization import display\n", "\n", @@ -78,9 +79,13 @@ }, "outputs": [], "source": [ - "df = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n", + "df = spark.read.parquet(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\"\n", + ")\n", "\n", - "labelIndexer = StringIndexer(inputCol=\"income\", outputCol=\"label\", stringOrderType=\"alphabetAsc\").fit(df)\n", + "labelIndexer = StringIndexer(\n", + " inputCol=\"income\", outputCol=\"label\", stringOrderType=\"alphabetAsc\"\n", + ").fit(df)\n", "print(\"Label index assigment: \" + str(set(zip(labelIndexer.labels, [0, 1]))))\n", "\n", "training = labelIndexer.transform(df).cache()\n", @@ -97,11 +102,23 @@ "]\n", "categorical_features_idx = [col + \"_idx\" for col in categorical_features]\n", "categorical_features_enc = [col + \"_enc\" for col in categorical_features]\n", - "numeric_features = [\"age\", \"education-num\", \"capital-gain\", \"capital-loss\", \"hours-per-week\"]\n", + "numeric_features = [\n", + " \"age\",\n", + " \"education-num\",\n", + " \"capital-gain\",\n", + " \"capital-loss\",\n", + " \"hours-per-week\",\n", + "]\n", "\n", - "strIndexer = StringIndexer(inputCols=categorical_features, outputCols=categorical_features_idx)\n", - "onehotEnc = OneHotEncoder(inputCols=categorical_features_idx, outputCols=categorical_features_enc)\n", - "vectAssem = VectorAssembler(inputCols=categorical_features_enc + numeric_features, outputCol=\"features\")\n", + "strIndexer = StringIndexer(\n", + " inputCols=categorical_features, outputCols=categorical_features_idx\n", + ")\n", + "onehotEnc = OneHotEncoder(\n", + " inputCols=categorical_features_idx, outputCols=categorical_features_enc\n", + ")\n", + "vectAssem = VectorAssembler(\n", + " inputCols=categorical_features_enc + numeric_features, outputCol=\"features\"\n", + ")\n", "lr = LogisticRegression(featuresCol=\"features\", labelCol=\"label\", weightCol=\"fnlwgt\")\n", "pipeline = Pipeline(stages=[strIndexer, onehotEnc, vectAssem, lr])\n", "model = pipeline.fit(training)" @@ -134,7 +151,9 @@ }, "outputs": [], "source": [ - "explain_instances = model.transform(training).orderBy(rand()).limit(5).repartition(200).cache()\n", + "explain_instances = (\n", + " model.transform(training).orderBy(rand()).limit(5).repartition(200).cache()\n", + ")\n", "display(explain_instances)" ] }, @@ -175,7 +194,7 @@ " backgroundData=broadcast(training.orderBy(rand()).limit(100).cache()),\n", ")\n", "\n", - "shap_df = shap.transform(explain_instances)\n" + "shap_df = shap.transform(explain_instances)" ] }, { @@ -209,7 +228,9 @@ "shaps = (\n", " shap_df.withColumn(\"probability\", vec_access(col(\"probability\"), lit(1)))\n", " .withColumn(\"shapValues\", vec2array(col(\"shapValues\").getItem(0)))\n", - " .select([\"shapValues\", \"probability\", \"label\"] + categorical_features + numeric_features)\n", + " .select(\n", + " [\"shapValues\", \"probability\", \"label\"] + categorical_features + numeric_features\n", + " )\n", ")\n", "\n", "shaps_local = shaps.toPandas()\n", @@ -257,7 +278,10 @@ "fig = make_subplots(\n", " rows=rows,\n", " cols=1,\n", - " subplot_titles=\"Probability: \" + shaps_local[\"probability\"].apply(\"{:.2%}\".format) + \"; Label: \" + shaps_local[\"label\"].astype(str),\n", + " subplot_titles=\"Probability: \"\n", + " + shaps_local[\"probability\"].apply(\"{:.2%}\".format)\n", + " + \"; Label: \"\n", + " + shaps_local[\"label\"].astype(str),\n", ")\n", "\n", "for index, row in shaps_local.iterrows():\n", @@ -266,7 +290,11 @@ " list_of_tuples = list(zip(features_with_base, feature_values, shap_values))\n", " shap_pdf = pd.DataFrame(list_of_tuples, columns=[\"name\", \"value\", \"shap\"])\n", " fig.add_trace(\n", - " go.Bar(x=shap_pdf[\"name\"], y=shap_pdf[\"shap\"], hovertext=\"value: \" + shap_pdf[\"value\"].astype(str)),\n", + " go.Bar(\n", + " x=shap_pdf[\"name\"],\n", + " y=shap_pdf[\"shap\"],\n", + " hovertext=\"value: \" + shap_pdf[\"value\"].astype(str),\n", + " ),\n", " row=index + 1,\n", " col=1,\n", " )\n", @@ -274,7 +302,7 @@ "fig.update_yaxes(range=[-1, 1], fixedrange=True, zerolinecolor=\"black\")\n", "fig.update_xaxes(type=\"category\", tickangle=45, fixedrange=True)\n", "fig.update_layout(height=400 * rows, title_text=\"SHAP explanations\")\n", - "fig.show()\n" + "fig.show()" ] }, { diff --git a/notebooks/features/spark_serving/SparkServing - Deploying a Classifier.ipynb b/notebooks/features/spark_serving/SparkServing - Deploying a Classifier.ipynb index b216e23cd8..e13d0390ed 100644 --- a/notebooks/features/spark_serving/SparkServing - Deploying a Classifier.ipynb +++ b/notebooks/features/spark_serving/SparkServing - Deploying a Classifier.ipynb @@ -16,8 +16,10 @@ "outputs": [], "source": [ "import os\n", + "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()" ] }, @@ -31,7 +33,7 @@ "source": [ "import sys\n", "import numpy as np\n", - "import pandas as pd\n" + "import pandas as pd" ] }, { @@ -47,7 +49,9 @@ "metadata": {}, "outputs": [], "source": [ - "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n", + "data = spark.read.parquet(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\"\n", + ")\n", "data = data.select([\"education\", \"marital-status\", \"hours-per-week\", \"income\"])\n", "train, test = data.randomSplit([0.75, 0.25], seed=123)\n", "train.limit(10).toPandas()" @@ -75,7 +79,10 @@ "source": [ "from synapse.ml.train import TrainClassifier\n", "from pyspark.ml.classification import LogisticRegression\n", - "model = TrainClassifier(model=LogisticRegression(), labelCol=\"income\", numFeatures=256).fit(train)" + "\n", + "model = TrainClassifier(\n", + " model=LogisticRegression(), labelCol=\"income\", numFeatures=256\n", + ").fit(train)" ] }, { @@ -92,6 +99,7 @@ "outputs": [], "source": [ "from synapse.ml.train import ComputeModelStatistics, TrainedClassifierModel\n", + "\n", "prediction = model.transform(test)\n", "prediction.printSchema()" ] @@ -124,21 +132,23 @@ "from synapse.ml.io import *\n", "import uuid\n", "\n", - "serving_inputs = spark.readStream.server() \\\n", - " .address(\"localhost\", 8898, \"my_api\") \\\n", - " .option(\"name\", \"my_api\") \\\n", - " .load() \\\n", + "serving_inputs = (\n", + " spark.readStream.server()\n", + " .address(\"localhost\", 8898, \"my_api\")\n", + " .option(\"name\", \"my_api\")\n", + " .load()\n", " .parseRequest(\"my_api\", test.schema)\n", + ")\n", "\n", - "serving_outputs = model.transform(serving_inputs) \\\n", - " .makeReply(\"prediction\")\n", + "serving_outputs = model.transform(serving_inputs).makeReply(\"prediction\")\n", "\n", - "server = serving_outputs.writeStream \\\n", - " .server() \\\n", - " .replyTo(\"my_api\") \\\n", - " .queryName(\"my_query\") \\\n", - " .option(\"checkpointLocation\", \"file:///tmp/checkpoints-{}\".format(uuid.uuid1())) \\\n", - " .start()\n" + "server = (\n", + " serving_outputs.writeStream.server()\n", + " .replyTo(\"my_api\")\n", + " .queryName(\"my_query\")\n", + " .option(\"checkpointLocation\", \"file:///tmp/checkpoints-{}\".format(uuid.uuid1()))\n", + " .start()\n", + ")" ] }, { @@ -155,7 +165,8 @@ "outputs": [], "source": [ "import requests\n", - "data = u'{\"education\":\" 10th\",\"marital-status\":\"Divorced\",\"hours-per-week\":40.0}'\n", + "\n", + "data = '{\"education\":\" 10th\",\"marital-status\":\"Divorced\",\"hours-per-week\":40.0}'\n", "r = requests.post(data=data, url=\"http://localhost:8898/my_api\")\n", "print(\"Response {}\".format(r.text))" ] @@ -167,7 +178,8 @@ "outputs": [], "source": [ "import requests\n", - "data = u'{\"education\":\" Masters\",\"marital-status\":\"Married-civ-spouse\",\"hours-per-week\":40.0}'\n", + "\n", + "data = '{\"education\":\" Masters\",\"marital-status\":\"Married-civ-spouse\",\"hours-per-week\":40.0}'\n", "r = requests.post(data=data, url=\"http://localhost:8898/my_api\")\n", "print(\"Response {}\".format(r.text))" ] @@ -181,7 +193,8 @@ "outputs": [], "source": [ "import time\n", - "time.sleep(20) # wait for server to finish setting up (just to be safe)\n", + "\n", + "time.sleep(20) # wait for server to finish setting up (just to be safe)\n", "server.stop()" ] }, diff --git a/notebooks/features/vw/Vowpal Wabbit - Overview.ipynb b/notebooks/features/vw/Vowpal Wabbit - Overview.ipynb index 831bab5fd9..f5015b22d2 100644 --- a/notebooks/features/vw/Vowpal Wabbit - Overview.ipynb +++ b/notebooks/features/vw/Vowpal Wabbit - Overview.ipynb @@ -92,6 +92,7 @@ "\n", "if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n", " from pyspark.sql import SparkSession\n", + "\n", " spark = SparkSession.builder.getOrCreate()\n", " from notebookutils.visualization import display" ] @@ -102,10 +103,14 @@ "metadata": {}, "outputs": [], "source": [ - "df = spark.read.format(\"csv\")\\\n", - " .option(\"header\", True)\\\n", - " .option(\"inferSchema\", True)\\\n", - " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/heart_disease_prediction_data.csv\")\n", + "df = (\n", + " spark.read.format(\"csv\")\n", + " .option(\"header\", True)\n", + " .option(\"inferSchema\", True)\n", + " .load(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/heart_disease_prediction_data.csv\"\n", + " )\n", + ")\n", "# print dataset basic info\n", "print(\"records read: \" + str(df.count()))\n", "print(\"Schema: \")\n", @@ -151,6 +156,7 @@ "outputs": [], "source": [ "from synapse.ml.vw import VowpalWabbitFeaturizer\n", + "\n", "featurizer = VowpalWabbitFeaturizer(inputCols=df.columns[:-1], outputCol=\"features\")\n", "train_data = featurizer.transform(train)[\"target\", \"features\"]\n", "test_data = featurizer.transform(test)[\"target\", \"features\"]" @@ -179,7 +185,10 @@ "outputs": [], "source": [ "from synapse.ml.vw import VowpalWabbitClassifier\n", - "model = VowpalWabbitClassifier(numPasses=20, labelCol=\"target\", featuresCol=\"features\").fit(train_data)" + "\n", + "model = VowpalWabbitClassifier(\n", + " numPasses=20, labelCol=\"target\", featuresCol=\"features\"\n", + ").fit(train_data)" ] }, { @@ -206,7 +215,10 @@ "outputs": [], "source": [ "from synapse.ml.train import ComputeModelStatistics\n", - "metrics = ComputeModelStatistics(evaluationMetric='classification', labelCol='target', scoredLabelsCol='prediction').transform(predictions)\n", + "\n", + "metrics = ComputeModelStatistics(\n", + " evaluationMetric=\"classification\", labelCol=\"target\", scoredLabelsCol=\"prediction\"\n", + ").transform(predictions)\n", "display(metrics)" ] }, @@ -232,7 +244,9 @@ "metadata": {}, "outputs": [], "source": [ - "data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n", + "data = spark.read.parquet(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\"\n", + ")\n", "data = data.select([\"education\", \"marital-status\", \"hours-per-week\", \"income\"])\n", "train, test = data.randomSplit([0.75, 0.25], seed=123)\n", "display(train)" @@ -260,12 +274,15 @@ "from synapse.ml.vw import VowpalWabbitFeaturizer, VowpalWabbitClassifier\n", "\n", "# Define classification label\n", - "train = train.withColumn(\"label\", when(col(\"income\").contains(\"<\"), 0.0).otherwise(1.0)).repartition(1)\n", + "train = train.withColumn(\n", + " \"label\", when(col(\"income\").contains(\"<\"), 0.0).otherwise(1.0)\n", + ").repartition(1)\n", "print(train.count())\n", "\n", "# Specify featurizer\n", - "vw_featurizer = VowpalWabbitFeaturizer(inputCols=[\"education\", \"marital-status\", \"hours-per-week\"],\n", - " outputCol=\"features\")" + "vw_featurizer = VowpalWabbitFeaturizer(\n", + " inputCols=[\"education\", \"marital-status\", \"hours-per-week\"], outputCol=\"features\"\n", + ")" ] }, { @@ -283,10 +300,9 @@ "source": [ "# Define VW classification model\n", "args = \"--loss_function=logistic --quiet --holdout_off\"\n", - "vw_model = VowpalWabbitClassifier(featuresCol=\"features\",\n", - " labelCol=\"label\",\n", - " passThroughArgs=args,\n", - " numPasses=10)\n", + "vw_model = VowpalWabbitClassifier(\n", + " featuresCol=\"features\", labelCol=\"label\", passThroughArgs=args, numPasses=10\n", + ")\n", "\n", "# Create a pipeline\n", "vw_pipeline = Pipeline(stages=[vw_featurizer, vw_model])" @@ -336,9 +352,10 @@ "outputs": [], "source": [ "from synapse.ml.train import ComputeModelStatistics\n", - "metrics = ComputeModelStatistics(evaluationMetric=\"classification\", \n", - " labelCol=\"label\", \n", - " scoredLabelsCol=\"prediction\").transform(prediction)\n", + "\n", + "metrics = ComputeModelStatistics(\n", + " evaluationMetric=\"classification\", labelCol=\"label\", scoredLabelsCol=\"prediction\"\n", + ").transform(prediction)\n", "display(metrics)" ] }, @@ -388,8 +405,8 @@ "source": [ "boston = load_boston()\n", "\n", - "feature_cols = ['f' + str(i) for i in range(boston.data.shape[1])]\n", - "header = ['target'] + feature_cols\n", + "feature_cols = [\"f\" + str(i) for i in range(boston.data.shape[1])]\n", + "header = [\"target\"] + feature_cols\n", "df = spark.createDataFrame(\n", " pd.DataFrame(data=np.column_stack((boston.target, boston.data)), columns=header)\n", ").repartition(1)\n", @@ -438,20 +455,20 @@ "outputs": [], "source": [ "features = train_data.columns[1:]\n", - "values = train_data.drop('target').toPandas()\n", + "values = train_data.drop(\"target\").toPandas()\n", "ncols = 5\n", "nrows = math.ceil(len(features) / ncols)\n", "\n", - "yy = [r['target'] for r in train_data.select('target').collect()]\n", + "yy = [r[\"target\"] for r in train_data.select(\"target\").collect()]\n", "\n", - "f, axes = plt.subplots(nrows, ncols, sharey=True, figsize=(30,10))\n", + "f, axes = plt.subplots(nrows, ncols, sharey=True, figsize=(30, 10))\n", "f.tight_layout()\n", "\n", "for irow in range(nrows):\n", - " axes[irow][0].set_ylabel('target')\n", + " axes[irow][0].set_ylabel(\"target\")\n", " for icol in range(ncols):\n", " try:\n", - " feat = features[irow*ncols + icol]\n", + " feat = features[irow * ncols + icol]\n", " xx = values[feat]\n", "\n", " axes[irow][icol].scatter(xx, yy, s=10, alpha=0.25)\n", @@ -476,10 +493,10 @@ "source": [ "vw_featurizer = VowpalWabbitFeaturizer(\n", " inputCols=feature_cols,\n", - " outputCol='features',\n", + " outputCol=\"features\",\n", ")\n", - "vw_train_data = vw_featurizer.transform(train_data)['target', 'features']\n", - "vw_test_data = vw_featurizer.transform(test_data)['target', 'features']\n", + "vw_train_data = vw_featurizer.transform(train_data)[\"target\", \"features\"]\n", + "vw_test_data = vw_featurizer.transform(test_data)[\"target\", \"features\"]\n", "display(vw_train_data)" ] }, @@ -500,8 +517,8 @@ "source": [ "args = \"--holdout_off --loss_function quantile -l 7 -q :: --power_t 0.7\"\n", "vwr = VowpalWabbitRegressor(\n", - " labelCol='target',\n", - " featuresCol='features',\n", + " labelCol=\"target\",\n", + " featuresCol=\"features\",\n", " passThroughArgs=args,\n", " numPasses=200,\n", ")\n", @@ -527,13 +544,11 @@ "outputs": [], "source": [ "metrics = ComputeModelStatistics(\n", - " evaluationMetric='regression',\n", - " labelCol='target',\n", - " scoresCol='prediction'\n", + " evaluationMetric=\"regression\", labelCol=\"target\", scoresCol=\"prediction\"\n", ").transform(vw_predictions)\n", "\n", "vw_result = metrics.toPandas()\n", - "vw_result.insert(0, 'model', ['Vowpal Wabbit'])\n", + "vw_result.insert(0, \"model\", [\"Vowpal Wabbit\"])\n", "display(vw_result)" ] }, @@ -543,21 +558,21 @@ "metadata": {}, "outputs": [], "source": [ - "cmap = get_cmap('YlOrRd')\n", - "target = np.array(test_data.select('target').collect()).flatten()\n", + "cmap = get_cmap(\"YlOrRd\")\n", + "target = np.array(test_data.select(\"target\").collect()).flatten()\n", "model_preds = [(\"Vowpal Wabbit\", vw_predictions)]\n", "\n", "f, axe = plt.subplots(figsize=(6, 6))\n", "f.tight_layout()\n", "\n", - "preds = np.array(vw_predictions.select('prediction').collect()).flatten()\n", + "preds = np.array(vw_predictions.select(\"prediction\").collect()).flatten()\n", "err = np.absolute(preds - target)\n", "norm = Normalize()\n", "clrs = cmap(np.asarray(norm(err)))[:, :-1]\n", - "plt.scatter(preds, target, s=60, c=clrs, edgecolors='#888888', alpha=0.75)\n", - "plt.plot((0, 60), (0, 60), linestyle='--', color='#888888')\n", - "axe.set_xlabel('Predicted values')\n", - "axe.set_ylabel('Actual values')\n", + "plt.scatter(preds, target, s=60, c=clrs, edgecolors=\"#888888\", alpha=0.75)\n", + "plt.plot((0, 60), (0, 60), linestyle=\"--\", color=\"#888888\")\n", + "axe.set_xlabel(\"Predicted values\")\n", + "axe.set_ylabel(\"Actual values\")\n", "axe.set_title(\"Vowpal Wabbit\")" ] }, @@ -583,8 +598,9 @@ "metadata": {}, "outputs": [], "source": [ - "triazines = spark.read.format(\"libsvm\")\\\n", - " .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/triazines.scale.svmlight\")" + "triazines = spark.read.format(\"libsvm\").load(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/triazines.scale.svmlight\"\n", + ")" ] }, { @@ -630,8 +646,10 @@ "outputs": [], "source": [ "from synapse.ml.vw import VowpalWabbitRegressor\n", - "model = (VowpalWabbitRegressor(numPasses=20, passThroughArgs=\"--holdout_off --loss_function quantile -q :: -l 0.1\")\n", - " .fit(train))" + "\n", + "model = VowpalWabbitRegressor(\n", + " numPasses=20, passThroughArgs=\"--holdout_off --loss_function quantile -q :: -l 0.1\"\n", + ").fit(train)" ] }, { @@ -658,10 +676,10 @@ "outputs": [], "source": [ "from synapse.ml.train import ComputeModelStatistics\n", - "metrics = ComputeModelStatistics(evaluationMetric='regression',\n", - " labelCol='label',\n", - " scoresCol='prediction') \\\n", - " .transform(scoredData)\n", + "\n", + "metrics = ComputeModelStatistics(\n", + " evaluationMetric=\"regression\", labelCol=\"label\", scoresCol=\"prediction\"\n", + ").transform(scoredData)\n", "display(metrics)" ] }, @@ -685,7 +703,9 @@ "metadata": {}, "outputs": [], "source": [ - "data = spark.read.format(\"json\").load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/vwcb_input.dsjson\")" + "data = spark.read.format(\"json\").load(\n", + " \"wasbs://publicwasb@mmlspark.blob.core.windows.net/vwcb_input.dsjson\"\n", + ")" ] }, { @@ -701,23 +721,39 @@ "metadata": {}, "outputs": [], "source": [ - "from pyspark.sql.functions import col \n", - "from pyspark.sql.types import IntegerType, DoubleType \n", - "data = data.withColumn('GUser_id', col('c.GUser.id'))\\\n", - " .withColumn('GUser_major', col('c.GUser.major'))\\\n", - " .withColumn('GUser_hobby', col('c.GUser.hobby'))\\\n", - " .withColumn('GUser_favorite_character', col('c.GUser.favorite_character'))\\\n", - " .withColumn('TAction_0_topic', col('c._multi.TAction.topic')[0])\\\n", - " .withColumn('TAction_1_topic', col('c._multi.TAction.topic')[1])\\\n", - " .withColumn('TAction_2_topic', col('c._multi.TAction.topic')[2])\\\n", - " .withColumn('TAction_3_topic', col('c._multi.TAction.topic')[3])\\\n", - " .withColumn('TAction_4_topic', col('c._multi.TAction.topic')[4])\\\n", - " .withColumn('chosenAction', col('_label_Action').cast(IntegerType()))\\\n", - " .withColumn('label', col('_labelIndex').cast(DoubleType()))\\\n", - " .withColumn('probability', col('_label_probability'))\\\n", - " .select('GUser_id', 'GUser_major', 'GUser_hobby', 'GUser_favorite_character', 'TAction_0_topic', 'TAction_1_topic', 'TAction_2_topic', 'TAction_3_topic', 'TAction_4_topic', 'chosenAction', 'label', 'probability')\n", + "from pyspark.sql.functions import col\n", + "from pyspark.sql.types import IntegerType, DoubleType\n", "\n", - "print(\"Schema: \") \n", + "data = (\n", + " data.withColumn(\"GUser_id\", col(\"c.GUser.id\"))\n", + " .withColumn(\"GUser_major\", col(\"c.GUser.major\"))\n", + " .withColumn(\"GUser_hobby\", col(\"c.GUser.hobby\"))\n", + " .withColumn(\"GUser_favorite_character\", col(\"c.GUser.favorite_character\"))\n", + " .withColumn(\"TAction_0_topic\", col(\"c._multi.TAction.topic\")[0])\n", + " .withColumn(\"TAction_1_topic\", col(\"c._multi.TAction.topic\")[1])\n", + " .withColumn(\"TAction_2_topic\", col(\"c._multi.TAction.topic\")[2])\n", + " .withColumn(\"TAction_3_topic\", col(\"c._multi.TAction.topic\")[3])\n", + " .withColumn(\"TAction_4_topic\", col(\"c._multi.TAction.topic\")[4])\n", + " .withColumn(\"chosenAction\", col(\"_label_Action\").cast(IntegerType()))\n", + " .withColumn(\"label\", col(\"_labelIndex\").cast(DoubleType()))\n", + " .withColumn(\"probability\", col(\"_label_probability\"))\n", + " .select(\n", + " \"GUser_id\",\n", + " \"GUser_major\",\n", + " \"GUser_hobby\",\n", + " \"GUser_favorite_character\",\n", + " \"TAction_0_topic\",\n", + " \"TAction_1_topic\",\n", + " \"TAction_2_topic\",\n", + " \"TAction_3_topic\",\n", + " \"TAction_4_topic\",\n", + " \"chosenAction\",\n", + " \"label\",\n", + " \"probability\",\n", + " )\n", + ")\n", + "\n", + "print(\"Schema: \")\n", "data.printSchema()" ] }, @@ -734,20 +770,53 @@ "metadata": {}, "outputs": [], "source": [ - "from synapse.ml.vw import VowpalWabbitFeaturizer, VowpalWabbitContextualBandit, VectorZipper\n", + "from synapse.ml.vw import (\n", + " VowpalWabbitFeaturizer,\n", + " VowpalWabbitContextualBandit,\n", + " VectorZipper,\n", + ")\n", "from pyspark.ml import Pipeline\n", - "pipeline = Pipeline(stages=[\n", - " VowpalWabbitFeaturizer(inputCols=['GUser_id'], outputCol='GUser_id_feature'),\n", - " VowpalWabbitFeaturizer(inputCols=['GUser_major'], outputCol='GUser_major_feature'),\n", - " VowpalWabbitFeaturizer(inputCols=['GUser_hobby'], outputCol='GUser_hobby_feature'),\n", - " VowpalWabbitFeaturizer(inputCols=['GUser_favorite_character'], outputCol='GUser_favorite_character_feature'),\n", - " VowpalWabbitFeaturizer(inputCols=['TAction_0_topic'], outputCol='TAction_0_topic_feature'),\n", - " VowpalWabbitFeaturizer(inputCols=['TAction_1_topic'], outputCol='TAction_1_topic_feature'),\n", - " VowpalWabbitFeaturizer(inputCols=['TAction_2_topic'], outputCol='TAction_2_topic_feature'),\n", - " VowpalWabbitFeaturizer(inputCols=['TAction_3_topic'], outputCol='TAction_3_topic_feature'),\n", - " VowpalWabbitFeaturizer(inputCols=['TAction_4_topic'], outputCol='TAction_4_topic_feature'),\n", - " VectorZipper(inputCols=['TAction_0_topic_feature', 'TAction_1_topic_feature', 'TAction_2_topic_feature', 'TAction_3_topic_feature','TAction_4_topic_feature'], outputCol='features')\n", - "])\n", + "\n", + "pipeline = Pipeline(\n", + " stages=[\n", + " VowpalWabbitFeaturizer(inputCols=[\"GUser_id\"], outputCol=\"GUser_id_feature\"),\n", + " VowpalWabbitFeaturizer(\n", + " inputCols=[\"GUser_major\"], outputCol=\"GUser_major_feature\"\n", + " ),\n", + " VowpalWabbitFeaturizer(\n", + " inputCols=[\"GUser_hobby\"], outputCol=\"GUser_hobby_feature\"\n", + " ),\n", + " VowpalWabbitFeaturizer(\n", + " inputCols=[\"GUser_favorite_character\"],\n", + " outputCol=\"GUser_favorite_character_feature\",\n", + " ),\n", + " VowpalWabbitFeaturizer(\n", + " inputCols=[\"TAction_0_topic\"], outputCol=\"TAction_0_topic_feature\"\n", + " ),\n", + " VowpalWabbitFeaturizer(\n", + " inputCols=[\"TAction_1_topic\"], outputCol=\"TAction_1_topic_feature\"\n", + " ),\n", + " VowpalWabbitFeaturizer(\n", + " inputCols=[\"TAction_2_topic\"], outputCol=\"TAction_2_topic_feature\"\n", + " ),\n", + " VowpalWabbitFeaturizer(\n", + " inputCols=[\"TAction_3_topic\"], outputCol=\"TAction_3_topic_feature\"\n", + " ),\n", + " VowpalWabbitFeaturizer(\n", + " inputCols=[\"TAction_4_topic\"], outputCol=\"TAction_4_topic_feature\"\n", + " ),\n", + " VectorZipper(\n", + " inputCols=[\n", + " \"TAction_0_topic_feature\",\n", + " \"TAction_1_topic_feature\",\n", + " \"TAction_2_topic_feature\",\n", + " \"TAction_3_topic_feature\",\n", + " \"TAction_4_topic_feature\",\n", + " ],\n", + " outputCol=\"features\",\n", + " ),\n", + " ]\n", + ")\n", "tranformation_pipeline = pipeline.fit(data)\n", "transformed_data = tranformation_pipeline.transform(data)\n", "\n", @@ -767,15 +836,23 @@ "metadata": {}, "outputs": [], "source": [ - "estimator = VowpalWabbitContextualBandit() \\\n", - " .setPassThroughArgs(\"--cb_explore_adf --epsilon 0.2 --quiet\") \\\n", - " .setSharedCol('GUser_id_feature') \\\n", - " .setAdditionalSharedFeatures([\"GUser_major_feature\", \"GUser_hobby_feature\", \"GUser_favorite_character_feature\"]) \\\n", - " .setFeaturesCol('features') \\\n", - " .setUseBarrierExecutionMode(False)\\\n", - " .setChosenActionCol('chosenAction')\\\n", - " .setLabelCol('label')\\\n", - " .setProbabilityCol('probability')\n", + "estimator = (\n", + " VowpalWabbitContextualBandit()\n", + " .setPassThroughArgs(\"--cb_explore_adf --epsilon 0.2 --quiet\")\n", + " .setSharedCol(\"GUser_id_feature\")\n", + " .setAdditionalSharedFeatures(\n", + " [\n", + " \"GUser_major_feature\",\n", + " \"GUser_hobby_feature\",\n", + " \"GUser_favorite_character_feature\",\n", + " ]\n", + " )\n", + " .setFeaturesCol(\"features\")\n", + " .setUseBarrierExecutionMode(False)\n", + " .setChosenActionCol(\"chosenAction\")\n", + " .setLabelCol(\"label\")\n", + " .setProbabilityCol(\"probability\")\n", + ")\n", "model = estimator.fit(transformed_data)\n", "display(model.getPerformanceStatistics())" ] diff --git a/requirements.txt b/requirements.txt index 9379faf4ab..9c26fda126 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ # Licensed under the MIT License. See LICENSE in project root for information. # Required to auto-format python code -black==22.3.0 \ No newline at end of file +black==22.3.0 +black[jupyter]==22.3.0 \ No newline at end of file