style: format notebooks with Black (#1522)

This commit is contained in:
Serena Ruan 2022-06-09 09:42:27 +08:00 коммит произвёл GitHub
Родитель 3519038c94
Коммит 775fcaa76a
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
39 изменённых файлов: 2670 добавлений и 1501 удалений

Просмотреть файл

@ -18,8 +18,10 @@
"outputs": [],
"source": [
"import os\n",
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()"
]
},
@ -29,7 +31,9 @@
"metadata": {},
"outputs": [],
"source": [
"data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n",
"data = spark.read.parquet(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\"\n",
")\n",
"data = data.select([\"education\", \"marital-status\", \"hours-per-week\", \"income\"])\n",
"train, test = data.randomSplit([0.75, 0.25], seed=123)\n",
"train.limit(10).toPandas()"
@ -54,19 +58,23 @@
"from synapse.ml.vw import VowpalWabbitFeaturizer, VowpalWabbitClassifier\n",
"\n",
"# Define classification label\n",
"train = train.withColumn(\"label\", when(col(\"income\").contains(\"<\"), 0.0).otherwise(1.0)).repartition(1).cache()\n",
"train = (\n",
" train.withColumn(\"label\", when(col(\"income\").contains(\"<\"), 0.0).otherwise(1.0))\n",
" .repartition(1)\n",
" .cache()\n",
")\n",
"print(train.count())\n",
"\n",
"# Specify featurizer\n",
"vw_featurizer = VowpalWabbitFeaturizer(inputCols=[\"education\", \"marital-status\", \"hours-per-week\"],\n",
" outputCol=\"features\")\n",
"vw_featurizer = VowpalWabbitFeaturizer(\n",
" inputCols=[\"education\", \"marital-status\", \"hours-per-week\"], outputCol=\"features\"\n",
")\n",
"\n",
"# Define VW classification model\n",
"args = \"--loss_function=logistic --quiet --holdout_off\"\n",
"vw_model = VowpalWabbitClassifier(featuresCol=\"features\",\n",
" labelCol=\"label\",\n",
" passThroughArgs=args,\n",
" numPasses=10)\n",
"vw_model = VowpalWabbitClassifier(\n",
" featuresCol=\"features\", labelCol=\"label\", passThroughArgs=args, numPasses=10\n",
")\n",
"\n",
"# Create a pipeline\n",
"vw_pipeline = Pipeline(stages=[vw_featurizer, vw_model])"
@ -122,9 +130,10 @@
"outputs": [],
"source": [
"from synapse.ml.train import ComputeModelStatistics\n",
"metrics = ComputeModelStatistics(evaluationMetric=\"classification\", \n",
" labelCol=\"label\", \n",
" scoredLabelsCol=\"prediction\").transform(prediction)\n",
"\n",
"metrics = ComputeModelStatistics(\n",
" evaluationMetric=\"classification\", labelCol=\"label\", scoredLabelsCol=\"prediction\"\n",
").transform(prediction)\n",
"metrics.toPandas()"
]
}

Просмотреть файл

@ -16,8 +16,10 @@
"execution_count": null,
"source": [
"import os\n",
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()"
],
"outputs": [],
@ -44,7 +46,9 @@
"cell_type": "code",
"execution_count": null,
"source": [
"data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n",
"data = spark.read.parquet(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\"\n",
")\n",
"data = data.select([\"education\", \"marital-status\", \"hours-per-week\", \"income\"])\n",
"train, test = data.randomSplit([0.75, 0.25], seed=123)\n",
"train.limit(10).toPandas()"
@ -70,7 +74,10 @@
"source": [
"from synapse.ml.train import TrainClassifier\n",
"from pyspark.ml.classification import LogisticRegression\n",
"model = TrainClassifier(model=LogisticRegression(), labelCol=\"income\", numFeatures=256).fit(train)"
"\n",
"model = TrainClassifier(\n",
" model=LogisticRegression(), labelCol=\"income\", numFeatures=256\n",
").fit(train)"
],
"outputs": [],
"metadata": {}
@ -89,7 +96,9 @@
"if os.environ.get(\"AZURE_SERVICE\", None) != \"Microsoft.ProjectArcadia\":\n",
" model.write().overwrite().save(\"dbfs:/AdultCensus.mml\")\n",
"else:\n",
" model.write().overwrite().save(\"abfss://synapse@mmlsparkeuap.dfs.core.windows.net/models/AdultCensus.mml\")"
" model.write().overwrite().save(\n",
" \"abfss://synapse@mmlsparkeuap.dfs.core.windows.net/models/AdultCensus.mml\"\n",
" )"
],
"outputs": [],
"metadata": {}

Просмотреть файл

@ -29,8 +29,10 @@
"outputs": [],
"source": [
"import os\n",
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()"
]
},
@ -49,7 +51,9 @@
"metadata": {},
"outputs": [],
"source": [
"rawData = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\")\n",
"rawData = spark.read.parquet(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\"\n",
")\n",
"rawData.show(5)"
]
},
@ -74,12 +78,19 @@
"source": [
"from pyspark.sql.functions import udf\n",
"from pyspark.sql.types import *\n",
"\n",
"\n",
"def wordCount(s):\n",
" return len(s.split())\n",
"\n",
"\n",
"def wordLength(s):\n",
" import numpy as np\n",
"\n",
" ss = [len(w) for w in s.split()]\n",
" return round(float(np.mean(ss)), 2)\n",
"\n",
"\n",
"wordLengthUDF = udf(wordLength, DoubleType())\n",
"wordCountUDF = udf(wordCount, IntegerType())"
]
@ -91,10 +102,15 @@
"outputs": [],
"source": [
"from synapse.ml.stages import UDFTransformer\n",
"\n",
"wordLength = \"wordLength\"\n",
"wordCount = \"wordCount\"\n",
"wordLengthTransformer = UDFTransformer(inputCol=\"text\", outputCol=wordLength, udf=wordLengthUDF)\n",
"wordCountTransformer = UDFTransformer(inputCol=\"text\", outputCol=wordCount, udf=wordCountUDF)\n"
"wordLengthTransformer = UDFTransformer(\n",
" inputCol=\"text\", outputCol=wordLength, udf=wordLengthUDF\n",
")\n",
"wordCountTransformer = UDFTransformer(\n",
" inputCol=\"text\", outputCol=wordCount, udf=wordCountUDF\n",
")"
]
},
{
@ -104,9 +120,14 @@
"outputs": [],
"source": [
"from pyspark.ml import Pipeline\n",
"data = Pipeline(stages=[wordLengthTransformer, wordCountTransformer]) \\\n",
" .fit(rawData).transform(rawData) \\\n",
" .withColumn(\"label\", rawData[\"rating\"] > 3).drop(\"rating\")"
"\n",
"data = (\n",
" Pipeline(stages=[wordLengthTransformer, wordCountTransformer])\n",
" .fit(rawData)\n",
" .transform(rawData)\n",
" .withColumn(\"label\", rawData[\"rating\"] > 3)\n",
" .drop(\"rating\")\n",
")"
]
},
{
@ -155,24 +176,22 @@
"# Featurize text column\n",
"tokenizer = Tokenizer(inputCol=\"text\", outputCol=\"tokenizedText\")\n",
"numFeatures = 10000\n",
"hashingScheme = HashingTF(inputCol=\"tokenizedText\",\n",
" outputCol=\"TextFeatures\",\n",
" numFeatures=numFeatures)\n",
"hashingScheme = HashingTF(\n",
" inputCol=\"tokenizedText\", outputCol=\"TextFeatures\", numFeatures=numFeatures\n",
")\n",
"tokenizedData = tokenizer.transform(data)\n",
"featurizedData = hashingScheme.transform(tokenizedData)\n",
"\n",
"# Merge text and numeric features in one feature column\n",
"featureColumnsArray = [\"TextFeatures\", \"wordCount\", \"wordLength\"]\n",
"assembler = VectorAssembler(\n",
" inputCols = featureColumnsArray,\n",
" outputCol=\"features\")\n",
"assembler = VectorAssembler(inputCols=featureColumnsArray, outputCol=\"features\")\n",
"assembledData = assembler.transform(featurizedData)\n",
"\n",
"# Select only columns of interest\n",
"# Convert rating column from boolean to int\n",
"processedData = assembledData \\\n",
" .select(\"label\", \"features\") \\\n",
" .withColumn(\"label\", assembledData.label.cast(IntegerType()))"
"processedData = assembledData.select(\"label\", \"features\").withColumn(\n",
" \"label\", assembledData.label.cast(IntegerType())\n",
")"
]
},
{
@ -189,10 +208,12 @@
"\n",
"# Train the models on the 'train' data\n",
"lrHyperParams = [0.05, 0.1, 0.2, 0.4]\n",
"logisticRegressions = [LogisticRegression(regParam = hyperParam)\n",
" for hyperParam in lrHyperParams]\n",
"evaluator = BinaryClassificationEvaluator(rawPredictionCol=\"rawPrediction\",\n",
" metricName=\"areaUnderROC\")\n",
"logisticRegressions = [\n",
" LogisticRegression(regParam=hyperParam) for hyperParam in lrHyperParams\n",
"]\n",
"evaluator = BinaryClassificationEvaluator(\n",
" rawPredictionCol=\"rawPrediction\", metricName=\"areaUnderROC\"\n",
")\n",
"metrics = []\n",
"models = []\n",
"\n",
@ -245,10 +266,13 @@
"\n",
"# Train the models on the 'train' data\n",
"lrHyperParams = [0.05, 0.1, 0.2, 0.4]\n",
"logisticRegressions = [LogisticRegression(regParam = hyperParam)\n",
" for hyperParam in lrHyperParams]\n",
"lrmodels = [TrainClassifier(model=lrm, labelCol=\"label\", numFeatures=10000).fit(train)\n",
" for lrm in logisticRegressions]\n",
"logisticRegressions = [\n",
" LogisticRegression(regParam=hyperParam) for hyperParam in lrHyperParams\n",
"]\n",
"lrmodels = [\n",
" TrainClassifier(model=lrm, labelCol=\"label\", numFeatures=10000).fit(train)\n",
" for lrm in logisticRegressions\n",
"]\n",
"\n",
"# Select the best model\n",
"bestModel = FindBestModel(evaluationMetric=\"AUC\", models=lrmodels).fit(test)\n",
@ -257,8 +281,10 @@
"# Get AUC on the validation dataset\n",
"predictions = bestModel.transform(validation)\n",
"metrics = ComputeModelStatistics().transform(predictions)\n",
"print(\"Best model's AUC on validation set = \"\n",
" + \"{0:.2f}%\".format(metrics.first()[\"AUC\"] * 100))"
"print(\n",
" \"Best model's AUC on validation set = \"\n",
" + \"{0:.2f}%\".format(metrics.first()[\"AUC\"] * 100)\n",
")"
]
}
],

Просмотреть файл

@ -40,6 +40,7 @@
"source": [
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()"
]
},
@ -78,9 +79,10 @@
"source": [
"def download_data(url, data_folder=DATA_FOLDER, filename=\"downloaded_data.zip\"):\n",
" \"\"\"Download and extract data from url\"\"\"\n",
" \n",
"\n",
" data_dir = \"./\" + DATA_FOLDER\n",
" if not os.path.exists(data_dir): os.makedirs(data_dir)\n",
" if not os.path.exists(data_dir):\n",
" os.makedirs(data_dir)\n",
" downloaded_filepath = os.path.join(data_dir, filename)\n",
" print(\"Downloading data...\")\n",
" urllib.request.urlretrieve(url, downloaded_filepath)\n",
@ -89,7 +91,8 @@
" zipfile.extractall(data_dir)\n",
" zipfile.close()\n",
" print(\"Finished data downloading and extraction.\")\n",
" \n",
"\n",
"\n",
"download_data(DATA_URL)"
]
},
@ -106,8 +109,12 @@
"metadata": {},
"outputs": [],
"source": [
"df_train = pd.read_csv(os.path.join(\".\", DATA_FOLDER, TRAIN_FILENAME), \n",
" header=None, names=COL_NAMES, encoding=ENCODING)\n",
"df_train = pd.read_csv(\n",
" os.path.join(\".\", DATA_FOLDER, TRAIN_FILENAME),\n",
" header=None,\n",
" names=COL_NAMES,\n",
" encoding=ENCODING,\n",
")\n",
"df_train = spark.createDataFrame(df_train, verifySchema=False)"
]
},
@ -155,10 +162,12 @@
"metadata": {},
"outputs": [],
"source": [
"df_train = df_train.orderBy(rand()) \\\n",
" .limit(100000) \\\n",
" .withColumn(\"label\", when(col(\"label\") > 0, 1.0).otherwise(0.0)) \\\n",
" .select([\"label\", \"text\"])"
"df_train = (\n",
" df_train.orderBy(rand())\n",
" .limit(100000)\n",
" .withColumn(\"label\", when(col(\"label\") > 0, 1.0).otherwise(0.0))\n",
" .select([\"label\", \"text\"])\n",
")"
]
},
{
@ -177,18 +186,15 @@
"outputs": [],
"source": [
"# Specify featurizers\n",
"tokenizer = RegexTokenizer(inputCol=\"text\",\n",
" outputCol=\"words\")\n",
"tokenizer = RegexTokenizer(inputCol=\"text\", outputCol=\"words\")\n",
"\n",
"count_vectorizer = CountVectorizer(inputCol=\"words\",\n",
" outputCol=\"features\")\n",
"count_vectorizer = CountVectorizer(inputCol=\"words\", outputCol=\"features\")\n",
"\n",
"# Define VW classification model\n",
"args = \"--loss_function=logistic --quiet --holdout_off\"\n",
"vw_model = VowpalWabbitClassifier(featuresCol=\"features\", \n",
" labelCol=\"label\", \n",
" passThroughArgs=args, \n",
" numPasses=10)\n",
"vw_model = VowpalWabbitClassifier(\n",
" featuresCol=\"features\", labelCol=\"label\", passThroughArgs=args, numPasses=10\n",
")\n",
"\n",
"# Create a pipeline\n",
"vw_pipeline = Pipeline(stages=[tokenizer, count_vectorizer, vw_model])"
@ -225,8 +231,12 @@
"metadata": {},
"outputs": [],
"source": [
"df_test = pd.read_csv(os.path.join(\".\", DATA_FOLDER, TEST_FILENAME), \n",
" header=None, names=COL_NAMES, encoding=ENCODING)\n",
"df_test = pd.read_csv(\n",
" os.path.join(\".\", DATA_FOLDER, TEST_FILENAME),\n",
" header=None,\n",
" names=COL_NAMES,\n",
" encoding=ENCODING,\n",
")\n",
"df_test = spark.createDataFrame(df_test, verifySchema=False)"
]
},
@ -244,9 +254,11 @@
"outputs": [],
"source": [
"print(\"Number of test samples before filtering: \", df_test.count())\n",
"df_test = df_test.filter(col(\"label\") != 2.0) \\\n",
" .withColumn(\"label\", when(col(\"label\") > 0, 1.0).otherwise(0.0)) \\\n",
" .select([\"label\", \"text\"])\n",
"df_test = (\n",
" df_test.filter(col(\"label\") != 2.0)\n",
" .withColumn(\"label\", when(col(\"label\") > 0, 1.0).otherwise(0.0))\n",
" .select([\"label\", \"text\"])\n",
")\n",
"print(\"Number of test samples after filtering: \", df_test.count())"
]
},
@ -268,9 +280,9 @@
"outputs": [],
"source": [
"# Compute model performance metrics\n",
"metrics = ComputeModelStatistics(evaluationMetric=\"classification\", \n",
" labelCol=\"label\", \n",
" scoredLabelsCol=\"prediction\").transform(predictions)\n",
"metrics = ComputeModelStatistics(\n",
" evaluationMetric=\"classification\", labelCol=\"label\", scoredLabelsCol=\"prediction\"\n",
").transform(predictions)\n",
"metrics.toPandas()"
]
},
@ -292,8 +304,10 @@
" points += [(float(row._1()), float(row._2()))]\n",
" return points\n",
"\n",
"preds = predictions.select(\"label\", \"probability\") \\\n",
" .rdd.map(lambda row: (float(row[\"probability\"][1]), float(row[\"label\"])))\n",
"\n",
"preds = predictions.select(\"label\", \"probability\").rdd.map(\n",
" lambda row: (float(row[\"probability\"][1]), float(row[\"label\"]))\n",
")\n",
"roc_points = CurveMetrics(preds).get_curve(\"roc\")\n",
"\n",
"# Plot ROC curve\n",

Просмотреть файл

@ -18,14 +18,16 @@
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.mssparkutils.credentials import getSecret\n",
" os.environ['TEXT_API_KEY'] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n",
"\n",
" os.environ[\"TEXT_API_KEY\"] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n",
" from notebookutils.visualization import display\n",
"\n",
"#put your service keys here\n",
"key = os.environ['TEXT_API_KEY']\n",
"location = 'eastus'"
"# put your service keys here\n",
"key = os.environ[\"TEXT_API_KEY\"]\n",
"location = \"eastus\""
]
},
{
@ -34,12 +36,13 @@
"metadata": {},
"outputs": [],
"source": [
"\n",
"df = spark.createDataFrame(data=[\n",
"df = spark.createDataFrame(\n",
" data=[\n",
" [\"en\", \"Hello Seattle\"],\n",
" [\"en\", \"There once was a dog who lived in London and thought she was a human\"]\n",
" ], \n",
" schema=[\"language\",\"text\"])"
" [\"en\", \"There once was a dog who lived in London and thought she was a human\"],\n",
" ],\n",
" schema=[\"language\", \"text\"],\n",
")"
]
},
{
@ -59,7 +62,8 @@
"source": [
"from synapse.ml.cognitive import *\n",
"\n",
"text_analyze = (TextAnalyze()\n",
"text_analyze = (\n",
" TextAnalyze()\n",
" .setLocation(location)\n",
" .setSubscriptionKey(key)\n",
" .setTextCol(\"text\")\n",
@ -67,15 +71,15 @@
" .setErrorCol(\"error\")\n",
" .setLanguageCol(\"language\")\n",
" # set the tasks to perform\n",
" .setEntityRecognitionTasks([{\"parameters\": { \"model-version\": \"latest\"}}])\n",
" .setKeyPhraseExtractionTasks([{\"parameters\": { \"model-version\": \"latest\"}}])\n",
" .setEntityRecognitionTasks([{\"parameters\": {\"model-version\": \"latest\"}}])\n",
" .setKeyPhraseExtractionTasks([{\"parameters\": {\"model-version\": \"latest\"}}])\n",
" # Uncomment these lines to add more tasks\n",
" # .setEntityRecognitionPiiTasks([{\"parameters\": { \"model-version\": \"latest\"}}])\n",
" # .setEntityLinkingTasks([{\"parameters\": { \"model-version\": \"latest\"}}])\n",
" # .setSentimentAnalysisTasks([{\"parameters\": { \"model-version\": \"latest\"}}])\n",
" )\n",
")\n",
"\n",
"df_results = text_analyze.transform(df)\n"
"df_results = text_analyze.transform(df)"
]
},
{
@ -97,8 +101,11 @@
"\n",
"# reformat and display for easier viewing\n",
"display(\n",
" df_results.select(\"language\", \"text\", \"error\", col(\"textAnalysis\").getItem(0)) # we are not batching so only have a single result\n",
" .select(\"language\", \"text\", \"error\", \"textAnalysis[0].*\") # explode the Text Analytics tasks into columns\n",
" df_results.select(\n",
" \"language\", \"text\", \"error\", col(\"textAnalysis\").getItem(0)\n",
" ).select( # we are not batching so only have a single result\n",
" \"language\", \"text\", \"error\", \"textAnalysis[0].*\"\n",
" ) # explode the Text Analytics tasks into columns\n",
")"
]
}

Просмотреть файл

@ -30,15 +30,19 @@
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.mssparkutils.credentials import getSecret\n",
" os.environ['VISION_API_KEY'] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n",
" os.environ['TEXT_API_KEY'] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n",
" os.environ['BING_IMAGE_SEARCH_KEY'] = getSecret(\"mmlspark-build-keys\", \"bing-search-key\")\n",
"\n",
"#put your service keys here\n",
"TEXT_API_KEY = os.environ[\"TEXT_API_KEY\"]\n",
"VISION_API_KEY = os.environ[\"VISION_API_KEY\"]\n",
" os.environ[\"VISION_API_KEY\"] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n",
" os.environ[\"TEXT_API_KEY\"] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n",
" os.environ[\"BING_IMAGE_SEARCH_KEY\"] = getSecret(\n",
" \"mmlspark-build-keys\", \"bing-search-key\"\n",
" )\n",
"\n",
"# put your service keys here\n",
"TEXT_API_KEY = os.environ[\"TEXT_API_KEY\"]\n",
"VISION_API_KEY = os.environ[\"VISION_API_KEY\"]\n",
"BING_IMAGE_SEARCH_KEY = os.environ[\"BING_IMAGE_SEARCH_KEY\"]"
]
},
@ -61,18 +65,22 @@
},
"outputs": [],
"source": [
"imgsPerBatch = 10 #the number of images Bing will return for each query\n",
"offsets = [(i*imgsPerBatch,) for i in range(100)] # A list of offsets, used to page into the search results\n",
"imgsPerBatch = 10 # the number of images Bing will return for each query\n",
"offsets = [\n",
" (i * imgsPerBatch,) for i in range(100)\n",
"] # A list of offsets, used to page into the search results\n",
"bingParameters = spark.createDataFrame(offsets, [\"offset\"])\n",
"\n",
"bingSearch = BingImageSearch()\\\n",
" .setSubscriptionKey(BING_IMAGE_SEARCH_KEY)\\\n",
" .setOffsetCol(\"offset\")\\\n",
" .setQuery(\"celebrity quotes\")\\\n",
" .setCount(imgsPerBatch)\\\n",
" .setOutputCol(\"images\")\n",
"bingSearch = (\n",
" BingImageSearch()\n",
" .setSubscriptionKey(BING_IMAGE_SEARCH_KEY)\n",
" .setOffsetCol(\"offset\")\n",
" .setQuery(\"celebrity quotes\")\n",
" .setCount(imgsPerBatch)\n",
" .setOutputCol(\"images\")\n",
")\n",
"\n",
"#Transformer to that extracts and flattens the richly structured output of Bing Image Search into a simple URL column\n",
"# Transformer to that extracts and flattens the richly structured output of Bing Image Search into a simple URL column\n",
"getUrls = BingImageSearch.getUrlTransformer(\"images\", \"url\")"
]
},
@ -94,15 +102,19 @@
},
"outputs": [],
"source": [
"celebs = RecognizeDomainSpecificContent()\\\n",
" .setSubscriptionKey(VISION_API_KEY)\\\n",
" .setModel(\"celebrities\")\\\n",
" .setUrl(\"https://eastus.api.cognitive.microsoft.com/vision/v2.0/\")\\\n",
" .setImageUrlCol(\"url\")\\\n",
" .setOutputCol(\"celebs\")\n",
"celebs = (\n",
" RecognizeDomainSpecificContent()\n",
" .setSubscriptionKey(VISION_API_KEY)\n",
" .setModel(\"celebrities\")\n",
" .setUrl(\"https://eastus.api.cognitive.microsoft.com/vision/v2.0/\")\n",
" .setImageUrlCol(\"url\")\n",
" .setOutputCol(\"celebs\")\n",
")\n",
"\n",
"#Extract the first celebrity we see from the structured response\n",
"firstCeleb = SQLTransformer(statement=\"SELECT *, celebs.result.celebrities[0].name as firstCeleb FROM __THIS__\")"
"# Extract the first celebrity we see from the structured response\n",
"firstCeleb = SQLTransformer(\n",
" statement=\"SELECT *, celebs.result.celebrities[0].name as firstCeleb FROM __THIS__\"\n",
")"
]
},
{
@ -123,22 +135,32 @@
},
"outputs": [],
"source": [
"from synapse.ml.stages import UDFTransformer \n",
"from synapse.ml.stages import UDFTransformer\n",
"\n",
"recognizeText = (\n",
" RecognizeText()\n",
" .setSubscriptionKey(VISION_API_KEY)\n",
" .setUrl(\"https://eastus.api.cognitive.microsoft.com/vision/v2.0/recognizeText\")\n",
" .setImageUrlCol(\"url\")\n",
" .setMode(\"Printed\")\n",
" .setOutputCol(\"ocr\")\n",
" .setConcurrency(5)\n",
")\n",
"\n",
"recognizeText = RecognizeText()\\\n",
" .setSubscriptionKey(VISION_API_KEY)\\\n",
" .setUrl(\"https://eastus.api.cognitive.microsoft.com/vision/v2.0/recognizeText\")\\\n",
" .setImageUrlCol(\"url\")\\\n",
" .setMode(\"Printed\")\\\n",
" .setOutputCol(\"ocr\")\\\n",
" .setConcurrency(5)\n",
"\n",
"def getTextFunction(ocrRow):\n",
" if ocrRow is None: return None\n",
" if ocrRow is None:\n",
" return None\n",
" return \"\\n\".join([line.text for line in ocrRow.recognitionResult.lines])\n",
"\n",
"\n",
"# this transformer wil extract a simpler string from the structured output of recognize text\n",
"getText = UDFTransformer().setUDF(udf(getTextFunction)).setInputCol(\"ocr\").setOutputCol(\"text\")\n"
"getText = (\n",
" UDFTransformer()\n",
" .setUDF(udf(getTextFunction))\n",
" .setInputCol(\"ocr\")\n",
" .setOutputCol(\"text\")\n",
")"
]
},
{
@ -158,14 +180,18 @@
},
"outputs": [],
"source": [
"sentimentTransformer = TextSentiment()\\\n",
" .setTextCol(\"text\")\\\n",
" .setUrl(\"https://eastus.api.cognitive.microsoft.com/text/analytics/v3.0/sentiment\")\\\n",
" .setSubscriptionKey(TEXT_API_KEY)\\\n",
"sentimentTransformer = (\n",
" TextSentiment()\n",
" .setTextCol(\"text\")\n",
" .setUrl(\"https://eastus.api.cognitive.microsoft.com/text/analytics/v3.0/sentiment\")\n",
" .setSubscriptionKey(TEXT_API_KEY)\n",
" .setOutputCol(\"sentiment\")\n",
")\n",
"\n",
"#Extract the sentiment score from the API response body\n",
"getSentiment = SQLTransformer(statement=\"SELECT *, sentiment[0].sentiment as sentimentLabel FROM __THIS__\")"
"# Extract the sentiment score from the API response body\n",
"getSentiment = SQLTransformer(\n",
" statement=\"SELECT *, sentiment[0].sentiment as sentimentLabel FROM __THIS__\"\n",
")"
]
},
{
@ -186,11 +212,25 @@
"outputs": [],
"source": [
"from synapse.ml.stages import SelectColumns\n",
"# Select the final coulmns\n",
"cleanupColumns = SelectColumns().setCols([\"url\", \"firstCeleb\", \"text\", \"sentimentLabel\"])\n",
"\n",
"celebrityQuoteAnalysis = PipelineModel(stages=[\n",
" bingSearch, getUrls, celebs, firstCeleb, recognizeText, getText, sentimentTransformer, getSentiment, cleanupColumns])\n",
"# Select the final coulmns\n",
"cleanupColumns = SelectColumns().setCols(\n",
" [\"url\", \"firstCeleb\", \"text\", \"sentimentLabel\"]\n",
")\n",
"\n",
"celebrityQuoteAnalysis = PipelineModel(\n",
" stages=[\n",
" bingSearch,\n",
" getUrls,\n",
" celebs,\n",
" firstCeleb,\n",
" recognizeText,\n",
" getText,\n",
" sentimentTransformer,\n",
" getSentiment,\n",
" cleanupColumns,\n",
" ]\n",
")\n",
"\n",
"celebrityQuoteAnalysis.transform(bingParameters).show(5)"
]

Просмотреть файл

@ -7,19 +7,24 @@
"outputs": [],
"source": [
"import os\n",
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.mssparkutils.credentials import getSecret\n",
" os.environ['VISION_API_KEY'] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n",
" os.environ['AZURE_SEARCH_KEY'] = getSecret(\"mmlspark-build-keys\", \"azure-search-key\")\n",
" os.environ['TRANSLATOR_KEY'] = getSecret(\"mmlspark-build-keys\", \"translator-key\")\n",
"\n",
" os.environ[\"VISION_API_KEY\"] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n",
" os.environ[\"AZURE_SEARCH_KEY\"] = getSecret(\n",
" \"mmlspark-build-keys\", \"azure-search-key\"\n",
" )\n",
" os.environ[\"TRANSLATOR_KEY\"] = getSecret(\"mmlspark-build-keys\", \"translator-key\")\n",
" from notebookutils.visualization import display\n",
"\n",
"\n",
"key = os.environ['VISION_API_KEY']\n",
"search_key = os.environ['AZURE_SEARCH_KEY']\n",
"translator_key = os.environ['TRANSLATOR_KEY']\n",
"key = os.environ[\"VISION_API_KEY\"]\n",
"search_key = os.environ[\"AZURE_SEARCH_KEY\"]\n",
"translator_key = os.environ[\"TRANSLATOR_KEY\"]\n",
"\n",
"search_service = \"mmlspark-azure-search\"\n",
"search_index = \"form-demo-index\""
@ -34,22 +39,24 @@
"from pyspark.sql.functions import udf\n",
"from pyspark.sql.types import StringType\n",
"\n",
"\n",
"def blob_to_url(blob):\n",
" [prefix, postfix] = blob.split(\"@\")\n",
" container = prefix.split(\"/\")[-1]\n",
" split_postfix = postfix.split(\"/\")\n",
" account = split_postfix[0]\n",
" filepath = \"/\".join(split_postfix[1:])\n",
" return \"https://{}/{}/{}\".format(account, container, filepath)\n",
" [prefix, postfix] = blob.split(\"@\")\n",
" container = prefix.split(\"/\")[-1]\n",
" split_postfix = postfix.split(\"/\")\n",
" account = split_postfix[0]\n",
" filepath = \"/\".join(split_postfix[1:])\n",
" return \"https://{}/{}/{}\".format(account, container, filepath)\n",
"\n",
"\n",
"df2 = (spark.read.format(\"binaryFile\")\n",
" .load(\"wasbs://ignite2021@mmlsparkdemo.blob.core.windows.net/form_subset/*\")\n",
" .select(\"path\")\n",
" .limit(10)\n",
" .select(udf(blob_to_url, StringType())(\"path\").alias(\"url\"))\n",
" .cache()\n",
" )\n"
"df2 = (\n",
" spark.read.format(\"binaryFile\")\n",
" .load(\"wasbs://ignite2021@mmlsparkdemo.blob.core.windows.net/form_subset/*\")\n",
" .select(\"path\")\n",
" .limit(10)\n",
" .select(udf(blob_to_url, StringType())(\"path\").alias(\"url\"))\n",
" .cache()\n",
")"
]
},
{
@ -80,15 +87,17 @@
"source": [
"from synapse.ml.cognitive import AnalyzeInvoices\n",
"\n",
"analyzed_df = (AnalyzeInvoices()\n",
" .setSubscriptionKey(key)\n",
" .setLocation(\"eastus\")\n",
" .setImageUrlCol(\"url\")\n",
" .setOutputCol(\"invoices\")\n",
" .setErrorCol(\"errors\")\n",
" .setConcurrency(5)\n",
" .transform(df2)\n",
" .cache())\n"
"analyzed_df = (\n",
" AnalyzeInvoices()\n",
" .setSubscriptionKey(key)\n",
" .setLocation(\"eastus\")\n",
" .setImageUrlCol(\"url\")\n",
" .setOutputCol(\"invoices\")\n",
" .setErrorCol(\"errors\")\n",
" .setConcurrency(5)\n",
" .transform(df2)\n",
" .cache()\n",
")"
]
},
{
@ -108,13 +117,15 @@
"source": [
"from synapse.ml.cognitive import FormOntologyLearner\n",
"\n",
"organized_df = (FormOntologyLearner()\n",
" .setInputCol(\"invoices\")\n",
" .setOutputCol(\"extracted\")\n",
" .fit(analyzed_df)\n",
" .transform(analyzed_df)\n",
" .select(\"url\", \"extracted.*\")\n",
" .cache())"
"organized_df = (\n",
" FormOntologyLearner()\n",
" .setInputCol(\"invoices\")\n",
" .setOutputCol(\"extracted\")\n",
" .fit(analyzed_df)\n",
" .transform(analyzed_df)\n",
" .select(\"url\", \"extracted.*\")\n",
" .cache()\n",
")"
]
},
{
@ -133,11 +144,13 @@
"outputs": [],
"source": [
"from pyspark.sql.functions import explode, col\n",
"itemized_df = (organized_df\n",
" .select(\"*\", explode(col(\"Items\")).alias(\"Item\"))\n",
" .drop(\"Items\")\n",
" .select(\"Item.*\", \"*\")\n",
" .drop(\"Item\"))\n"
"\n",
"itemized_df = (\n",
" organized_df.select(\"*\", explode(col(\"Items\")).alias(\"Item\"))\n",
" .drop(\"Items\")\n",
" .select(\"Item.*\", \"*\")\n",
" .drop(\"Item\")\n",
")"
]
},
{
@ -166,7 +179,8 @@
"source": [
"from synapse.ml.cognitive import Translate\n",
"\n",
"translated_df = (Translate()\n",
"translated_df = (\n",
" Translate()\n",
" .setSubscriptionKey(translator_key)\n",
" .setLocation(\"eastus\")\n",
" .setTextCol(\"Description\")\n",
@ -177,7 +191,8 @@
" .transform(itemized_df)\n",
" .withColumn(\"Translations\", col(\"output.translations\")[0])\n",
" .drop(\"output\", \"TranslationError\")\n",
" .cache())\n"
" .cache()\n",
")"
]
},
{
@ -198,16 +213,17 @@
"from synapse.ml.cognitive import *\n",
"from pyspark.sql.functions import monotonically_increasing_id, lit\n",
"\n",
"(translated_df\n",
" .withColumn(\"DocID\", monotonically_increasing_id().cast(\"string\"))\n",
" .withColumn(\"SearchAction\", lit(\"upload\"))\n",
" .writeToAzureSearch(\n",
" subscriptionKey=search_key,\n",
" actionCol=\"SearchAction\",\n",
" serviceName=search_service,\n",
" indexName=search_index,\n",
" keyCol=\"DocID\")\n",
")\n"
"(\n",
" translated_df.withColumn(\"DocID\", monotonically_increasing_id().cast(\"string\"))\n",
" .withColumn(\"SearchAction\", lit(\"upload\"))\n",
" .writeToAzureSearch(\n",
" subscriptionKey=search_key,\n",
" actionCol=\"SearchAction\",\n",
" serviceName=search_service,\n",
" indexName=search_index,\n",
" keyCol=\"DocID\",\n",
" )\n",
")"
]
},
{
@ -217,8 +233,11 @@
"outputs": [],
"source": [
"import requests\n",
"url = 'https://{}.search.windows.net/indexes/{}/docs/search?api-version=2019-05-06'.format(search_service, search_index)\n",
"requests.post(url, json={\"search\": \"door\"}, headers = {\"api-key\": search_key}).json()"
"\n",
"url = \"https://{}.search.windows.net/indexes/{}/docs/search?api-version=2019-05-06\".format(\n",
" search_service, search_index\n",
")\n",
"requests.post(url, json={\"search\": \"door\"}, headers={\"api-key\": search_key}).json()"
]
},
{

Просмотреть файл

@ -76,10 +76,14 @@
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.mssparkutils.credentials import getSecret\n",
" os.environ['ANOMALY_API_KEY'] = getSecret(\"mmlspark-build-keys\", \"anomaly-api-key\")\n",
" os.environ['BLOB_CONNECTION_STRING'] = getSecret(\"mmlspark-build-keys\", \"madtest-connection-string\")"
"\n",
" os.environ[\"ANOMALY_API_KEY\"] = getSecret(\"mmlspark-build-keys\", \"anomaly-api-key\")\n",
" os.environ[\"BLOB_CONNECTION_STRING\"] = getSecret(\n",
" \"mmlspark-build-keys\", \"madtest-connection-string\"\n",
" )"
]
},
{
@ -226,11 +230,17 @@
},
"outputs": [],
"source": [
"df = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/MVAD/sample.csv\")\n",
"df = (\n",
" spark.read.format(\"csv\")\n",
" .option(\"header\", \"true\")\n",
" .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/MVAD/sample.csv\")\n",
")\n",
"\n",
"df = df.withColumn(\"sensor_1\", col(\"sensor_1\").cast(DoubleType())) \\\n",
" .withColumn(\"sensor_2\", col(\"sensor_2\").cast(DoubleType())) \\\n",
"df = (\n",
" df.withColumn(\"sensor_1\", col(\"sensor_1\").cast(DoubleType()))\n",
" .withColumn(\"sensor_2\", col(\"sensor_2\").cast(DoubleType()))\n",
" .withColumn(\"sensor_3\", col(\"sensor_3\").cast(DoubleType()))\n",
")\n",
"\n",
"# Let's inspect the dataframe:\n",
"df.show(5)"
@ -299,11 +309,12 @@
"source": [
"trainingStartTime = \"2020-06-01T12:00:00Z\"\n",
"trainingEndTime = \"2020-07-02T17:55:00Z\"\n",
"intermediateSaveDir = \"intermediateData\" \n",
"intermediateSaveDir = \"intermediateData\"\n",
"timestampColumn = \"timestamp\"\n",
"inputColumns = [\"sensor_1\", \"sensor_2\", \"sensor_3\"]\n",
"\n",
"estimator = (FitMultivariateAnomaly()\n",
"estimator = (\n",
" FitMultivariateAnomaly()\n",
" .setSubscriptionKey(anomalyKey)\n",
" .setLocation(location)\n",
" .setStartTime(trainingStartTime)\n",
@ -314,7 +325,7 @@
" .setInputCols(inputColumns)\n",
" .setSlidingWindow(200)\n",
" .setConnectionString(connectionString)\n",
" )"
")"
]
},
{
@ -397,15 +408,15 @@
"inferenceStartTime = \"2020-07-02T18:00:00Z\"\n",
"inferenceEndTime = \"2020-07-06T05:15:00Z\"\n",
"\n",
"result = (model\n",
" .setStartTime(inferenceStartTime)\n",
"result = (\n",
" model.setStartTime(inferenceStartTime)\n",
" .setEndTime(inferenceEndTime)\n",
" .setOutputCol(\"results\")\n",
" .setErrorCol(\"errors\")\n",
" .setInputCols(inputColumns)\n",
" .setTimestampCol(timestampColumn)\n",
" .transform(df)\n",
" )\n",
")\n",
"\n",
"result.show(5)"
]
@ -636,10 +647,18 @@
}
],
"source": [
"rdf = (result.select(\"timestamp\",*inputColumns, \"results.contributors\", \"results.isAnomaly\", \"results.severity\")\n",
" .orderBy('timestamp', ascending=True)\n",
" .filter(col('timestamp') >= lit(inferenceStartTime))\n",
" .toPandas())\n",
"rdf = (\n",
" result.select(\n",
" \"timestamp\",\n",
" *inputColumns,\n",
" \"results.contributors\",\n",
" \"results.isAnomaly\",\n",
" \"results.severity\"\n",
" )\n",
" .orderBy(\"timestamp\", ascending=True)\n",
" .filter(col(\"timestamp\") >= lit(inferenceStartTime))\n",
" .toPandas()\n",
")\n",
"\n",
"rdf"
]
@ -887,10 +906,13 @@
" if type(x) is list:\n",
" return dict([item[::-1] for item in x])\n",
" else:\n",
" return {'series_0': 0, 'series_1': 0, 'series_2': 0}\n",
" return {\"series_0\": 0, \"series_1\": 0, \"series_2\": 0}\n",
"\n",
"rdf['contributors'] = rdf['contributors'].apply(parse)\n",
"rdf = pd.concat([rdf.drop(['contributors'], axis=1), pd.json_normalize(rdf['contributors'])], axis=1)\n",
"\n",
"rdf[\"contributors\"] = rdf[\"contributors\"].apply(parse)\n",
"rdf = pd.concat(\n",
" [rdf.drop([\"contributors\"], axis=1), pd.json_normalize(rdf[\"contributors\"])], axis=1\n",
")\n",
"rdf"
]
},
@ -927,42 +949,95 @@
"\n",
"\n",
"####### Main Figure #######\n",
"plt.figure(figsize=(23,8))\n",
"plt.plot(rdf['timestamp'],rdf['sensor_1'], color='tab:orange', linestyle='solid', linewidth=2, label='sensor_1')\n",
"plt.plot(rdf['timestamp'],rdf['sensor_2'], color='tab:green', linestyle='solid', linewidth=2, label='sensor_2')\n",
"plt.plot(rdf['timestamp'],rdf['sensor_3'], color='tab:blue', linestyle='solid', linewidth=2, label='sensor_3')\n",
"plt.grid(axis='y')\n",
"plt.tick_params(axis='x',which='both',bottom=False,labelbottom=False)\n",
"plt.figure(figsize=(23, 8))\n",
"plt.plot(\n",
" rdf[\"timestamp\"],\n",
" rdf[\"sensor_1\"],\n",
" color=\"tab:orange\",\n",
" linestyle=\"solid\",\n",
" linewidth=2,\n",
" label=\"sensor_1\",\n",
")\n",
"plt.plot(\n",
" rdf[\"timestamp\"],\n",
" rdf[\"sensor_2\"],\n",
" color=\"tab:green\",\n",
" linestyle=\"solid\",\n",
" linewidth=2,\n",
" label=\"sensor_2\",\n",
")\n",
"plt.plot(\n",
" rdf[\"timestamp\"],\n",
" rdf[\"sensor_3\"],\n",
" color=\"tab:blue\",\n",
" linestyle=\"solid\",\n",
" linewidth=2,\n",
" label=\"sensor_3\",\n",
")\n",
"plt.grid(axis=\"y\")\n",
"plt.tick_params(axis=\"x\", which=\"both\", bottom=False, labelbottom=False)\n",
"plt.legend()\n",
"\n",
"anoms = list(rdf[\"severity\"] >= minSeverity)\n",
"_, _, ymin, ymax = plt.axis()\n",
"plt.vlines(np.where(anoms), ymin=ymin , ymax=ymax , color='r', alpha=0.8)\n",
"plt.vlines(np.where(anoms), ymin=ymin, ymax=ymax, color=\"r\", alpha=0.8)\n",
"\n",
"plt.legend()\n",
"plt.title('A plot of the values from the three sensors with the detected anomalies highlighted in red.')\n",
"plt.title(\n",
" \"A plot of the values from the three sensors with the detected anomalies highlighted in red.\"\n",
")\n",
"plt.show()\n",
"\n",
"####### Severity Figure #######\n",
"plt.figure(figsize=(23,1))\n",
"plt.tick_params(axis='x',which='both',bottom=False,labelbottom=False)\n",
"plt.plot(rdf['timestamp'],rdf['severity'], color='black', linestyle='solid', linewidth=2, label='Severity score')\n",
"plt.plot(rdf['timestamp'],[minSeverity]*len(rdf['severity']), color='red', linestyle='dotted', linewidth=1, label='minSeverity')\n",
"plt.grid(axis='y')\n",
"plt.figure(figsize=(23, 1))\n",
"plt.tick_params(axis=\"x\", which=\"both\", bottom=False, labelbottom=False)\n",
"plt.plot(\n",
" rdf[\"timestamp\"],\n",
" rdf[\"severity\"],\n",
" color=\"black\",\n",
" linestyle=\"solid\",\n",
" linewidth=2,\n",
" label=\"Severity score\",\n",
")\n",
"plt.plot(\n",
" rdf[\"timestamp\"],\n",
" [minSeverity] * len(rdf[\"severity\"]),\n",
" color=\"red\",\n",
" linestyle=\"dotted\",\n",
" linewidth=1,\n",
" label=\"minSeverity\",\n",
")\n",
"plt.grid(axis=\"y\")\n",
"plt.legend()\n",
"plt.ylim([0,1])\n",
"plt.ylim([0, 1])\n",
"plt.title(\"Severity of the detected anomalies\")\n",
"plt.show()\n",
"\n",
"####### Contributors Figure #######\n",
"plt.figure(figsize=(23,1))\n",
"plt.tick_params(axis='x',which='both',bottom=False,labelbottom=False)\n",
"plt.bar(rdf['timestamp'],rdf['series_0'], width=2, color='tab:orange', label='sensor_1')\n",
"plt.bar(rdf['timestamp'],rdf['series_1'], width=2, color='tab:green', label='sensor_2', bottom=rdf['series_0'])\n",
"plt.bar(rdf['timestamp'],rdf['series_2'], width=2, color='tab:blue', label='sensor_3', bottom=rdf['series_0']+rdf['series_1'])\n",
"plt.grid(axis='y')\n",
"plt.figure(figsize=(23, 1))\n",
"plt.tick_params(axis=\"x\", which=\"both\", bottom=False, labelbottom=False)\n",
"plt.bar(\n",
" rdf[\"timestamp\"], rdf[\"series_0\"], width=2, color=\"tab:orange\", label=\"sensor_1\"\n",
")\n",
"plt.bar(\n",
" rdf[\"timestamp\"],\n",
" rdf[\"series_1\"],\n",
" width=2,\n",
" color=\"tab:green\",\n",
" label=\"sensor_2\",\n",
" bottom=rdf[\"series_0\"],\n",
")\n",
"plt.bar(\n",
" rdf[\"timestamp\"],\n",
" rdf[\"series_2\"],\n",
" width=2,\n",
" color=\"tab:blue\",\n",
" label=\"sensor_3\",\n",
" bottom=rdf[\"series_0\"] + rdf[\"series_1\"],\n",
")\n",
"plt.grid(axis=\"y\")\n",
"plt.legend()\n",
"plt.ylim([0,1])\n",
"plt.ylim([0, 1])\n",
"plt.title(\"The contribution of each sensor to the detected anomaly\")\n",
"plt.show()"
]

Просмотреть файл

@ -111,7 +111,7 @@
"from pyspark.sql.functions import lit\n",
"from pyspark.ml import PipelineModel\n",
"from pyspark.sql.functions import col\n",
"import os\n"
"import os"
]
},
{
@ -122,14 +122,24 @@
"source": [
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.mssparkutils.credentials import getSecret\n",
" os.environ['ANOMALY_API_KEY'] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n",
" os.environ['COGNITIVE_SERVICE_KEY'] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n",
" os.environ['BING_IMAGE_SEARCH_KEY'] = getSecret(\"mmlspark-build-keys\", \"bing-search-key\")\n",
" os.environ['TRANSLATOR_KEY'] = getSecret(\"mmlspark-build-keys\", \"translator-key\")\n",
" os.environ['AZURE_SEARCH_KEY'] = getSecret(\"mmlspark-build-keys\", \"azure-search-key\")\n",
" from notebookutils.visualization import display\n"
"\n",
" os.environ[\"ANOMALY_API_KEY\"] = getSecret(\n",
" \"mmlspark-build-keys\", \"cognitive-api-key\"\n",
" )\n",
" os.environ[\"COGNITIVE_SERVICE_KEY\"] = getSecret(\n",
" \"mmlspark-build-keys\", \"cognitive-api-key\"\n",
" )\n",
" os.environ[\"BING_IMAGE_SEARCH_KEY\"] = getSecret(\n",
" \"mmlspark-build-keys\", \"bing-search-key\"\n",
" )\n",
" os.environ[\"TRANSLATOR_KEY\"] = getSecret(\"mmlspark-build-keys\", \"translator-key\")\n",
" os.environ[\"AZURE_SEARCH_KEY\"] = getSecret(\n",
" \"mmlspark-build-keys\", \"azure-search-key\"\n",
" )\n",
" from notebookutils.visualization import display"
]
},
{
@ -149,7 +159,7 @@
"# A Translator subscription key\n",
"translator_key = os.environ[\"TRANSLATOR_KEY\"]\n",
"# An Azure search key\n",
"search_key = os.environ['AZURE_SEARCH_KEY']\n"
"search_key = os.environ[\"AZURE_SEARCH_KEY\"]"
]
},
{
@ -168,24 +178,32 @@
"outputs": [],
"source": [
"# Create a dataframe that's tied to it's column names\n",
"df = spark.createDataFrame([\n",
" (\"I am so happy today, its sunny!\", \"en-US\"),\n",
" (\"I am frustrated by this rush hour traffic\", \"en-US\"),\n",
" (\"The cognitive services on spark aint bad\", \"en-US\"),\n",
"], [\"text\", \"language\"])\n",
"df = spark.createDataFrame(\n",
" [\n",
" (\"I am so happy today, its sunny!\", \"en-US\"),\n",
" (\"I am frustrated by this rush hour traffic\", \"en-US\"),\n",
" (\"The cognitive services on spark aint bad\", \"en-US\"),\n",
" ],\n",
" [\"text\", \"language\"],\n",
")\n",
"\n",
"# Run the Text Analytics service with options\n",
"sentiment = (TextSentiment()\n",
" .setTextCol(\"text\")\n",
" .setLocation(\"eastus\")\n",
" .setSubscriptionKey(service_key)\n",
" .setOutputCol(\"sentiment\")\n",
" .setErrorCol(\"error\")\n",
" .setLanguageCol(\"language\"))\n",
"sentiment = (\n",
" TextSentiment()\n",
" .setTextCol(\"text\")\n",
" .setLocation(\"eastus\")\n",
" .setSubscriptionKey(service_key)\n",
" .setOutputCol(\"sentiment\")\n",
" .setErrorCol(\"error\")\n",
" .setLanguageCol(\"language\")\n",
")\n",
"\n",
"# Show the results of your text query in a table format\n",
"display(sentiment.transform(df).select(\"text\", col(\n",
" \"sentiment\")[0].getItem(\"sentiment\").alias(\"sentiment\")))"
"display(\n",
" sentiment.transform(df).select(\n",
" \"text\", col(\"sentiment\")[0].getItem(\"sentiment\").alias(\"sentiment\")\n",
" )\n",
")"
]
},
{
@ -203,16 +221,22 @@
"metadata": {},
"outputs": [],
"source": [
"df = spark.createDataFrame([\n",
" (\"20mg of ibuprofen twice a day\",),\n",
" (\"1tsp of Tylenol every 4 hours\",),\n",
" (\"6-drops of Vitamin B-12 every evening\",)], [\"text\"])\n",
"df = spark.createDataFrame(\n",
" [\n",
" (\"20mg of ibuprofen twice a day\",),\n",
" (\"1tsp of Tylenol every 4 hours\",),\n",
" (\"6-drops of Vitamin B-12 every evening\",),\n",
" ],\n",
" [\"text\"],\n",
")\n",
"\n",
"healthcare = (HealthcareSDK()\n",
"healthcare = (\n",
" HealthcareSDK()\n",
" .setSubscriptionKey(service_key)\n",
" .setLocation(\"eastus\")\n",
" .setLanguage(\"en\")\n",
" .setOutputCol(\"response\"))\n",
" .setOutputCol(\"response\")\n",
")\n",
"\n",
"display(healthcare.transform(df))"
]
@ -234,24 +258,30 @@
"from pyspark.sql.functions import col, flatten\n",
"\n",
"# Create a dataframe including sentences you want to translate\n",
"df = spark.createDataFrame([\n",
" ([\"Hello, what is your name?\", \"Bye\"],)\n",
"], [\"text\",])\n",
"df = spark.createDataFrame(\n",
" [([\"Hello, what is your name?\", \"Bye\"],)],\n",
" [\n",
" \"text\",\n",
" ],\n",
")\n",
"\n",
"# Run the Translator service with options\n",
"translate = (Translate()\n",
"translate = (\n",
" Translate()\n",
" .setSubscriptionKey(translator_key)\n",
" .setLocation(\"eastus\")\n",
" .setTextCol(\"text\")\n",
" .setToLanguage([\"zh-Hans\"])\n",
" .setOutputCol(\"translation\"))\n",
" .setOutputCol(\"translation\")\n",
")\n",
"\n",
"# Show the results of the translation.\n",
"display(translate\n",
" .transform(df)\n",
" .withColumn(\"translation\", flatten(col(\"translation.translations\")))\n",
" .withColumn(\"translation\", col(\"translation.text\"))\n",
" .select(\"translation\"))"
"display(\n",
" translate.transform(df)\n",
" .withColumn(\"translation\", flatten(col(\"translation.translations\")))\n",
" .withColumn(\"translation\", col(\"translation.text\"))\n",
" .select(\"translation\")\n",
")"
]
},
{
@ -271,22 +301,34 @@
"from pyspark.sql.functions import col, explode\n",
"\n",
"# Create a dataframe containing the source files\n",
"imageDf = spark.createDataFrame([\n",
" (\"https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/business_card.jpg\",)\n",
"], [\"source\",])\n",
"imageDf = spark.createDataFrame(\n",
" [\n",
" (\n",
" \"https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/business_card.jpg\",\n",
" )\n",
" ],\n",
" [\n",
" \"source\",\n",
" ],\n",
")\n",
"\n",
"# Run the Form Recognizer service\n",
"analyzeBusinessCards = (AnalyzeBusinessCards()\n",
" .setSubscriptionKey(service_key)\n",
" .setLocation(\"eastus\")\n",
" .setImageUrlCol(\"source\")\n",
" .setOutputCol(\"businessCards\"))\n",
"analyzeBusinessCards = (\n",
" AnalyzeBusinessCards()\n",
" .setSubscriptionKey(service_key)\n",
" .setLocation(\"eastus\")\n",
" .setImageUrlCol(\"source\")\n",
" .setOutputCol(\"businessCards\")\n",
")\n",
"\n",
"# Show the results of recognition.\n",
"display(analyzeBusinessCards\n",
" .transform(imageDf)\n",
" .withColumn(\"documents\", explode(col(\"businessCards.analyzeResult.documentResults.fields\")))\n",
" .select(\"source\", \"documents\"))"
"display(\n",
" analyzeBusinessCards.transform(imageDf)\n",
" .withColumn(\n",
" \"documents\", explode(col(\"businessCards.analyzeResult.documentResults.fields\"))\n",
" )\n",
" .select(\"source\", \"documents\")\n",
")"
]
},
{
@ -305,24 +347,38 @@
"outputs": [],
"source": [
"# Create a dataframe with the image URLs\n",
"df = spark.createDataFrame([\n",
" (\"https://raw.githubusercontent.com/Azure-Samples/cognitive-services-sample-data-files/master/ComputerVision/Images/objects.jpg\", ),\n",
" (\"https://raw.githubusercontent.com/Azure-Samples/cognitive-services-sample-data-files/master/ComputerVision/Images/dog.jpg\", ),\n",
" (\"https://raw.githubusercontent.com/Azure-Samples/cognitive-services-sample-data-files/master/ComputerVision/Images/house.jpg\", )\n",
"], [\"image\", ])\n",
"df = spark.createDataFrame(\n",
" [\n",
" (\n",
" \"https://raw.githubusercontent.com/Azure-Samples/cognitive-services-sample-data-files/master/ComputerVision/Images/objects.jpg\",\n",
" ),\n",
" (\n",
" \"https://raw.githubusercontent.com/Azure-Samples/cognitive-services-sample-data-files/master/ComputerVision/Images/dog.jpg\",\n",
" ),\n",
" (\n",
" \"https://raw.githubusercontent.com/Azure-Samples/cognitive-services-sample-data-files/master/ComputerVision/Images/house.jpg\",\n",
" ),\n",
" ],\n",
" [\n",
" \"image\",\n",
" ],\n",
")\n",
"\n",
"# Run the Computer Vision service. Analyze Image extracts infortmation from/about the images.\n",
"analysis = (AnalyzeImage()\n",
" .setLocation(\"eastus\")\n",
" .setSubscriptionKey(service_key)\n",
" .setVisualFeatures([\"Categories\", \"Color\", \"Description\", \"Faces\", \"Objects\", \"Tags\"])\n",
" .setOutputCol(\"analysis_results\")\n",
" .setImageUrlCol(\"image\")\n",
" .setErrorCol(\"error\"))\n",
"analysis = (\n",
" AnalyzeImage()\n",
" .setLocation(\"eastus\")\n",
" .setSubscriptionKey(service_key)\n",
" .setVisualFeatures(\n",
" [\"Categories\", \"Color\", \"Description\", \"Faces\", \"Objects\", \"Tags\"]\n",
" )\n",
" .setOutputCol(\"analysis_results\")\n",
" .setImageUrlCol(\"image\")\n",
" .setErrorCol(\"error\")\n",
")\n",
"\n",
"# Show the results of what you wanted to pull out of the images.\n",
"display(analysis.transform(df).select(\n",
" \"image\", \"analysis_results.description.tags\"))\n"
"display(analysis.transform(df).select(\"image\", \"analysis_results.description.tags\"))"
]
},
{
@ -343,17 +399,19 @@
"# Number of images Bing will return per query\n",
"imgsPerBatch = 10\n",
"# A list of offsets, used to page into the search results\n",
"offsets = [(i*imgsPerBatch,) for i in range(100)]\n",
"offsets = [(i * imgsPerBatch,) for i in range(100)]\n",
"# Since web content is our data, we create a dataframe with options on that data: offsets\n",
"bingParameters = spark.createDataFrame(offsets, [\"offset\"])\n",
"\n",
"# Run the Bing Image Search service with our text query\n",
"bingSearch = (BingImageSearch()\n",
" .setSubscriptionKey(bing_search_key)\n",
" .setOffsetCol(\"offset\")\n",
" .setQuery(\"Martin Luther King Jr. quotes\")\n",
" .setCount(imgsPerBatch)\n",
" .setOutputCol(\"images\"))\n",
"bingSearch = (\n",
" BingImageSearch()\n",
" .setSubscriptionKey(bing_search_key)\n",
" .setOffsetCol(\"offset\")\n",
" .setQuery(\"Martin Luther King Jr. quotes\")\n",
" .setCount(imgsPerBatch)\n",
" .setOutputCol(\"images\")\n",
")\n",
"\n",
"# Transformer that extracts and flattens the richly structured output of Bing Image Search into a simple URL column\n",
"getUrls = BingImageSearch.getUrlTransformer(\"images\", \"url\")\n",
@ -365,7 +423,7 @@
"pipeline = PipelineModel(stages=[bingSearch, getUrls])\n",
"\n",
"# Show the results of your search: image URLs\n",
"display(pipeline.transform(bingParameters))\n"
"display(pipeline.transform(bingParameters))"
]
},
{
@ -383,20 +441,23 @@
"outputs": [],
"source": [
"# Create a dataframe with our audio URLs, tied to the column called \"url\"\n",
"df = spark.createDataFrame([(\"https://mmlspark.blob.core.windows.net/datasets/Speech/audio2.wav\",)\n",
" ], [\"url\"])\n",
"df = spark.createDataFrame(\n",
" [(\"https://mmlspark.blob.core.windows.net/datasets/Speech/audio2.wav\",)], [\"url\"]\n",
")\n",
"\n",
"# Run the Speech-to-text service to translate the audio into text\n",
"speech_to_text = (SpeechToTextSDK()\n",
" .setSubscriptionKey(service_key)\n",
" .setLocation(\"eastus\")\n",
" .setOutputCol(\"text\")\n",
" .setAudioDataCol(\"url\")\n",
" .setLanguage(\"en-US\")\n",
" .setProfanity(\"Masked\"))\n",
"speech_to_text = (\n",
" SpeechToTextSDK()\n",
" .setSubscriptionKey(service_key)\n",
" .setLocation(\"eastus\")\n",
" .setOutputCol(\"text\")\n",
" .setAudioDataCol(\"url\")\n",
" .setLanguage(\"en-US\")\n",
" .setProfanity(\"Masked\")\n",
")\n",
"\n",
"# Show the results of the translation\n",
"display(speech_to_text.transform(df).select(\"url\", \"text.DisplayText\"))\n"
"display(speech_to_text.transform(df).select(\"url\", \"text.DisplayText\"))"
]
},
{
@ -416,14 +477,24 @@
"from synapse.ml.cognitive import TextToSpeech\n",
"\n",
"# Create a dataframe with text and an output file location\n",
"df = spark.createDataFrame([(\"Reading out lod is fun! Check out aka.ms/spark for more information\", \"dbfs:/output.mp3\")], [\"text\", \"output_file\"])\n",
" \n",
"tts = (TextToSpeech()\n",
"df = spark.createDataFrame(\n",
" [\n",
" (\n",
" \"Reading out lod is fun! Check out aka.ms/spark for more information\",\n",
" \"dbfs:/output.mp3\",\n",
" )\n",
" ],\n",
" [\"text\", \"output_file\"],\n",
")\n",
"\n",
"tts = (\n",
" TextToSpeech()\n",
" .setSubscriptionKey(service_key)\n",
" .setTextCol(\"text\")\n",
" .setLocation(\"eastus\")\n",
" .setVoiceName(\"en-US-JennyNeural\") \n",
" .setOutputFileCol(\"output_file\"))\n",
" .setVoiceName(\"en-US-JennyNeural\")\n",
" .setOutputFileCol(\"output_file\")\n",
")\n",
"\n",
"# Check to make sure there were no errors during audio creation\n",
"display(tts.transform(df))"
@ -445,37 +516,43 @@
"outputs": [],
"source": [
"# Create a dataframe with the point data that Anomaly Detector requires\n",
"df = spark.createDataFrame([\n",
" (\"1972-01-01T00:00:00Z\", 826.0),\n",
" (\"1972-02-01T00:00:00Z\", 799.0),\n",
" (\"1972-03-01T00:00:00Z\", 890.0),\n",
" (\"1972-04-01T00:00:00Z\", 900.0),\n",
" (\"1972-05-01T00:00:00Z\", 766.0),\n",
" (\"1972-06-01T00:00:00Z\", 805.0),\n",
" (\"1972-07-01T00:00:00Z\", 821.0),\n",
" (\"1972-08-01T00:00:00Z\", 20000.0),\n",
" (\"1972-09-01T00:00:00Z\", 883.0),\n",
" (\"1972-10-01T00:00:00Z\", 898.0),\n",
" (\"1972-11-01T00:00:00Z\", 957.0),\n",
" (\"1972-12-01T00:00:00Z\", 924.0),\n",
" (\"1973-01-01T00:00:00Z\", 881.0),\n",
" (\"1973-02-01T00:00:00Z\", 837.0),\n",
" (\"1973-03-01T00:00:00Z\", 9000.0)\n",
"], [\"timestamp\", \"value\"]).withColumn(\"group\", lit(\"series1\"))\n",
"df = spark.createDataFrame(\n",
" [\n",
" (\"1972-01-01T00:00:00Z\", 826.0),\n",
" (\"1972-02-01T00:00:00Z\", 799.0),\n",
" (\"1972-03-01T00:00:00Z\", 890.0),\n",
" (\"1972-04-01T00:00:00Z\", 900.0),\n",
" (\"1972-05-01T00:00:00Z\", 766.0),\n",
" (\"1972-06-01T00:00:00Z\", 805.0),\n",
" (\"1972-07-01T00:00:00Z\", 821.0),\n",
" (\"1972-08-01T00:00:00Z\", 20000.0),\n",
" (\"1972-09-01T00:00:00Z\", 883.0),\n",
" (\"1972-10-01T00:00:00Z\", 898.0),\n",
" (\"1972-11-01T00:00:00Z\", 957.0),\n",
" (\"1972-12-01T00:00:00Z\", 924.0),\n",
" (\"1973-01-01T00:00:00Z\", 881.0),\n",
" (\"1973-02-01T00:00:00Z\", 837.0),\n",
" (\"1973-03-01T00:00:00Z\", 9000.0),\n",
" ],\n",
" [\"timestamp\", \"value\"],\n",
").withColumn(\"group\", lit(\"series1\"))\n",
"\n",
"# Run the Anomaly Detector service to look for irregular data\n",
"anamoly_detector = (SimpleDetectAnomalies()\n",
" .setSubscriptionKey(anomaly_key)\n",
" .setLocation(\"eastus\")\n",
" .setTimestampCol(\"timestamp\")\n",
" .setValueCol(\"value\")\n",
" .setOutputCol(\"anomalies\")\n",
" .setGroupbyCol(\"group\")\n",
" .setGranularity(\"monthly\"))\n",
"anamoly_detector = (\n",
" SimpleDetectAnomalies()\n",
" .setSubscriptionKey(anomaly_key)\n",
" .setLocation(\"eastus\")\n",
" .setTimestampCol(\"timestamp\")\n",
" .setValueCol(\"value\")\n",
" .setOutputCol(\"anomalies\")\n",
" .setGroupbyCol(\"group\")\n",
" .setGranularity(\"monthly\")\n",
")\n",
"\n",
"# Show the full results of the analysis with the anomalies marked as \"True\"\n",
"display(anamoly_detector.transform(df).select(\n",
" \"timestamp\", \"value\", \"anomalies.isAnomaly\"))"
"display(\n",
" anamoly_detector.transform(df).select(\"timestamp\", \"value\", \"anomalies.isAnomaly\")\n",
")"
]
},
{
@ -495,19 +572,22 @@
"source": [
"# Use any requests from the python requests library\n",
"\n",
"\n",
"def world_bank_request(country):\n",
" return Request(\"GET\", \"http://api.worldbank.org/v2/country/{}?format=json\".format(country))\n",
" return Request(\n",
" \"GET\", \"http://api.worldbank.org/v2/country/{}?format=json\".format(country)\n",
" )\n",
"\n",
"\n",
"# Create a dataframe with spcificies which countries we want data on\n",
"df = (spark.createDataFrame([(\"br\",), (\"usa\",)], [\"country\"])\n",
" .withColumn(\"request\", http_udf(world_bank_request)(col(\"country\"))))\n",
"df = spark.createDataFrame([(\"br\",), (\"usa\",)], [\"country\"]).withColumn(\n",
" \"request\", http_udf(world_bank_request)(col(\"country\"))\n",
")\n",
"\n",
"# Much faster for big data because of the concurrency :)\n",
"client = (HTTPTransformer()\n",
" .setConcurrency(3)\n",
" .setInputCol(\"request\")\n",
" .setOutputCol(\"response\"))\n",
"client = (\n",
" HTTPTransformer().setConcurrency(3).setInputCol(\"request\").setOutputCol(\"response\")\n",
")\n",
"\n",
"# Get the body of the response\n",
"\n",
@ -517,9 +597,11 @@
"\n",
"\n",
"# Show the details of the country data returned\n",
"display(client.transform(df)\n",
" .select(\"country\", udf(get_response_body)(col(\"response\"))\n",
" .alias(\"response\")))\n"
"display(\n",
" client.transform(df).select(\n",
" \"country\", udf(get_response_body)(col(\"response\")).alias(\"response\")\n",
" )\n",
")"
]
},
{
@ -540,25 +622,44 @@
"search_service = \"mmlspark-azure-search\"\n",
"search_index = \"test-33467690\"\n",
"\n",
"df = spark.createDataFrame([(\"upload\", \"0\", \"https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg\"),\n",
" (\"upload\", \"1\", \"https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg\")],\n",
" [\"searchAction\", \"id\", \"url\"])\n",
"df = spark.createDataFrame(\n",
" [\n",
" (\n",
" \"upload\",\n",
" \"0\",\n",
" \"https://mmlspark.blob.core.windows.net/datasets/DSIR/test1.jpg\",\n",
" ),\n",
" (\n",
" \"upload\",\n",
" \"1\",\n",
" \"https://mmlspark.blob.core.windows.net/datasets/DSIR/test2.jpg\",\n",
" ),\n",
" ],\n",
" [\"searchAction\", \"id\", \"url\"],\n",
")\n",
"\n",
"tdf = AnalyzeImage()\\\n",
" .setSubscriptionKey(service_key)\\\n",
" .setLocation(\"eastus\")\\\n",
" .setImageUrlCol(\"url\")\\\n",
" .setOutputCol(\"analyzed\")\\\n",
" .setErrorCol(\"errors\")\\\n",
" .setVisualFeatures([\"Categories\", \"Tags\", \"Description\", \"Faces\", \"ImageType\", \"Color\", \"Adult\"])\\\n",
" .transform(df).select(\"*\", \"analyzed.*\")\\\n",
"tdf = (\n",
" AnalyzeImage()\n",
" .setSubscriptionKey(service_key)\n",
" .setLocation(\"eastus\")\n",
" .setImageUrlCol(\"url\")\n",
" .setOutputCol(\"analyzed\")\n",
" .setErrorCol(\"errors\")\n",
" .setVisualFeatures(\n",
" [\"Categories\", \"Tags\", \"Description\", \"Faces\", \"ImageType\", \"Color\", \"Adult\"]\n",
" )\n",
" .transform(df)\n",
" .select(\"*\", \"analyzed.*\")\n",
" .drop(\"errors\", \"analyzed\")\n",
")\n",
"\n",
"tdf.writeToAzureSearch(subscriptionKey=search_key,\n",
" actionCol=\"searchAction\",\n",
" serviceName=search_service,\n",
" indexName=search_index,\n",
" keyCol=\"id\")\n"
"tdf.writeToAzureSearch(\n",
" subscriptionKey=search_key,\n",
" actionCol=\"searchAction\",\n",
" serviceName=search_service,\n",
" indexName=search_index,\n",
" keyCol=\"id\",\n",
")"
]
}
],

Просмотреть файл

@ -43,16 +43,21 @@
"cell_type": "code",
"source": [
"import os\n",
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.mssparkutils.credentials import getSecret\n",
" os.environ['ANOMALY_API_KEY'] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n",
"\n",
"service_key = os.environ[\"ANOMALY_API_KEY\"] # Paste your anomaly detector key here\n",
"location = \"westus2\" # Paste your anomaly detector location here\n",
" os.environ[\"ANOMALY_API_KEY\"] = getSecret(\n",
" \"mmlspark-build-keys\", \"cognitive-api-key\"\n",
" )\n",
"\n",
"assert (service_key is not None)"
"service_key = os.environ[\"ANOMALY_API_KEY\"] # Paste your anomaly detector key here\n",
"location = \"westus2\" # Paste your anomaly detector location here\n",
"\n",
"assert service_key is not None"
],
"metadata": {},
"outputs": [],
@ -68,7 +73,11 @@
{
"cell_type": "code",
"source": [
"df_signals = spark.read.csv(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/iot/IoTSignals.csv\", header=True, inferSchema=True)"
"df_signals = spark.read.csv(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/iot/IoTSignals.csv\",\n",
" header=True,\n",
" inferSchema=True,\n",
")"
],
"metadata": {},
"outputs": [],
@ -84,7 +93,29 @@
{
"cell_type": "code",
"source": [
"from pyspark.sql.functions import col, struct\nfrom synapse.ml.cognitive import SimpleDetectAnomalies\nfrom synapse.ml.core.spark import FluentAPI\n\ndetector = (SimpleDetectAnomalies()\n .setSubscriptionKey(service_key)\n .setLocation(location)\n .setOutputCol(\"anomalies\")\n .setGroupbyCol(\"grouping\")\n .setSensitivity(95)\n .setGranularity(\"secondly\"))\n\ndf_anomaly = (df_signals\n .where(col(\"unitSymbol\") == 'RPM')\n .withColumn(\"timestamp\", col(\"dateTime\").cast(\"string\"))\n .withColumn(\"value\", col(\"measureValue\").cast(\"double\"))\n .withColumn(\"grouping\", struct(\"deviceId\"))\n .mlTransform(detector)).cache()\n\ndf_anomaly.createOrReplaceTempView('df_anomaly')"
"from pyspark.sql.functions import col, struct\n",
"from synapse.ml.cognitive import SimpleDetectAnomalies\n",
"from synapse.ml.core.spark import FluentAPI\n",
"\n",
"detector = (\n",
" SimpleDetectAnomalies()\n",
" .setSubscriptionKey(service_key)\n",
" .setLocation(location)\n",
" .setOutputCol(\"anomalies\")\n",
" .setGroupbyCol(\"grouping\")\n",
" .setSensitivity(95)\n",
" .setGranularity(\"secondly\")\n",
")\n",
"\n",
"df_anomaly = (\n",
" df_signals.where(col(\"unitSymbol\") == \"RPM\")\n",
" .withColumn(\"timestamp\", col(\"dateTime\").cast(\"string\"))\n",
" .withColumn(\"value\", col(\"measureValue\").cast(\"double\"))\n",
" .withColumn(\"grouping\", struct(\"deviceId\"))\n",
" .mlTransform(detector)\n",
").cache()\n",
"\n",
"df_anomaly.createOrReplaceTempView(\"df_anomaly\")"
],
"metadata": {},
"outputs": [],
@ -100,7 +131,7 @@
{
"cell_type": "code",
"source": [
"df_anomaly.select(\"timestamp\",\"value\",\"deviceId\",\"anomalies.isAnomaly\").show(3)\n"
"df_anomaly.select(\"timestamp\", \"value\", \"deviceId\", \"anomalies.isAnomaly\").show(3)"
],
"metadata": {},
"outputs": [],
@ -123,7 +154,21 @@
{
"cell_type": "code",
"source": [
"df_anomaly_single_device = spark.sql(\"\"\"\nselect\n timestamp,\n measureValue,\n anomalies.expectedValue,\n anomalies.expectedValue + anomalies.upperMargin as expectedUpperValue,\n anomalies.expectedValue - anomalies.lowerMargin as expectedLowerValue,\n case when anomalies.isAnomaly=true then 1 else 0 end as isAnomaly\nfrom\n df_anomaly\nwhere deviceid = 'dev-1' and timestamp < '2020-04-29'\norder by timestamp\nlimit 200\"\"\")"
"df_anomaly_single_device = spark.sql(\n",
" \"\"\"\n",
"select\n",
" timestamp,\n",
" measureValue,\n",
" anomalies.expectedValue,\n",
" anomalies.expectedValue + anomalies.upperMargin as expectedUpperValue,\n",
" anomalies.expectedValue - anomalies.lowerMargin as expectedLowerValue,\n",
" case when anomalies.isAnomaly=true then 1 else 0 end as isAnomaly\n",
"from\n",
" df_anomaly\n",
"where deviceid = 'dev-1' and timestamp < '2020-04-29'\n",
"order by timestamp\n",
"limit 200\"\"\"\n",
")"
],
"metadata": {},
"outputs": [],
@ -139,7 +184,50 @@
{
"cell_type": "code",
"source": [
"import matplotlib.pyplot as plt\nfrom pyspark.sql.functions import col\n\nadf = df_anomaly_single_device.toPandas()\nadf_subset = df_anomaly_single_device.where(col(\"isAnomaly\") == 1).toPandas()\n\nplt.figure(figsize=(23,8))\nplt.plot(adf['timestamp'],adf['expectedUpperValue'], color='darkred', linestyle='solid', linewidth=0.25, label='UpperMargin')\nplt.plot(adf['timestamp'],adf['expectedValue'], color='darkgreen', linestyle='solid', linewidth=2, label='Expected Value')\nplt.plot(adf['timestamp'],adf['measureValue'], 'b', color='royalblue', linestyle='dotted', linewidth=2, label='Actual')\nplt.plot(adf['timestamp'],adf['expectedLowerValue'], color='black', linestyle='solid', linewidth=0.25, label='Lower Margin')\nplt.plot(adf_subset['timestamp'],adf_subset['measureValue'], 'ro', label = 'Anomaly')\nplt.legend()\nplt.title('RPM Anomalies with Confidence Intervals')\nplt.show()"
"import matplotlib.pyplot as plt\n",
"from pyspark.sql.functions import col\n",
"\n",
"adf = df_anomaly_single_device.toPandas()\n",
"adf_subset = df_anomaly_single_device.where(col(\"isAnomaly\") == 1).toPandas()\n",
"\n",
"plt.figure(figsize=(23, 8))\n",
"plt.plot(\n",
" adf[\"timestamp\"],\n",
" adf[\"expectedUpperValue\"],\n",
" color=\"darkred\",\n",
" linestyle=\"solid\",\n",
" linewidth=0.25,\n",
" label=\"UpperMargin\",\n",
")\n",
"plt.plot(\n",
" adf[\"timestamp\"],\n",
" adf[\"expectedValue\"],\n",
" color=\"darkgreen\",\n",
" linestyle=\"solid\",\n",
" linewidth=2,\n",
" label=\"Expected Value\",\n",
")\n",
"plt.plot(\n",
" adf[\"timestamp\"],\n",
" adf[\"measureValue\"],\n",
" \"b\",\n",
" color=\"royalblue\",\n",
" linestyle=\"dotted\",\n",
" linewidth=2,\n",
" label=\"Actual\",\n",
")\n",
"plt.plot(\n",
" adf[\"timestamp\"],\n",
" adf[\"expectedLowerValue\"],\n",
" color=\"black\",\n",
" linestyle=\"solid\",\n",
" linewidth=0.25,\n",
" label=\"Lower Margin\",\n",
")\n",
"plt.plot(adf_subset[\"timestamp\"], adf_subset[\"measureValue\"], \"ro\", label=\"Anomaly\")\n",
"plt.legend()\n",
"plt.title(\"RPM Anomalies with Confidence Intervals\")\n",
"plt.show()"
],
"metadata": {},
"outputs": [],

Просмотреть файл

@ -40,7 +40,7 @@
"retry_strategy = Retry(\n",
" total=3,\n",
" status_forcelist=[429, 500, 502, 503, 504],\n",
" method_whitelist=[\"HEAD\", \"GET\", \"PUT\", \"DELETE\", \"OPTIONS\", \"TRACE\"]\n",
" method_whitelist=[\"HEAD\", \"GET\", \"PUT\", \"DELETE\", \"OPTIONS\", \"TRACE\"],\n",
")\n",
"adapter = HTTPAdapter(max_retries=retry_strategy)\n",
"http = requests.Session()\n",
@ -49,42 +49,51 @@
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.mssparkutils.credentials import getSecret\n",
" os.environ['AZURE_MAPS_KEY'] = getSecret(\"mmlspark-build-keys\", \"azuremaps-api-key\")\n",
"\n",
" os.environ[\"AZURE_MAPS_KEY\"] = getSecret(\"mmlspark-build-keys\", \"azuremaps-api-key\")\n",
" from notebookutils.visualization import display\n",
"\n",
"\n",
"\n",
"# Azure Maps account key\n",
"azureMapsKey = os.environ[\"AZURE_MAPS_KEY\"] #Replace this with your azure maps key\n",
"azureMapsKey = os.environ[\"AZURE_MAPS_KEY\"] # Replace this with your azure maps key\n",
"\n",
"# Creator Geo prefix\n",
"# for this example, assuming that the creator resource is created in `EAST US 2`.\n",
"atlas_geo_prefix = \"us\"\n",
"\n",
"# Load flood plains data\n",
"flood_plain_geojson = http.get(\"https://mmlspark.blob.core.windows.net/publicwasb/maps/KingCountyFloodPlains.geojson\").content\n",
"flood_plain_geojson = http.get(\n",
" \"https://mmlspark.blob.core.windows.net/publicwasb/maps/KingCountyFloodPlains.geojson\"\n",
").content\n",
"\n",
"# Upload this flood plains data to your maps/creator account. This is a Long-Running async operation and takes approximately 15~30 seconds to complete\n",
"r= http.post(f'https://{atlas_geo_prefix}.atlas.microsoft.com/mapData/upload?api-version=1.0&dataFormat=geojson&subscription-key={azureMapsKey}',\n",
" json=json.loads(flood_plain_geojson))\n",
"r = http.post(\n",
" f\"https://{atlas_geo_prefix}.atlas.microsoft.com/mapData/upload?api-version=1.0&dataFormat=geojson&subscription-key={azureMapsKey}\",\n",
" json=json.loads(flood_plain_geojson),\n",
")\n",
"\n",
"# Poll for resource upload completion\n",
"resource_location = r.headers.get('location')\n",
"resource_location = r.headers.get(\"location\")\n",
"for _ in range(20):\n",
" resource = json.loads(http.get(f'{resource_location}&subscription-key={azureMapsKey}').content)\n",
" status = resource[\"status\"].lower()\n",
" if status == \"running\":\n",
" time.sleep(5) # wait in a polling loop\n",
" elif status == \"succeeded\":\n",
" break\n",
" else:\n",
" raise ValueError(\"Unknown status {}\".format(status))\n",
" resource = json.loads(\n",
" http.get(f\"{resource_location}&subscription-key={azureMapsKey}\").content\n",
" )\n",
" status = resource[\"status\"].lower()\n",
" if status == \"running\":\n",
" time.sleep(5) # wait in a polling loop\n",
" elif status == \"succeeded\":\n",
" break\n",
" else:\n",
" raise ValueError(\"Unknown status {}\".format(status))\n",
"\n",
"# Once the above operation returns a HTTP 201, get the user_data_id of the flood plains data, you uploaded to your map account.\n",
"user_data_id_resource_url = resource['resourceLocation']\n",
"user_data_id = json.loads(http.get(f'{user_data_id_resource_url}&subscription-key={azureMapsKey}').content)['udid']"
"user_data_id_resource_url = resource[\"resourceLocation\"]\n",
"user_data_id = json.loads(\n",
" http.get(f\"{user_data_id_resource_url}&subscription-key={azureMapsKey}\").content\n",
")[\"udid\"]"
]
},
{
@ -102,9 +111,9 @@
"metadata": {},
"outputs": [],
"source": [
"data = spark.read\\\n",
" .option(\"header\", \"true\")\\\n",
" .csv(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/maps/KingCountyAddress.csv\")\n",
"data = spark.read.option(\"header\", \"true\").csv(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/maps/KingCountyAddress.csv\"\n",
")\n",
"\n",
"# Visualize incoming schema\n",
"print(\"Schema:\")\n",
@ -135,23 +144,39 @@
"from synapse.ml.stages import FixedMiniBatchTransformer, FlattenBatch\n",
"from synapse.ml.geospatial import *\n",
"\n",
"\n",
"def extract_location_fields(df):\n",
" # Use this function to select only lat/lon columns into the dataframe\n",
" return df.select(col(\"*\"),\n",
" col(\"output.response.results\").getItem(0).getField(\"position\").getField(\"lat\").alias(\"Latitude\"),\n",
" col(\"output.response.results\").getItem(0).getField(\"position\").getField(\"lon\").alias(\"Longitude\")\n",
" return df.select(\n",
" col(\"*\"),\n",
" col(\"output.response.results\")\n",
" .getItem(0)\n",
" .getField(\"position\")\n",
" .getField(\"lat\")\n",
" .alias(\"Latitude\"),\n",
" col(\"output.response.results\")\n",
" .getItem(0)\n",
" .getField(\"position\")\n",
" .getField(\"lon\")\n",
" .alias(\"Longitude\"),\n",
" ).drop(\"output\")\n",
" \n",
"\n",
"\n",
"# Azure Maps geocoder to enhance the dataframe with location data\n",
"geocoder = (AddressGeocoder()\n",
"geocoder = (\n",
" AddressGeocoder()\n",
" .setSubscriptionKey(azureMapsKey)\n",
" .setAddressCol(\"FullAddress\")\n",
" .setOutputCol(\"output\"))\n",
" .setOutputCol(\"output\")\n",
")\n",
"\n",
"# Set up a fixed mini batch transformer to geocode addresses\n",
"batched_dataframe = geocoder.transform(FixedMiniBatchTransformer().setBatchSize(10).transform(subset_data.coalesce(1)))\n",
"geocoded_addresses = extract_location_fields(FlattenBatch().transform(batched_dataframe))\n",
"batched_dataframe = geocoder.transform(\n",
" FixedMiniBatchTransformer().setBatchSize(10).transform(subset_data.coalesce(1))\n",
")\n",
"geocoded_addresses = extract_location_fields(\n",
" FlattenBatch().transform(batched_dataframe)\n",
")\n",
"\n",
"# Display the results\n",
"display(geocoded_addresses)"
@ -174,22 +199,27 @@
"source": [
"def extract_point_in_polygon_result_fields(df):\n",
" # Use this function to select only lat/lon columns into the dataframe\n",
" return df.select(col(\"*\"),\n",
" return df.select(\n",
" col(\"*\"),\n",
" col(\"output.result.pointInPolygons\").alias(\"In Polygon\"),\n",
" col(\"output.result.intersectingGeometries\").alias(\"Intersecting Polygons\")\n",
" col(\"output.result.intersectingGeometries\").alias(\"Intersecting Polygons\"),\n",
" ).drop(\"output\")\n",
"\n",
"\n",
"check_point_in_polygon = (CheckPointInPolygon()\n",
"check_point_in_polygon = (\n",
" CheckPointInPolygon()\n",
" .setSubscriptionKey(azureMapsKey)\n",
" .setGeography(atlas_geo_prefix)\n",
" .setUserDataIdentifier(user_data_id)\n",
" .setLatitudeCol(\"Latitude\")\n",
" .setLongitudeCol(\"Longitude\")\n",
" .setOutputCol(\"output\"))\n",
" .setOutputCol(\"output\")\n",
")\n",
"\n",
"\n",
"flood_plain_addresses = extract_point_in_polygon_result_fields(check_point_in_polygon.transform(geocoded_addresses))\n",
"flood_plain_addresses = extract_point_in_polygon_result_fields(\n",
" check_point_in_polygon.transform(geocoded_addresses)\n",
")\n",
"\n",
"# Display the results\n",
"display(flood_plain_addresses)"
@ -209,7 +239,9 @@
"metadata": {},
"outputs": [],
"source": [
"res = http.delete(f\"https://{atlas_geo_prefix}.atlas.microsoft.com/mapData/{user_data_id}?api-version=1.0&subscription-key={azureMapsKey}\")"
"res = http.delete(\n",
" f\"https://{atlas_geo_prefix}.atlas.microsoft.com/mapData/{user_data_id}?api-version=1.0&subscription-key={azureMapsKey}\"\n",
")"
]
}
],

Просмотреть файл

@ -54,7 +54,7 @@
"outputs": [],
"source": [
"from pyspark.sql.functions import udf, col\n",
"from pyspark.sql.types import StructType,StructField, DoubleType\n",
"from pyspark.sql.types import StructType, StructField, DoubleType\n",
"from pyspark.sql.functions import lit\n",
"from pyspark.ml import PipelineModel\n",
"from pyspark.sql.functions import col\n",
@ -67,7 +67,7 @@
"retry_strategy = Retry(\n",
" total=3,\n",
" status_forcelist=[429, 500, 502, 503, 504],\n",
" method_whitelist=[\"HEAD\", \"GET\", \"PUT\", \"DELETE\", \"OPTIONS\", \"TRACE\"]\n",
" method_whitelist=[\"HEAD\", \"GET\", \"PUT\", \"DELETE\", \"OPTIONS\", \"TRACE\"],\n",
")\n",
"adapter = HTTPAdapter(max_retries=retry_strategy)\n",
"http = requests.Session()\n",
@ -83,10 +83,12 @@
"source": [
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.mssparkutils.credentials import getSecret\n",
" os.environ['AZURE_MAPS_KEY'] = getSecret(\"mmlspark-build-keys\", \"azuremaps-api-key\")\n",
" from notebookutils.visualization import display\n"
"\n",
" os.environ[\"AZURE_MAPS_KEY\"] = getSecret(\"mmlspark-build-keys\", \"azuremaps-api-key\")\n",
" from notebookutils.visualization import display"
]
},
{
@ -119,30 +121,51 @@
"source": [
"from synapse.ml.stages import FixedMiniBatchTransformer, FlattenBatch\n",
"\n",
"df = spark.createDataFrame([\n",
" (\"One, Microsoft Way, Redmond\",),\n",
" (\"400 Broad St, Seattle\",),\n",
" (\"350 5th Ave, New York\",),\n",
" (\"Pike Pl, Seattle\",),\n",
" (\"Champ de Mars, 5 Avenue Anatole France, 75007 Paris\",)\n",
"], [\"address\",])\n",
"df = spark.createDataFrame(\n",
" [\n",
" (\"One, Microsoft Way, Redmond\",),\n",
" (\"400 Broad St, Seattle\",),\n",
" (\"350 5th Ave, New York\",),\n",
" (\"Pike Pl, Seattle\",),\n",
" (\"Champ de Mars, 5 Avenue Anatole France, 75007 Paris\",),\n",
" ],\n",
" [\n",
" \"address\",\n",
" ],\n",
")\n",
"\n",
"\n",
"def extract_location_fields(df):\n",
" # Use this function to select only lat/lon columns into the dataframe\n",
" return df.select(col(\"*\"),\n",
" col(\"output.response.results\").getItem(0).getField(\"position\").getField(\"lat\").alias(\"Latitude\"),\n",
" col(\"output.response.results\").getItem(0).getField(\"position\").getField(\"lon\").alias(\"Longitude\")\n",
" return df.select(\n",
" col(\"*\"),\n",
" col(\"output.response.results\")\n",
" .getItem(0)\n",
" .getField(\"position\")\n",
" .getField(\"lat\")\n",
" .alias(\"Latitude\"),\n",
" col(\"output.response.results\")\n",
" .getItem(0)\n",
" .getField(\"position\")\n",
" .getField(\"lon\")\n",
" .alias(\"Longitude\"),\n",
" ).drop(\"output\")\n",
"\n",
"\n",
"# Run the Azure Maps geocoder to enhance the data with location data\n",
"geocoder = (AddressGeocoder()\n",
"geocoder = (\n",
" AddressGeocoder()\n",
" .setSubscriptionKey(azureMapsKey)\n",
" .setAddressCol(\"address\")\n",
" .setOutputCol(\"output\"))\n",
" .setOutputCol(\"output\")\n",
")\n",
"\n",
"# Show the results of your text query in a table format\n",
"display(extract_location_fields(geocoder.transform(FixedMiniBatchTransformer().setBatchSize(10).transform(df))))"
"display(\n",
" extract_location_fields(\n",
" geocoder.transform(FixedMiniBatchTransformer().setBatchSize(10).transform(df))\n",
" )\n",
")"
]
},
{
@ -161,26 +184,46 @@
"outputs": [],
"source": [
"# Create a dataframe that's tied to it's column names\n",
"df = spark.createDataFrame(((\n",
" (48.858561, 2.294911),\n",
" (47.639765, -122.127896),\n",
" (47.621028, -122.348170),\n",
" (47.734012, -122.102737)\n",
" )), StructType([StructField(\"lat\", DoubleType()), StructField(\"lon\", DoubleType())]))\n",
"df = spark.createDataFrame(\n",
" (\n",
" (\n",
" (48.858561, 2.294911),\n",
" (47.639765, -122.127896),\n",
" (47.621028, -122.348170),\n",
" (47.734012, -122.102737),\n",
" )\n",
" ),\n",
" StructType([StructField(\"lat\", DoubleType()), StructField(\"lon\", DoubleType())]),\n",
")\n",
"\n",
"# Run the Azure Maps geocoder to enhance the data with location data\n",
"rev_geocoder = (ReverseAddressGeocoder()\n",
"rev_geocoder = (\n",
" ReverseAddressGeocoder()\n",
" .setSubscriptionKey(azureMapsKey)\n",
" .setLatitudeCol(\"lat\")\n",
" .setLongitudeCol(\"lon\")\n",
" .setOutputCol(\"output\"))\n",
" .setOutputCol(\"output\")\n",
")\n",
"\n",
"# Show the results of your text query in a table format\n",
"\n",
"display(rev_geocoder.transform(FixedMiniBatchTransformer().setBatchSize(10).transform(df)).select(col(\"*\"),\n",
" col(\"output.response.addresses\").getItem(0).getField(\"address\").getField(\"freeformAddress\").alias(\"In Polygon\"),\n",
" col(\"output.response.addresses\").getItem(0).getField(\"address\").getField(\"country\").alias(\"Intersecting Polygons\")\n",
" ).drop(\"output\"))\n"
"display(\n",
" rev_geocoder.transform(FixedMiniBatchTransformer().setBatchSize(10).transform(df))\n",
" .select(\n",
" col(\"*\"),\n",
" col(\"output.response.addresses\")\n",
" .getItem(0)\n",
" .getField(\"address\")\n",
" .getField(\"freeformAddress\")\n",
" .alias(\"In Polygon\"),\n",
" col(\"output.response.addresses\")\n",
" .getItem(0)\n",
" .getField(\"address\")\n",
" .getField(\"country\")\n",
" .alias(\"Intersecting Polygons\"),\n",
" )\n",
" .drop(\"output\")\n",
")"
]
},
{
@ -211,56 +254,47 @@
"import json\n",
"\n",
"# Choose a geography, you want your data to reside in.\n",
"# Allowed values \n",
"# Allowed values\n",
"# us => North American datacenters\n",
"# eu -> European datacenters\n",
"url_geo_prefix = 'us' \n",
"url_geo_prefix = \"us\"\n",
"\n",
"# Upload a geojson with polygons in them\n",
"r= http.post(f'https://{url_geo_prefix}.atlas.microsoft.com/mapData/upload?api-version=1.0&dataFormat=geojson&subscription-key={azureMapsKey}',\n",
" json= { \n",
" \"type\": \"FeatureCollection\", \n",
"r = http.post(\n",
" f\"https://{url_geo_prefix}.atlas.microsoft.com/mapData/upload?api-version=1.0&dataFormat=geojson&subscription-key={azureMapsKey}\",\n",
" json={\n",
" \"type\": \"FeatureCollection\",\n",
" \"features\": [\n",
" {\n",
" \"type\": \"Feature\",\n",
" \"properties\": { \"geometryId\": \"test_geometry\" },\n",
" \"properties\": {\"geometryId\": \"test_geometry\"},\n",
" \"geometry\": {\n",
" \"type\": \"Polygon\",\n",
" \"coordinates\":[\n",
" \"coordinates\": [\n",
" [\n",
" [\n",
" -122.14290618896484,\n",
" 47.67856488312544\n",
" ],\n",
" [\n",
" -122.03956604003906,\n",
" 47.67856488312544\n",
" ],\n",
" [\n",
" -122.03956604003906,\n",
" 47.7483271435476\n",
" ],\n",
" [\n",
" -122.14290618896484,\n",
" 47.7483271435476\n",
" ],\n",
" [\n",
" -122.14290618896484,\n",
" 47.67856488312544\n",
" ]\n",
" [-122.14290618896484, 47.67856488312544],\n",
" [-122.03956604003906, 47.67856488312544],\n",
" [-122.03956604003906, 47.7483271435476],\n",
" [-122.14290618896484, 47.7483271435476],\n",
" [-122.14290618896484, 47.67856488312544],\n",
" ]\n",
" ]\n",
" } \n",
" } \n",
" ] \n",
" })\n",
" ],\n",
" },\n",
" }\n",
" ],\n",
" },\n",
")\n",
"\n",
"long_running_operation = r.headers.get('location')\n",
"time.sleep(30) # Sometimes this may take upto 30 seconds\n",
"long_running_operation = r.headers.get(\"location\")\n",
"time.sleep(30) # Sometimes this may take upto 30 seconds\n",
"print(f\"Status Code: {r.status_code}, Long Running Operation: {long_running_operation}\")\n",
"# This Operation completes in approximately 5 ~ 15 seconds \n",
"user_data_id_resource_url = json.loads(http.get(f'{long_running_operation}&subscription-key={azureMapsKey}').content)['resourceLocation']\n",
"user_data_id = json.loads(http.get(f'{user_data_id_resource_url}&subscription-key={azureMapsKey}').content)['udid']"
"# This Operation completes in approximately 5 ~ 15 seconds\n",
"user_data_id_resource_url = json.loads(\n",
" http.get(f\"{long_running_operation}&subscription-key={azureMapsKey}\").content\n",
")[\"resourceLocation\"]\n",
"user_data_id = json.loads(\n",
" http.get(f\"{user_data_id_resource_url}&subscription-key={azureMapsKey}\").content\n",
")[\"udid\"]"
]
},
{
@ -277,27 +311,39 @@
"outputs": [],
"source": [
"# Create a dataframe that's tied to it's column names\n",
"df = spark.createDataFrame(((\n",
" (48.858561, 2.294911),\n",
" (47.639765, -122.127896),\n",
" (47.621028, -122.348170),\n",
" (47.734012, -122.102737)\n",
" )), StructType([StructField(\"lat\", DoubleType()), StructField(\"lon\", DoubleType())]))\n",
"df = spark.createDataFrame(\n",
" (\n",
" (\n",
" (48.858561, 2.294911),\n",
" (47.639765, -122.127896),\n",
" (47.621028, -122.348170),\n",
" (47.734012, -122.102737),\n",
" )\n",
" ),\n",
" StructType([StructField(\"lat\", DoubleType()), StructField(\"lon\", DoubleType())]),\n",
")\n",
"\n",
"# Run the Azure Maps geocoder to enhance the data with location data\n",
"check_point_in_polygon = (CheckPointInPolygon()\n",
"check_point_in_polygon = (\n",
" CheckPointInPolygon()\n",
" .setSubscriptionKey(azureMapsKey)\n",
" .setGeography(url_geo_prefix)\n",
" .setUserDataIdentifier(user_data_id)\n",
" .setLatitudeCol(\"lat\")\n",
" .setLongitudeCol(\"lon\")\n",
" .setOutputCol(\"output\"))\n",
" .setOutputCol(\"output\")\n",
")\n",
"\n",
"# Show the results of your text query in a table format\n",
"display(check_point_in_polygon.transform(df).select(col(\"*\"),\n",
"display(\n",
" check_point_in_polygon.transform(df)\n",
" .select(\n",
" col(\"*\"),\n",
" col(\"output.result.pointInPolygons\").alias(\"In Polygon\"),\n",
" col(\"output.result.intersectingGeometries\").alias(\"Intersecting Polygons\")\n",
" ).drop(\"output\"))"
" col(\"output.result.intersectingGeometries\").alias(\"Intersecting Polygons\"),\n",
" )\n",
" .drop(\"output\")\n",
")"
]
},
{
@ -313,7 +359,9 @@
"metadata": {},
"outputs": [],
"source": [
"res = http.delete(f\"https://{url_geo_prefix}.atlas.microsoft.com/mapData/{user_data_id}?api-version=1.0&subscription-key={azureMapsKey}\")"
"res = http.delete(\n",
" f\"https://{url_geo_prefix}.atlas.microsoft.com/mapData/{user_data_id}?api-version=1.0&subscription-key={azureMapsKey}\"\n",
")"
]
}
],

Просмотреть файл

@ -82,6 +82,7 @@
"source": [
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.visualization import display"
],
@ -120,14 +121,26 @@
"outputs": [],
"source": [
"# Table inputs\n",
"timestampColumn = \"timestamp\" # str: the name of the timestamp column in the table\n",
"inputCols = ['sensor_1', 'sensor_2', 'sensor_3'] # list(str): the names of the input variables \n",
"timestampColumn = \"timestamp\" # str: the name of the timestamp column in the table\n",
"inputCols = [\n",
" \"sensor_1\",\n",
" \"sensor_2\",\n",
" \"sensor_3\",\n",
"] # list(str): the names of the input variables\n",
"\n",
"# Training Start time, and number of days to use for training: \n",
"trainingStartTime = \"2022-02-24T06:00:00Z\" # datetime: datetime for when to start the training\n",
"trainingEndTime = \"2022-03-08T23:55:00Z\" # datetime: datetime for when to end the training\n",
"inferenceStartTime = \"2022-03-09T09:30:00Z\" # datetime: datetime for when to start the training\n",
"inferenceEndTime = \"2022-03-20T23:55:00Z\" # datetime: datetime for when to end the training\n",
"# Training Start time, and number of days to use for training:\n",
"trainingStartTime = (\n",
" \"2022-02-24T06:00:00Z\" # datetime: datetime for when to start the training\n",
")\n",
"trainingEndTime = (\n",
" \"2022-03-08T23:55:00Z\" # datetime: datetime for when to end the training\n",
")\n",
"inferenceStartTime = (\n",
" \"2022-03-09T09:30:00Z\" # datetime: datetime for when to start the training\n",
")\n",
"inferenceEndTime = (\n",
" \"2022-03-20T23:55:00Z\" # datetime: datetime for when to end the training\n",
")\n",
"\n",
"# Isolation Forest parameters\n",
"contamination = 0.021\n",
@ -169,7 +182,13 @@
},
"outputs": [],
"source": [
"df = spark.read.format(\"csv\").option(\"header\", \"true\").load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/generated_sample_mvad_data.csv\")"
"df = (\n",
" spark.read.format(\"csv\")\n",
" .option(\"header\", \"true\")\n",
" .load(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/generated_sample_mvad_data.csv\"\n",
" )\n",
")"
]
},
{
@ -200,14 +219,13 @@
"outputs": [],
"source": [
"df = (\n",
" df\n",
" .orderBy(timestampColumn)\n",
" df.orderBy(timestampColumn)\n",
" .withColumn(\"timestamp\", F.date_format(timestampColumn, \"yyyy-MM-dd'T'HH:mm:ss'Z'\"))\n",
" .withColumn(\"sensor_1\", F.col(\"sensor_1\").cast(DoubleType()))\n",
" .withColumn(\"sensor_2\", F.col(\"sensor_2\").cast(DoubleType()))\n",
" .withColumn(\"sensor_3\", F.col(\"sensor_3\").cast(DoubleType()))\n",
" .drop('_c5')\n",
" )\n",
" .drop(\"_c5\")\n",
")\n",
"\n",
"display(df)"
]
@ -240,7 +258,10 @@
"outputs": [],
"source": [
"# filter to data with timestamps within the training window\n",
"df_train = df.filter((F.col(timestampColumn) >= trainingStartTime) & (F.col(timestampColumn) <= trainingEndTime))\n",
"df_train = df.filter(\n",
" (F.col(timestampColumn) >= trainingStartTime)\n",
" & (F.col(timestampColumn) <= trainingEndTime)\n",
")\n",
"display(df_train)"
]
},
@ -272,7 +293,10 @@
"outputs": [],
"source": [
"# filter to data with timestamps within the inference window\n",
"df_test = df.filter((F.col(timestampColumn) >= inferenceStartTime) & (F.col(timestampColumn) <= inferenceEndTime))\n",
"df_test = df.filter(\n",
" (F.col(timestampColumn) >= inferenceStartTime)\n",
" & (F.col(timestampColumn) <= inferenceEndTime)\n",
")\n",
"display(df_test)"
]
},
@ -303,17 +327,19 @@
},
"outputs": [],
"source": [
"isolationForest = (IsolationForest()\n",
" .setNumEstimators(num_estimators)\n",
" .setBootstrap(False)\n",
" .setMaxSamples(max_samples)\n",
" .setMaxFeatures(max_features)\n",
" .setFeaturesCol(\"features\")\n",
" .setPredictionCol(\"predictedLabel\")\n",
" .setScoreCol(\"outlierScore\")\n",
" .setContamination(contamination)\n",
" .setContaminationError(0.01 * contamination)\n",
" .setRandomSeed(1))"
"isolationForest = (\n",
" IsolationForest()\n",
" .setNumEstimators(num_estimators)\n",
" .setBootstrap(False)\n",
" .setMaxSamples(max_samples)\n",
" .setMaxFeatures(max_features)\n",
" .setFeaturesCol(\"features\")\n",
" .setPredictionCol(\"predictedLabel\")\n",
" .setScoreCol(\"outlierScore\")\n",
" .setContamination(contamination)\n",
" .setContaminationError(0.01 * contamination)\n",
" .setRandomSeed(1)\n",
")"
]
},
{
@ -350,7 +376,9 @@
" va = VectorAssembler(inputCols=inputCols, outputCol=\"features\")\n",
" pipeline = Pipeline(stages=[va, isolationForest])\n",
" model = pipeline.fit(df_train)\n",
" mlflow.spark.log_model(model, artifact_path=artifact_path,registered_model_name=model_name)"
" mlflow.spark.log_model(\n",
" model, artifact_path=artifact_path, registered_model_name=model_name\n",
" )"
]
},
{
@ -457,14 +485,14 @@
},
"outputs": [],
"source": [
"# Here, we create a TabularSHAP explainer, set the input columns to all the features the model takes, specify the model and the target output column \n",
"# Here, we create a TabularSHAP explainer, set the input columns to all the features the model takes, specify the model and the target output column\n",
"# we are trying to explain. In this case, we are trying to explain the \"outlierScore\" output.\n",
"shap = TabularSHAP(\n",
" inputCols=inputCols,\n",
" outputCol=\"shapValues\",\n",
" model=model,\n",
" targetCol=\"outlierScore\",\n",
" backgroundData=F.broadcast(df_test)\n",
" backgroundData=F.broadcast(df_test),\n",
")"
]
},
@ -522,13 +550,14 @@
},
"outputs": [],
"source": [
"# Here, we extract the SHAP values, the original features and the outlier score column. Then we convert it to a Pandas DataFrame for visualization. \n",
"# For each observation, the first element in the SHAP values vector is the base value (the mean output of the background dataset), \n",
"# Here, we extract the SHAP values, the original features and the outlier score column. Then we convert it to a Pandas DataFrame for visualization.\n",
"# For each observation, the first element in the SHAP values vector is the base value (the mean output of the background dataset),\n",
"# and each of the following elements represents the SHAP values for each feature\n",
"shaps = (\n",
" shap_df\n",
" .withColumn(\"shapValues\", vec2array(F.col(\"shapValues\").getItem(0)))\n",
" .select([\"shapValues\", \"outlierScore\"] + inputCols + [timestampColumn, \"prediction\"])\n",
" shap_df.withColumn(\"shapValues\", vec2array(F.col(\"shapValues\").getItem(0)))\n",
" .select(\n",
" [\"shapValues\", \"outlierScore\"] + inputCols + [timestampColumn, \"prediction\"]\n",
" )\n",
" .withColumn(\"sensor_1_localimp\", F.col(\"shapValues\")[1])\n",
" .withColumn(\"sensor_2_localimp\", F.col(\"shapValues\")[2])\n",
" .withColumn(\"sensor_3_localimp\", F.col(\"shapValues\")[3])\n",
@ -565,7 +594,7 @@
},
"outputs": [],
"source": [
"local_importance_values = shaps_local[['shapValues']]\n",
"local_importance_values = shaps_local[[\"shapValues\"]]\n",
"eval_data = shaps_local[inputCols]"
]
},
@ -634,8 +663,11 @@
"outputs": [],
"source": [
"from interpret_community.adapter import ExplanationAdapter\n",
"\n",
"adapter = ExplanationAdapter(inputCols, classification=False)\n",
"global_explanation = adapter.create_global(converted_importance_values, eval_data, expected_values=bias)"
"global_explanation = adapter.create_global(\n",
" converted_importance_values, eval_data, expected_values=bias\n",
")"
]
},
{
@ -687,13 +719,20 @@
"source": [
"# Defining a wrapper class with predict method for creating the Explanation Dashboard\n",
"\n",
"\n",
"class wrapper(object):\n",
" def __init__(self, model):\n",
" self.model = model\n",
" \n",
"\n",
" def predict(self, data):\n",
" sparkdata = spark.createDataFrame(data)\n",
" return model.transform(sparkdata).select('outlierScore').toPandas().values.flatten().tolist()"
" return (\n",
" model.transform(sparkdata)\n",
" .select(\"outlierScore\")\n",
" .toPandas()\n",
" .values.flatten()\n",
" .tolist()\n",
" )"
]
},
{
@ -733,50 +772,119 @@
"def visualize(rdf):\n",
" anoms = list(rdf[\"prediction\"] == 1)\n",
"\n",
" fig = plt.figure(figsize=(26,12))\n",
" fig = plt.figure(figsize=(26, 12))\n",
"\n",
" ax = fig.add_subplot(611)\n",
" ax.title.set_text(f\"Multivariate Anomaly Detection Results\")\n",
" ax.plot(rdf[timestampColumn],rdf[\"sensor_1\"], color='tab:orange', linestyle='solid', linewidth=2, label=\"sensor_1\")\n",
" ax.grid(axis='y')\n",
" ax.plot(\n",
" rdf[timestampColumn],\n",
" rdf[\"sensor_1\"],\n",
" color=\"tab:orange\",\n",
" linestyle=\"solid\",\n",
" linewidth=2,\n",
" label=\"sensor_1\",\n",
" )\n",
" ax.grid(axis=\"y\")\n",
" _, _, ymin, ymax = plt.axis()\n",
" ax.vlines(rdf[timestampColumn][anoms], ymin=ymin , ymax=ymax , color='tab:red', alpha=0.2, linewidth=6)\n",
" ax.tick_params(axis='x',which='both',bottom=False,labelbottom=False)\n",
" ax.set_ylabel('sensor1_value')\n",
" ax.vlines(\n",
" rdf[timestampColumn][anoms],\n",
" ymin=ymin,\n",
" ymax=ymax,\n",
" color=\"tab:red\",\n",
" alpha=0.2,\n",
" linewidth=6,\n",
" )\n",
" ax.tick_params(axis=\"x\", which=\"both\", bottom=False, labelbottom=False)\n",
" ax.set_ylabel(\"sensor1_value\")\n",
" ax.legend()\n",
"\n",
" ax = fig.add_subplot(612, sharex=ax)\n",
" ax.plot(rdf[timestampColumn],rdf[\"sensor_2\"], color='tab:green', linestyle='solid', linewidth=2, label=\"sensor_2\")\n",
" ax.grid(axis='y')\n",
" ax.plot(\n",
" rdf[timestampColumn],\n",
" rdf[\"sensor_2\"],\n",
" color=\"tab:green\",\n",
" linestyle=\"solid\",\n",
" linewidth=2,\n",
" label=\"sensor_2\",\n",
" )\n",
" ax.grid(axis=\"y\")\n",
" _, _, ymin, ymax = plt.axis()\n",
" ax.vlines(rdf[timestampColumn][anoms], ymin=ymin , ymax=ymax , color='tab:red', alpha=0.2, linewidth=6)\n",
" ax.tick_params(axis='x',which='both',bottom=False,labelbottom=False)\n",
" ax.set_ylabel('sensor2_value')\n",
" ax.vlines(\n",
" rdf[timestampColumn][anoms],\n",
" ymin=ymin,\n",
" ymax=ymax,\n",
" color=\"tab:red\",\n",
" alpha=0.2,\n",
" linewidth=6,\n",
" )\n",
" ax.tick_params(axis=\"x\", which=\"both\", bottom=False, labelbottom=False)\n",
" ax.set_ylabel(\"sensor2_value\")\n",
" ax.legend()\n",
"\n",
" ax = fig.add_subplot(613, sharex=ax)\n",
" ax.plot(rdf[timestampColumn],rdf[\"sensor_3\"], color='tab:purple', linestyle='solid', linewidth=2, label=\"sensor_3\")\n",
" ax.grid(axis='y')\n",
" ax.plot(\n",
" rdf[timestampColumn],\n",
" rdf[\"sensor_3\"],\n",
" color=\"tab:purple\",\n",
" linestyle=\"solid\",\n",
" linewidth=2,\n",
" label=\"sensor_3\",\n",
" )\n",
" ax.grid(axis=\"y\")\n",
" _, _, ymin, ymax = plt.axis()\n",
" ax.vlines(rdf[timestampColumn][anoms], ymin=ymin , ymax=ymax , color='tab:red', alpha=0.2, linewidth=6)\n",
" ax.tick_params(axis='x',which='both',bottom=False,labelbottom=False)\n",
" ax.set_ylabel('sensor3_value')\n",
" ax.vlines(\n",
" rdf[timestampColumn][anoms],\n",
" ymin=ymin,\n",
" ymax=ymax,\n",
" color=\"tab:red\",\n",
" alpha=0.2,\n",
" linewidth=6,\n",
" )\n",
" ax.tick_params(axis=\"x\", which=\"both\", bottom=False, labelbottom=False)\n",
" ax.set_ylabel(\"sensor3_value\")\n",
" ax.legend()\n",
"\n",
" ax = fig.add_subplot(614, sharex=ax)\n",
" ax.tick_params(axis='x',which='both',bottom=False,labelbottom=False)\n",
" ax.plot(rdf[timestampColumn],rdf['outlierScore'], color='black', linestyle='solid', linewidth=2, label='Outlier score')\n",
" ax.set_ylabel('outlier score')\n",
" ax.grid(axis='y')\n",
" ax.tick_params(axis=\"x\", which=\"both\", bottom=False, labelbottom=False)\n",
" ax.plot(\n",
" rdf[timestampColumn],\n",
" rdf[\"outlierScore\"],\n",
" color=\"black\",\n",
" linestyle=\"solid\",\n",
" linewidth=2,\n",
" label=\"Outlier score\",\n",
" )\n",
" ax.set_ylabel(\"outlier score\")\n",
" ax.grid(axis=\"y\")\n",
" ax.legend()\n",
" \n",
"\n",
" ax = fig.add_subplot(615, sharex=ax)\n",
" ax.tick_params(axis='x',which='both',bottom=False,labelbottom=False)\n",
" ax.bar(rdf[timestampColumn],rdf['sensor_1_localimp'].abs(), width=2, color='tab:orange', label=\"sensor_1\")\n",
" ax.bar(rdf[timestampColumn],rdf['sensor_2_localimp'].abs(), width=2, color='tab:green', label=\"sensor_2\", bottom=rdf[\"sensor_1_localimp\"].abs())\n",
" ax.bar(rdf[timestampColumn],rdf['sensor_3_localimp'].abs(), width=2, color='tab:purple', label=\"sensor_3\", bottom=rdf[\"sensor_1_localimp\"].abs()+rdf[\"sensor_2_localimp\"].abs())\n",
" ax.set_ylabel('Contribution scores')\n",
" ax.grid(axis='y')\n",
" ax.tick_params(axis=\"x\", which=\"both\", bottom=False, labelbottom=False)\n",
" ax.bar(\n",
" rdf[timestampColumn],\n",
" rdf[\"sensor_1_localimp\"].abs(),\n",
" width=2,\n",
" color=\"tab:orange\",\n",
" label=\"sensor_1\",\n",
" )\n",
" ax.bar(\n",
" rdf[timestampColumn],\n",
" rdf[\"sensor_2_localimp\"].abs(),\n",
" width=2,\n",
" color=\"tab:green\",\n",
" label=\"sensor_2\",\n",
" bottom=rdf[\"sensor_1_localimp\"].abs(),\n",
" )\n",
" ax.bar(\n",
" rdf[timestampColumn],\n",
" rdf[\"sensor_3_localimp\"].abs(),\n",
" width=2,\n",
" color=\"tab:purple\",\n",
" label=\"sensor_3\",\n",
" bottom=rdf[\"sensor_1_localimp\"].abs() + rdf[\"sensor_2_localimp\"].abs(),\n",
" )\n",
" ax.set_ylabel(\"Contribution scores\")\n",
" ax.grid(axis=\"y\")\n",
" ax.legend()\n",
"\n",
" plt.show()"
@ -824,7 +932,7 @@
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(10,7))\n",
"plt.figure(figsize=(10, 7))\n",
"plt.bar(inputCols, global_explanation.global_importance_values)\n",
"plt.ylabel(\"global importance values\")"
]
@ -860,6 +968,7 @@
"source": [
"# View the model explanation in the ExplanationDashboard\n",
"from raiwidgets import ExplanationDashboard\n",
"\n",
"ExplanationDashboard(global_explanation, wrapper(model), dataset=eval_data)"
]
}

Просмотреть файл

@ -88,8 +88,9 @@
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.visualization import display\n"
" from notebookutils.visualization import display"
],
"outputs": [],
"metadata": {}
@ -98,10 +99,14 @@
"cell_type": "code",
"execution_count": null,
"source": [
"df = spark.read.format(\"csv\")\\\n",
" .option(\"header\", True)\\\n",
" .option(\"inferSchema\", True)\\\n",
" .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/company_bankruptcy_prediction_data.csv\")\n",
"df = (\n",
" spark.read.format(\"csv\")\n",
" .option(\"header\", True)\n",
" .option(\"inferSchema\", True)\n",
" .load(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/company_bankruptcy_prediction_data.csv\"\n",
" )\n",
")\n",
"# print dataset size\n",
"print(\"records read: \" + str(df.count()))\n",
"print(\"Schema: \")\n",
@ -147,13 +152,11 @@
"execution_count": null,
"source": [
"from pyspark.ml.feature import VectorAssembler\n",
"\n",
"feature_cols = df.columns[1:]\n",
"featurizer = VectorAssembler(\n",
" inputCols=feature_cols,\n",
" outputCol='features'\n",
")\n",
"train_data = featurizer.transform(train)['Bankrupt?', 'features']\n",
"test_data = featurizer.transform(test)['Bankrupt?', 'features']"
"featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n",
"train_data = featurizer.transform(train)[\"Bankrupt?\", \"features\"]\n",
"test_data = featurizer.transform(test)[\"Bankrupt?\", \"features\"]"
],
"outputs": [],
"metadata": {}
@ -186,7 +189,10 @@
"execution_count": null,
"source": [
"from synapse.ml.lightgbm import LightGBMClassifier\n",
"model = LightGBMClassifier(objective=\"binary\", featuresCol=\"features\", labelCol=\"Bankrupt?\", isUnbalance=True)"
"\n",
"model = LightGBMClassifier(\n",
" objective=\"binary\", featuresCol=\"features\", labelCol=\"Bankrupt?\", isUnbalance=True\n",
")"
],
"outputs": [],
"metadata": {}
@ -215,10 +221,12 @@
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" model.saveNativeModel(\"/models/lgbmclassifier.model\")\n",
" model = LightGBMClassificationModel.loadNativeModelFromFile(\"/models/lgbmclassifier.model\")\n",
" model = LightGBMClassificationModel.loadNativeModelFromFile(\n",
" \"/models/lgbmclassifier.model\"\n",
" )\n",
"else:\n",
" model.saveNativeModel(\"/lgbmclassifier.model\")\n",
" model = LightGBMClassificationModel.loadNativeModelFromFile(\"/lgbmclassifier.model\")\n"
" model = LightGBMClassificationModel.loadNativeModelFromFile(\"/lgbmclassifier.model\")"
],
"outputs": [],
"metadata": {}
@ -238,22 +246,24 @@
"import matplotlib.pyplot as plt\n",
"\n",
"feature_importances = model.getFeatureImportances()\n",
"fi = pd.Series(feature_importances,index = feature_cols)\n",
"fi = fi.sort_values(ascending = True)\n",
"fi = pd.Series(feature_importances, index=feature_cols)\n",
"fi = fi.sort_values(ascending=True)\n",
"f_index = fi.index\n",
"f_values = fi.values\n",
" \n",
"# print feature importances \n",
"print ('f_index:',f_index)\n",
"print ('f_values:',f_values)\n",
"\n",
"# print feature importances\n",
"print(\"f_index:\", f_index)\n",
"print(\"f_values:\", f_values)\n",
"\n",
"# plot\n",
"x_index = list(range(len(fi)))\n",
"x_index = [x/len(fi) for x in x_index]\n",
"plt.rcParams['figure.figsize'] = (20,20)\n",
"plt.barh(x_index,f_values,height = 0.028 ,align=\"center\",color = 'tan',tick_label=f_index)\n",
"plt.xlabel('importances')\n",
"plt.ylabel('features')\n",
"x_index = [x / len(fi) for x in x_index]\n",
"plt.rcParams[\"figure.figsize\"] = (20, 20)\n",
"plt.barh(\n",
" x_index, f_values, height=0.028, align=\"center\", color=\"tan\", tick_label=f_index\n",
")\n",
"plt.xlabel(\"importances\")\n",
"plt.ylabel(\"features\")\n",
"plt.show()"
],
"outputs": [],
@ -281,7 +291,12 @@
"execution_count": null,
"source": [
"from synapse.ml.train import ComputeModelStatistics\n",
"metrics = ComputeModelStatistics(evaluationMetric=\"classification\", labelCol='Bankrupt?', scoredLabelsCol='prediction').transform(predictions)\n",
"\n",
"metrics = ComputeModelStatistics(\n",
" evaluationMetric=\"classification\",\n",
" labelCol=\"Bankrupt?\",\n",
" scoredLabelsCol=\"prediction\",\n",
").transform(predictions)\n",
"display(metrics)"
],
"outputs": [],
@ -309,8 +324,9 @@
"cell_type": "code",
"execution_count": null,
"source": [
"triazines = spark.read.format(\"libsvm\")\\\n",
" .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/triazines.scale.svmlight\")"
"triazines = spark.read.format(\"libsvm\").load(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/triazines.scale.svmlight\"\n",
")"
],
"outputs": [],
"metadata": {}
@ -356,10 +372,10 @@
"execution_count": null,
"source": [
"from synapse.ml.lightgbm import LightGBMRegressor\n",
"model = LightGBMRegressor(objective='quantile',\n",
" alpha=0.2,\n",
" learningRate=0.3,\n",
" numLeaves=31).fit(train)"
"\n",
"model = LightGBMRegressor(\n",
" objective=\"quantile\", alpha=0.2, learningRate=0.3, numLeaves=31\n",
").fit(train)"
],
"outputs": [],
"metadata": {}
@ -395,10 +411,10 @@
"execution_count": null,
"source": [
"from synapse.ml.train import ComputeModelStatistics\n",
"metrics = ComputeModelStatistics(evaluationMetric='regression',\n",
" labelCol='label',\n",
" scoresCol='prediction') \\\n",
" .transform(scoredData)\n",
"\n",
"metrics = ComputeModelStatistics(\n",
" evaluationMetric=\"regression\", labelCol=\"label\", scoresCol=\"prediction\"\n",
").transform(scoredData)\n",
"display(metrics)"
],
"outputs": [],
@ -422,7 +438,9 @@
"cell_type": "code",
"execution_count": null,
"source": [
"df = spark.read.format(\"parquet\").load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/lightGBMRanker_train.parquet\")\n",
"df = spark.read.format(\"parquet\").load(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/lightGBMRanker_train.parquet\"\n",
")\n",
"# print some basic info\n",
"print(\"records read: \" + str(df.count()))\n",
"print(\"Schema: \")\n",
@ -445,20 +463,22 @@
"source": [
"from synapse.ml.lightgbm import LightGBMRanker\n",
"\n",
"features_col = 'features'\n",
"query_col = 'query'\n",
"label_col = 'labels'\n",
"lgbm_ranker = LightGBMRanker(labelCol=label_col,\n",
" featuresCol=features_col,\n",
" groupCol=query_col,\n",
" predictionCol='preds',\n",
" leafPredictionCol='leafPreds',\n",
" featuresShapCol='importances',\n",
" repartitionByGroupingColumn=True,\n",
" numLeaves=32,\n",
" numIterations=200,\n",
" evalAt=[1,3,5],\n",
" metric='ndcg')"
"features_col = \"features\"\n",
"query_col = \"query\"\n",
"label_col = \"labels\"\n",
"lgbm_ranker = LightGBMRanker(\n",
" labelCol=label_col,\n",
" featuresCol=features_col,\n",
" groupCol=query_col,\n",
" predictionCol=\"preds\",\n",
" leafPredictionCol=\"leafPreds\",\n",
" featuresShapCol=\"importances\",\n",
" repartitionByGroupingColumn=True,\n",
" numLeaves=32,\n",
" numIterations=200,\n",
" evalAt=[1, 3, 5],\n",
" metric=\"ndcg\",\n",
")"
],
"outputs": [],
"metadata": {}
@ -483,7 +503,9 @@
"cell_type": "code",
"execution_count": null,
"source": [
"dt = spark.read.format(\"parquet\").load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/lightGBMRanker_test.parquet\")\n",
"dt = spark.read.format(\"parquet\").load(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/lightGBMRanker_test.parquet\"\n",
")\n",
"predictions = lgbm_ranker_model.transform(dt)\n",
"predictions.limit(10).toPandas()"
],

Просмотреть файл

@ -30,6 +30,7 @@
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.visualization import display"
],
@ -46,10 +47,14 @@
"metadata": {},
"outputs": [],
"source": [
"df = spark.read.format(\"csv\")\\\n",
" .option(\"header\", True)\\\n",
" .option(\"inferSchema\", True)\\\n",
" .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/company_bankruptcy_prediction_data.csv\")\n",
"df = (\n",
" spark.read.format(\"csv\")\n",
" .option(\"header\", True)\n",
" .option(\"inferSchema\", True)\n",
" .load(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/company_bankruptcy_prediction_data.csv\"\n",
" )\n",
")\n",
"\n",
"display(df)"
]
@ -71,29 +76,26 @@
"from synapse.ml.lightgbm import LightGBMClassifier\n",
"\n",
"feature_cols = df.columns[1:]\n",
"featurizer = VectorAssembler(\n",
" inputCols=feature_cols,\n",
" outputCol='features'\n",
")\n",
"featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n",
"\n",
"train_data = featurizer.transform(df)['Bankrupt?', 'features']\n",
"train_data = featurizer.transform(df)[\"Bankrupt?\", \"features\"]\n",
"\n",
"model = (\n",
" LightGBMClassifier(featuresCol=\"features\", labelCol=\"Bankrupt?\")\n",
" .setEarlyStoppingRound(300)\n",
" .setLambdaL1(0.5)\n",
" .setNumIterations(1000)\n",
" .setNumThreads(-1)\n",
" .setMaxDeltaStep(0.5)\n",
" .setNumLeaves(31)\n",
" .setMaxDepth(-1)\n",
" .setBaggingFraction(0.7)\n",
" .setFeatureFraction(0.7)\n",
" .setBaggingFreq(2)\n",
" .setObjective(\"binary\")\n",
" .setIsUnbalance(True)\n",
" .setMinSumHessianInLeaf(20)\n",
" .setMinGainToSplit(0.01)\n",
" LightGBMClassifier(featuresCol=\"features\", labelCol=\"Bankrupt?\")\n",
" .setEarlyStoppingRound(300)\n",
" .setLambdaL1(0.5)\n",
" .setNumIterations(1000)\n",
" .setNumThreads(-1)\n",
" .setMaxDeltaStep(0.5)\n",
" .setNumLeaves(31)\n",
" .setMaxDepth(-1)\n",
" .setBaggingFraction(0.7)\n",
" .setFeatureFraction(0.7)\n",
" .setBaggingFreq(2)\n",
" .setObjective(\"binary\")\n",
" .setIsUnbalance(True)\n",
" .setMinSumHessianInLeaf(20)\n",
" .setMinGainToSplit(0.01)\n",
")\n",
"\n",
"model = model.fit(train_data)"
@ -115,12 +117,17 @@
"import lightgbm as lgb\n",
"from lightgbm import Booster, LGBMClassifier\n",
"\n",
"\n",
"def convertModel(lgbm_model: LGBMClassifier or Booster, input_size: int) -> bytes:\n",
" from onnxmltools.convert import convert_lightgbm\n",
" from onnxconverter_common.data_types import FloatTensorType\n",
" initial_types = [(\"input\", FloatTensorType([-1, input_size]))]\n",
" onnx_model = convert_lightgbm(lgbm_model, initial_types=initial_types, target_opset=9)\n",
" return onnx_model.SerializeToString()\n",
" from onnxmltools.convert import convert_lightgbm\n",
" from onnxconverter_common.data_types import FloatTensorType\n",
"\n",
" initial_types = [(\"input\", FloatTensorType([-1, input_size]))]\n",
" onnx_model = convert_lightgbm(\n",
" lgbm_model, initial_types=initial_types, target_opset=9\n",
" )\n",
" return onnx_model.SerializeToString()\n",
"\n",
"\n",
"booster_model_str = model.getLightGBMBooster().modelStr().get()\n",
"booster = lgb.Booster(model_str=booster_model_str)\n",
@ -162,8 +169,7 @@
"outputs": [],
"source": [
"onnx_ml = (\n",
" onnx_ml\n",
" .setDeviceType(\"CPU\")\n",
" onnx_ml.setDeviceType(\"CPU\")\n",
" .setFeedDict({\"input\": \"features\"})\n",
" .setFetchDict({\"probability\": \"probabilities\", \"prediction\": \"label\"})\n",
" .setMiniBatchSize(5000)\n",
@ -194,7 +200,14 @@
"cols = list(map(str, testPdf.columns))\n",
"testDf = spark.createDataFrame(testPdf)\n",
"testDf = testDf.union(testDf).repartition(200)\n",
"testDf = VectorAssembler().setInputCols(cols).setOutputCol(\"features\").transform(testDf).drop(*cols).cache()\n",
"testDf = (\n",
" VectorAssembler()\n",
" .setInputCols(cols)\n",
" .setOutputCol(\"features\")\n",
" .transform(testDf)\n",
" .drop(*cols)\n",
" .cache()\n",
")\n",
"\n",
"display(onnx_ml.transform(testDf))"
]

Просмотреть файл

@ -27,8 +27,10 @@
"outputs": [],
"source": [
"import os\n",
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
"\n",
"import synapse.ml\n",
@ -58,8 +60,14 @@
"outputs": [],
"source": [
"import time\n",
"\n",
"imageStream = spark.readStream.image().load(imageDir)\n",
"query = imageStream.select(\"image.height\").writeStream.format(\"memory\").queryName(\"heights\").start()\n",
"query = (\n",
" imageStream.select(\"image.height\")\n",
" .writeStream.format(\"memory\")\n",
" .queryName(\"heights\")\n",
" .start()\n",
")\n",
"time.sleep(3)\n",
"print(\"Streaming query activity: {}\".format(query.isActive))"
]
@ -99,6 +107,7 @@
"outputs": [],
"source": [
"from py4j.protocol import Py4JJavaError\n",
"\n",
"try:\n",
" query.stop()\n",
"except Py4JJavaError as e:\n",
@ -123,16 +132,17 @@
"source": [
"from PIL import Image\n",
"import matplotlib.pyplot as plt\n",
"data = images.take(3) # take first three rows of the dataframe\n",
"im = data[2][0] # the image is in the first column of a given row\n",
"\n",
"data = images.take(3) # take first three rows of the dataframe\n",
"im = data[2][0] # the image is in the first column of a given row\n",
"\n",
"print(\"image type: {}, number of fields: {}\".format(type(im), len(im)))\n",
"print(\"image path: {}\".format(im.origin))\n",
"print(\"height: {}, width: {}, OpenCV type: {}\".format(im.height, im.width, im.mode))\n",
"\n",
"arr = toNDArray(im) # convert to numpy array\n",
"arr = toNDArray(im) # convert to numpy array\n",
"print(images.count())\n",
"plt.imshow(Image.fromarray(arr, \"RGB\")) # display the image inside notebook\n"
"plt.imshow(Image.fromarray(arr, \"RGB\")) # display the image inside notebook"
]
},
{
@ -151,15 +161,17 @@
"source": [
"from synapse.ml.opencv import ImageTransformer\n",
"\n",
"tr = (ImageTransformer() # images are resized and then cropped\n",
" .setOutputCol(\"transformed\")\n",
" .resize(size=(200, 200))\n",
" .crop(0, 0, height = 180, width = 180) )\n",
"tr = (\n",
" ImageTransformer() # images are resized and then cropped\n",
" .setOutputCol(\"transformed\")\n",
" .resize(size=(200, 200))\n",
" .crop(0, 0, height=180, width=180)\n",
")\n",
"\n",
"small = tr.transform(images).select(\"transformed\")\n",
"\n",
"im = small.take(3)[2][0] # take third image\n",
"plt.imshow(Image.fromarray(toNDArray(im), \"RGB\")) # display the image inside notebook"
"im = small.take(3)[2][0] # take third image\n",
"plt.imshow(Image.fromarray(toNDArray(im), \"RGB\")) # display the image inside notebook"
]
},
{
@ -180,17 +192,19 @@
"from pyspark.sql.functions import udf\n",
"from synapse.ml.opencv import ImageSchema, toNDArray, toImage\n",
"\n",
"def u(row):\n",
" array = toNDArray(row) # convert Image to numpy ndarray[height, width, 3]\n",
" array[:,:,2] = 0\n",
" return toImage(array) # numpy array back to Spark Row structure\n",
"\n",
"noBlueUDF = udf(u,ImageSchema)\n",
"def u(row):\n",
" array = toNDArray(row) # convert Image to numpy ndarray[height, width, 3]\n",
" array[:, :, 2] = 0\n",
" return toImage(array) # numpy array back to Spark Row structure\n",
"\n",
"\n",
"noBlueUDF = udf(u, ImageSchema)\n",
"\n",
"noblue = small.withColumn(\"noblue\", noBlueUDF(small[\"transformed\"])).select(\"noblue\")\n",
"\n",
"im = noblue.take(3)[2][0] # take second image\n",
"plt.imshow(Image.fromarray(toNDArray(im), \"RGB\")) # display the image inside notebook"
"im = noblue.take(3)[2][0] # take second image\n",
"plt.imshow(Image.fromarray(toNDArray(im), \"RGB\")) # display the image inside notebook"
]
},
{

Просмотреть файл

@ -34,10 +34,14 @@
"source": [
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.mssparkutils.credentials import getSecret\n",
" os.environ['VISION_API_KEY'] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n",
" os.environ['AZURE_SEARCH_KEY'] = getSecret(\"mmlspark-build-keys\", \"azure-search-key\")"
"\n",
" os.environ[\"VISION_API_KEY\"] = getSecret(\"mmlspark-build-keys\", \"cognitive-api-key\")\n",
" os.environ[\"AZURE_SEARCH_KEY\"] = getSecret(\n",
" \"mmlspark-build-keys\", \"azure-search-key\"\n",
" )"
],
"outputs": [],
"metadata": {}
@ -46,8 +50,8 @@
"cell_type": "code",
"execution_count": 4,
"source": [
"VISION_API_KEY = os.environ['VISION_API_KEY']\n",
"AZURE_SEARCH_KEY = os.environ['AZURE_SEARCH_KEY']\n",
"VISION_API_KEY = os.environ[\"VISION_API_KEY\"]\n",
"AZURE_SEARCH_KEY = os.environ[\"AZURE_SEARCH_KEY\"]\n",
"search_service = \"mmlspark-azure-search\"\n",
"search_index = \"test\""
],
@ -60,14 +64,15 @@
"cell_type": "code",
"execution_count": 5,
"source": [
"data = spark.read\\\n",
" .format(\"csv\")\\\n",
" .option(\"header\", True)\\\n",
" .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/metartworks_sample.csv\")\\\n",
" .withColumn(\"searchAction\", lit(\"upload\"))\\\n",
" .withColumn(\"Neighbors\", split(col(\"Neighbors\"), \",\").cast(\"array<string>\"))\\\n",
" .withColumn(\"Tags\", split(col(\"Tags\"), \",\").cast(\"array<string>\"))\\\n",
" .limit(25)"
"data = (\n",
" spark.read.format(\"csv\")\n",
" .option(\"header\", True)\n",
" .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/metartworks_sample.csv\")\n",
" .withColumn(\"searchAction\", lit(\"upload\"))\n",
" .withColumn(\"Neighbors\", split(col(\"Neighbors\"), \",\").cast(\"array<string>\"))\n",
" .withColumn(\"Tags\", split(col(\"Tags\"), \",\").cast(\"array<string>\"))\n",
" .limit(25)\n",
")"
],
"outputs": [],
"metadata": {
@ -88,18 +93,25 @@
"from synapse.ml.cognitive import AnalyzeImage\n",
"from synapse.ml.stages import SelectColumns\n",
"\n",
"#define pipeline\n",
"describeImage = (AnalyzeImage()\n",
" .setSubscriptionKey(VISION_API_KEY)\n",
" .setLocation(\"eastus\")\n",
" .setImageUrlCol(\"PrimaryImageUrl\")\n",
" .setOutputCol(\"RawImageDescription\")\n",
" .setErrorCol(\"Errors\")\n",
" .setVisualFeatures([\"Categories\", \"Description\", \"Faces\", \"ImageType\", \"Color\", \"Adult\"])\n",
" .setConcurrency(5))\n",
"# define pipeline\n",
"describeImage = (\n",
" AnalyzeImage()\n",
" .setSubscriptionKey(VISION_API_KEY)\n",
" .setLocation(\"eastus\")\n",
" .setImageUrlCol(\"PrimaryImageUrl\")\n",
" .setOutputCol(\"RawImageDescription\")\n",
" .setErrorCol(\"Errors\")\n",
" .setVisualFeatures(\n",
" [\"Categories\", \"Description\", \"Faces\", \"ImageType\", \"Color\", \"Adult\"]\n",
" )\n",
" .setConcurrency(5)\n",
")\n",
"\n",
"df2 = describeImage.transform(data)\\\n",
" .select(\"*\", \"RawImageDescription.*\").drop(\"Errors\", \"RawImageDescription\")"
"df2 = (\n",
" describeImage.transform(data)\n",
" .select(\"*\", \"RawImageDescription.*\")\n",
" .drop(\"Errors\", \"RawImageDescription\")\n",
")"
],
"outputs": [],
"metadata": {
@ -125,12 +137,14 @@
"execution_count": 10,
"source": [
"from synapse.ml.cognitive import *\n",
"\n",
"df2.writeToAzureSearch(\n",
" subscriptionKey=AZURE_SEARCH_KEY,\n",
" actionCol=\"searchAction\",\n",
" serviceName=search_service,\n",
" indexName=search_index,\n",
" keyCol=\"ObjectID\")"
" subscriptionKey=AZURE_SEARCH_KEY,\n",
" actionCol=\"searchAction\",\n",
" serviceName=search_service,\n",
" indexName=search_index,\n",
" keyCol=\"ObjectID\",\n",
")"
],
"outputs": [],
"metadata": {
@ -148,8 +162,12 @@
"cell_type": "code",
"execution_count": 12,
"source": [
"url = 'https://{}.search.windows.net/indexes/{}/docs/search?api-version=2019-05-06'.format(search_service, search_index)\n",
"requests.post(url, json={\"search\": \"Glass\"}, headers = {\"api-key\": AZURE_SEARCH_KEY}).json()"
"url = \"https://{}.search.windows.net/indexes/{}/docs/search?api-version=2019-05-06\".format(\n",
" search_service, search_index\n",
")\n",
"requests.post(\n",
" url, json={\"search\": \"Glass\"}, headers={\"api-key\": AZURE_SEARCH_KEY}\n",
").json()"
],
"outputs": [],
"metadata": {

Просмотреть файл

@ -46,8 +46,9 @@
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.visualization import display\n"
" from notebookutils.visualization import display"
],
"metadata": {},
"outputs": [],
@ -80,8 +81,9 @@
"source": [
"# loads the dataset and the two trained CKNN models for querying by medium and culture\n",
"df = spark.read.parquet(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/met_and_rijks.parquet\")\n",
"display(df.drop(\"Norm_Features\"))\n"
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/met_and_rijks.parquet\"\n",
")\n",
"display(df.drop(\"Norm_Features\"))"
],
"metadata": {},
"outputs": [],
@ -101,14 +103,14 @@
"# mediums = ['prints', 'drawings', 'ceramics', 'textiles', 'paintings', \"musical instruments\",\"glass\", 'accessories', 'photographs', \"metalwork\",\n",
"# \"sculptures\", \"weapons\", \"stone\", \"precious\", \"paper\", \"woodwork\", \"leatherwork\", \"uncategorized\"]\n",
"\n",
"mediums = ['paintings', 'glass', 'ceramics']\n",
"mediums = [\"paintings\", \"glass\", \"ceramics\"]\n",
"\n",
"# cultures = ['african (general)', 'american', 'ancient american', 'ancient asian', 'ancient european', 'ancient middle-eastern', 'asian (general)',\n",
"# 'austrian', 'belgian', 'british', 'chinese', 'czech', 'dutch', 'egyptian']#, 'european (general)', 'french', 'german', 'greek',\n",
"# 'iranian', 'italian', 'japanese', 'latin american', 'middle eastern', 'roman', 'russian', 'south asian', 'southeast asian',\n",
"# 'spanish', 'swiss', 'various']\n",
"\n",
"cultures = ['japanese', 'american', 'african (general)']\n",
"cultures = [\"japanese\", \"american\", \"african (general)\"]\n",
"\n",
"# Uncomment the above for more robust and large scale searches!\n",
"\n",
@ -118,10 +120,16 @@
"culture_set = set(cultures)\n",
"selected_ids = {\"AK-RBK-17525-2\", \"AK-MAK-1204\", \"AK-RAK-2015-2-9\"}\n",
"\n",
"small_df = df.where(udf(lambda medium, culture, id_val: (medium in medium_set) or (\n",
" culture in culture_set) or (id_val in selected_ids), BooleanType())(\"Classification\", \"Culture\", \"id\"))\n",
"small_df = df.where(\n",
" udf(\n",
" lambda medium, culture, id_val: (medium in medium_set)\n",
" or (culture in culture_set)\n",
" or (id_val in selected_ids),\n",
" BooleanType(),\n",
" )(\"Classification\", \"Culture\", \"id\")\n",
")\n",
"\n",
"small_df.count()\n"
"small_df.count()"
],
"metadata": {},
"outputs": [],
@ -138,12 +146,14 @@
{
"cell_type": "code",
"source": [
"medium_cknn = (ConditionalKNN()\n",
" .setOutputCol(\"Matches\")\n",
" .setFeaturesCol(\"Norm_Features\")\n",
" .setValuesCol(\"Thumbnail_Url\")\n",
" .setLabelCol(\"Classification\")\n",
" .fit(small_df))"
"medium_cknn = (\n",
" ConditionalKNN()\n",
" .setOutputCol(\"Matches\")\n",
" .setFeaturesCol(\"Norm_Features\")\n",
" .setValuesCol(\"Thumbnail_Url\")\n",
" .setLabelCol(\"Classification\")\n",
" .fit(small_df)\n",
")"
],
"metadata": {},
"outputs": [],
@ -152,12 +162,14 @@
{
"cell_type": "code",
"source": [
"culture_cknn = (ConditionalKNN()\n",
" .setOutputCol(\"Matches\")\n",
" .setFeaturesCol(\"Norm_Features\")\n",
" .setValuesCol(\"Thumbnail_Url\")\n",
" .setLabelCol(\"Culture\")\n",
" .fit(small_df))\n"
"culture_cknn = (\n",
" ConditionalKNN()\n",
" .setOutputCol(\"Matches\")\n",
" .setFeaturesCol(\"Norm_Features\")\n",
" .setValuesCol(\"Thumbnail_Url\")\n",
" .setLabelCol(\"Culture\")\n",
" .fit(small_df)\n",
")"
],
"metadata": {},
"outputs": [],
@ -180,8 +192,9 @@
"def add_matches(classes, cknn, df):\n",
" results = df\n",
" for label in classes:\n",
" results = (cknn.transform(results.withColumn(\"conditioner\", array(lit(label))))\n",
" .withColumnRenamed(\"Matches\", \"Matches_{}\".format(label)))\n",
" results = cknn.transform(\n",
" results.withColumn(\"conditioner\", array(lit(label)))\n",
" ).withColumnRenamed(\"Matches\", \"Matches_{}\".format(label))\n",
" return results"
],
"metadata": {},
@ -201,19 +214,19 @@
"def plot_img(axis, url, title):\n",
" try:\n",
" response = requests.get(url)\n",
" img = Image.open(BytesIO(response.content)).convert('RGB')\n",
" img = Image.open(BytesIO(response.content)).convert(\"RGB\")\n",
" axis.imshow(img, aspect=\"equal\")\n",
" except:\n",
" pass\n",
" if title is not None:\n",
" axis.set_title(title, fontsize=4)\n",
" axis.set_title(title, fontsize=4)\n",
" axis.axis(\"off\")\n",
"\n",
"\n",
"def plot_urls(url_arr, titles, filename):\n",
" nx, ny = url_arr.shape\n",
"\n",
" plt.figure(figsize=(nx*5, ny*5), dpi=1600)\n",
" plt.figure(figsize=(nx * 5, ny * 5), dpi=1600)\n",
" fig, axes = plt.subplots(ny, nx)\n",
"\n",
" # reshape required in the case of 1 image query\n",
@ -225,7 +238,7 @@
" if j == 0:\n",
" plot_img(axes[j, i], url_arr[i, j], titles[i])\n",
" else:\n",
" plot_img(axes[j, i], url_arr[i, j], None)\n",
" plot_img(axes[j, i], url_arr[i, j], None)\n",
"\n",
" plt.savefig(filename, dpi=1600) # saves the results as a PNG\n",
"\n",
@ -248,6 +261,7 @@
"source": [
"# main method to test a particular dataset with two CKNN models and a set of art IDs, saving the result to filename.png\n",
"\n",
"\n",
"def test_all(data, cknn_medium, cknn_culture, test_ids, root):\n",
" is_nice_obj = udf(lambda obj: obj in test_ids, BooleanType())\n",
" test_df = data.where(is_nice_obj(\"id\"))\n",
@ -259,19 +273,21 @@
"\n",
" original_urls = [row[\"Thumbnail_Url\"] for row in results]\n",
"\n",
" culture_urls = [[row[\"Matches_{}\".format(\n",
" label)][0][\"value\"] for row in results] for label in cultures]\n",
" culture_urls = [\n",
" [row[\"Matches_{}\".format(label)][0][\"value\"] for row in results]\n",
" for label in cultures\n",
" ]\n",
" culture_url_arr = np.array([original_urls] + culture_urls)[:, :]\n",
" plot_urls(culture_url_arr, [\"Original\"] +\n",
" cultures, root + \"matches_by_culture.png\")\n",
" plot_urls(culture_url_arr, [\"Original\"] + cultures, root + \"matches_by_culture.png\")\n",
"\n",
" medium_urls = [[row[\"Matches_{}\".format(\n",
" label)][0][\"value\"] for row in results] for label in mediums]\n",
" medium_urls = [\n",
" [row[\"Matches_{}\".format(label)][0][\"value\"] for row in results]\n",
" for label in mediums\n",
" ]\n",
" medium_url_arr = np.array([original_urls] + medium_urls)[:, :]\n",
" plot_urls(medium_url_arr, [\"Original\"] +\n",
" mediums, root + \"matches_by_medium.png\")\n",
" plot_urls(medium_url_arr, [\"Original\"] + mediums, root + \"matches_by_medium.png\")\n",
"\n",
" return results_df_culture\n"
" return results_df_culture"
],
"metadata": {},
"outputs": [],
@ -292,8 +308,7 @@
"cell_type": "code",
"source": [
"# sample query\n",
"result_df = test_all(small_df, medium_cknn, culture_cknn,\n",
" selected_ids, root=\".\")\n"
"result_df = test_all(small_df, medium_cknn, culture_cknn, selected_ids, root=\".\")"
],
"metadata": {},
"outputs": [],

Просмотреть файл

@ -72,6 +72,7 @@
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()"
]
},
@ -88,24 +89,28 @@
"metadata": {},
"outputs": [],
"source": [
"spark.sparkContext.setCheckpointDir('dbfs:/checkpoint_path/')\n",
"spark.sparkContext.setCheckpointDir(\"dbfs:/checkpoint_path/\")\n",
"\n",
"factory = DataFactory(\n",
" num_hr_users = 25,\n",
" num_hr_resources = 50,\n",
" num_fin_users = 35,\n",
" num_fin_resources = 75,\n",
" num_eng_users = 15,\n",
" num_eng_resources = 25,\n",
" single_component = True\n",
" num_hr_users=25,\n",
" num_hr_resources=50,\n",
" num_fin_users=35,\n",
" num_fin_resources=75,\n",
" num_eng_users=15,\n",
" num_eng_resources=25,\n",
" single_component=True,\n",
")\n",
"\n",
"training_pdf = factory.create_clustered_training_data(ratio=0.4)\n",
"\n",
"# a tenant id is used when independant datasets originate from different tenants, in this example we set all tenants-ids to the same value\n",
"training_df = spark.createDataFrame(training_pdf).withColumn('tenant_id', f.lit(0))\n",
"ingroup_df = spark.createDataFrame(factory.create_clustered_intra_test_data(training_pdf)).withColumn('tenant_id', f.lit(0))\n",
"outgroup_df = spark.createDataFrame(factory.create_clustered_inter_test_data()).withColumn('tenant_id', f.lit(0))"
"training_df = spark.createDataFrame(training_pdf).withColumn(\"tenant_id\", f.lit(0))\n",
"ingroup_df = spark.createDataFrame(\n",
" factory.create_clustered_intra_test_data(training_pdf)\n",
").withColumn(\"tenant_id\", f.lit(0))\n",
"outgroup_df = spark.createDataFrame(\n",
" factory.create_clustered_inter_test_data()\n",
").withColumn(\"tenant_id\", f.lit(0))"
]
},
{
@ -142,11 +147,11 @@
"outputs": [],
"source": [
"access_anomaly = AccessAnomaly(\n",
" tenantCol='tenant_id',\n",
" userCol='user',\n",
" resCol='res',\n",
" likelihoodCol='likelihood',\n",
" maxIter=1000\n",
" tenantCol=\"tenant_id\",\n",
" userCol=\"user\",\n",
" resCol=\"res\",\n",
" likelihoodCol=\"likelihood\",\n",
" maxIter=1000,\n",
")"
]
},
@ -182,10 +187,10 @@
"outputs": [],
"source": [
"ingroup_scored_df.agg(\n",
" f.min('anomaly_score').alias('min_anomaly_score'),\n",
" f.max('anomaly_score').alias('max_anomaly_score'),\n",
" f.mean('anomaly_score').alias('mean_anomaly_score'),\n",
" f.stddev('anomaly_score').alias('stddev_anomaly_score'),\n",
" f.min(\"anomaly_score\").alias(\"min_anomaly_score\"),\n",
" f.max(\"anomaly_score\").alias(\"max_anomaly_score\"),\n",
" f.mean(\"anomaly_score\").alias(\"mean_anomaly_score\"),\n",
" f.stddev(\"anomaly_score\").alias(\"stddev_anomaly_score\"),\n",
").show()"
]
},
@ -205,10 +210,10 @@
"outputs": [],
"source": [
"outgroup_scored_df.agg(\n",
" f.min('anomaly_score').alias('min_anomaly_score'),\n",
" f.max('anomaly_score').alias('max_anomaly_score'),\n",
" f.mean('anomaly_score').alias('mean_anomaly_score'),\n",
" f.stddev('anomaly_score').alias('stddev_anomaly_score'),\n",
" f.min(\"anomaly_score\").alias(\"min_anomaly_score\"),\n",
" f.max(\"anomaly_score\").alias(\"max_anomaly_score\"),\n",
" f.mean(\"anomaly_score\").alias(\"mean_anomaly_score\"),\n",
" f.stddev(\"anomaly_score\").alias(\"stddev_anomaly_score\"),\n",
").show()"
]
},
@ -229,38 +234,28 @@
"# Select a subset of results to send to Log Analytics\n",
"#\n",
"\n",
"full_res_df = outgroup_scored_df.orderBy(f.desc('anomaly_score')).cache()\n",
"full_res_df = outgroup_scored_df.orderBy(f.desc(\"anomaly_score\")).cache()\n",
"\n",
"from pyspark.sql.window import Window\n",
"\n",
"w = Window.partitionBy(\n",
" 'tenant_id',\n",
" 'user',\n",
" 'res' \n",
" ).orderBy(\n",
" f.desc('anomaly_score')\n",
" )\n",
"w = Window.partitionBy(\"tenant_id\", \"user\", \"res\").orderBy(f.desc(\"anomaly_score\"))\n",
"\n",
"# select values above threshold\n",
"results_above_threshold = full_res_df.filter(full_res_df.anomaly_score > 1.0)\n",
"\n",
"# get distinct resource/user and corresponding timestamp and highest score\n",
"results_to_la = results_above_threshold.withColumn(\n",
" 'index', f.row_number().over(w)\n",
" ).orderBy(\n",
" f.desc('anomaly_score')\n",
" ).select(\n",
" 'tenant_id',\n",
" f.col('user'),\n",
" f.col('res'),\n",
" 'anomaly_score'\n",
" ).where(\n",
" 'index == 1'\n",
" ).limit(100).cache()\n",
"results_to_la = (\n",
" results_above_threshold.withColumn(\"index\", f.row_number().over(w))\n",
" .orderBy(f.desc(\"anomaly_score\"))\n",
" .select(\"tenant_id\", f.col(\"user\"), f.col(\"res\"), \"anomaly_score\")\n",
" .where(\"index == 1\")\n",
" .limit(100)\n",
" .cache()\n",
")\n",
"\n",
"# add a fake timestamp to the results\n",
"results_to_la = results_to_la.withColumn('timestamp', f.current_timestamp())\n",
" \n",
"results_to_la = results_to_la.withColumn(\"timestamp\", f.current_timestamp())\n",
"\n",
"display(results_to_la)"
]
},
@ -283,7 +278,7 @@
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"print (__version__) # requires version >= 1.9.0\n",
"print(__version__) # requires version >= 1.9.0\n",
"\n",
"# run plotly in offline mode\n",
"offline.init_notebook_mode()"
@ -295,53 +290,43 @@
"metadata": {},
"outputs": [],
"source": [
"#Find all server accesses of users with high predicted scores\n",
"# Find all server accesses of users with high predicted scores\n",
"# For display, limit to top 25 results\n",
"results_to_display = results_to_la.orderBy(\n",
" f.desc('anomaly_score')\n",
" ).limit(25).cache()\n",
"interesting_records = full_res_df.join(results_to_display, ['user'], 'left_semi')\n",
"non_anomalous_records = interesting_records.join(results_to_display, ['user', 'res'], 'left_anti')\n",
"results_to_display = results_to_la.orderBy(f.desc(\"anomaly_score\")).limit(25).cache()\n",
"interesting_records = full_res_df.join(results_to_display, [\"user\"], \"left_semi\")\n",
"non_anomalous_records = interesting_records.join(\n",
" results_to_display, [\"user\", \"res\"], \"left_anti\"\n",
")\n",
"\n",
"top_non_anomalous_records = non_anomalous_records.groupBy(\n",
" 'tenant_id',\n",
" 'user', \n",
" 'res'\n",
" ).agg(\n",
" f.count('*').alias('count'),\n",
" ).select(\n",
" f.col('tenant_id'),\n",
" f.col('user'),\n",
" f.col('res'),\n",
" 'count'\n",
" )\n",
"top_non_anomalous_records = (\n",
" non_anomalous_records.groupBy(\"tenant_id\", \"user\", \"res\")\n",
" .agg(\n",
" f.count(\"*\").alias(\"count\"),\n",
" )\n",
" .select(f.col(\"tenant_id\"), f.col(\"user\"), f.col(\"res\"), \"count\")\n",
")\n",
"\n",
"#pick only a subset of non-anomalous record for UI\n",
"# pick only a subset of non-anomalous record for UI\n",
"w = Window.partitionBy(\n",
" 'tenant_id',\n",
" 'user',\n",
" ).orderBy(\n",
" f.desc('count')\n",
" )\n",
" \"tenant_id\",\n",
" \"user\",\n",
").orderBy(f.desc(\"count\"))\n",
"\n",
"# pick top non-anomalous set\n",
"top_non_anomalous_accesses = top_non_anomalous_records.withColumn(\n",
" 'index', f.row_number().over(w)\n",
" ).orderBy(\n",
" f.desc('count')\n",
" ).select(\n",
" 'tenant_id',\n",
" f.col('user'),\n",
" f.col('res'),\n",
" f.col('count')\n",
" ).where(\n",
" 'index in (1,2,3,4,5)'\n",
" ).limit(25)\n",
"top_non_anomalous_accesses = (\n",
" top_non_anomalous_records.withColumn(\"index\", f.row_number().over(w))\n",
" .orderBy(f.desc(\"count\"))\n",
" .select(\"tenant_id\", f.col(\"user\"), f.col(\"res\"), f.col(\"count\"))\n",
" .where(\"index in (1,2,3,4,5)\")\n",
" .limit(25)\n",
")\n",
"\n",
"# add back anomalous record\n",
"fileShare_accesses = (top_non_anomalous_accesses\n",
" .select('user', 'res', 'count')\n",
" .union(results_to_display.select('user', 'res', f.lit(1).alias('count'))).cache())"
"fileShare_accesses = (\n",
" top_non_anomalous_accesses.select(\"user\", \"res\", \"count\")\n",
" .union(results_to_display.select(\"user\", \"res\", f.lit(1).alias(\"count\")))\n",
" .cache()\n",
")"
]
},
{
@ -354,29 +339,63 @@
"high_scores_df = fileShare_accesses.toPandas()\n",
"unique_arr = np.append(high_scores_df.user.unique(), high_scores_df.res.unique())\n",
"\n",
"unique_df = pd.DataFrame(data = unique_arr, columns = ['name'])\n",
"unique_df['index'] = range(0, len(unique_df.index))\n",
"unique_df = pd.DataFrame(data=unique_arr, columns=[\"name\"])\n",
"unique_df[\"index\"] = range(0, len(unique_df.index))\n",
"\n",
"# create index for source & target and color for the normal accesses\n",
"normal_line_color = 'rgba(211, 211, 211, 0.8)'\n",
"anomolous_color = 'red'\n",
"x = pd.merge(high_scores_df, unique_df, how='left', left_on='user', right_on='name').drop(['name'], axis=1).rename(columns={'index' : 'userIndex'})\n",
"all_access_index_df = pd.merge(x, unique_df, how='left', left_on='res', right_on='name').drop(['name'], axis=1).rename(columns={'index' : 'resIndex'})\n",
"all_access_index_df['color'] = normal_line_color\n",
"normal_line_color = \"rgba(211, 211, 211, 0.8)\"\n",
"anomolous_color = \"red\"\n",
"x = (\n",
" pd.merge(high_scores_df, unique_df, how=\"left\", left_on=\"user\", right_on=\"name\")\n",
" .drop([\"name\"], axis=1)\n",
" .rename(columns={\"index\": \"userIndex\"})\n",
")\n",
"all_access_index_df = (\n",
" pd.merge(x, unique_df, how=\"left\", left_on=\"res\", right_on=\"name\")\n",
" .drop([\"name\"], axis=1)\n",
" .rename(columns={\"index\": \"resIndex\"})\n",
")\n",
"all_access_index_df[\"color\"] = normal_line_color\n",
"\n",
"# results_to_display index, color and \n",
"y = results_to_display.toPandas().drop(['tenant_id', 'timestamp', 'anomaly_score'], axis=1)\n",
"y = pd.merge(y, unique_df, how='left', left_on='user', right_on='name').drop(['name'], axis=1).rename(columns={'index' : 'userIndex'})\n",
"high_scores_index_df = pd.merge(y, unique_df, how='left', left_on='res', right_on='name').drop(['name'], axis=1).rename(columns={'index' : 'resIndex'})\n",
"high_scores_index_df['count'] = 1\n",
"high_scores_index_df['color'] = anomolous_color\n",
"# results_to_display index, color and\n",
"y = results_to_display.toPandas().drop(\n",
" [\"tenant_id\", \"timestamp\", \"anomaly_score\"], axis=1\n",
")\n",
"y = (\n",
" pd.merge(y, unique_df, how=\"left\", left_on=\"user\", right_on=\"name\")\n",
" .drop([\"name\"], axis=1)\n",
" .rename(columns={\"index\": \"userIndex\"})\n",
")\n",
"high_scores_index_df = (\n",
" pd.merge(y, unique_df, how=\"left\", left_on=\"res\", right_on=\"name\")\n",
" .drop([\"name\"], axis=1)\n",
" .rename(columns={\"index\": \"resIndex\"})\n",
")\n",
"high_scores_index_df[\"count\"] = 1\n",
"high_scores_index_df[\"color\"] = anomolous_color\n",
"\n",
"# substract 1 for the red entries in all_access df\n",
"hsi_df = high_scores_index_df[['user','res', 'count']].rename(columns={'count' : 'hsiCount'})\n",
"all_access_updated_count_df = pd.merge(all_access_index_df, hsi_df, how='left', left_on=['user', 'res'], right_on=['user', 'res'])\n",
"all_access_updated_count_df['count'] = np.where(all_access_updated_count_df['hsiCount']==1, all_access_updated_count_df['count'] - 1, all_access_updated_count_df['count'])\n",
"all_access_updated_count_df = all_access_updated_count_df.loc[all_access_updated_count_df['count'] > 0]\n",
"all_access_updated_count_df = all_access_updated_count_df[['user','res', 'count', 'userIndex', 'resIndex', 'color']]\n",
"hsi_df = high_scores_index_df[[\"user\", \"res\", \"count\"]].rename(\n",
" columns={\"count\": \"hsiCount\"}\n",
")\n",
"all_access_updated_count_df = pd.merge(\n",
" all_access_index_df,\n",
" hsi_df,\n",
" how=\"left\",\n",
" left_on=[\"user\", \"res\"],\n",
" right_on=[\"user\", \"res\"],\n",
")\n",
"all_access_updated_count_df[\"count\"] = np.where(\n",
" all_access_updated_count_df[\"hsiCount\"] == 1,\n",
" all_access_updated_count_df[\"count\"] - 1,\n",
" all_access_updated_count_df[\"count\"],\n",
")\n",
"all_access_updated_count_df = all_access_updated_count_df.loc[\n",
" all_access_updated_count_df[\"count\"] > 0\n",
"]\n",
"all_access_updated_count_df = all_access_updated_count_df[\n",
" [\"user\", \"res\", \"count\", \"userIndex\", \"resIndex\", \"color\"]\n",
"]\n",
"\n",
"# combine the two tables\n",
"frames = [all_access_updated_count_df, high_scores_index_df]\n",
@ -391,41 +410,33 @@
"outputs": [],
"source": [
"data_trace = dict(\n",
" type='sankey',\n",
" domain = dict(\n",
" x = [0,1],\n",
" y = [0,1]\n",
" type=\"sankey\",\n",
" domain=dict(x=[0, 1], y=[0, 1]),\n",
" orientation=\"h\",\n",
" valueformat=\".0f\",\n",
" node=dict(\n",
" pad=10,\n",
" thickness=30,\n",
" line=dict(color=\"black\", width=0),\n",
" label=unique_df[\"name\"].dropna(axis=0, how=\"any\"),\n",
" ),\n",
" orientation = \"h\",\n",
" valueformat = \".0f\",\n",
" node = dict(\n",
" pad = 10,\n",
" thickness = 30,\n",
" line = dict(\n",
" color = \"black\",\n",
" width = 0\n",
" ),\n",
" label = unique_df['name'].dropna(axis=0, how='any')\n",
" link=dict(\n",
" source=display_df[\"userIndex\"].dropna(axis=0, how=\"any\"),\n",
" target=display_df[\"resIndex\"].dropna(axis=0, how=\"any\"),\n",
" value=display_df[\"count\"].dropna(axis=0, how=\"any\"),\n",
" color=display_df[\"color\"].dropna(axis=0, how=\"any\"),\n",
" ),\n",
" link = dict(\n",
" source = display_df['userIndex'].dropna(axis=0, how='any'),\n",
" target = display_df['resIndex'].dropna(axis=0, how='any'),\n",
" value = display_df['count'].dropna(axis=0, how='any'),\n",
" color = display_df['color'].dropna(axis=0, how='any'),\n",
" )\n",
")\n",
"\n",
"layout = dict(\n",
" title = \"All resources accessed by users with highest anomalous scores\",\n",
" height = 772,\n",
" font = dict(\n",
" size = 10\n",
" ), \n",
"layout = dict(\n",
" title=\"All resources accessed by users with highest anomalous scores\",\n",
" height=772,\n",
" font=dict(size=10),\n",
")\n",
"\n",
"fig = dict(data=[data_trace], layout=layout)\n",
"\n",
"p = plot(fig, output_type='div')\n",
"p = plot(fig, output_type=\"div\")\n",
"\n",
"displayHTML(p)"
]

Просмотреть файл

@ -45,8 +45,9 @@
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.visualization import display\n"
" from notebookutils.visualization import display"
]
},
{
@ -90,7 +91,8 @@
"source": [
"modelName = \"BiLSTM\"\n",
"modelDir = abspath(\"models\")\n",
"if not os.path.exists(modelDir): os.makedirs(modelDir)\n",
"if not os.path.exists(modelDir):\n",
" os.makedirs(modelDir)\n",
"d = ModelDownloader(spark, \"file://\" + modelDir)\n",
"modelSchema = d.downloadByName(modelName)\n",
"nltk.download(\"punkt\")"
@ -114,7 +116,9 @@
"wordEmbFileName = \"WordEmbeddings_PubMed.pkl\"\n",
"pickleFile = join(abspath(\"models\"), wordEmbFileName)\n",
"if not os.path.isfile(pickleFile):\n",
" urllib.request.urlretrieve(\"https://mmlspark.blob.core.windows.net/datasets/\" + wordEmbFileName, pickleFile)"
" urllib.request.urlretrieve(\n",
" \"https://mmlspark.blob.core.windows.net/datasets/\" + wordEmbFileName, pickleFile\n",
" )"
]
},
{
@ -163,7 +167,7 @@
"outputs": [],
"source": [
"sentences = sent_tokenize(content)\n",
"df = spark.createDataFrame(enumerate(sentences), [\"index\",\"sentence\"])"
"df = spark.createDataFrame(enumerate(sentences), [\"index\", \"sentence\"])"
]
},
{
@ -180,6 +184,7 @@
" nltk.data.path.append(\"/dbfs/nltkdata\")\n",
" return partition\n",
"\n",
"\n",
"df = df.rdd.mapPartitions(prepNLTK).toDF()"
]
},
@ -195,34 +200,38 @@
" prepNLTK(None)\n",
" return word_tokenize(sent)\n",
"\n",
"\n",
"tokenizeUDF = udf(safe_tokenize, ArrayType(StringType()))\n",
"df = df.withColumn(\"tokens\",tokenizeUDF(\"sentence\"))\n",
"df = df.withColumn(\"tokens\", tokenizeUDF(\"sentence\"))\n",
"\n",
"countUDF = udf(len, IntegerType())\n",
"df = df.withColumn(\"count\",countUDF(\"tokens\"))\n",
"df = df.withColumn(\"count\", countUDF(\"tokens\"))\n",
"\n",
"\n",
"def wordToEmb(word):\n",
" return wordvectors[wordToIndex.get(word.lower(), wordToIndex[\"UNK\"])]\n",
"\n",
"\n",
"def featurize(tokens):\n",
" X = np.zeros((maxSentenceLen, nFeatures))\n",
" X[-len(tokens):,:] = np.array([wordToEmb(word) for word in tokens])\n",
" X[-len(tokens) :, :] = np.array([wordToEmb(word) for word in tokens])\n",
" return [float(x) for x in X.reshape(maxSentenceLen, nFeatures).flatten()]\n",
"\n",
"\n",
"def safe_show(df, retries):\n",
" try:\n",
" df.show()\n",
" except Exception as e:\n",
" if retries >= 1:\n",
" safe_show(df, retries-1)\n",
" safe_show(df, retries - 1)\n",
" else:\n",
" raise e\n",
"\n",
"featurizeUDF = udf(featurize, ArrayType(FloatType()))\n",
"\n",
"featurizeUDF = udf(featurize, ArrayType(FloatType()))\n",
"\n",
"df = df.withColumn(\"features\", featurizeUDF(\"tokens\")).cache()\n",
"safe_show(df, 5) # Can be flaky on build server\n",
" \n"
"safe_show(df, 5) # Can be flaky on build server"
],
"metadata": {
"collapsed": false,
@ -244,12 +253,14 @@
"metadata": {},
"outputs": [],
"source": [
"model = CNTKModel() \\\n",
" .setModelLocation(modelSchema.uri) \\\n",
" .setInputCol(\"features\") \\\n",
" .setOutputCol(\"probs\") \\\n",
" .setOutputNodeIndex(0) \\\n",
"model = (\n",
" CNTKModel()\n",
" .setModelLocation(modelSchema.uri)\n",
" .setInputCol(\"features\")\n",
" .setOutputCol(\"probs\")\n",
" .setOutputNodeIndex(0)\n",
" .setMiniBatchSize(1)\n",
")\n",
"\n",
"df = model.transform(df).cache()\n",
"df.show()"
@ -263,10 +274,11 @@
"source": [
"def probsToEntities(probs, wordCount):\n",
" reshaped_probs = np.array(probs).reshape(maxSentenceLen, nClasses)\n",
" reshaped_probs = reshaped_probs[-wordCount:,:]\n",
" reshaped_probs = reshaped_probs[-wordCount:, :]\n",
" return [classToEntity[np.argmax(probs)] for probs in reshaped_probs]\n",
"\n",
"toEntityUDF = udf(probsToEntities,ArrayType(StringType()))\n",
"\n",
"toEntityUDF = udf(probsToEntities, ArrayType(StringType()))\n",
"df = df.withColumn(\"entities\", toEntityUDF(\"probs\", \"count\"))\n",
"df.show()"
]
@ -287,28 +299,33 @@
"# Color Code the Text based on the entity type\n",
"colors = {\n",
" \"B-Disease\": \"blue\",\n",
" \"I-Disease\":\"blue\",\n",
" \"B-Drug\":\"lime\",\n",
" \"I-Drug\":\"lime\",\n",
" \"B-Chemical\":\"lime\",\n",
" \"I-Chemical\":\"lime\",\n",
" \"O\":\"black\",\n",
" \"NONE\":\"black\"\n",
" \"I-Disease\": \"blue\",\n",
" \"B-Drug\": \"lime\",\n",
" \"I-Drug\": \"lime\",\n",
" \"B-Chemical\": \"lime\",\n",
" \"I-Chemical\": \"lime\",\n",
" \"O\": \"black\",\n",
" \"NONE\": \"black\",\n",
"}\n",
"\n",
"\n",
"def prettyPrint(words, annotations):\n",
" formattedWords = []\n",
" for word,annotation in zip(words,annotations):\n",
" formattedWord = \"<font size = '2' color = '{}'>{}</font>\".format(colors[annotation], word)\n",
" if annotation in {\"O\",\"NONE\"}:\n",
" for word, annotation in zip(words, annotations):\n",
" formattedWord = \"<font size = '2' color = '{}'>{}</font>\".format(\n",
" colors[annotation], word\n",
" )\n",
" if annotation in {\"O\", \"NONE\"}:\n",
" formattedWords.append(formattedWord)\n",
" else:\n",
" formattedWords.append(\"<b>{}</b>\".format(formattedWord))\n",
" return \" \".join(formattedWords)\n",
"\n",
"\n",
"prettyPrintUDF = udf(prettyPrint, StringType())\n",
"df = df.withColumn(\"formattedSentence\", prettyPrintUDF(\"tokens\", \"entities\")) \\\n",
" .select(\"formattedSentence\")\n",
"df = df.withColumn(\"formattedSentence\", prettyPrintUDF(\"tokens\", \"entities\")).select(\n",
" \"formattedSentence\"\n",
")\n",
"\n",
"sentences = [row[\"formattedSentence\"] for row in df.collect()]"
]
@ -322,6 +339,7 @@
"outputs": [],
"source": [
"from IPython.core.display import display, HTML\n",
"\n",
"for sentence in sentences:\n",
" display(HTML(sentence))"
]

Просмотреть файл

@ -22,8 +22,9 @@
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.visualization import display\n"
" from notebookutils.visualization import display"
]
},
{
@ -43,7 +44,9 @@
"\n",
"# Please note that this is a copy of the CIFAR10 dataset originally found here:\n",
"# http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz\n",
"imagesWithLabels = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/CIFAR10_test.parquet\")"
"imagesWithLabels = spark.read.parquet(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/CIFAR10_test.parquet\"\n",
")"
]
},
{
@ -59,7 +62,7 @@
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" modelDir = \"abfss://synapse@mmlsparkeuap.dfs.core.windows.net/models/\"\n",
"else:\n",
" modelDir = \"dbfs:/models/\"\n"
" modelDir = \"dbfs:/models/\""
]
},
{
@ -76,7 +79,7 @@
"outputs": [],
"source": [
"d = ModelDownloader(spark, modelDir)\n",
"model = d.downloadByName(modelName)\n"
"model = d.downloadByName(modelName)"
]
},
{
@ -93,24 +96,33 @@
"outputs": [],
"source": [
"import time\n",
"\n",
"start = time.time()\n",
"\n",
"# Use CNTK model to get log probabilities\n",
"cntkModel = CNTKModel().setInputCol(\"images\").setOutputCol(\"output\") \\\n",
" .setModelLocation(model.uri).setOutputNode(\"z\")\n",
"cntkModel = (\n",
" CNTKModel()\n",
" .setInputCol(\"images\")\n",
" .setOutputCol(\"output\")\n",
" .setModelLocation(model.uri)\n",
" .setOutputNode(\"z\")\n",
")\n",
"scoredImages = cntkModel.transform(imagesWithLabels)\n",
"\n",
"# Transform the log probabilities to predictions\n",
"def argmax(x): return max(enumerate(x),key=lambda p: p[1])[0]\n",
"def argmax(x):\n",
" return max(enumerate(x), key=lambda p: p[1])[0]\n",
"\n",
"\n",
"argmaxUDF = udf(argmax, IntegerType())\n",
"imagePredictions = scoredImages.withColumn(\"predictions\", argmaxUDF(\"output\")) \\\n",
" .select(\"predictions\", \"labels\")\n",
"imagePredictions = scoredImages.withColumn(\"predictions\", argmaxUDF(\"output\")).select(\n",
" \"predictions\", \"labels\"\n",
")\n",
"\n",
"numRows = imagePredictions.count()\n",
"\n",
"end = time.time()\n",
"print(\"classifying {} images took {} seconds\".format(numRows,end-start))"
"print(\"classifying {} images took {} seconds\".format(numRows, end - start))"
]
},
{
@ -144,8 +156,18 @@
"\n",
"cm = confusion_matrix(y, y_hat)\n",
"\n",
"labels = [\"airplane\", \"automobile\", \"bird\", \"cat\", \"deer\", \"dog\", \"frog\",\n",
" \"horse\", \"ship\", \"truck\"]\n",
"labels = [\n",
" \"airplane\",\n",
" \"automobile\",\n",
" \"bird\",\n",
" \"cat\",\n",
" \"deer\",\n",
" \"dog\",\n",
" \"frog\",\n",
" \"horse\",\n",
" \"ship\",\n",
" \"truck\",\n",
"]\n",
"plt.imshow(cm, interpolation=\"nearest\", cmap=plt.cm.Blues)\n",
"plt.colorbar()\n",
"tick_marks = np.arange(len(labels))\n",

Просмотреть файл

@ -21,8 +21,9 @@
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.visualization import display\n"
" from notebookutils.visualization import display"
]
},
{
@ -49,8 +50,13 @@
"source": [
"# Load the images\n",
"# use flowers_and_labels.parquet on larger cluster in order to get better results\n",
"imagesWithLabels = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/flowers_and_labels2.parquet\") \\\n",
" .withColumnRenamed(\"bytes\",\"image\").sample(.1)\n",
"imagesWithLabels = (\n",
" spark.read.parquet(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/flowers_and_labels2.parquet\"\n",
" )\n",
" .withColumnRenamed(\"bytes\", \"image\")\n",
" .sample(0.1)\n",
")\n",
"\n",
"imagesWithLabels.printSchema()"
]
@ -74,17 +80,15 @@
"from synapse.ml.stages import *\n",
"\n",
"# Make some featurizers\n",
"it = ImageTransformer()\\\n",
" .setOutputCol(\"scaled\")\\\n",
" .resize(size=(60, 60))\n",
"it = ImageTransformer().setOutputCol(\"scaled\").resize(size=(60, 60))\n",
"\n",
"ur = UnrollImage().setInputCol(\"scaled\").setOutputCol(\"features\")\n",
"\n",
"ur = UnrollImage()\\\n",
" .setInputCol(\"scaled\")\\\n",
" .setOutputCol(\"features\")\n",
" \n",
"dc1 = DropColumns().setCols([\"scaled\", \"image\"])\n",
"\n",
"lr1 = LogisticRegression().setMaxIter(8).setFeaturesCol(\"features\").setLabelCol(\"labels\")\n",
"lr1 = (\n",
" LogisticRegression().setMaxIter(8).setFeaturesCol(\"features\").setLabelCol(\"labels\")\n",
")\n",
"\n",
"dc2 = DropColumns().setCols([\"features\"])\n",
"\n",
@ -97,20 +101,24 @@
"metadata": {},
"outputs": [],
"source": [
"resnet = ImageFeaturizer()\\\n",
" .setInputCol(\"image\")\\\n",
" .setOutputCol(\"features\")\\\n",
" .setModelLocation(model.uri)\\\n",
" .setLayerNames(model.layerNames)\\\n",
"resnet = (\n",
" ImageFeaturizer()\n",
" .setInputCol(\"image\")\n",
" .setOutputCol(\"features\")\n",
" .setModelLocation(model.uri)\n",
" .setLayerNames(model.layerNames)\n",
" .setCutOutputLayers(1)\n",
" \n",
")\n",
"\n",
"dc3 = DropColumns().setCols([\"image\"])\n",
" \n",
"lr2 = LogisticRegression().setMaxIter(8).setFeaturesCol(\"features\").setLabelCol(\"labels\")\n",
"\n",
"lr2 = (\n",
" LogisticRegression().setMaxIter(8).setFeaturesCol(\"features\").setLabelCol(\"labels\")\n",
")\n",
"\n",
"dc4 = DropColumns().setCols([\"features\"])\n",
"\n",
"deepModel = Pipeline(stages=[resnet, dc3, lr2, dc4]) "
"deepModel = Pipeline(stages=[resnet, dc3, lr2, dc4])"
]
},
{
@ -141,10 +149,10 @@
"outputs": [],
"source": [
"def timedExperiment(model, train, test):\n",
" start = time.time()\n",
" result = model.fit(train).transform(test).toPandas()\n",
" print(\"Experiment took {}s\".format(time.time() - start))\n",
" return result"
" start = time.time()\n",
" result = model.fit(train).transform(test).toPandas()\n",
" print(\"Experiment took {}s\".format(time.time() - start))\n",
" return result"
]
},
{
@ -153,7 +161,7 @@
"metadata": {},
"outputs": [],
"source": [
"train, test = imagesWithLabels.randomSplit([.8,.2])\n",
"train, test = imagesWithLabels.randomSplit([0.8, 0.2])\n",
"train.count(), test.count()"
]
},
@ -192,26 +200,30 @@
"from sklearn.metrics import confusion_matrix\n",
"import numpy as np\n",
"\n",
"\n",
"def evaluate(results, name):\n",
" y, y_hat = results[\"labels\"],results[\"prediction\"]\n",
" y, y_hat = results[\"labels\"], results[\"prediction\"]\n",
" y = [int(l) for l in y]\n",
"\n",
" accuracy = np.mean([1. if pred==true else 0. for (pred,true) in zip(y_hat,y)])\n",
" accuracy = np.mean([1.0 if pred == true else 0.0 for (pred, true) in zip(y_hat, y)])\n",
" cm = confusion_matrix(y, y_hat)\n",
" cm = cm.astype(\"float\") / cm.sum(axis=1)[:, np.newaxis]\n",
"\n",
" plt.text(40, 10,\"$Accuracy$ $=$ ${}\\%$\".format(round(accuracy*100,1)),fontsize=14)\n",
" plt.text(\n",
" 40, 10, \"$Accuracy$ $=$ ${}\\%$\".format(round(accuracy * 100, 1)), fontsize=14\n",
" )\n",
" plt.imshow(cm, interpolation=\"nearest\", cmap=plt.cm.Blues)\n",
" plt.colorbar()\n",
" plt.xlabel(\"$Predicted$ $label$\", fontsize=18)\n",
" plt.ylabel(\"$True$ $Label$\", fontsize=18)\n",
" plt.title(\"$Normalized$ $CM$ $for$ ${}$\".format(name))\n",
"\n",
"plt.figure(figsize=(12,5))\n",
"plt.subplot(1,2,1)\n",
"evaluate(deepResults,\"CNTKModel + LR\")\n",
"plt.subplot(1,2,2)\n",
"evaluate(basicResults,\"LR\")\n",
"\n",
"plt.figure(figsize=(12, 5))\n",
"plt.subplot(1, 2, 1)\n",
"evaluate(deepResults, \"CNTKModel + LR\")\n",
"plt.subplot(1, 2, 2)\n",
"evaluate(basicResults, \"LR\")\n",
"# Note that on the larger dataset the accuracy will bump up from 44% to >90%\n",
"display(plt.show())"
]

Просмотреть файл

@ -33,6 +33,7 @@
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
"\n",
"\n",
@ -45,8 +46,13 @@
"d = ModelDownloader(spark, modelDir)\n",
"model = d.downloadByName(modelName)\n",
"print(model.layerNames)\n",
"cntkModel = CNTKModel().setInputCol(\"images\").setOutputCol(\"features\") \\\n",
" .setModelLocation(model.uri).setOutputNode(\"l8\")"
"cntkModel = (\n",
" CNTKModel()\n",
" .setInputCol(\"images\")\n",
" .setOutputCol(\"features\")\n",
" .setModelLocation(model.uri)\n",
" .setOutputNode(\"l8\")\n",
")"
]
},
{
@ -62,7 +68,9 @@
"metadata": {},
"outputs": [],
"source": [
"imagesWithLabels = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/CIFAR10_test.parquet\")"
"imagesWithLabels = spark.read.parquet(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/CIFAR10_test.parquet\"\n",
")"
]
},
{
@ -95,7 +103,7 @@
"metadata": {},
"outputs": [],
"source": [
"featurizedImages = cntkModel.transform(imagesWithLabels).select([\"features\",\"labels\"])"
"featurizedImages = cntkModel.transform(imagesWithLabels).select([\"features\", \"labels\"])"
]
},
{
@ -114,9 +122,9 @@
"from synapse.ml.train import TrainClassifier\n",
"from pyspark.ml.classification import RandomForestClassifier\n",
"\n",
"train,test = featurizedImages.randomSplit([0.75,0.25])\n",
"train, test = featurizedImages.randomSplit([0.75, 0.25])\n",
"\n",
"model = TrainClassifier(model=RandomForestClassifier(),labelCol=\"labels\").fit(train)"
"model = TrainClassifier(model=RandomForestClassifier(), labelCol=\"labels\").fit(train)"
]
},
{
@ -133,6 +141,7 @@
"outputs": [],
"source": [
"from synapse.ml.train import ComputeModelStatistics\n",
"\n",
"predictions = model.transform(test)\n",
"metrics = ComputeModelStatistics(evaluationMetric=\"accuracy\").transform(predictions)\n",
"metrics.show()"

Просмотреть файл

@ -22,6 +22,7 @@
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()"
]
},
@ -38,7 +39,9 @@
"metadata": {},
"outputs": [],
"source": [
"data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BreastCancer.parquet\").cache()\n",
"data = spark.read.parquet(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/BreastCancer.parquet\"\n",
").cache()\n",
"tune, test = data.randomSplit([0.80, 0.20])\n",
"tune.limit(10).toPandas()"
]
@ -58,7 +61,12 @@
"source": [
"from synapse.ml.automl import TuneHyperparameters\n",
"from synapse.ml.train import TrainClassifier\n",
"from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier\n",
"from pyspark.ml.classification import (\n",
" LogisticRegression,\n",
" RandomForestClassifier,\n",
" GBTClassifier,\n",
")\n",
"\n",
"logReg = LogisticRegression()\n",
"randForest = RandomForestClassifier()\n",
"gbt = GBTClassifier()\n",
@ -83,13 +91,14 @@
"source": [
"from synapse.ml.automl import *\n",
"\n",
"paramBuilder = \\\n",
" HyperparamBuilder() \\\n",
" .addHyperparam(logReg, logReg.regParam, RangeHyperParam(0.1, 0.3)) \\\n",
" .addHyperparam(randForest, randForest.numTrees, DiscreteHyperParam([5,10])) \\\n",
" .addHyperparam(randForest, randForest.maxDepth, DiscreteHyperParam([3,5])) \\\n",
" .addHyperparam(gbt, gbt.maxBins, RangeHyperParam(8,16)) \\\n",
" .addHyperparam(gbt, gbt.maxDepth, DiscreteHyperParam([3,5]))\n",
"paramBuilder = (\n",
" HyperparamBuilder()\n",
" .addHyperparam(logReg, logReg.regParam, RangeHyperParam(0.1, 0.3))\n",
" .addHyperparam(randForest, randForest.numTrees, DiscreteHyperParam([5, 10]))\n",
" .addHyperparam(randForest, randForest.maxDepth, DiscreteHyperParam([3, 5]))\n",
" .addHyperparam(gbt, gbt.maxBins, RangeHyperParam(8, 16))\n",
" .addHyperparam(gbt, gbt.maxDepth, DiscreteHyperParam([3, 5]))\n",
")\n",
"searchSpace = paramBuilder.build()\n",
"# The search space is a list of params to tuples of estimator and hyperparam\n",
"print(searchSpace)\n",
@ -110,9 +119,14 @@
"outputs": [],
"source": [
"bestModel = TuneHyperparameters(\n",
" evaluationMetric=\"accuracy\", models=mmlmodels, numFolds=2,\n",
" numRuns=len(mmlmodels) * 2, parallelism=1,\n",
" paramSpace=randomSpace.space(), seed=0).fit(tune)"
" evaluationMetric=\"accuracy\",\n",
" models=mmlmodels,\n",
" numFolds=2,\n",
" numRuns=len(mmlmodels) * 2,\n",
" parallelism=1,\n",
" paramSpace=randomSpace.space(),\n",
" seed=0,\n",
").fit(tune)"
]
},
{
@ -146,6 +160,7 @@
"outputs": [],
"source": [
"from synapse.ml.train import ComputeModelStatistics\n",
"\n",
"prediction = bestModel.transform(test)\n",
"metrics = ComputeModelStatistics().transform(prediction)\n",
"metrics.limit(10).toPandas()"

Просмотреть файл

@ -16,8 +16,10 @@
"execution_count": null,
"source": [
"import os\n",
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()"
],
"outputs": [],
@ -27,7 +29,7 @@
"cell_type": "code",
"execution_count": null,
"source": [
"import pandas as pd\n"
"import pandas as pd"
],
"outputs": [],
"metadata": {}
@ -36,7 +38,9 @@
"cell_type": "code",
"execution_count": null,
"source": [
"data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\")\n",
"data = spark.read.parquet(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\"\n",
")\n",
"data.limit(10).toPandas()"
],
"outputs": [],
@ -53,8 +57,7 @@
"cell_type": "code",
"execution_count": null,
"source": [
"processedData = data.withColumn(\"label\", data[\"rating\"] > 3) \\\n",
" .select([\"text\", \"label\"])\n",
"processedData = data.withColumn(\"label\", data[\"rating\"] > 3).select([\"text\", \"label\"])\n",
"processedData.limit(5).toPandas()"
],
"outputs": [],
@ -89,11 +92,13 @@
"source": [
"from pyspark.ml import Pipeline\n",
"from pyspark.ml.feature import Tokenizer, Word2Vec\n",
"\n",
"tokenizer = Tokenizer(inputCol=\"text\", outputCol=\"words\")\n",
"partitions = train.rdd.getNumPartitions()\n",
"word2vec = Word2Vec(maxIter=4, seed=42, inputCol=\"words\", outputCol=\"features\",\n",
" numPartitions=partitions)\n",
"textFeaturizer = Pipeline(stages = [tokenizer, word2vec]).fit(train)"
"word2vec = Word2Vec(\n",
" maxIter=4, seed=42, inputCol=\"words\", outputCol=\"features\", numPartitions=partitions\n",
")\n",
"textFeaturizer = Pipeline(stages=[tokenizer, word2vec]).fit(train)"
],
"outputs": [],
"metadata": {}
@ -128,29 +133,42 @@
"cell_type": "code",
"execution_count": null,
"source": [
"from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier\n",
"from pyspark.ml.classification import (\n",
" LogisticRegression,\n",
" RandomForestClassifier,\n",
" GBTClassifier,\n",
")\n",
"from synapse.ml.train import TrainClassifier\n",
"import itertools\n",
"\n",
"lrHyperParams = [0.05, 0.2]\n",
"logisticRegressions = [LogisticRegression(regParam = hyperParam)\n",
" for hyperParam in lrHyperParams]\n",
"lrmodels = [TrainClassifier(model=lrm, labelCol=\"label\").fit(ptrain)\n",
" for lrm in logisticRegressions]\n",
"lrHyperParams = [0.05, 0.2]\n",
"logisticRegressions = [\n",
" LogisticRegression(regParam=hyperParam) for hyperParam in lrHyperParams\n",
"]\n",
"lrmodels = [\n",
" TrainClassifier(model=lrm, labelCol=\"label\").fit(ptrain)\n",
" for lrm in logisticRegressions\n",
"]\n",
"\n",
"rfHyperParams = itertools.product([5, 10], [2, 3])\n",
"randomForests = [RandomForestClassifier(numTrees=hyperParam[0], maxDepth=hyperParam[1])\n",
" for hyperParam in rfHyperParams]\n",
"rfmodels = [TrainClassifier(model=rfm, labelCol=\"label\").fit(ptrain)\n",
" for rfm in randomForests]\n",
"rfHyperParams = itertools.product([5, 10], [2, 3])\n",
"randomForests = [\n",
" RandomForestClassifier(numTrees=hyperParam[0], maxDepth=hyperParam[1])\n",
" for hyperParam in rfHyperParams\n",
"]\n",
"rfmodels = [\n",
" TrainClassifier(model=rfm, labelCol=\"label\").fit(ptrain) for rfm in randomForests\n",
"]\n",
"\n",
"gbtHyperParams = itertools.product([8, 16], [2, 3])\n",
"gbtclassifiers = [GBTClassifier(maxBins=hyperParam[0], maxDepth=hyperParam[1])\n",
" for hyperParam in gbtHyperParams]\n",
"gbtmodels = [TrainClassifier(model=gbt, labelCol=\"label\").fit(ptrain)\n",
" for gbt in gbtclassifiers]\n",
"gbtHyperParams = itertools.product([8, 16], [2, 3])\n",
"gbtclassifiers = [\n",
" GBTClassifier(maxBins=hyperParam[0], maxDepth=hyperParam[1])\n",
" for hyperParam in gbtHyperParams\n",
"]\n",
"gbtmodels = [\n",
" TrainClassifier(model=gbt, labelCol=\"label\").fit(ptrain) for gbt in gbtclassifiers\n",
"]\n",
"\n",
"trainedModels = lrmodels + rfmodels + gbtmodels"
"trainedModels = lrmodels + rfmodels + gbtmodels"
],
"outputs": [],
"metadata": {}
@ -167,6 +185,7 @@
"execution_count": null,
"source": [
"from synapse.ml.automl import FindBestModel\n",
"\n",
"bestModel = FindBestModel(evaluationMetric=\"AUC\", models=trainedModels).fit(ptest)\n",
"bestModel.getRocCurve().show()\n",
"bestModel.getBestModelMetrics().show()\n",
@ -187,12 +206,17 @@
"execution_count": null,
"source": [
"from synapse.ml.train import ComputeModelStatistics\n",
"\n",
"predictions = bestModel.transform(pvalidation)\n",
"metrics = ComputeModelStatistics().transform(predictions)\n",
"print(\"Best model's accuracy on validation set = \"\n",
" + \"{0:.2f}%\".format(metrics.first()[\"accuracy\"] * 100))\n",
"print(\"Best model's AUC on validation set = \"\n",
" + \"{0:.2f}%\".format(metrics.first()[\"AUC\"] * 100))"
"print(\n",
" \"Best model's accuracy on validation set = \"\n",
" + \"{0:.2f}%\".format(metrics.first()[\"accuracy\"] * 100)\n",
")\n",
"print(\n",
" \"Best model's AUC on validation set = \"\n",
" + \"{0:.2f}%\".format(metrics.first()[\"AUC\"] * 100)\n",
")"
],
"outputs": [],
"metadata": {}

Просмотреть файл

@ -16,8 +16,10 @@
"execution_count": null,
"source": [
"import os\n",
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()"
],
"outputs": [],
@ -36,7 +38,9 @@
"cell_type": "code",
"execution_count": null,
"source": [
"data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\")\n",
"data = spark.read.parquet(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/BookReviewsFromAmazon10K.parquet\"\n",
")\n",
"data.limit(10).toPandas()"
],
"outputs": [],
@ -55,9 +59,17 @@
"execution_count": null,
"source": [
"from synapse.ml.featurize.text import TextFeaturizer\n",
"textFeaturizer = TextFeaturizer() \\\n",
" .setInputCol(\"text\").setOutputCol(\"features\") \\\n",
" .setUseStopWordsRemover(True).setUseIDF(True).setMinDocFreq(5).setNumFeatures(1 << 16).fit(data)"
"\n",
"textFeaturizer = (\n",
" TextFeaturizer()\n",
" .setInputCol(\"text\")\n",
" .setOutputCol(\"features\")\n",
" .setUseStopWordsRemover(True)\n",
" .setUseIDF(True)\n",
" .setMinDocFreq(5)\n",
" .setNumFeatures(1 << 16)\n",
" .fit(data)\n",
")"
],
"outputs": [],
"metadata": {}
@ -84,8 +96,9 @@
"cell_type": "code",
"execution_count": null,
"source": [
"processedData = processedData.withColumn(\"label\", processedData[\"rating\"] > 3) \\\n",
" .select([\"features\", \"label\"])\n",
"processedData = processedData.withColumn(\"label\", processedData[\"rating\"] > 3).select(\n",
" [\"features\", \"label\"]\n",
")\n",
"processedData.limit(5).toPandas()"
],
"outputs": [],
@ -106,10 +119,16 @@
"from pyspark.ml.classification import LogisticRegression\n",
"\n",
"lrHyperParams = [0.05, 0.1, 0.2, 0.4]\n",
"logisticRegressions = [LogisticRegression(regParam = hyperParam) for hyperParam in lrHyperParams]\n",
"logisticRegressions = [\n",
" LogisticRegression(regParam=hyperParam) for hyperParam in lrHyperParams\n",
"]\n",
"\n",
"from synapse.ml.train import TrainClassifier\n",
"lrmodels = [TrainClassifier(model=lrm, labelCol=\"label\").fit(train) for lrm in logisticRegressions]"
"\n",
"lrmodels = [\n",
" TrainClassifier(model=lrm, labelCol=\"label\").fit(train)\n",
" for lrm in logisticRegressions\n",
"]"
],
"outputs": [],
"metadata": {}
@ -126,10 +145,11 @@
"execution_count": null,
"source": [
"from synapse.ml.automl import FindBestModel, BestModel\n",
"\n",
"bestModel = FindBestModel(evaluationMetric=\"AUC\", models=lrmodels).fit(test)\n",
"bestModel.getRocCurve().show()\n",
"bestModel.getBestModelMetrics().show()\n",
"bestModel.getAllModelMetrics().show()\n"
"bestModel.getAllModelMetrics().show()"
],
"outputs": [],
"metadata": {}
@ -146,10 +166,13 @@
"execution_count": null,
"source": [
"from synapse.ml.train import ComputeModelStatistics\n",
"\n",
"predictions = bestModel.transform(validation)\n",
"metrics = ComputeModelStatistics().transform(predictions)\n",
"print(\"Best model's accuracy on validation set = \"\n",
" + \"{0:.2f}%\".format(metrics.first()[\"accuracy\"] * 100))"
"print(\n",
" \"Best model's accuracy on validation set = \"\n",
" + \"{0:.2f}%\".format(metrics.first()[\"accuracy\"] * 100)\n",
")"
],
"outputs": [],
"metadata": {}

Просмотреть файл

@ -36,8 +36,10 @@
"outputs": [],
"source": [
"import os\n",
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()"
]
},
@ -47,7 +49,9 @@
"metadata": {},
"outputs": [],
"source": [
"data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AutomobilePriceRaw.parquet\")\n"
"data = spark.read.parquet(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/AutomobilePriceRaw.parquet\"\n",
")"
]
},
{
@ -94,6 +98,7 @@
"outputs": [],
"source": [
"from synapse.ml.stages import SummarizeData\n",
"\n",
"summary = SummarizeData().transform(data)\n",
"summary.toPandas()"
]
@ -139,10 +144,11 @@
"outputs": [],
"source": [
"from synapse.ml.featurize import CleanMissingData\n",
"cols = [\"normalized-losses\", \"stroke\", \"bore\", \"horsepower\",\n",
" \"peak-rpm\", \"price\"]\n",
"cleanModel = CleanMissingData().setCleaningMode(\"Median\") \\\n",
" .setInputCols(cols).setOutputCols(cols)"
"\n",
"cols = [\"normalized-losses\", \"stroke\", \"bore\", \"horsepower\", \"peak-rpm\", \"price\"]\n",
"cleanModel = (\n",
" CleanMissingData().setCleaningMode(\"Median\").setInputCols(cols).setOutputCols(cols)\n",
")"
]
},
{
@ -195,7 +201,7 @@
"\n",
"glr = GeneralizedLinearRegression(family=\"poisson\", link=\"log\")\n",
"poissonModel = TrainRegressor().setModel(glr).setLabelCol(\"price\").setNumFeatures(256)\n",
"poissonPipe = Pipeline(stages = [cleanModel, poissonModel]).fit(train)\n",
"poissonPipe = Pipeline(stages=[cleanModel, poissonModel]).fit(train)\n",
"poissonPrediction = poissonPipe.transform(test)"
]
},
@ -217,8 +223,10 @@
"from pyspark.ml.regression import RandomForestRegressor\n",
"\n",
"rfr = RandomForestRegressor(maxDepth=30, maxBins=128, numTrees=8, minInstancesPerNode=1)\n",
"randomForestModel = TrainRegressor(model=rfr, labelCol=\"price\", numFeatures=256).fit(train)\n",
"randomForestPipe = Pipeline(stages = [cleanModel, randomForestModel]).fit(train)\n",
"randomForestModel = TrainRegressor(model=rfr, labelCol=\"price\", numFeatures=256).fit(\n",
" train\n",
")\n",
"randomForestPipe = Pipeline(stages=[cleanModel, randomForestModel]).fit(train)\n",
"randomForestPrediction = randomForestPipe.transform(test)"
]
},
@ -245,6 +253,7 @@
"outputs": [],
"source": [
"from synapse.ml.train import ComputeModelStatistics\n",
"\n",
"poissonMetrics = ComputeModelStatistics().transform(poissonPrediction)\n",
"print(\"Poisson Metrics\")\n",
"poissonMetrics.toPandas()"
@ -275,10 +284,18 @@
"outputs": [],
"source": [
"from synapse.ml.train import ComputePerInstanceStatistics\n",
"\n",
"\n",
"def demonstrateEvalPerInstance(pred):\n",
" return ComputePerInstanceStatistics().transform(pred) \\\n",
" .select(\"price\", \"prediction\", \"L1_loss\", \"L2_loss\") \\\n",
" .limit(10).toPandas()\n",
" return (\n",
" ComputePerInstanceStatistics()\n",
" .transform(pred)\n",
" .select(\"price\", \"prediction\", \"L1_loss\", \"L2_loss\")\n",
" .limit(10)\n",
" .toPandas()\n",
" )\n",
"\n",
"\n",
"demonstrateEvalPerInstance(poissonPrediction)"
]
},

Просмотреть файл

@ -32,8 +32,10 @@
"outputs": [],
"source": [
"import os\n",
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()"
]
},
@ -63,7 +65,9 @@
"metadata": {},
"outputs": [],
"source": [
"flightDelay = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/On_Time_Performance_2012_9.parquet\")\n",
"flightDelay = spark.read.parquet(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/On_Time_Performance_2012_9.parquet\"\n",
")\n",
"# print some basic info\n",
"print(\"records read: \" + str(flightDelay.count()))\n",
"print(\"Schema: \")\n",
@ -105,11 +109,20 @@
"outputs": [],
"source": [
"from synapse.ml.featurize import DataConversion\n",
"flightDelay = DataConversion(cols=[\"Quarter\",\"Month\",\"DayofMonth\",\"DayOfWeek\",\n",
" \"OriginAirportID\",\"DestAirportID\",\n",
" \"CRSDepTime\",\"CRSArrTime\"],\n",
" convertTo=\"double\") \\\n",
" .transform(flightDelay)\n",
"\n",
"flightDelay = DataConversion(\n",
" cols=[\n",
" \"Quarter\",\n",
" \"Month\",\n",
" \"DayofMonth\",\n",
" \"DayOfWeek\",\n",
" \"OriginAirportID\",\n",
" \"DestAirportID\",\n",
" \"CRSDepTime\",\n",
" \"CRSArrTime\",\n",
" ],\n",
" convertTo=\"double\",\n",
").transform(flightDelay)\n",
"flightDelay.printSchema()\n",
"flightDelay.limit(10).toPandas()"
]
@ -159,14 +172,13 @@
"from synapse.ml.train import TrainRegressor, TrainedRegressorModel\n",
"from pyspark.ml.regression import LinearRegression\n",
"\n",
"trainCat = DataConversion(cols=[\"Carrier\",\"DepTimeBlk\",\"ArrTimeBlk\"],\n",
" convertTo=\"toCategorical\") \\\n",
" .transform(train)\n",
"testCat = DataConversion(cols=[\"Carrier\",\"DepTimeBlk\",\"ArrTimeBlk\"],\n",
" convertTo=\"toCategorical\") \\\n",
" .transform(test)\n",
"lr = LinearRegression().setRegParam(0.1) \\\n",
" .setElasticNetParam(0.3)\n",
"trainCat = DataConversion(\n",
" cols=[\"Carrier\", \"DepTimeBlk\", \"ArrTimeBlk\"], convertTo=\"toCategorical\"\n",
").transform(train)\n",
"testCat = DataConversion(\n",
" cols=[\"Carrier\", \"DepTimeBlk\", \"ArrTimeBlk\"], convertTo=\"toCategorical\"\n",
").transform(test)\n",
"lr = LinearRegression().setRegParam(0.1).setElasticNetParam(0.3)\n",
"model = TrainRegressor(model=lr, labelCol=\"ArrDelay\").fit(trainCat)"
]
},
@ -201,6 +213,7 @@
"outputs": [],
"source": [
"from synapse.ml.train import ComputeModelStatistics\n",
"\n",
"metrics = ComputeModelStatistics().transform(scoredData)\n",
"metrics.toPandas()"
]
@ -220,9 +233,11 @@
"outputs": [],
"source": [
"from synapse.ml.train import ComputePerInstanceStatistics\n",
"\n",
"evalPerInstance = ComputePerInstanceStatistics().transform(scoredData)\n",
"evalPerInstance.select(\"ArrDelay\", \"prediction\", \"L1_loss\", \"L2_loss\") \\\n",
" .limit(10).toPandas()"
"evalPerInstance.select(\"ArrDelay\", \"prediction\", \"L1_loss\", \"L2_loss\").limit(\n",
" 10\n",
").toPandas()"
]
}
],

Просмотреть файл

@ -20,8 +20,10 @@
"outputs": [],
"source": [
"import os\n",
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()"
]
},
@ -49,7 +51,9 @@
"metadata": {},
"outputs": [],
"source": [
"flightDelay = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/On_Time_Performance_2012_9.parquet\")\n",
"flightDelay = spark.read.parquet(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/On_Time_Performance_2012_9.parquet\"\n",
")\n",
"# print some basic info\n",
"print(\"records read: \" + str(flightDelay.count()))\n",
"print(\"Schema: \")\n",
@ -70,7 +74,7 @@
"metadata": {},
"outputs": [],
"source": [
"train,test = flightDelay.randomSplit([0.75, 0.25])"
"train, test = flightDelay.randomSplit([0.75, 0.25])"
]
},
{
@ -89,14 +93,23 @@
"from synapse.ml.train import TrainRegressor, TrainedRegressorModel\n",
"from pyspark.ml.regression import LinearRegression\n",
"from pyspark.ml.feature import StringIndexer\n",
"\n",
"# Convert columns to categorical\n",
"catCols = [\"Carrier\", \"DepTimeBlk\", \"ArrTimeBlk\"]\n",
"trainCat = train\n",
"testCat = test\n",
"for catCol in catCols:\n",
" simodel = StringIndexer(inputCol=catCol, outputCol=catCol + \"Tmp\").fit(train)\n",
" trainCat = simodel.transform(trainCat).drop(catCol).withColumnRenamed(catCol + \"Tmp\", catCol)\n",
" testCat = simodel.transform(testCat).drop(catCol).withColumnRenamed(catCol + \"Tmp\", catCol)\n",
" trainCat = (\n",
" simodel.transform(trainCat)\n",
" .drop(catCol)\n",
" .withColumnRenamed(catCol + \"Tmp\", catCol)\n",
" )\n",
" testCat = (\n",
" simodel.transform(testCat)\n",
" .drop(catCol)\n",
" .withColumnRenamed(catCol + \"Tmp\", catCol)\n",
" )\n",
"lr = LinearRegression().setRegParam(0.1).setElasticNetParam(0.3)\n",
"model = TrainRegressor(model=lr, labelCol=\"ArrDelay\").fit(trainCat)"
]
@ -140,6 +153,7 @@
"outputs": [],
"source": [
"from synapse.ml.train import ComputeModelStatistics\n",
"\n",
"metrics = ComputeModelStatistics().transform(scoredData)\n",
"metrics.toPandas()"
]
@ -159,8 +173,11 @@
"outputs": [],
"source": [
"from synapse.ml.train import ComputePerInstanceStatistics\n",
"\n",
"evalPerInstance = ComputePerInstanceStatistics().transform(scoredData)\n",
"evalPerInstance.select(\"ArrDelay\", \"prediction\", \"L1_loss\", \"L2_loss\").limit(10).toPandas()"
"evalPerInstance.select(\"ArrDelay\", \"prediction\", \"L1_loss\", \"L2_loss\").limit(\n",
" 10\n",
").toPandas()"
]
}
],

Просмотреть файл

@ -20,10 +20,12 @@
"outputs": [],
"source": [
"import os\n",
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.visualization import display\n"
" from notebookutils.visualization import display"
]
},
{
@ -62,9 +64,11 @@
"source": [
"boston = load_boston()\n",
"\n",
"feature_cols = ['f' + str(i) for i in range(boston.data.shape[1])]\n",
"header = ['target'] + feature_cols\n",
"df = spark.createDataFrame(pd.DataFrame(data=np.column_stack((boston.target, boston.data)), columns=header)).repartition(1)\n",
"feature_cols = [\"f\" + str(i) for i in range(boston.data.shape[1])]\n",
"header = [\"target\"] + feature_cols\n",
"df = spark.createDataFrame(\n",
" pd.DataFrame(data=np.column_stack((boston.target, boston.data)), columns=header)\n",
").repartition(1)\n",
"print(\"Dataframe has {} rows\".format(df.count()))\n",
"display(df.limit(10).toPandas())"
]
@ -110,7 +114,7 @@
"outputs": [],
"source": [
"features = train_data.columns[1:]\n",
"values = train_data.drop('target').toPandas()\n",
"values = train_data.drop(\"target\").toPandas()\n",
"ncols = 5\n",
"nrows = math.ceil(len(features) / ncols)"
]
@ -130,9 +134,9 @@
"metadata": {},
"outputs": [],
"source": [
"featurizer = VectorAssembler(inputCols=feature_cols, outputCol='features')\n",
"lr_train_data = featurizer.transform(train_data)['target', 'features']\n",
"lr_test_data = featurizer.transform(test_data)['target', 'features']\n",
"featurizer = VectorAssembler(inputCols=feature_cols, outputCol=\"features\")\n",
"lr_train_data = featurizer.transform(train_data)[\"target\", \"features\"]\n",
"lr_test_data = featurizer.transform(test_data)[\"target\", \"features\"]\n",
"display(lr_train_data.limit(10).toPandas())"
]
},
@ -143,7 +147,7 @@
"outputs": [],
"source": [
"# By default, `maxIter` is 100. Other params you may want to change include: `regParam`, `elasticNetParam`, etc.\n",
"lr = LinearRegression(labelCol='target')\n",
"lr = LinearRegression(labelCol=\"target\")\n",
"\n",
"lr_model = lr.fit(lr_train_data)\n",
"lr_predictions = lr_model.transform(lr_test_data)\n",
@ -169,12 +173,11 @@
"outputs": [],
"source": [
"metrics = ComputeModelStatistics(\n",
" evaluationMetric='regression',\n",
" labelCol='target',\n",
" scoresCol='prediction').transform(lr_predictions)\n",
" evaluationMetric=\"regression\", labelCol=\"target\", scoresCol=\"prediction\"\n",
").transform(lr_predictions)\n",
"\n",
"results = metrics.toPandas()\n",
"results.insert(0, 'model', ['Spark MLlib - Linear Regression'])\n",
"results.insert(0, \"model\", [\"Spark MLlib - Linear Regression\"])\n",
"display(results)"
]
},
@ -198,12 +201,10 @@
"metadata": {},
"outputs": [],
"source": [
"vw_featurizer = VowpalWabbitFeaturizer(\n",
" inputCols=feature_cols,\n",
" outputCol='features')\n",
"vw_featurizer = VowpalWabbitFeaturizer(inputCols=feature_cols, outputCol=\"features\")\n",
"\n",
"vw_train_data = vw_featurizer.transform(train_data)['target', 'features']\n",
"vw_test_data = vw_featurizer.transform(test_data)['target', 'features']\n",
"vw_train_data = vw_featurizer.transform(train_data)[\"target\", \"features\"]\n",
"vw_test_data = vw_featurizer.transform(test_data)[\"target\", \"features\"]\n",
"display(vw_train_data.limit(10).toPandas())"
]
},
@ -222,10 +223,7 @@
"source": [
"# Use the same number of iterations as Spark MLlib's Linear Regression (=100)\n",
"args = \"--holdout_off --loss_function quantile -l 7 -q :: --power_t 0.3\"\n",
"vwr = VowpalWabbitRegressor(\n",
" labelCol='target',\n",
" passThroughArgs=args,\n",
" numPasses=100)\n",
"vwr = VowpalWabbitRegressor(labelCol=\"target\", passThroughArgs=args, numPasses=100)\n",
"\n",
"# To reduce number of partitions (which will effect performance), use `vw_train_data.repartition(1)`\n",
"vw_train_data_2 = vw_train_data.repartition(1).cache()\n",
@ -243,15 +241,12 @@
"outputs": [],
"source": [
"metrics = ComputeModelStatistics(\n",
" evaluationMetric='regression',\n",
" labelCol='target',\n",
" scoresCol='prediction').transform(vw_predictions)\n",
" evaluationMetric=\"regression\", labelCol=\"target\", scoresCol=\"prediction\"\n",
").transform(vw_predictions)\n",
"\n",
"vw_result = metrics.toPandas()\n",
"vw_result.insert(0, 'model', ['Vowpal Wabbit'])\n",
"results = results.append(\n",
" vw_result,\n",
" ignore_index=True)\n",
"vw_result.insert(0, \"model\", [\"Vowpal Wabbit\"])\n",
"results = results.append(vw_result, ignore_index=True)\n",
"\n",
"display(results)"
]
@ -270,12 +265,13 @@
"outputs": [],
"source": [
"lgr = LightGBMRegressor(\n",
" objective='quantile',\n",
" objective=\"quantile\",\n",
" alpha=0.2,\n",
" learningRate=0.3,\n",
" numLeaves=31,\n",
" labelCol='target',\n",
" numIterations=100)\n",
" labelCol=\"target\",\n",
" numIterations=100,\n",
")\n",
"\n",
"# Using one partition since the training dataset is very small\n",
"repartitioned_data = lr_train_data.repartition(1).cache()\n",
@ -293,16 +289,13 @@
"outputs": [],
"source": [
"metrics = ComputeModelStatistics(\n",
" evaluationMetric='regression',\n",
" labelCol='target',\n",
" scoresCol='prediction').transform(lg_predictions)\n",
" evaluationMetric=\"regression\", labelCol=\"target\", scoresCol=\"prediction\"\n",
").transform(lg_predictions)\n",
"\n",
"lg_result = metrics.toPandas()\n",
"lg_result.insert(0, 'model', ['LightGBM'])\n",
"lg_result.insert(0, \"model\", [\"LightGBM\"])\n",
"\n",
"results = results.append(\n",
" lg_result,\n",
" ignore_index=True)\n",
"results = results.append(lg_result, ignore_index=True)\n",
"\n",
"display(results)"
]
@ -327,14 +320,14 @@
" from matplotlib.cm import get_cmap\n",
" import matplotlib.pyplot as plt\n",
"\n",
" f, axes = plt.subplots(nrows, ncols, sharey=True, figsize=(30,10))\n",
" f, axes = plt.subplots(nrows, ncols, sharey=True, figsize=(30, 10))\n",
" f.tight_layout()\n",
" yy = [r['target'] for r in train_data.select('target').collect()]\n",
" yy = [r[\"target\"] for r in train_data.select(\"target\").collect()]\n",
" for irow in range(nrows):\n",
" axes[irow][0].set_ylabel('target')\n",
" axes[irow][0].set_ylabel(\"target\")\n",
" for icol in range(ncols):\n",
" try:\n",
" feat = features[irow*ncols + icol]\n",
" feat = features[irow * ncols + icol]\n",
" xx = values[feat]\n",
" axes[irow][icol].scatter(xx, yy, s=10, alpha=0.25)\n",
" axes[irow][icol].set_xlabel(feat)\n",
@ -342,28 +335,29 @@
" except IndexError:\n",
" f.delaxes(axes[irow][icol])\n",
"\n",
" cmap = get_cmap('YlOrRd')\n",
" cmap = get_cmap(\"YlOrRd\")\n",
"\n",
" target = np.array(test_data.select('target').collect()).flatten()\n",
" target = np.array(test_data.select(\"target\").collect()).flatten()\n",
" model_preds = [\n",
" (\"Spark MLlib Linear Regression\", lr_predictions),\n",
" (\"Vowpal Wabbit\", vw_predictions),\n",
" (\"LightGBM\", lg_predictions)]\n",
" (\"LightGBM\", lg_predictions),\n",
" ]\n",
"\n",
" f, axes = plt.subplots(1, len(model_preds), sharey=True, figsize=(18, 6))\n",
" f.tight_layout()\n",
"\n",
" for i, (model_name, preds) in enumerate(model_preds):\n",
" preds = np.array(preds.select('prediction').collect()).flatten()\n",
" preds = np.array(preds.select(\"prediction\").collect()).flatten()\n",
" err = np.absolute(preds - target)\n",
"\n",
" norm = Normalize()\n",
" clrs = cmap(np.asarray(norm(err)))[:, :-1]\n",
" axes[i].scatter(preds, target, s=60, c=clrs, edgecolors='#888888', alpha=0.75)\n",
" axes[i].plot((0, 60), (0, 60), linestyle='--', color='#888888')\n",
" axes[i].set_xlabel('Predicted values')\n",
" if i ==0:\n",
" axes[i].set_ylabel('Actual values')\n",
" axes[i].scatter(preds, target, s=60, c=clrs, edgecolors=\"#888888\", alpha=0.75)\n",
" axes[i].plot((0, 60), (0, 60), linestyle=\"--\", color=\"#888888\")\n",
" axes[i].set_xlabel(\"Predicted values\")\n",
" if i == 0:\n",
" axes[i].set_ylabel(\"Actual values\")\n",
" axes[i].set_title(model_name)"
]
},

Просмотреть файл

@ -62,8 +62,9 @@
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.visualization import display\n"
" from notebookutils.visualization import display"
]
},
{
@ -79,7 +80,9 @@
},
"outputs": [],
"source": [
"df = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n",
"df = spark.read.parquet(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\"\n",
")\n",
"display(df)"
]
},
@ -98,7 +101,9 @@
"source": [
"# Convert the \"income\" column from {<=50K, >50K} to {0, 1} to represent our binary classification label column\n",
"label_col = \"income\"\n",
"df = df.withColumn(label_col, F.when(F.col(label_col).contains(\"<=50K\"), F.lit(0)).otherwise(F.lit(1)))"
"df = df.withColumn(\n",
" label_col, F.when(F.col(label_col).contains(\"<=50K\"), F.lit(0)).otherwise(F.lit(1))\n",
")"
]
},
{
@ -240,7 +245,11 @@
"outputs": [],
"source": [
"# Drill down to feature == \"sex\"\n",
"display(feature_balance_measures.filter(F.col(\"FeatureName\") == \"sex\").sort(F.abs(\"FeatureBalanceMeasure.dp\").desc()))"
"display(\n",
" feature_balance_measures.filter(F.col(\"FeatureName\") == \"sex\").sort(\n",
" F.abs(\"FeatureBalanceMeasure.dp\").desc()\n",
" )\n",
")"
]
},
{
@ -257,7 +266,11 @@
"outputs": [],
"source": [
"# Drill down to feature == \"race\"\n",
"display(feature_balance_measures.filter(F.col(\"FeatureName\") == \"race\").sort(F.abs(\"FeatureBalanceMeasure.dp\").desc()))"
"display(\n",
" feature_balance_measures.filter(F.col(\"FeatureName\") == \"race\").sort(\n",
" F.abs(\"FeatureBalanceMeasure.dp\").desc()\n",
" )\n",
")"
]
},
{
@ -288,15 +301,19 @@
"outputs": [],
"source": [
"races = [row[\"race\"] for row in df.groupBy(\"race\").count().select(\"race\").collect()]\n",
"dp_rows = feature_balance_measures.filter(F.col(\"FeatureName\") == \"race\").select(\"ClassA\", \"ClassB\", \"FeatureBalanceMeasure.dp\").collect()\n",
"dp_rows = (\n",
" feature_balance_measures.filter(F.col(\"FeatureName\") == \"race\")\n",
" .select(\"ClassA\", \"ClassB\", \"FeatureBalanceMeasure.dp\")\n",
" .collect()\n",
")\n",
"race_dp_values = [(row[\"ClassA\"], row[\"ClassB\"], row[\"dp\"]) for row in dp_rows]\n",
"\n",
"race_dp_array = np.zeros((len(races), len(races)))\n",
"for class_a, class_b, dp_value in race_dp_values:\n",
" i, j = races.index(class_a), races.index(class_b)\n",
" dp_value = round(dp_value, 2)\n",
" race_dp_array[i, j] = dp_value\n",
" race_dp_array[j, i] = -1 * dp_value\n",
" i, j = races.index(class_a), races.index(class_b)\n",
" dp_value = round(dp_value, 2)\n",
" race_dp_array[i, j] = dp_value\n",
" race_dp_array[j, i] = -1 * dp_value\n",
"\n",
"colormap = \"RdBu\"\n",
"dp_min, dp_max = -1.0, 1.0\n",
@ -315,9 +332,9 @@
"plt.setp(ax.get_xticklabels(), rotation=45, ha=\"right\", rotation_mode=\"anchor\")\n",
"\n",
"for i in range(len(races)):\n",
" for j in range(len(races)):\n",
" text = ax.text(j, i, race_dp_array[i, j], ha=\"center\", va=\"center\", color=\"k\")\n",
" \n",
" for j in range(len(races)):\n",
" text = ax.text(j, i, race_dp_array[i, j], ha=\"center\", va=\"center\", color=\"k\")\n",
"\n",
"ax.set_title(\"Demographic Parity of Races in Adult Dataset\")\n",
"fig.tight_layout()\n",
"plt.show()"
@ -426,13 +443,15 @@
"from synapse.ml.exploratory import DistributionBalanceMeasure\n",
"\n",
"distribution_balance_measures = (\n",
" DistributionBalanceMeasure()\n",
" .setSensitiveCols(cols_of_interest)\n",
" .transform(df)\n",
" DistributionBalanceMeasure().setSensitiveCols(cols_of_interest).transform(df)\n",
")\n",
"\n",
"# Sort by JS Distance descending\n",
"display(distribution_balance_measures.sort(F.abs(\"DistributionBalanceMeasure.js_dist\").desc()))"
"display(\n",
" distribution_balance_measures.sort(\n",
" F.abs(\"DistributionBalanceMeasure.js_dist\").desc()\n",
" )\n",
")"
]
},
{
@ -463,10 +482,20 @@
"outputs": [],
"source": [
"distribution_rows = distribution_balance_measures.collect()\n",
"race_row = [row for row in distribution_rows if row[\"FeatureName\"] == \"race\"][0][\"DistributionBalanceMeasure\"]\n",
"sex_row = [row for row in distribution_rows if row[\"FeatureName\"] == \"sex\"][0][\"DistributionBalanceMeasure\"]\n",
"race_row = [row for row in distribution_rows if row[\"FeatureName\"] == \"race\"][0][\n",
" \"DistributionBalanceMeasure\"\n",
"]\n",
"sex_row = [row for row in distribution_rows if row[\"FeatureName\"] == \"sex\"][0][\n",
" \"DistributionBalanceMeasure\"\n",
"]\n",
"\n",
"measures_of_interest = [\"kl_divergence\", \"js_dist\", \"inf_norm_dist\", \"total_variation_dist\", \"wasserstein_dist\"]\n",
"measures_of_interest = [\n",
" \"kl_divergence\",\n",
" \"js_dist\",\n",
" \"inf_norm_dist\",\n",
" \"total_variation_dist\",\n",
" \"wasserstein_dist\",\n",
"]\n",
"race_measures = [round(race_row[measure], 4) for measure in measures_of_interest]\n",
"sex_measures = [round(sex_row[measure], 4) for measure in measures_of_interest]\n",
"\n",
@ -474,8 +503,8 @@
"width = 0.35\n",
"\n",
"fig, ax = plt.subplots()\n",
"rects1 = ax.bar(x - width/2, race_measures, width, label=\"Race\")\n",
"rects2 = ax.bar(x + width/2, sex_measures, width, label=\"Sex\")\n",
"rects1 = ax.bar(x - width / 2, race_measures, width, label=\"Race\")\n",
"rects2 = ax.bar(x + width / 2, sex_measures, width, label=\"Sex\")\n",
"\n",
"ax.set_xlabel(\"Measure\")\n",
"ax.set_ylabel(\"Value\")\n",
@ -486,14 +515,19 @@
"\n",
"plt.setp(ax.get_xticklabels(), rotation=20, ha=\"right\", rotation_mode=\"default\")\n",
"\n",
"\n",
"def autolabel(rects):\n",
" for rect in rects:\n",
" height = rect.get_height()\n",
" ax.annotate('{}'.format(height),\n",
" xy=(rect.get_x() + rect.get_width() / 2, height),\n",
" xytext=(0, 1), # 1 point vertical offset\n",
" textcoords=\"offset points\",\n",
" ha='center', va='bottom')\n",
" for rect in rects:\n",
" height = rect.get_height()\n",
" ax.annotate(\n",
" \"{}\".format(height),\n",
" xy=(rect.get_x() + rect.get_width() / 2, height),\n",
" xytext=(0, 1), # 1 point vertical offset\n",
" textcoords=\"offset points\",\n",
" ha=\"center\",\n",
" va=\"bottom\",\n",
" )\n",
"\n",
"\n",
"autolabel(rects1)\n",
"autolabel(rects2)\n",
@ -571,9 +605,7 @@
"from synapse.ml.exploratory import AggregateBalanceMeasure\n",
"\n",
"aggregate_balance_measures = (\n",
" AggregateBalanceMeasure()\n",
" .setSensitiveCols(cols_of_interest)\n",
" .transform(df)\n",
" AggregateBalanceMeasure().setSensitiveCols(cols_of_interest).transform(df)\n",
")\n",
"\n",
"display(aggregate_balance_measures)"

Просмотреть файл

@ -44,6 +44,7 @@
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.visualization import display\n",
"\n",
@ -80,9 +81,13 @@
},
"outputs": [],
"source": [
"df = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\").cache()\n",
"df = spark.read.parquet(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\"\n",
").cache()\n",
"\n",
"labelIndexer = StringIndexer(inputCol=\"income\", outputCol=\"label\", stringOrderType=\"alphabetAsc\").fit(df)\n",
"labelIndexer = StringIndexer(\n",
" inputCol=\"income\", outputCol=\"label\", stringOrderType=\"alphabetAsc\"\n",
").fit(df)\n",
"print(\"Label index assigment: \" + str(set(zip(labelIndexer.labels, [0, 1]))))\n",
"\n",
"training = labelIndexer.transform(df)\n",
@ -99,11 +104,23 @@
"]\n",
"categorical_features_idx = [col + \"_idx\" for col in categorical_features]\n",
"categorical_features_enc = [col + \"_enc\" for col in categorical_features]\n",
"numeric_features = [\"age\", \"education-num\", \"capital-gain\", \"capital-loss\", \"hours-per-week\"]\n",
"numeric_features = [\n",
" \"age\",\n",
" \"education-num\",\n",
" \"capital-gain\",\n",
" \"capital-loss\",\n",
" \"hours-per-week\",\n",
"]\n",
"\n",
"strIndexer = StringIndexer(inputCols=categorical_features, outputCols=categorical_features_idx)\n",
"onehotEnc = OneHotEncoder(inputCols=categorical_features_idx, outputCols=categorical_features_enc)\n",
"vectAssem = VectorAssembler(inputCols=categorical_features_enc + numeric_features, outputCol=\"features\")\n",
"strIndexer = StringIndexer(\n",
" inputCols=categorical_features, outputCols=categorical_features_idx\n",
")\n",
"onehotEnc = OneHotEncoder(\n",
" inputCols=categorical_features_idx, outputCols=categorical_features_enc\n",
")\n",
"vectAssem = VectorAssembler(\n",
" inputCols=categorical_features_enc + numeric_features, outputCol=\"features\"\n",
")\n",
"lr = LogisticRegression(featuresCol=\"features\", labelCol=\"label\", weightCol=\"fnlwgt\")\n",
"pipeline = Pipeline(stages=[strIndexer, onehotEnc, vectAssem, lr])\n",
"model = pipeline.fit(training)"
@ -137,7 +154,9 @@
},
"outputs": [],
"source": [
"explain_instances = model.transform(training).orderBy(rand()).limit(5).repartition(200).cache()\n",
"explain_instances = (\n",
" model.transform(training).orderBy(rand()).limit(5).repartition(200).cache()\n",
")\n",
"display(explain_instances)"
]
},
@ -179,7 +198,7 @@
" backgroundData=broadcast(training.orderBy(rand()).limit(100).cache()),\n",
")\n",
"\n",
"shap_df = shap.transform(explain_instances)\n"
"shap_df = shap.transform(explain_instances)"
]
},
{
@ -214,7 +233,9 @@
"shaps = (\n",
" shap_df.withColumn(\"probability\", vec_access(col(\"probability\"), lit(1)))\n",
" .withColumn(\"shapValues\", vec2array(col(\"shapValues\").getItem(0)))\n",
" .select([\"shapValues\", \"probability\", \"label\"] + categorical_features + numeric_features)\n",
" .select(\n",
" [\"shapValues\", \"probability\", \"label\"] + categorical_features + numeric_features\n",
" )\n",
")\n",
"\n",
"shaps_local = shaps.toPandas()\n",
@ -259,9 +280,9 @@
"\n",
"rows = shaps_local.shape[0]\n",
"\n",
"local_importance_values = shaps_local[['shapValues']]\n",
"local_importance_values = shaps_local[[\"shapValues\"]]\n",
"eval_data = shaps_local[features]\n",
"true_y = np.array(shaps_local[['label']])"
"true_y = np.array(shaps_local[[\"label\"]])"
]
},
{
@ -323,8 +344,11 @@
"outputs": [],
"source": [
"from interpret_community.adapter import ExplanationAdapter\n",
"\n",
"adapter = ExplanationAdapter(features, classification=True)\n",
"global_explanation = adapter.create_global(converted_importance_values, eval_data, expected_values=bias)"
"global_explanation = adapter.create_global(\n",
" converted_importance_values, eval_data, expected_values=bias\n",
")"
]
},
{
@ -360,18 +384,30 @@
"outputs": [],
"source": [
"class wrapper(object):\n",
" def __init__(self, model):\n",
" self.model = model\n",
" \n",
" def predict(self, data):\n",
" sparkdata = spark.createDataFrame(data)\n",
" return model.transform(sparkdata).select('prediction').toPandas().values.flatten().tolist()\n",
" \n",
" def predict_proba(self, data):\n",
" sparkdata = spark.createDataFrame(data)\n",
" prediction = model.transform(sparkdata).select('probability').toPandas().values.flatten().tolist()\n",
" proba_list = [vector.values.tolist() for vector in prediction]\n",
" return proba_list"
" def __init__(self, model):\n",
" self.model = model\n",
"\n",
" def predict(self, data):\n",
" sparkdata = spark.createDataFrame(data)\n",
" return (\n",
" model.transform(sparkdata)\n",
" .select(\"prediction\")\n",
" .toPandas()\n",
" .values.flatten()\n",
" .tolist()\n",
" )\n",
"\n",
" def predict_proba(self, data):\n",
" sparkdata = spark.createDataFrame(data)\n",
" prediction = (\n",
" model.transform(sparkdata)\n",
" .select(\"probability\")\n",
" .toPandas()\n",
" .values.flatten()\n",
" .tolist()\n",
" )\n",
" proba_list = [vector.values.tolist() for vector in prediction]\n",
" return proba_list"
]
},
{
@ -384,7 +420,10 @@
"source": [
"# view the explanation in the ExplanationDashboard\n",
"from raiwidgets import ExplanationDashboard\n",
"ExplanationDashboard(global_explanation, wrapper(model), dataset=eval_data, true_y=true_y)"
"\n",
"ExplanationDashboard(\n",
" global_explanation, wrapper(model), dataset=eval_data, true_y=true_y\n",
")"
]
},
{

Просмотреть файл

@ -34,26 +34,34 @@
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.visualization import display\n",
"\n",
"vec_slice = udf(lambda vec, indices: (vec.toArray())[indices].tolist(), ArrayType(FloatType()))\n",
"arg_top_k = udf(lambda vec, k: (-vec.toArray()).argsort()[:k].tolist(), ArrayType(IntegerType()))\n",
"vec_slice = udf(\n",
" lambda vec, indices: (vec.toArray())[indices].tolist(), ArrayType(FloatType())\n",
")\n",
"arg_top_k = udf(\n",
" lambda vec, k: (-vec.toArray()).argsort()[:k].tolist(), ArrayType(IntegerType())\n",
")\n",
"\n",
"\n",
"def downloadBytes(url: str):\n",
" with urllib.request.urlopen(url) as url:\n",
" barr = url.read()\n",
" return barr\n",
" with urllib.request.urlopen(url) as url:\n",
" barr = url.read()\n",
" return barr\n",
"\n",
"\n",
"def rotate_color_channel(bgr_image_array, height, width, nChannels):\n",
" B, G, R, *_ = np.asarray(bgr_image_array).reshape(height, width, nChannels).T\n",
" rgb_image_array = np.array((R, G, B)).T\n",
" return rgb_image_array\n",
" \n",
" B, G, R, *_ = np.asarray(bgr_image_array).reshape(height, width, nChannels).T\n",
" rgb_image_array = np.array((R, G, B)).T\n",
" return rgb_image_array\n",
"\n",
"\n",
"def plot_superpixels(image_rgb_array, sp_clusters, weights, green_threshold=99):\n",
" superpixels = sp_clusters\n",
" green_value = np.percentile(weights, green_threshold)\n",
" img = Image.fromarray(image_rgb_array, mode='RGB').convert(\"RGBA\")\n",
" img = Image.fromarray(image_rgb_array, mode=\"RGB\").convert(\"RGBA\")\n",
" image_array = np.asarray(img).copy()\n",
" for (sp, v) in zip(superpixels, weights):\n",
" if v > green_value:\n",
@ -85,32 +93,42 @@
"source": [
"from synapse.ml.io import *\n",
"\n",
"image_df = spark.read.image().load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/explainers/images/david-lusvardi-dWcUncxocQY-unsplash.jpg\")\n",
"image_df = spark.read.image().load(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/explainers/images/david-lusvardi-dWcUncxocQY-unsplash.jpg\"\n",
")\n",
"display(image_df)\n",
"\n",
"# Rotate the image array from BGR into RGB channels for visualization later.\n",
"row = image_df.select(\"image.height\", \"image.width\", \"image.nChannels\", \"image.data\").head()\n",
"row = image_df.select(\n",
" \"image.height\", \"image.width\", \"image.nChannels\", \"image.data\"\n",
").head()\n",
"locals().update(row.asDict())\n",
"rgb_image_array = rotate_color_channel(data, height, width, nChannels)\n",
"\n",
"# Download the ONNX model\n",
"modelPayload = downloadBytes(\"https://mmlspark.blob.core.windows.net/publicwasb/ONNXModels/resnet50-v2-7.onnx\")\n",
"modelPayload = downloadBytes(\n",
" \"https://mmlspark.blob.core.windows.net/publicwasb/ONNXModels/resnet50-v2-7.onnx\"\n",
")\n",
"\n",
"featurizer = (\n",
" ImageTransformer(inputCol=\"image\", outputCol=\"features\")\n",
" .resize(224, True)\n",
" .centerCrop(224, 224)\n",
" .normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], color_scale_factor = 1/255)\n",
" .setTensorElementType(FloatType())\n",
" ImageTransformer(inputCol=\"image\", outputCol=\"features\")\n",
" .resize(224, True)\n",
" .centerCrop(224, 224)\n",
" .normalize(\n",
" mean=[0.485, 0.456, 0.406],\n",
" std=[0.229, 0.224, 0.225],\n",
" color_scale_factor=1 / 255,\n",
" )\n",
" .setTensorElementType(FloatType())\n",
")\n",
"\n",
"onnx = (\n",
" ONNXModel()\n",
" .setModelPayload(modelPayload)\n",
" .setFeedDict({\"data\": \"features\"})\n",
" .setFetchDict({\"rawPrediction\": \"resnetv24_dense0_fwd\"})\n",
" .setSoftMaxDict({\"rawPrediction\": \"probability\"})\n",
" .setMiniBatchSize(1)\n",
" ONNXModel()\n",
" .setModelPayload(modelPayload)\n",
" .setFeedDict({\"data\": \"features\"})\n",
" .setFetchDict({\"rawPrediction\": \"resnetv24_dense0_fwd\"})\n",
" .setSoftMaxDict({\"rawPrediction\": \"probability\"})\n",
" .setMiniBatchSize(1)\n",
")\n",
"\n",
"model = Pipeline(stages=[featurizer, onnx]).fit(image_df)"
@ -124,8 +142,8 @@
"source": [
"predicted = (\n",
" model.transform(image_df)\n",
" .withColumn(\"top2pred\", arg_top_k(col(\"probability\"), lit(2)))\n",
" .withColumn(\"top2prob\", vec_slice(col(\"probability\"), col(\"top2pred\")))\n",
" .withColumn(\"top2pred\", arg_top_k(col(\"probability\"), lit(2)))\n",
" .withColumn(\"top2prob\", vec_slice(col(\"probability\"), col(\"top2pred\")))\n",
")\n",
"\n",
"display(predicted.select(\"top2pred\", \"top2prob\"))"
@ -183,8 +201,18 @@
"cell_type": "code",
"execution_count": null,
"source": [
"plot_superpixels(rgb_image_array, lime_row[\"superpixels\"][\"clusters\"], list(lime_row[\"weights_violin\"]), 95)\n",
"plot_superpixels(rgb_image_array, lime_row[\"superpixels\"][\"clusters\"], list(lime_row[\"weights_piano\"]), 95)"
"plot_superpixels(\n",
" rgb_image_array,\n",
" lime_row[\"superpixels\"][\"clusters\"],\n",
" list(lime_row[\"weights_violin\"]),\n",
" 95,\n",
")\n",
"plot_superpixels(\n",
" rgb_image_array,\n",
" lime_row[\"superpixels\"][\"clusters\"],\n",
" list(lime_row[\"weights_piano\"]),\n",
" 95,\n",
")"
],
"outputs": [],
"metadata": {}
@ -250,8 +278,18 @@
"cell_type": "code",
"execution_count": null,
"source": [
"plot_superpixels(rgb_image_array, shap_row[\"superpixels\"][\"clusters\"], list(shap_row[\"shaps_violin\"][1:]), 95)\n",
"plot_superpixels(rgb_image_array, shap_row[\"superpixels\"][\"clusters\"], list(shap_row[\"shaps_piano\"][1:]), 95)"
"plot_superpixels(\n",
" rgb_image_array,\n",
" shap_row[\"superpixels\"][\"clusters\"],\n",
" list(shap_row[\"shaps_violin\"][1:]),\n",
" 95,\n",
")\n",
"plot_superpixels(\n",
" rgb_image_array,\n",
" shap_row[\"superpixels\"][\"clusters\"],\n",
" list(shap_row[\"shaps_piano\"][1:]),\n",
" 95,\n",
")"
],
"outputs": [],
"metadata": {}

Просмотреть файл

@ -84,8 +84,9 @@
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.visualization import display\n"
" from notebookutils.visualization import display"
]
},
{
@ -115,7 +116,9 @@
},
"outputs": [],
"source": [
"df = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n",
"df = spark.read.parquet(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\"\n",
")\n",
"display(df)"
]
},
@ -146,8 +149,23 @@
},
"outputs": [],
"source": [
"categorical_features = [\"race\", \"workclass\", \"marital-status\", \"education\", \"occupation\", \"relationship\", \"native-country\", \"sex\"]\n",
"numeric_features = [\"age\", \"education-num\", \"capital-gain\", \"capital-loss\", \"hours-per-week\"]"
"categorical_features = [\n",
" \"race\",\n",
" \"workclass\",\n",
" \"marital-status\",\n",
" \"education\",\n",
" \"occupation\",\n",
" \"relationship\",\n",
" \"native-country\",\n",
" \"sex\",\n",
"]\n",
"numeric_features = [\n",
" \"age\",\n",
" \"education-num\",\n",
" \"capital-gain\",\n",
" \"capital-loss\",\n",
" \"hours-per-week\",\n",
"]"
]
},
{
@ -166,12 +184,24 @@
"string_indexer_outputs = [feature + \"_idx\" for feature in categorical_features]\n",
"one_hot_encoder_outputs = [feature + \"_enc\" for feature in categorical_features]\n",
"\n",
"pipeline = Pipeline(stages=[\n",
" StringIndexer().setInputCol(\"income\").setOutputCol(\"label\").setStringOrderType(\"alphabetAsc\"),\n",
" StringIndexer().setInputCols(categorical_features).setOutputCols(string_indexer_outputs),\n",
" OneHotEncoder().setInputCols(string_indexer_outputs).setOutputCols(one_hot_encoder_outputs),\n",
" VectorAssembler(inputCols=one_hot_encoder_outputs+numeric_features, outputCol=\"features\"),\n",
" GBTClassifier(weightCol=\"fnlwgt\", maxDepth=7, maxIter=100)])\n",
"pipeline = Pipeline(\n",
" stages=[\n",
" StringIndexer()\n",
" .setInputCol(\"income\")\n",
" .setOutputCol(\"label\")\n",
" .setStringOrderType(\"alphabetAsc\"),\n",
" StringIndexer()\n",
" .setInputCols(categorical_features)\n",
" .setOutputCols(string_indexer_outputs),\n",
" OneHotEncoder()\n",
" .setInputCols(string_indexer_outputs)\n",
" .setOutputCols(one_hot_encoder_outputs),\n",
" VectorAssembler(\n",
" inputCols=one_hot_encoder_outputs + numeric_features, outputCol=\"features\"\n",
" ),\n",
" GBTClassifier(weightCol=\"fnlwgt\", maxDepth=7, maxIter=100),\n",
" ]\n",
")\n",
"\n",
"model = pipeline.fit(df)"
]
@ -204,7 +234,7 @@
"outputs": [],
"source": [
"data = model.transform(df)\n",
"display(data.select('income', 'probability', 'prediction'))"
"display(data.select(\"income\", \"probability\", \"prediction\"))"
]
},
{
@ -220,7 +250,9 @@
},
"outputs": [],
"source": [
"eval_auc = BinaryClassificationEvaluator(labelCol=\"label\", rawPredictionCol=\"prediction\")\n",
"eval_auc = BinaryClassificationEvaluator(\n",
" labelCol=\"label\", rawPredictionCol=\"prediction\"\n",
")\n",
"eval_auc.evaluate(data)"
]
},
@ -300,8 +332,14 @@
},
"outputs": [],
"source": [
"pdp = ICETransformer(model=model, targetCol=\"probability\", kind=\"average\", targetClasses=[1],\n",
" categoricalFeatures=categorical_features, numericFeatures=numeric_features)"
"pdp = ICETransformer(\n",
" model=model,\n",
" targetCol=\"probability\",\n",
" kind=\"average\",\n",
" targetClasses=[1],\n",
" categoricalFeatures=categorical_features,\n",
" numericFeatures=numeric_features,\n",
")"
]
},
{
@ -364,49 +402,54 @@
"source": [
"# Helper functions for visualization\n",
"\n",
"\n",
"def get_pandas_df_from_column(df, col_name):\n",
" keys_df = df.select(F.explode(F.map_keys(F.col(col_name)))).distinct()\n",
" keys = list(map(lambda row: row[0], keys_df.collect()))\n",
" key_cols = list(map(lambda f: F.col(col_name).getItem(f).alias(str(f)), keys))\n",
" final_cols = key_cols\n",
" pandas_df = df.select(final_cols).toPandas()\n",
" return pandas_df\n",
" keys_df = df.select(F.explode(F.map_keys(F.col(col_name)))).distinct()\n",
" keys = list(map(lambda row: row[0], keys_df.collect()))\n",
" key_cols = list(map(lambda f: F.col(col_name).getItem(f).alias(str(f)), keys))\n",
" final_cols = key_cols\n",
" pandas_df = df.select(final_cols).toPandas()\n",
" return pandas_df\n",
"\n",
"\n",
"def plot_dependence_for_categorical(df, col, col_int=True, figsize=(20, 5)):\n",
" dict_values = {}\n",
" col_names = list(df.columns)\n",
" dict_values = {}\n",
" col_names = list(df.columns)\n",
"\n",
" for col_name in col_names:\n",
" dict_values[col_name] = df[col_name][0].toArray()[0]\n",
" marklist= sorted(dict_values.items(), key=lambda x: int(x[0]) if col_int else x[0]) \n",
" sortdict=dict(marklist)\n",
" for col_name in col_names:\n",
" dict_values[col_name] = df[col_name][0].toArray()[0]\n",
" marklist = sorted(\n",
" dict_values.items(), key=lambda x: int(x[0]) if col_int else x[0]\n",
" )\n",
" sortdict = dict(marklist)\n",
"\n",
" fig = plt.figure(figsize=figsize)\n",
" plt.bar(sortdict.keys(), sortdict.values())\n",
"\n",
" plt.xlabel(col, size=13)\n",
" plt.ylabel(\"Dependence\")\n",
" plt.show()\n",
"\n",
" fig = plt.figure(figsize = figsize)\n",
" plt.bar(sortdict.keys(), sortdict.values())\n",
"\n",
" plt.xlabel(col, size=13)\n",
" plt.ylabel(\"Dependence\")\n",
" plt.show()\n",
" \n",
"def plot_dependence_for_numeric(df, col, col_int=True, figsize=(20, 5)):\n",
" dict_values = {}\n",
" col_names = list(df.columns)\n",
" dict_values = {}\n",
" col_names = list(df.columns)\n",
"\n",
" for col_name in col_names:\n",
" dict_values[col_name] = df[col_name][0].toArray()[0]\n",
" marklist= sorted(dict_values.items(), key=lambda x: int(x[0]) if col_int else x[0]) \n",
" sortdict=dict(marklist)\n",
" for col_name in col_names:\n",
" dict_values[col_name] = df[col_name][0].toArray()[0]\n",
" marklist = sorted(\n",
" dict_values.items(), key=lambda x: int(x[0]) if col_int else x[0]\n",
" )\n",
" sortdict = dict(marklist)\n",
"\n",
" fig = plt.figure(figsize = figsize)\n",
" fig = plt.figure(figsize=figsize)\n",
"\n",
" \n",
" plt.plot(list(sortdict.keys()), list(sortdict.values()))\n",
" plt.plot(list(sortdict.keys()), list(sortdict.values()))\n",
"\n",
" plt.xlabel(col, size=13)\n",
" plt.ylabel(\"Dependence\")\n",
" plt.ylim(0.0)\n",
" plt.show()\n",
" "
" plt.xlabel(col, size=13)\n",
" plt.ylabel(\"Dependence\")\n",
" plt.ylim(0.0)\n",
" plt.show()"
]
},
{
@ -438,8 +481,8 @@
},
"outputs": [],
"source": [
"df_education_num = get_pandas_df_from_column(output_pdp, 'age_dependence')\n",
"plot_dependence_for_numeric(df_education_num, 'age')"
"df_education_num = get_pandas_df_from_column(output_pdp, \"age_dependence\")\n",
"plot_dependence_for_numeric(df_education_num, \"age\")"
]
},
{
@ -487,8 +530,8 @@
},
"outputs": [],
"source": [
"df_occupation = get_pandas_df_from_column(output_pdp, 'marital-status_dependence')\n",
"plot_dependence_for_categorical(df_occupation, 'marital-status', False, figsize=(30, 5))"
"df_occupation = get_pandas_df_from_column(output_pdp, \"marital-status_dependence\")\n",
"plot_dependence_for_categorical(df_occupation, \"marital-status\", False, figsize=(30, 5))"
]
},
{
@ -537,8 +580,8 @@
},
"outputs": [],
"source": [
"df_education_num = get_pandas_df_from_column(output_pdp, 'capital-gain_dependence')\n",
"plot_dependence_for_numeric(df_education_num, 'capital-gain_dependence')"
"df_education_num = get_pandas_df_from_column(output_pdp, \"capital-gain_dependence\")\n",
"plot_dependence_for_numeric(df_education_num, \"capital-gain_dependence\")"
]
},
{
@ -570,12 +613,21 @@
},
"outputs": [],
"source": [
"pdp_cap_gain = ICETransformer(model=model, targetCol=\"probability\", kind=\"average\", targetClasses=[1], \n",
" numericFeatures=[{\"name\": \"capital-gain\", \"numSplits\": 20, \"rangeMin\": 0.0,\n",
" \"rangeMax\": 10000.0}], numSamples=50)\n",
"pdp_cap_gain = ICETransformer(\n",
" model=model,\n",
" targetCol=\"probability\",\n",
" kind=\"average\",\n",
" targetClasses=[1],\n",
" numericFeatures=[\n",
" {\"name\": \"capital-gain\", \"numSplits\": 20, \"rangeMin\": 0.0, \"rangeMax\": 10000.0}\n",
" ],\n",
" numSamples=50,\n",
")\n",
"output_pdp_cap_gain = pdp_cap_gain.transform(df)\n",
"df_education_num_gain = get_pandas_df_from_column(output_pdp_cap_gain, 'capital-gain_dependence')\n",
"plot_dependence_for_numeric(df_education_num_gain, 'capital-gain_dependence')"
"df_education_num_gain = get_pandas_df_from_column(\n",
" output_pdp_cap_gain, \"capital-gain_dependence\"\n",
")\n",
"plot_dependence_for_numeric(df_education_num_gain, \"capital-gain_dependence\")"
]
},
{
@ -670,8 +722,14 @@
},
"outputs": [],
"source": [
"ice = ICETransformer(model=model, targetCol=\"probability\", targetClasses=[1], \n",
" categoricalFeatures=categorical_features, numericFeatures=numeric_features, numSamples=50)\n",
"ice = ICETransformer(\n",
" model=model,\n",
" targetCol=\"probability\",\n",
" targetClasses=[1],\n",
" categoricalFeatures=categorical_features,\n",
" numericFeatures=numeric_features,\n",
" numSamples=50,\n",
")\n",
"\n",
"output = ice.transform(df)"
]
@ -708,89 +766,89 @@
"\n",
"from collections import defaultdict\n",
"\n",
"\n",
"def plot_ice_numeric(df, col, col_int=True, figsize=(20, 10)):\n",
" dict_values = defaultdict(list)\n",
" col_names = list(df.columns)\n",
" num_instances = df.shape[0]\n",
" \n",
" instances_y = {}\n",
" i = 0\n",
" dict_values = defaultdict(list)\n",
" col_names = list(df.columns)\n",
" num_instances = df.shape[0]\n",
"\n",
" for col_name in col_names:\n",
" instances_y = {}\n",
" i = 0\n",
"\n",
" for col_name in col_names:\n",
" for i in range(num_instances):\n",
" dict_values[i].append(df[col_name][i].toArray()[0])\n",
"\n",
" fig = plt.figure(figsize=figsize)\n",
" for i in range(num_instances):\n",
" dict_values[i].append(df[col_name][i].toArray()[0])\n",
" \n",
" fig = plt.figure(figsize = figsize)\n",
" for i in range(num_instances):\n",
" plt.plot(col_names, dict_values[i], \"k\")\n",
" \n",
" \n",
" plt.xlabel(col, size=13)\n",
" plt.ylabel(\"Dependence\")\n",
" plt.ylim(0.0)\n",
" \n",
" \n",
" \n",
" plt.plot(col_names, dict_values[i], \"k\")\n",
"\n",
" plt.xlabel(col, size=13)\n",
" plt.ylabel(\"Dependence\")\n",
" plt.ylim(0.0)\n",
"\n",
"\n",
"def plot_ice_categorical(df, col, col_int=True, figsize=(20, 10)):\n",
" dict_values = defaultdict(list)\n",
" col_names = list(df.columns)\n",
" num_instances = df.shape[0]\n",
" \n",
" angles = [n / float(df.shape[1]) * 2 * pi for n in range(df.shape[1])]\n",
" angles += angles [:1]\n",
" \n",
" instances_y = {}\n",
" i = 0\n",
" dict_values = defaultdict(list)\n",
" col_names = list(df.columns)\n",
" num_instances = df.shape[0]\n",
"\n",
" angles = [n / float(df.shape[1]) * 2 * pi for n in range(df.shape[1])]\n",
" angles += angles[:1]\n",
"\n",
" instances_y = {}\n",
" i = 0\n",
"\n",
" for col_name in col_names:\n",
" for i in range(num_instances):\n",
" dict_values[i].append(df[col_name][i].toArray()[0])\n",
"\n",
" fig = plt.figure(figsize=figsize)\n",
" ax = plt.subplot(111, polar=True)\n",
" plt.xticks(angles[:-1], col_names)\n",
"\n",
" for col_name in col_names:\n",
" for i in range(num_instances):\n",
" dict_values[i].append(df[col_name][i].toArray()[0])\n",
" \n",
" fig = plt.figure(figsize = figsize)\n",
" ax = plt.subplot(111, polar=True)\n",
" plt.xticks(angles[:-1], col_names)\n",
" \n",
" for i in range(num_instances):\n",
" values = dict_values[i]\n",
" values += values[:1]\n",
" ax.plot(angles, values, \"k\")\n",
" ax.fill(angles, values, 'teal', alpha=0.1)\n",
" values = dict_values[i]\n",
" values += values[:1]\n",
" ax.plot(angles, values, \"k\")\n",
" ax.fill(angles, values, \"teal\", alpha=0.1)\n",
"\n",
" plt.xlabel(col, size=13)\n",
" plt.show()\n",
"\n",
" plt.xlabel(col, size=13)\n",
" plt.show()\n",
"\n",
"def overlay_ice_with_pdp(df_ice, df_pdp, col, col_int=True, figsize=(20, 5)):\n",
" dict_values = defaultdict(list)\n",
" col_names_ice = list(df_ice.columns)\n",
" num_instances = df_ice.shape[0]\n",
" \n",
" instances_y = {}\n",
" i = 0\n",
" dict_values = defaultdict(list)\n",
" col_names_ice = list(df_ice.columns)\n",
" num_instances = df_ice.shape[0]\n",
"\n",
" for col_name in col_names_ice:\n",
" instances_y = {}\n",
" i = 0\n",
"\n",
" for col_name in col_names_ice:\n",
" for i in range(num_instances):\n",
" dict_values[i].append(df_ice[col_name][i].toArray()[0])\n",
"\n",
" fig = plt.figure(figsize=figsize)\n",
" for i in range(num_instances):\n",
" dict_values[i].append(df_ice[col_name][i].toArray()[0])\n",
" \n",
" fig = plt.figure(figsize = figsize)\n",
" for i in range(num_instances):\n",
" plt.plot(col_names_ice, dict_values[i], \"k\")\n",
" \n",
" dict_values_pdp = {}\n",
" col_names = list(df_pdp.columns)\n",
" plt.plot(col_names_ice, dict_values[i], \"k\")\n",
"\n",
" for col_name in col_names:\n",
" dict_values_pdp[col_name] = df_pdp[col_name][0].toArray()[0]\n",
" marklist= sorted(dict_values_pdp.items(), key=lambda x: int(x[0]) if col_int else x[0]) \n",
" sortdict=dict(marklist)\n",
" \n",
" plt.plot(col_names_ice, list(sortdict.values()), \"r\", linewidth=5)\n",
" \n",
" \n",
" \n",
" plt.xlabel(col, size=13)\n",
" plt.ylabel(\"Dependence\")\n",
" plt.ylim(0.0)\n",
" plt.show()\n"
" dict_values_pdp = {}\n",
" col_names = list(df_pdp.columns)\n",
"\n",
" for col_name in col_names:\n",
" dict_values_pdp[col_name] = df_pdp[col_name][0].toArray()[0]\n",
" marklist = sorted(\n",
" dict_values_pdp.items(), key=lambda x: int(x[0]) if col_int else x[0]\n",
" )\n",
" sortdict = dict(marklist)\n",
"\n",
" plt.plot(col_names_ice, list(sortdict.values()), \"r\", linewidth=5)\n",
"\n",
" plt.xlabel(col, size=13)\n",
" plt.ylabel(\"Dependence\")\n",
" plt.ylim(0.0)\n",
" plt.show()"
]
},
{
@ -824,10 +882,10 @@
},
"outputs": [],
"source": [
"age_df_ice = get_pandas_df_from_column(output, 'age_dependence')\n",
"age_df_pdp = get_pandas_df_from_column(output_pdp, 'age_dependence')\n",
"age_df_ice = get_pandas_df_from_column(output, \"age_dependence\")\n",
"age_df_pdp = get_pandas_df_from_column(output_pdp, \"age_dependence\")\n",
"\n",
"overlay_ice_with_pdp(age_df_ice, age_df_pdp, col='age_dependence', figsize=(30, 10))"
"overlay_ice_with_pdp(age_df_ice, age_df_pdp, col=\"age_dependence\", figsize=(30, 10))"
]
},
{
@ -891,9 +949,9 @@
},
"outputs": [],
"source": [
"occupation_dep = get_pandas_df_from_column(output, 'occupation_dependence')\n",
"occupation_dep = get_pandas_df_from_column(output, \"occupation_dependence\")\n",
"\n",
"plot_ice_categorical(occupation_dep, 'occupation_dependence', figsize=(30, 10))"
"plot_ice_categorical(occupation_dep, \"occupation_dependence\", figsize=(30, 10))"
]
},
{
@ -991,8 +1049,14 @@
},
"outputs": [],
"source": [
"pdp_based_imp = ICETransformer(model=model, targetCol=\"probability\", kind=\"feature\", targetClasses=[1],\n",
" categoricalFeatures=categorical_features, numericFeatures=numeric_features)\n",
"pdp_based_imp = ICETransformer(\n",
" model=model,\n",
" targetCol=\"probability\",\n",
" kind=\"feature\",\n",
" targetClasses=[1],\n",
" categoricalFeatures=categorical_features,\n",
" numericFeatures=numeric_features,\n",
")\n",
"\n",
"output_pdp_based_imp = pdp_based_imp.transform(df)\n",
"display(output_pdp_based_imp)"
@ -1027,19 +1091,20 @@
"source": [
"# Helper functions for visualization\n",
"\n",
"\n",
"def plot_pdp_based_imp(df, figsize=(35, 5)):\n",
" values_list = list(df.select('pdpBasedDependence').toPandas()['pdpBasedDependence'])\n",
" names = list(df.select('featureNames').toPandas()['featureNames'])\n",
" dependence_values = []\n",
" for vec in values_list:\n",
" dependence_values.append(vec.toArray()[0])\n",
" values_list = list(df.select(\"pdpBasedDependence\").toPandas()[\"pdpBasedDependence\"])\n",
" names = list(df.select(\"featureNames\").toPandas()[\"featureNames\"])\n",
" dependence_values = []\n",
" for vec in values_list:\n",
" dependence_values.append(vec.toArray()[0])\n",
"\n",
" fig = plt.figure(figsize = figsize)\n",
" plt.bar(names, dependence_values)\n",
" fig = plt.figure(figsize=figsize)\n",
" plt.bar(names, dependence_values)\n",
"\n",
" plt.xlabel(\"Feature names\", size=13)\n",
" plt.ylabel(\"PDP-based-feature-imporance\")\n",
" plt.show()"
" plt.xlabel(\"Feature names\", size=13)\n",
" plt.ylabel(\"PDP-based-feature-imporance\")\n",
" plt.show()"
]
},
{

Просмотреть файл

@ -14,12 +14,17 @@
"execution_count": null,
"source": [
"import os\n",
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.visualization import display\n",
" from notebookutils.mssparkutils.credentials import getSecret\n",
" os.environ[\"BING_IMAGE_SEARCH_KEY\"] = getSecret(\"mmlspark-build-keys\", \"bing-search-key\")\n",
"\n",
" os.environ[\"BING_IMAGE_SEARCH_KEY\"] = getSecret(\n",
" \"mmlspark-build-keys\", \"bing-search-key\"\n",
" )\n",
"\n",
"# WARNING this notebook requires alot of memory.\n",
"# If you get a heap space error, try dropping the number of images bing returns\n",
@ -41,22 +46,26 @@
"from synapse.ml.core.spark import FluentAPI\n",
"from pyspark.sql.functions import lit\n",
"\n",
"\n",
"def bingPhotoSearch(name, queries, pages):\n",
" offsets = [offset*10 for offset in range(0, pages)] \n",
" parameters = [(query, offset) for offset in offsets for query in queries]\n",
" \n",
" return spark.createDataFrame(parameters, (\"queries\",\"offsets\")) \\\n",
" .mlTransform(\n",
" BingImageSearch() # Apply Bing Image Search\n",
" .setSubscriptionKey(BING_IMAGE_SEARCH_KEY) # Set the API Key\n",
" .setOffsetCol(\"offsets\") # Specify a column containing the offsets\n",
" .setQueryCol(\"queries\") # Specify a column containing the query words\n",
" .setCount(10) # Specify the number of images to return per offset\n",
" .setImageType(\"photo\") # Specify a filter to ensure we get photos\n",
" .setOutputCol(\"images\")) \\\n",
" .mlTransform(BingImageSearch.getUrlTransformer(\"images\", \"urls\")) \\\n",
" .withColumn(\"labels\", lit(name)) \\\n",
" .limit(400)\n"
" offsets = [offset * 10 for offset in range(0, pages)]\n",
" parameters = [(query, offset) for offset in offsets for query in queries]\n",
"\n",
" return (\n",
" spark.createDataFrame(parameters, (\"queries\", \"offsets\"))\n",
" .mlTransform(\n",
" BingImageSearch() # Apply Bing Image Search\n",
" .setSubscriptionKey(BING_IMAGE_SEARCH_KEY) # Set the API Key\n",
" .setOffsetCol(\"offsets\") # Specify a column containing the offsets\n",
" .setQueryCol(\"queries\") # Specify a column containing the query words\n",
" .setCount(10) # Specify the number of images to return per offset\n",
" .setImageType(\"photo\") # Specify a filter to ensure we get photos\n",
" .setOutputCol(\"images\")\n",
" )\n",
" .mlTransform(BingImageSearch.getUrlTransformer(\"images\", \"urls\"))\n",
" .withColumn(\"labels\", lit(name))\n",
" .limit(400)\n",
" )"
],
"outputs": [],
"metadata": {
@ -74,12 +83,12 @@
"cell_type": "code",
"execution_count": null,
"source": [
"def displayDF(df, n=5, image_cols = set([\"urls\"])):\n",
" rows = df.take(n)\n",
" cols = df.columns\n",
" header = \"\".join([\"<th>\" + c + \"</th>\" for c in cols])\n",
" \n",
" style = \"\"\"\n",
"def displayDF(df, n=5, image_cols=set([\"urls\"])):\n",
" rows = df.take(n)\n",
" cols = df.columns\n",
" header = \"\".join([\"<th>\" + c + \"</th>\" for c in cols])\n",
"\n",
" style = \"\"\"\n",
"<!DOCTYPE html>\n",
"<html>\n",
"<head>\n",
@ -101,20 +110,20 @@
"}\n",
"</style>\n",
"</head>\"\"\"\n",
" \n",
" table = []\n",
" for row in rows:\n",
" table.append(\"<tr>\")\n",
" for col in cols:\n",
" if col in image_cols:\n",
" rep = '<img src=\"{}\", width=\"100\">'.format(row[col])\n",
" else:\n",
" rep = row[col]\n",
" table.append(\"<td>{}</td>\".format(rep))\n",
" table.append(\"</tr>\")\n",
" tableHTML = \"\".join(table)\n",
" \n",
" body = \"\"\"\n",
"\n",
" table = []\n",
" for row in rows:\n",
" table.append(\"<tr>\")\n",
" for col in cols:\n",
" if col in image_cols:\n",
" rep = '<img src=\"{}\", width=\"100\">'.format(row[col])\n",
" else:\n",
" rep = row[col]\n",
" table.append(\"<td>{}</td>\".format(rep))\n",
" table.append(\"</tr>\")\n",
" tableHTML = \"\".join(table)\n",
"\n",
" body = \"\"\"\n",
"<body>\n",
"<table>\n",
" <tr>\n",
@ -124,11 +133,13 @@
"</table>\n",
"</body>\n",
"</html>\n",
" \"\"\".format(header, tableHTML)\n",
" try:\n",
" displayHTML(style + body)\n",
" except:\n",
" pass"
" \"\"\".format(\n",
" header, tableHTML\n",
" )\n",
" try:\n",
" displayHTML(style + body)\n",
" except:\n",
" pass"
],
"outputs": [],
"metadata": {
@ -152,7 +163,9 @@
"cell_type": "code",
"execution_count": null,
"source": [
"randomWords = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/random_words.parquet\").cache()\n",
"randomWords = spark.read.parquet(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/random_words.parquet\"\n",
").cache()\n",
"randomWords.show()"
],
"outputs": [],
@ -162,16 +175,19 @@
"cell_type": "code",
"execution_count": null,
"source": [
"randomLinks = randomWords \\\n",
" .mlTransform(BingImageSearch()\n",
" .setSubscriptionKey(BING_IMAGE_SEARCH_KEY)\n",
" .setCount(10)\n",
" .setQueryCol(\"words\")\n",
" .setOutputCol(\"images\")) \\\n",
" .mlTransform(BingImageSearch.getUrlTransformer(\"images\", \"urls\")) \\\n",
" .withColumn(\"label\", lit(\"other\")) \\\n",
" .limit(400)\n",
" \n",
"randomLinks = (\n",
" randomWords.mlTransform(\n",
" BingImageSearch()\n",
" .setSubscriptionKey(BING_IMAGE_SEARCH_KEY)\n",
" .setCount(10)\n",
" .setQueryCol(\"words\")\n",
" .setOutputCol(\"images\")\n",
" )\n",
" .mlTransform(BingImageSearch.getUrlTransformer(\"images\", \"urls\"))\n",
" .withColumn(\"label\", lit(\"other\"))\n",
" .limit(400)\n",
")\n",
"\n",
"displayDF(randomLinks)"
],
"outputs": [],
@ -183,11 +199,17 @@
"cell_type": "code",
"execution_count": null,
"source": [
"images = snowLeopardUrls.union(randomLinks).distinct().repartition(100)\\\n",
" .mlTransform(BingImageSearch.downloadFromUrls(\"urls\", \"image\", concurrency=5, timeout=5000))\\\n",
" .dropna()\n",
"images = (\n",
" snowLeopardUrls.union(randomLinks)\n",
" .distinct()\n",
" .repartition(100)\n",
" .mlTransform(\n",
" BingImageSearch.downloadFromUrls(\"urls\", \"image\", concurrency=5, timeout=5000)\n",
" )\n",
" .dropna()\n",
")\n",
"\n",
"train, test = images.randomSplit([.7,.3], seed=1)"
"train, test = images.randomSplit([0.7, 0.3], seed=1)"
],
"outputs": [],
"metadata": {}
@ -205,23 +227,31 @@
"from synapse.ml.stages import UDFTransformer\n",
"from pyspark.sql.types import *\n",
"\n",
"\n",
"def getIndex(row):\n",
" return float(row[1])\n",
" return float(row[1])\n",
"\n",
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" network = ModelDownloader(spark, \"abfss://synapse@mmlsparkeuap.dfs.core.windows.net/models/\").downloadByName(\"ResNet50\")\n",
" network = ModelDownloader(\n",
" spark, \"abfss://synapse@mmlsparkeuap.dfs.core.windows.net/models/\"\n",
" ).downloadByName(\"ResNet50\")\n",
"else:\n",
" network = ModelDownloader(spark, \"dbfs:/Models/\").downloadByName(\"ResNet50\")\n",
" network = ModelDownloader(spark, \"dbfs:/Models/\").downloadByName(\"ResNet50\")\n",
"\n",
"model = Pipeline(stages=[\n",
" StringIndexer(inputCol = \"labels\", outputCol=\"index\"),\n",
" ImageFeaturizer(inputCol=\"image\", outputCol=\"features\", cutOutputLayers=1).setModel(network),\n",
" LogisticRegression(maxIter=5, labelCol=\"index\", regParam=10.0),\n",
" UDFTransformer()\\\n",
" .setUDF(udf(getIndex, DoubleType()))\\\n",
" .setInputCol(\"probability\")\\\n",
" .setOutputCol(\"leopard_prob\")\n",
"])\n",
"model = Pipeline(\n",
" stages=[\n",
" StringIndexer(inputCol=\"labels\", outputCol=\"index\"),\n",
" ImageFeaturizer(\n",
" inputCol=\"image\", outputCol=\"features\", cutOutputLayers=1\n",
" ).setModel(network),\n",
" LogisticRegression(maxIter=5, labelCol=\"index\", regParam=10.0),\n",
" UDFTransformer()\n",
" .setUDF(udf(getIndex, DoubleType()))\n",
" .setInputCol(\"probability\")\n",
" .setOutputCol(\"leopard_prob\"),\n",
" ]\n",
")\n",
"\n",
"fitModel = model.fit(train)"
],
@ -240,14 +270,18 @@
"execution_count": null,
"source": [
"def plotConfusionMatrix(df, label, prediction, classLabels):\n",
" from synapse.ml.plot import confusionMatrix\n",
" import matplotlib.pyplot as plt\n",
" fig = plt.figure(figsize=(4.5, 4.5))\n",
" confusionMatrix(df, label, prediction, classLabels)\n",
" display(fig)\n",
" from synapse.ml.plot import confusionMatrix\n",
" import matplotlib.pyplot as plt\n",
"\n",
" fig = plt.figure(figsize=(4.5, 4.5))\n",
" confusionMatrix(df, label, prediction, classLabels)\n",
" display(fig)\n",
"\n",
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) != \"Microsoft.ProjectArcadia\":\n",
" plotConfusionMatrix(fitModel.transform(test), \"index\", \"prediction\", fitModel.stages[0].labels)"
" plotConfusionMatrix(\n",
" fitModel.transform(test), \"index\", \"prediction\", fitModel.stages[0].labels\n",
" )"
],
"outputs": [],
"metadata": {
@ -261,19 +295,23 @@
"import urllib.request\n",
"from synapse.ml.lime import ImageLIME\n",
"\n",
"test_image_url = \"https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/snow_leopard1.jpg\"\n",
"test_image_url = (\n",
" \"https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/snow_leopard1.jpg\"\n",
")\n",
"with urllib.request.urlopen(test_image_url) as url:\n",
" barr = url.read()\n",
"test_subsample = spark.createDataFrame([(bytearray(barr),)], [\"image\"])\n",
"\n",
"lime = ImageLIME()\\\n",
" .setModel(fitModel)\\\n",
" .setPredictionCol(\"leopard_prob\")\\\n",
" .setOutputCol(\"weights\")\\\n",
" .setInputCol(\"image\")\\\n",
" .setCellSize(100.0)\\\n",
" .setModifier(50.0)\\\n",
" .setNSamples(300)\n",
"lime = (\n",
" ImageLIME()\n",
" .setModel(fitModel)\n",
" .setPredictionCol(\"leopard_prob\")\n",
" .setOutputCol(\"weights\")\n",
" .setInputCol(\"image\")\n",
" .setCellSize(100.0)\n",
" .setModifier(50.0)\n",
" .setNSamples(300)\n",
")\n",
"\n",
"result = lime.transform(test_subsample)"
],
@ -289,12 +327,13 @@
"import matplotlib.pyplot as plt\n",
"import PIL, io, numpy as np\n",
"\n",
"\n",
"def plot_superpixels(row):\n",
" image_bytes = row['image']\n",
" superpixels = row['superpixels']['clusters']\n",
" weights = list(row['weights'])\n",
" mean_weight = np.percentile(weights,90)\n",
" img = (PIL.Image.open(io.BytesIO(image_bytes))).convert('RGBA')\n",
" image_bytes = row[\"image\"]\n",
" superpixels = row[\"superpixels\"][\"clusters\"]\n",
" weights = list(row[\"weights\"])\n",
" mean_weight = np.percentile(weights, 90)\n",
" img = (PIL.Image.open(io.BytesIO(image_bytes))).convert(\"RGBA\")\n",
" image_array = np.asarray(img).copy()\n",
" for (sp, w) in zip(superpixels, weights):\n",
" if w > mean_weight:\n",
@ -305,6 +344,7 @@
" plt.imshow(image_array)\n",
" display()\n",
"\n",
"\n",
"# Gets first row from the LIME-transformed data frame\n",
"if os.environ.get(\"AZURE_SERVICE\", None) != \"Microsoft.ProjectArcadia\":\n",
" plot_superpixels(result.take(1)[0])"

Просмотреть файл

@ -43,6 +43,7 @@
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.visualization import display\n",
"\n",
@ -78,9 +79,13 @@
},
"outputs": [],
"source": [
"df = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n",
"df = spark.read.parquet(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\"\n",
")\n",
"\n",
"labelIndexer = StringIndexer(inputCol=\"income\", outputCol=\"label\", stringOrderType=\"alphabetAsc\").fit(df)\n",
"labelIndexer = StringIndexer(\n",
" inputCol=\"income\", outputCol=\"label\", stringOrderType=\"alphabetAsc\"\n",
").fit(df)\n",
"print(\"Label index assigment: \" + str(set(zip(labelIndexer.labels, [0, 1]))))\n",
"\n",
"training = labelIndexer.transform(df).cache()\n",
@ -97,11 +102,23 @@
"]\n",
"categorical_features_idx = [col + \"_idx\" for col in categorical_features]\n",
"categorical_features_enc = [col + \"_enc\" for col in categorical_features]\n",
"numeric_features = [\"age\", \"education-num\", \"capital-gain\", \"capital-loss\", \"hours-per-week\"]\n",
"numeric_features = [\n",
" \"age\",\n",
" \"education-num\",\n",
" \"capital-gain\",\n",
" \"capital-loss\",\n",
" \"hours-per-week\",\n",
"]\n",
"\n",
"strIndexer = StringIndexer(inputCols=categorical_features, outputCols=categorical_features_idx)\n",
"onehotEnc = OneHotEncoder(inputCols=categorical_features_idx, outputCols=categorical_features_enc)\n",
"vectAssem = VectorAssembler(inputCols=categorical_features_enc + numeric_features, outputCol=\"features\")\n",
"strIndexer = StringIndexer(\n",
" inputCols=categorical_features, outputCols=categorical_features_idx\n",
")\n",
"onehotEnc = OneHotEncoder(\n",
" inputCols=categorical_features_idx, outputCols=categorical_features_enc\n",
")\n",
"vectAssem = VectorAssembler(\n",
" inputCols=categorical_features_enc + numeric_features, outputCol=\"features\"\n",
")\n",
"lr = LogisticRegression(featuresCol=\"features\", labelCol=\"label\", weightCol=\"fnlwgt\")\n",
"pipeline = Pipeline(stages=[strIndexer, onehotEnc, vectAssem, lr])\n",
"model = pipeline.fit(training)"
@ -134,7 +151,9 @@
},
"outputs": [],
"source": [
"explain_instances = model.transform(training).orderBy(rand()).limit(5).repartition(200).cache()\n",
"explain_instances = (\n",
" model.transform(training).orderBy(rand()).limit(5).repartition(200).cache()\n",
")\n",
"display(explain_instances)"
]
},
@ -175,7 +194,7 @@
" backgroundData=broadcast(training.orderBy(rand()).limit(100).cache()),\n",
")\n",
"\n",
"shap_df = shap.transform(explain_instances)\n"
"shap_df = shap.transform(explain_instances)"
]
},
{
@ -209,7 +228,9 @@
"shaps = (\n",
" shap_df.withColumn(\"probability\", vec_access(col(\"probability\"), lit(1)))\n",
" .withColumn(\"shapValues\", vec2array(col(\"shapValues\").getItem(0)))\n",
" .select([\"shapValues\", \"probability\", \"label\"] + categorical_features + numeric_features)\n",
" .select(\n",
" [\"shapValues\", \"probability\", \"label\"] + categorical_features + numeric_features\n",
" )\n",
")\n",
"\n",
"shaps_local = shaps.toPandas()\n",
@ -257,7 +278,10 @@
"fig = make_subplots(\n",
" rows=rows,\n",
" cols=1,\n",
" subplot_titles=\"Probability: \" + shaps_local[\"probability\"].apply(\"{:.2%}\".format) + \"; Label: \" + shaps_local[\"label\"].astype(str),\n",
" subplot_titles=\"Probability: \"\n",
" + shaps_local[\"probability\"].apply(\"{:.2%}\".format)\n",
" + \"; Label: \"\n",
" + shaps_local[\"label\"].astype(str),\n",
")\n",
"\n",
"for index, row in shaps_local.iterrows():\n",
@ -266,7 +290,11 @@
" list_of_tuples = list(zip(features_with_base, feature_values, shap_values))\n",
" shap_pdf = pd.DataFrame(list_of_tuples, columns=[\"name\", \"value\", \"shap\"])\n",
" fig.add_trace(\n",
" go.Bar(x=shap_pdf[\"name\"], y=shap_pdf[\"shap\"], hovertext=\"value: \" + shap_pdf[\"value\"].astype(str)),\n",
" go.Bar(\n",
" x=shap_pdf[\"name\"],\n",
" y=shap_pdf[\"shap\"],\n",
" hovertext=\"value: \" + shap_pdf[\"value\"].astype(str),\n",
" ),\n",
" row=index + 1,\n",
" col=1,\n",
" )\n",
@ -274,7 +302,7 @@
"fig.update_yaxes(range=[-1, 1], fixedrange=True, zerolinecolor=\"black\")\n",
"fig.update_xaxes(type=\"category\", tickangle=45, fixedrange=True)\n",
"fig.update_layout(height=400 * rows, title_text=\"SHAP explanations\")\n",
"fig.show()\n"
"fig.show()"
]
},
{

Просмотреть файл

@ -16,8 +16,10 @@
"outputs": [],
"source": [
"import os\n",
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()"
]
},
@ -31,7 +33,7 @@
"source": [
"import sys\n",
"import numpy as np\n",
"import pandas as pd\n"
"import pandas as pd"
]
},
{
@ -47,7 +49,9 @@
"metadata": {},
"outputs": [],
"source": [
"data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n",
"data = spark.read.parquet(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\"\n",
")\n",
"data = data.select([\"education\", \"marital-status\", \"hours-per-week\", \"income\"])\n",
"train, test = data.randomSplit([0.75, 0.25], seed=123)\n",
"train.limit(10).toPandas()"
@ -75,7 +79,10 @@
"source": [
"from synapse.ml.train import TrainClassifier\n",
"from pyspark.ml.classification import LogisticRegression\n",
"model = TrainClassifier(model=LogisticRegression(), labelCol=\"income\", numFeatures=256).fit(train)"
"\n",
"model = TrainClassifier(\n",
" model=LogisticRegression(), labelCol=\"income\", numFeatures=256\n",
").fit(train)"
]
},
{
@ -92,6 +99,7 @@
"outputs": [],
"source": [
"from synapse.ml.train import ComputeModelStatistics, TrainedClassifierModel\n",
"\n",
"prediction = model.transform(test)\n",
"prediction.printSchema()"
]
@ -124,21 +132,23 @@
"from synapse.ml.io import *\n",
"import uuid\n",
"\n",
"serving_inputs = spark.readStream.server() \\\n",
" .address(\"localhost\", 8898, \"my_api\") \\\n",
" .option(\"name\", \"my_api\") \\\n",
" .load() \\\n",
"serving_inputs = (\n",
" spark.readStream.server()\n",
" .address(\"localhost\", 8898, \"my_api\")\n",
" .option(\"name\", \"my_api\")\n",
" .load()\n",
" .parseRequest(\"my_api\", test.schema)\n",
")\n",
"\n",
"serving_outputs = model.transform(serving_inputs) \\\n",
" .makeReply(\"prediction\")\n",
"serving_outputs = model.transform(serving_inputs).makeReply(\"prediction\")\n",
"\n",
"server = serving_outputs.writeStream \\\n",
" .server() \\\n",
" .replyTo(\"my_api\") \\\n",
" .queryName(\"my_query\") \\\n",
" .option(\"checkpointLocation\", \"file:///tmp/checkpoints-{}\".format(uuid.uuid1())) \\\n",
" .start()\n"
"server = (\n",
" serving_outputs.writeStream.server()\n",
" .replyTo(\"my_api\")\n",
" .queryName(\"my_query\")\n",
" .option(\"checkpointLocation\", \"file:///tmp/checkpoints-{}\".format(uuid.uuid1()))\n",
" .start()\n",
")"
]
},
{
@ -155,7 +165,8 @@
"outputs": [],
"source": [
"import requests\n",
"data = u'{\"education\":\" 10th\",\"marital-status\":\"Divorced\",\"hours-per-week\":40.0}'\n",
"\n",
"data = '{\"education\":\" 10th\",\"marital-status\":\"Divorced\",\"hours-per-week\":40.0}'\n",
"r = requests.post(data=data, url=\"http://localhost:8898/my_api\")\n",
"print(\"Response {}\".format(r.text))"
]
@ -167,7 +178,8 @@
"outputs": [],
"source": [
"import requests\n",
"data = u'{\"education\":\" Masters\",\"marital-status\":\"Married-civ-spouse\",\"hours-per-week\":40.0}'\n",
"\n",
"data = '{\"education\":\" Masters\",\"marital-status\":\"Married-civ-spouse\",\"hours-per-week\":40.0}'\n",
"r = requests.post(data=data, url=\"http://localhost:8898/my_api\")\n",
"print(\"Response {}\".format(r.text))"
]
@ -181,7 +193,8 @@
"outputs": [],
"source": [
"import time\n",
"time.sleep(20) # wait for server to finish setting up (just to be safe)\n",
"\n",
"time.sleep(20) # wait for server to finish setting up (just to be safe)\n",
"server.stop()"
]
},

Просмотреть файл

@ -92,6 +92,7 @@
"\n",
"if os.environ.get(\"AZURE_SERVICE\", None) == \"Microsoft.ProjectArcadia\":\n",
" from pyspark.sql import SparkSession\n",
"\n",
" spark = SparkSession.builder.getOrCreate()\n",
" from notebookutils.visualization import display"
]
@ -102,10 +103,14 @@
"metadata": {},
"outputs": [],
"source": [
"df = spark.read.format(\"csv\")\\\n",
" .option(\"header\", True)\\\n",
" .option(\"inferSchema\", True)\\\n",
" .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/heart_disease_prediction_data.csv\")\n",
"df = (\n",
" spark.read.format(\"csv\")\n",
" .option(\"header\", True)\n",
" .option(\"inferSchema\", True)\n",
" .load(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/heart_disease_prediction_data.csv\"\n",
" )\n",
")\n",
"# print dataset basic info\n",
"print(\"records read: \" + str(df.count()))\n",
"print(\"Schema: \")\n",
@ -151,6 +156,7 @@
"outputs": [],
"source": [
"from synapse.ml.vw import VowpalWabbitFeaturizer\n",
"\n",
"featurizer = VowpalWabbitFeaturizer(inputCols=df.columns[:-1], outputCol=\"features\")\n",
"train_data = featurizer.transform(train)[\"target\", \"features\"]\n",
"test_data = featurizer.transform(test)[\"target\", \"features\"]"
@ -179,7 +185,10 @@
"outputs": [],
"source": [
"from synapse.ml.vw import VowpalWabbitClassifier\n",
"model = VowpalWabbitClassifier(numPasses=20, labelCol=\"target\", featuresCol=\"features\").fit(train_data)"
"\n",
"model = VowpalWabbitClassifier(\n",
" numPasses=20, labelCol=\"target\", featuresCol=\"features\"\n",
").fit(train_data)"
]
},
{
@ -206,7 +215,10 @@
"outputs": [],
"source": [
"from synapse.ml.train import ComputeModelStatistics\n",
"metrics = ComputeModelStatistics(evaluationMetric='classification', labelCol='target', scoredLabelsCol='prediction').transform(predictions)\n",
"\n",
"metrics = ComputeModelStatistics(\n",
" evaluationMetric=\"classification\", labelCol=\"target\", scoredLabelsCol=\"prediction\"\n",
").transform(predictions)\n",
"display(metrics)"
]
},
@ -232,7 +244,9 @@
"metadata": {},
"outputs": [],
"source": [
"data = spark.read.parquet(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\")\n",
"data = spark.read.parquet(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/AdultCensusIncome.parquet\"\n",
")\n",
"data = data.select([\"education\", \"marital-status\", \"hours-per-week\", \"income\"])\n",
"train, test = data.randomSplit([0.75, 0.25], seed=123)\n",
"display(train)"
@ -260,12 +274,15 @@
"from synapse.ml.vw import VowpalWabbitFeaturizer, VowpalWabbitClassifier\n",
"\n",
"# Define classification label\n",
"train = train.withColumn(\"label\", when(col(\"income\").contains(\"<\"), 0.0).otherwise(1.0)).repartition(1)\n",
"train = train.withColumn(\n",
" \"label\", when(col(\"income\").contains(\"<\"), 0.0).otherwise(1.0)\n",
").repartition(1)\n",
"print(train.count())\n",
"\n",
"# Specify featurizer\n",
"vw_featurizer = VowpalWabbitFeaturizer(inputCols=[\"education\", \"marital-status\", \"hours-per-week\"],\n",
" outputCol=\"features\")"
"vw_featurizer = VowpalWabbitFeaturizer(\n",
" inputCols=[\"education\", \"marital-status\", \"hours-per-week\"], outputCol=\"features\"\n",
")"
]
},
{
@ -283,10 +300,9 @@
"source": [
"# Define VW classification model\n",
"args = \"--loss_function=logistic --quiet --holdout_off\"\n",
"vw_model = VowpalWabbitClassifier(featuresCol=\"features\",\n",
" labelCol=\"label\",\n",
" passThroughArgs=args,\n",
" numPasses=10)\n",
"vw_model = VowpalWabbitClassifier(\n",
" featuresCol=\"features\", labelCol=\"label\", passThroughArgs=args, numPasses=10\n",
")\n",
"\n",
"# Create a pipeline\n",
"vw_pipeline = Pipeline(stages=[vw_featurizer, vw_model])"
@ -336,9 +352,10 @@
"outputs": [],
"source": [
"from synapse.ml.train import ComputeModelStatistics\n",
"metrics = ComputeModelStatistics(evaluationMetric=\"classification\", \n",
" labelCol=\"label\", \n",
" scoredLabelsCol=\"prediction\").transform(prediction)\n",
"\n",
"metrics = ComputeModelStatistics(\n",
" evaluationMetric=\"classification\", labelCol=\"label\", scoredLabelsCol=\"prediction\"\n",
").transform(prediction)\n",
"display(metrics)"
]
},
@ -388,8 +405,8 @@
"source": [
"boston = load_boston()\n",
"\n",
"feature_cols = ['f' + str(i) for i in range(boston.data.shape[1])]\n",
"header = ['target'] + feature_cols\n",
"feature_cols = [\"f\" + str(i) for i in range(boston.data.shape[1])]\n",
"header = [\"target\"] + feature_cols\n",
"df = spark.createDataFrame(\n",
" pd.DataFrame(data=np.column_stack((boston.target, boston.data)), columns=header)\n",
").repartition(1)\n",
@ -438,20 +455,20 @@
"outputs": [],
"source": [
"features = train_data.columns[1:]\n",
"values = train_data.drop('target').toPandas()\n",
"values = train_data.drop(\"target\").toPandas()\n",
"ncols = 5\n",
"nrows = math.ceil(len(features) / ncols)\n",
"\n",
"yy = [r['target'] for r in train_data.select('target').collect()]\n",
"yy = [r[\"target\"] for r in train_data.select(\"target\").collect()]\n",
"\n",
"f, axes = plt.subplots(nrows, ncols, sharey=True, figsize=(30,10))\n",
"f, axes = plt.subplots(nrows, ncols, sharey=True, figsize=(30, 10))\n",
"f.tight_layout()\n",
"\n",
"for irow in range(nrows):\n",
" axes[irow][0].set_ylabel('target')\n",
" axes[irow][0].set_ylabel(\"target\")\n",
" for icol in range(ncols):\n",
" try:\n",
" feat = features[irow*ncols + icol]\n",
" feat = features[irow * ncols + icol]\n",
" xx = values[feat]\n",
"\n",
" axes[irow][icol].scatter(xx, yy, s=10, alpha=0.25)\n",
@ -476,10 +493,10 @@
"source": [
"vw_featurizer = VowpalWabbitFeaturizer(\n",
" inputCols=feature_cols,\n",
" outputCol='features',\n",
" outputCol=\"features\",\n",
")\n",
"vw_train_data = vw_featurizer.transform(train_data)['target', 'features']\n",
"vw_test_data = vw_featurizer.transform(test_data)['target', 'features']\n",
"vw_train_data = vw_featurizer.transform(train_data)[\"target\", \"features\"]\n",
"vw_test_data = vw_featurizer.transform(test_data)[\"target\", \"features\"]\n",
"display(vw_train_data)"
]
},
@ -500,8 +517,8 @@
"source": [
"args = \"--holdout_off --loss_function quantile -l 7 -q :: --power_t 0.7\"\n",
"vwr = VowpalWabbitRegressor(\n",
" labelCol='target',\n",
" featuresCol='features',\n",
" labelCol=\"target\",\n",
" featuresCol=\"features\",\n",
" passThroughArgs=args,\n",
" numPasses=200,\n",
")\n",
@ -527,13 +544,11 @@
"outputs": [],
"source": [
"metrics = ComputeModelStatistics(\n",
" evaluationMetric='regression',\n",
" labelCol='target',\n",
" scoresCol='prediction'\n",
" evaluationMetric=\"regression\", labelCol=\"target\", scoresCol=\"prediction\"\n",
").transform(vw_predictions)\n",
"\n",
"vw_result = metrics.toPandas()\n",
"vw_result.insert(0, 'model', ['Vowpal Wabbit'])\n",
"vw_result.insert(0, \"model\", [\"Vowpal Wabbit\"])\n",
"display(vw_result)"
]
},
@ -543,21 +558,21 @@
"metadata": {},
"outputs": [],
"source": [
"cmap = get_cmap('YlOrRd')\n",
"target = np.array(test_data.select('target').collect()).flatten()\n",
"cmap = get_cmap(\"YlOrRd\")\n",
"target = np.array(test_data.select(\"target\").collect()).flatten()\n",
"model_preds = [(\"Vowpal Wabbit\", vw_predictions)]\n",
"\n",
"f, axe = plt.subplots(figsize=(6, 6))\n",
"f.tight_layout()\n",
"\n",
"preds = np.array(vw_predictions.select('prediction').collect()).flatten()\n",
"preds = np.array(vw_predictions.select(\"prediction\").collect()).flatten()\n",
"err = np.absolute(preds - target)\n",
"norm = Normalize()\n",
"clrs = cmap(np.asarray(norm(err)))[:, :-1]\n",
"plt.scatter(preds, target, s=60, c=clrs, edgecolors='#888888', alpha=0.75)\n",
"plt.plot((0, 60), (0, 60), linestyle='--', color='#888888')\n",
"axe.set_xlabel('Predicted values')\n",
"axe.set_ylabel('Actual values')\n",
"plt.scatter(preds, target, s=60, c=clrs, edgecolors=\"#888888\", alpha=0.75)\n",
"plt.plot((0, 60), (0, 60), linestyle=\"--\", color=\"#888888\")\n",
"axe.set_xlabel(\"Predicted values\")\n",
"axe.set_ylabel(\"Actual values\")\n",
"axe.set_title(\"Vowpal Wabbit\")"
]
},
@ -583,8 +598,9 @@
"metadata": {},
"outputs": [],
"source": [
"triazines = spark.read.format(\"libsvm\")\\\n",
" .load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/triazines.scale.svmlight\")"
"triazines = spark.read.format(\"libsvm\").load(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/triazines.scale.svmlight\"\n",
")"
]
},
{
@ -630,8 +646,10 @@
"outputs": [],
"source": [
"from synapse.ml.vw import VowpalWabbitRegressor\n",
"model = (VowpalWabbitRegressor(numPasses=20, passThroughArgs=\"--holdout_off --loss_function quantile -q :: -l 0.1\")\n",
" .fit(train))"
"\n",
"model = VowpalWabbitRegressor(\n",
" numPasses=20, passThroughArgs=\"--holdout_off --loss_function quantile -q :: -l 0.1\"\n",
").fit(train)"
]
},
{
@ -658,10 +676,10 @@
"outputs": [],
"source": [
"from synapse.ml.train import ComputeModelStatistics\n",
"metrics = ComputeModelStatistics(evaluationMetric='regression',\n",
" labelCol='label',\n",
" scoresCol='prediction') \\\n",
" .transform(scoredData)\n",
"\n",
"metrics = ComputeModelStatistics(\n",
" evaluationMetric=\"regression\", labelCol=\"label\", scoresCol=\"prediction\"\n",
").transform(scoredData)\n",
"display(metrics)"
]
},
@ -685,7 +703,9 @@
"metadata": {},
"outputs": [],
"source": [
"data = spark.read.format(\"json\").load(\"wasbs://publicwasb@mmlspark.blob.core.windows.net/vwcb_input.dsjson\")"
"data = spark.read.format(\"json\").load(\n",
" \"wasbs://publicwasb@mmlspark.blob.core.windows.net/vwcb_input.dsjson\"\n",
")"
]
},
{
@ -701,23 +721,39 @@
"metadata": {},
"outputs": [],
"source": [
"from pyspark.sql.functions import col \n",
"from pyspark.sql.types import IntegerType, DoubleType \n",
"data = data.withColumn('GUser_id', col('c.GUser.id'))\\\n",
" .withColumn('GUser_major', col('c.GUser.major'))\\\n",
" .withColumn('GUser_hobby', col('c.GUser.hobby'))\\\n",
" .withColumn('GUser_favorite_character', col('c.GUser.favorite_character'))\\\n",
" .withColumn('TAction_0_topic', col('c._multi.TAction.topic')[0])\\\n",
" .withColumn('TAction_1_topic', col('c._multi.TAction.topic')[1])\\\n",
" .withColumn('TAction_2_topic', col('c._multi.TAction.topic')[2])\\\n",
" .withColumn('TAction_3_topic', col('c._multi.TAction.topic')[3])\\\n",
" .withColumn('TAction_4_topic', col('c._multi.TAction.topic')[4])\\\n",
" .withColumn('chosenAction', col('_label_Action').cast(IntegerType()))\\\n",
" .withColumn('label', col('_labelIndex').cast(DoubleType()))\\\n",
" .withColumn('probability', col('_label_probability'))\\\n",
" .select('GUser_id', 'GUser_major', 'GUser_hobby', 'GUser_favorite_character', 'TAction_0_topic', 'TAction_1_topic', 'TAction_2_topic', 'TAction_3_topic', 'TAction_4_topic', 'chosenAction', 'label', 'probability')\n",
"from pyspark.sql.functions import col\n",
"from pyspark.sql.types import IntegerType, DoubleType\n",
"\n",
"print(\"Schema: \") \n",
"data = (\n",
" data.withColumn(\"GUser_id\", col(\"c.GUser.id\"))\n",
" .withColumn(\"GUser_major\", col(\"c.GUser.major\"))\n",
" .withColumn(\"GUser_hobby\", col(\"c.GUser.hobby\"))\n",
" .withColumn(\"GUser_favorite_character\", col(\"c.GUser.favorite_character\"))\n",
" .withColumn(\"TAction_0_topic\", col(\"c._multi.TAction.topic\")[0])\n",
" .withColumn(\"TAction_1_topic\", col(\"c._multi.TAction.topic\")[1])\n",
" .withColumn(\"TAction_2_topic\", col(\"c._multi.TAction.topic\")[2])\n",
" .withColumn(\"TAction_3_topic\", col(\"c._multi.TAction.topic\")[3])\n",
" .withColumn(\"TAction_4_topic\", col(\"c._multi.TAction.topic\")[4])\n",
" .withColumn(\"chosenAction\", col(\"_label_Action\").cast(IntegerType()))\n",
" .withColumn(\"label\", col(\"_labelIndex\").cast(DoubleType()))\n",
" .withColumn(\"probability\", col(\"_label_probability\"))\n",
" .select(\n",
" \"GUser_id\",\n",
" \"GUser_major\",\n",
" \"GUser_hobby\",\n",
" \"GUser_favorite_character\",\n",
" \"TAction_0_topic\",\n",
" \"TAction_1_topic\",\n",
" \"TAction_2_topic\",\n",
" \"TAction_3_topic\",\n",
" \"TAction_4_topic\",\n",
" \"chosenAction\",\n",
" \"label\",\n",
" \"probability\",\n",
" )\n",
")\n",
"\n",
"print(\"Schema: \")\n",
"data.printSchema()"
]
},
@ -734,20 +770,53 @@
"metadata": {},
"outputs": [],
"source": [
"from synapse.ml.vw import VowpalWabbitFeaturizer, VowpalWabbitContextualBandit, VectorZipper\n",
"from synapse.ml.vw import (\n",
" VowpalWabbitFeaturizer,\n",
" VowpalWabbitContextualBandit,\n",
" VectorZipper,\n",
")\n",
"from pyspark.ml import Pipeline\n",
"pipeline = Pipeline(stages=[\n",
" VowpalWabbitFeaturizer(inputCols=['GUser_id'], outputCol='GUser_id_feature'),\n",
" VowpalWabbitFeaturizer(inputCols=['GUser_major'], outputCol='GUser_major_feature'),\n",
" VowpalWabbitFeaturizer(inputCols=['GUser_hobby'], outputCol='GUser_hobby_feature'),\n",
" VowpalWabbitFeaturizer(inputCols=['GUser_favorite_character'], outputCol='GUser_favorite_character_feature'),\n",
" VowpalWabbitFeaturizer(inputCols=['TAction_0_topic'], outputCol='TAction_0_topic_feature'),\n",
" VowpalWabbitFeaturizer(inputCols=['TAction_1_topic'], outputCol='TAction_1_topic_feature'),\n",
" VowpalWabbitFeaturizer(inputCols=['TAction_2_topic'], outputCol='TAction_2_topic_feature'),\n",
" VowpalWabbitFeaturizer(inputCols=['TAction_3_topic'], outputCol='TAction_3_topic_feature'),\n",
" VowpalWabbitFeaturizer(inputCols=['TAction_4_topic'], outputCol='TAction_4_topic_feature'),\n",
" VectorZipper(inputCols=['TAction_0_topic_feature', 'TAction_1_topic_feature', 'TAction_2_topic_feature', 'TAction_3_topic_feature','TAction_4_topic_feature'], outputCol='features')\n",
"])\n",
"\n",
"pipeline = Pipeline(\n",
" stages=[\n",
" VowpalWabbitFeaturizer(inputCols=[\"GUser_id\"], outputCol=\"GUser_id_feature\"),\n",
" VowpalWabbitFeaturizer(\n",
" inputCols=[\"GUser_major\"], outputCol=\"GUser_major_feature\"\n",
" ),\n",
" VowpalWabbitFeaturizer(\n",
" inputCols=[\"GUser_hobby\"], outputCol=\"GUser_hobby_feature\"\n",
" ),\n",
" VowpalWabbitFeaturizer(\n",
" inputCols=[\"GUser_favorite_character\"],\n",
" outputCol=\"GUser_favorite_character_feature\",\n",
" ),\n",
" VowpalWabbitFeaturizer(\n",
" inputCols=[\"TAction_0_topic\"], outputCol=\"TAction_0_topic_feature\"\n",
" ),\n",
" VowpalWabbitFeaturizer(\n",
" inputCols=[\"TAction_1_topic\"], outputCol=\"TAction_1_topic_feature\"\n",
" ),\n",
" VowpalWabbitFeaturizer(\n",
" inputCols=[\"TAction_2_topic\"], outputCol=\"TAction_2_topic_feature\"\n",
" ),\n",
" VowpalWabbitFeaturizer(\n",
" inputCols=[\"TAction_3_topic\"], outputCol=\"TAction_3_topic_feature\"\n",
" ),\n",
" VowpalWabbitFeaturizer(\n",
" inputCols=[\"TAction_4_topic\"], outputCol=\"TAction_4_topic_feature\"\n",
" ),\n",
" VectorZipper(\n",
" inputCols=[\n",
" \"TAction_0_topic_feature\",\n",
" \"TAction_1_topic_feature\",\n",
" \"TAction_2_topic_feature\",\n",
" \"TAction_3_topic_feature\",\n",
" \"TAction_4_topic_feature\",\n",
" ],\n",
" outputCol=\"features\",\n",
" ),\n",
" ]\n",
")\n",
"tranformation_pipeline = pipeline.fit(data)\n",
"transformed_data = tranformation_pipeline.transform(data)\n",
"\n",
@ -767,15 +836,23 @@
"metadata": {},
"outputs": [],
"source": [
"estimator = VowpalWabbitContextualBandit() \\\n",
" .setPassThroughArgs(\"--cb_explore_adf --epsilon 0.2 --quiet\") \\\n",
" .setSharedCol('GUser_id_feature') \\\n",
" .setAdditionalSharedFeatures([\"GUser_major_feature\", \"GUser_hobby_feature\", \"GUser_favorite_character_feature\"]) \\\n",
" .setFeaturesCol('features') \\\n",
" .setUseBarrierExecutionMode(False)\\\n",
" .setChosenActionCol('chosenAction')\\\n",
" .setLabelCol('label')\\\n",
" .setProbabilityCol('probability')\n",
"estimator = (\n",
" VowpalWabbitContextualBandit()\n",
" .setPassThroughArgs(\"--cb_explore_adf --epsilon 0.2 --quiet\")\n",
" .setSharedCol(\"GUser_id_feature\")\n",
" .setAdditionalSharedFeatures(\n",
" [\n",
" \"GUser_major_feature\",\n",
" \"GUser_hobby_feature\",\n",
" \"GUser_favorite_character_feature\",\n",
" ]\n",
" )\n",
" .setFeaturesCol(\"features\")\n",
" .setUseBarrierExecutionMode(False)\n",
" .setChosenActionCol(\"chosenAction\")\n",
" .setLabelCol(\"label\")\n",
" .setProbabilityCol(\"probability\")\n",
")\n",
"model = estimator.fit(transformed_data)\n",
"display(model.getPerformanceStatistics())"
]

Просмотреть файл

@ -2,4 +2,5 @@
# Licensed under the MIT License. See LICENSE in project root for information.
# Required to auto-format python code
black==22.3.0
black==22.3.0
black[jupyter]==22.3.0