From 17166c7c12b620d84cf186ec618f5dd347233ccc Mon Sep 17 00:00:00 2001 From: Lily Ma Date: Fri, 8 Nov 2019 10:17:12 -0800 Subject: [PATCH] Delete AnomalousRATraining.ipynb --- BYOML/Notebooks/AnomalousRATraining.ipynb | 1 - 1 file changed, 1 deletion(-) delete mode 100644 BYOML/Notebooks/AnomalousRATraining.ipynb diff --git a/BYOML/Notebooks/AnomalousRATraining.ipynb b/BYOML/Notebooks/AnomalousRATraining.ipynb deleted file mode 100644 index f48807df07..0000000000 --- a/BYOML/Notebooks/AnomalousRATraining.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"cells":[{"cell_type":"markdown","source":["This notebook is used for training anomalous resource access model. The data used here is File Share Access Events from Windows machine. Data is loaded from a Blob Storage Container.\nThe trained model is then saved to the Blob Storage, which can then be used by the Scoring Notebook\n\nSteps:\n 0. One-time: Install the following packages on your cluster by navigating to the 'Clusters' tab on the left\n - sentinel_ai (whl package from GitHub Utilities folder)\n - azure_sentinel_ml_utilities (whl package from GitHub Utilities folder)\n - azure-storage-blob (from PyPi)\n - scikit-surprise (from PyPi)\n - numpy==1.15.0 (from PyPi)\n - pyarrow==0.12.0 (from PyPi)\n \n 1. One-time: Set credentials in KeyVault so the notebook can access \n - Storage Account\n 2. Ensure the relative paths to Blob Storage are correct.\n 3. Run the Notebook to produce the model\n \n One-time: (Setting up Storage Key in KeyVault)\n - (Refer:- https://docs.databricks.com/spark/latest/data-sources/azure/azure-storage.html#access-azure-blob-storage-directly)\n \n Storing and retrieving secrets: \n - Using Azure KeyVault:- https://docs.azuredatabricks.net/user-guide/secrets/secret-scopes.html#akv-ss"],"metadata":{}},{"cell_type":"code","source":["import datetime as dt\n\n# Storage Account information\nstorage_account = 'YOUR STORAGE ACCOUNT HERE'\nstorage_key = dbutils.secrets.get(scope = 'NAME HERE', key = 'KEY NAME HERE')\ncontainer = 'CONTAINER NAME HERE'\nmount_point_name = 'MOUNT POINT NAME HERE'\n\ntrain_base_path = 'PATHNAME HERE'\n\n# Project name\nproject = 'PROJECT NAME HERE'\n\n###\n### Note that when training periodically, you specify time range relative to current time as specified in the commented section\n###\n# Time range for training\n# train_start_time = dt.datetime.now() - dt.timedelta(days=65)\n# train_end_time = dt.datetime.now() - dt.timedelta(days=10)\ntrain_start_time = dt.datetime.strptime('Dec 1 2018', '%b %d %Y') \ntrain_end_time = dt.datetime.strptime('Jan 20 2019', '%b %d %Y') "],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
---------------------------------------------------------------------------\nPy4JJavaError Traceback (most recent call last)\n<command-896007305526709> in <module>()\n 3 # Storage Account information\n 4 storage_account = 'asirsa2019anilstoragews2'\n----> 5 storage_key = dbutils.secrets.get(scope = 'asirsa2019', key = 'asirsa2019anilstoragews2')\n 6 container = 'fileshareaccessdata'\n 7 mount_point_name = '/mnt/rootFSSTr/'\n\n/local_disk0/tmp/1570131528280-0/dbutils.py in get(self, scope, key)\n 167 \n 168 def get(self, scope, key):\n--> 169 return self.entry_point.getDbutils().preview().secret().get(scope, key)\n 170 \n 171 def getBytes(self, scope, key):\n\n/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py in __call__(self, *args)\n 1255 answer = self.gateway_client.send_command(command)\n 1256 return_value = get_return_value(\n-> 1257 answer, self.gateway_client, self.target_id, self.name)\n 1258 \n 1259 for temp_arg in temp_args:\n\n/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw)\n 61 def deco(*a, **kw):\n 62 try:\n---> 63 return f(*a, **kw)\n 64 except py4j.protocol.Py4JJavaError as e:\n 65 s = e.java_exception.toString()\n\n/databricks/spark/python/lib/py4j-0.10.7-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)\n 326 raise Py4JJavaError(\n 327 "An error occurred while calling {0}{1}{2}.\\n".\n--> 328 format(target_id, ".", name), value)\n 329 else:\n 330 raise Py4JError(\n\nPy4JJavaError: An error occurred while calling o221.get.\n: com.databricks.common.client.DatabricksServiceException: PERMISSION_DENIED: User fafadavi@microsoft.com does not have READ permission on scope asirsa2019 to perform this action.\n\tat com.databricks.common.client.DatabricksServiceException.copy(DBHttpClient.scala:592)\n\tat com.databricks.common.client.RawDBHttpClient.getResponseBody(DBHttpClient.scala:547)\n\tat com.databricks.common.client.RawDBHttpClient.httpRequestInternal(DBHttpClient.scala:498)\n\tat com.databricks.common.client.RawDBHttpClient.entityEnclosingRequestInternal(DBHttpClient.scala:489)\n\tat com.databricks.common.client.RawDBHttpClient.getInternal(DBHttpClient.scala:454)\n\tat com.databricks.common.client.RawDBHttpClient.get(DBHttpClient.scala:266)\n\tat com.databricks.common.client.DBHttpClient.get(DBHttpClient.scala:196)\n\tat com.databricks.backend.common.rpc.SimpleSecretManagerClient$$anonfun$get$1.apply(SecretManagerClient.scala:153)\n\tat com.databricks.backend.common.rpc.SimpleSecretManagerClient$$anonfun$get$1.apply(SecretManagerClient.scala:153)\n\tat com.databricks.logging.UsageLogging$$anonfun$recordOperation$1.apply(UsageLogging.scala:369)\n\tat com.databricks.logging.UsageLogging$$anonfun$withAttributionContext$1.apply(UsageLogging.scala:238)\n\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)\n\tat com.databricks.logging.UsageLogging$class.withAttributionContext(UsageLogging.scala:233)\n\tat com.databricks.backend.common.rpc.SimpleSecretManagerClient.withAttributionContext(SecretManagerClient.scala:76)\n\tat com.databricks.logging.UsageLogging$class.withAttributionTags(UsageLogging.scala:271)\n\tat com.databricks.backend.common.rpc.SimpleSecretManagerClient.withAttributionTags(SecretManagerClient.scala:76)\n\tat com.databricks.logging.UsageLogging$class.recordOperation(UsageLogging.scala:350)\n\tat com.databricks.backend.common.rpc.SimpleSecretManagerClient.recordOperation(SecretManagerClient.scala:76)\n\tat com.databricks.backend.common.rpc.SimpleSecretManagerClient.com$databricks$backend$common$rpc$SimpleSecretManagerClient$$performRequest(SecretManagerClient.scala:97)\n\tat com.databricks.backend.common.rpc.SimpleSecretManagerClient$$anonfun$performRequestReliably$2.apply(SecretManagerClient.scala:115)\n\tat com.databricks.common.client.DBHttpClient$.retryWithDeadline(DBHttpClient.scala:133)\n\tat com.databricks.backend.common.rpc.SimpleSecretManagerClient.performRequestReliably(SecretManagerClient.scala:114)\n\tat com.databricks.backend.common.rpc.SimpleSecretManagerClient.get(SecretManagerClient.scala:152)\n\tat com.databricks.backend.common.rpc.SimpleSecretManagerClient.getSecret(SecretManagerClient.scala:223)\n\tat com.databricks.dbutils_v1.impl.SecretUtilsImpl.getBytesInternal(SecretUtilsImpl.scala:46)\n\tat com.databricks.dbutils_v1.impl.SecretUtilsImpl.get(SecretUtilsImpl.scala:61)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380)\n\tat py4j.Gateway.invoke(Gateway.java:295)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:251)\n\tat java.lang.Thread.run(Thread.java:748)\n
"]}}],"execution_count":2},{"cell_type":"code","source":["###\n### You can do this one-time in a separate Notebook, so that you don't cause accidental errors in other Notebooks mounting/unmounting the folder\n###\n\n# Mount the Storage Container\n# (Refer:- https://docs.databricks.com/spark/latest/data-sources/azure/azure-storage.html#mount-azure-blob-storage-containers-with-dbfs)\ndbutils.fs.mount(\n source = \"wasbs://\" + container + \"@\" + storage_account + \".blob.core.windows.net\",\n mount_point = mount_point_name,\n extra_configs = {\"fs.azure.account.key.\" + storage_account + \".blob.core.windows.net\":storage_key})"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
---------------------------------------------------------------------------\nNameError Traceback (most recent call last)\n<command-896007305526710> in <module>()\n 6 # (Refer:- https://docs.databricks.com/spark/latest/data-sources/azure/azure-storage.html#mount-azure-blob-storage-containers-with-dbfs)\n 7 dbutils.fs.mount(\n----> 8 source = "wasbs://" + container + "@" + storage_account + ".blob.core.windows.net",\n 9 mount_point = mount_point_name,\n 10 extra_configs = {"fs.azure.account.key." + storage_account + ".blob.core.windows.net":storage_key})\n\nNameError: name 'container' is not defined
"]}}],"execution_count":3},{"cell_type":"code","source":["import numpy as np\nimport pandas as pd\n\nfrom pyspark.sql import functions as f, types as t\nfrom pyspark.sql.functions import udf\n\n# ML\nfrom sentinel_ai.peer_anomaly import spark_collaborative_filtering as scf \n\n# spark\nfrom sentinel_ai.utils import sparkutils\n\n#utils\nfrom azure_sentinel_ml_utilities.azure_storage import blob_manager"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
---------------------------------------------------------------------------\nImportError Traceback (most recent call last)\n<command-896007305526711> in <module>()\n 12 \n 13 #utils\n---> 14 from azure_sentinel_ml_utilities.azure_storage import blob_manager\n\n/databricks/python/lib/python3.5/site-packages/azure_sentinel_ml_utilities/azure_storage.py in <module>()\n 2 # Class for accessing blobs on Storage\n 3 #\n----> 4 from azure.storage.blob import BlockBlobService\n 5 import datetime as dt\n 6 \n\nImportError: No module named 'azure'
"]}}],"execution_count":4},{"cell_type":"code","source":["checkpoint_dir = mount_point_name + 'cache/{0}/checkpoints'.format(project)\ndbutils.fs.mkdirs(checkpoint_dir)\nsparkutils.set_checkpointdir(spark, checkpoint_dir)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
---------------------------------------------------------------------------\nNameError Traceback (most recent call last)\n<command-896007305526712> in <module>()\n----> 1 checkpoint_dir = mount_point_name + 'cache/{0}/checkpoints'.format(project)\n 2 dbutils.fs.mkdirs(checkpoint_dir)\n 3 sparkutils.set_checkpointdir(spark, checkpoint_dir)\n\nNameError: name 'mount_point_name' is not defined
"]}}],"execution_count":5},{"cell_type":"markdown","source":["# Load Dataset"],"metadata":{}},{"cell_type":"code","source":["class FileShareDataset:\n \n def __init__(self, storage_account, storage_key):\n self.storage_account = storage_account\n self.blob_manager = blob_manager(storage_account, storage_key)\n # Spark conf set for spark.read.csv to work\n spark.conf.set(\n \"fs.azure.account.key.\" + storage_account + \".blob.core.windows.net\",\n storage_key)\n\n @staticmethod\n def get_schema():\n return t.StructType([\n t.StructField('Timestamp', t.TimestampType()),\n t.StructField('Actor', t.StringType()),\n t.StructField('Resource', t.StringType()),\n t.StructField('categoricalFeatures', t.StringType()),\n t.StructField('count_', t.IntegerType())\n ])\n\n @staticmethod\n def _make_days_delta():\n @udf('double')\n def days_delta(d2, d1):\n return 1.0 + (d2 - d1).days\n\n return days_delta\n\n def get_raw_df(self, start_time, end_time, container, root, use_schema=True):\n \n blob_names = self.blob_manager.enumerate_blob_names(start_time, end_time, container, root)\n full_blob_names = [\"wasbs://\" + container + \"@\" + self.storage_account + \".blob.core.windows.net/\" + bn for bn in blob_names]\n \n schema = FileShareDataset.get_schema() if use_schema else None\n \n if use_schema:\n return spark.read.csv(full_blob_names, schema=schema, sep='\\t', header=False)\n else:\n return spark.read.csv(full_blob_names, sep='\\t', header=False)\n\n def processed_df(self, df):\n dd = FileShareDataset._make_days_delta()\n\n return df.select(\n f.col('Timestamp').alias('timestamp1'),\n f.col('Timestamp').alias('timestamp2'),\n 'Actor',\n 'Resource',\n 'count_'\n ).groupBy(\n 'Actor',\n 'Resource'\n ).agg({\n 'timestamp1': 'min',\n 'timestamp2': 'max',\n 'count_': 'sum'\n }).select(\n f.lit('0').alias('tid'),\n f.col('min(timestamp1)').alias('min_timestamp'),\n f.col('max(timestamp2)').alias('max_timestamp'),\n f.col('Actor').alias('user'),\n f.col('Resource').alias('res'),\n (f.col('sum(count_)')/dd(f.col('max(timestamp2)'), f.col('min(timestamp1)'))).alias('score')\n )\n\n def get_dataset(self, start_time, end_time, container, root):\n return self.processed_df(self.get_raw_df(start_time, end_time, container, root)).cache()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":7},{"cell_type":"code","source":["def getdataset():\n return FileShareDataset(storage_account, storage_key).get_dataset(train_start_time, train_end_time, container, train_base_path)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":8},{"cell_type":"code","source":["ptraining = getdataset()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
---------------------------------------------------------------------------\nNameError Traceback (most recent call last)\n<command-896007305526717> in <module>()\n----> 1 ptraining = getdataset()\n\n<command-896007305526716> in getdataset()\n 1 def getdataset():\n----> 2 return FileShareDataset(storage_account, storage_key).get_dataset(train_start_time, train_end_time, container, train_base_path)\n\nNameError: name 'storage_key' is not defined
"]}}],"execution_count":9},{"cell_type":"code","source":["print(ptraining.first())"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
---------------------------------------------------------------------------\nNameError Traceback (most recent call last)\n<command-896007305526718> in <module>()\n----> 1 print(ptraining.first())\n\nNameError: name 'ptraining' is not defined
"]}}],"execution_count":10},{"cell_type":"code","source":["print(ptraining.select('tid').distinct().count())"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
---------------------------------------------------------------------------\nNameError Traceback (most recent call last)\n<command-896007305526719> in <module>()\n----> 1 print(ptraining.select('tid').distinct().count())\n\nNameError: name 'ptraining' is not defined
"]}}],"execution_count":11},{"cell_type":"code","source":["ptraining.describe().show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
---------------------------------------------------------------------------\nNameError Traceback (most recent call last)\n<command-896007305526720> in <module>()\n----> 1 ptraining.describe().show()\n\nNameError: name 'ptraining' is not defined
"]}}],"execution_count":12},{"cell_type":"code","source":["sparkutils.df_stats(ptraining)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
---------------------------------------------------------------------------\nNameError Traceback (most recent call last)\n<command-896007305526721> in <module>()\n----> 1 sparkutils.df_stats(ptraining)\n\nNameError: name 'ptraining' is not defined
"]}}],"execution_count":13},{"cell_type":"markdown","source":["# Build Model"],"metadata":{}},{"cell_type":"code","source":["# Model building\naccess_anomaly = scf.AccessAnomaly(tenant_colname='tid', score_colname='score')\naccess_anomaly_model = access_anomaly.fit(ptraining)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
---------------------------------------------------------------------------\nNameError Traceback (most recent call last)\n<command-896007305526723> in <module>()\n 1 # Model building\n 2 access_anomaly = scf.AccessAnomaly(tenant_colname='tid', score_colname='score')\n----> 3 access_anomaly_model = access_anomaly.fit(ptraining)\n\nNameError: name 'ptraining' is not defined
"]}}],"execution_count":15},{"cell_type":"markdown","source":["# Save Model"],"metadata":{}},{"cell_type":"code","source":["model_output = '{root}/{project}/model_output'.format(root=mount_point_name + 'models/', project=project)\nprint(model_output)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
---------------------------------------------------------------------------\nNameError Traceback (most recent call last)\n<command-896007305526725> in <module>()\n----> 1 model_output = '{root}/{project}/model_output'.format(root=mount_point_name + 'models/', project=project)\n 2 print(model_output)\n\nNameError: name 'mount_point_name' is not defined
"]}}],"execution_count":17},{"cell_type":"code","source":["access_anomaly_model.save(\n '{model_output}/access_anomaly_model'.format(model_output=model_output)\n)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
---------------------------------------------------------------------------\nNameError Traceback (most recent call last)\n<command-896007305526726> in <module>()\n----> 1 access_anomaly_model.save(\n 2 '{model_output}/access_anomaly_model'.format(model_output=model_output)\n 3 )\n\nNameError: name 'access_anomaly_model' is not defined
"]}}],"execution_count":18},{"cell_type":"code","source":["# unmount blob storage\ndbutils.fs.unmount(mount_point_name)"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
---------------------------------------------------------------------------\nNameError Traceback (most recent call last)\n<command-896007305526727> in <module>()\n 1 # unmount blob storage\n----> 2 dbutils.fs.unmount(mount_point_name)\n\nNameError: name 'mount_point_name' is not defined
"]}}],"execution_count":19},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":20}],"metadata":{"name":"AnomalousRATraining","notebookId":896007305526707},"nbformat":4,"nbformat_minor":0}