From f0806bce04d8c13b389dc9338ed41a2f72dc9d4b Mon Sep 17 00:00:00 2001 From: Cody <54814569+lostmygithubaccount@users.noreply.github.com> Date: Sun, 29 Nov 2020 12:20:15 -0800 Subject: [PATCH] update dask tutorial (#309) * saving * updates --- tutorials/using-dask/1.intro-to-dask.ipynb | 105 +++++++++++++++++---- 1 file changed, 86 insertions(+), 19 deletions(-) diff --git a/tutorials/using-dask/1.intro-to-dask.ipynb b/tutorials/using-dask/1.intro-to-dask.ipynb index 9b695703b..f0b706c4d 100644 --- a/tutorials/using-dask/1.intro-to-dask.ipynb +++ b/tutorials/using-dask/1.intro-to-dask.ipynb @@ -83,7 +83,7 @@ "from distributed import Client\n", "\n", "c = Client()\n", - "# c.restart()\n", + "c.restart()\n", "c" ] }, @@ -201,8 +201,8 @@ "metadata": {}, "outputs": [], "source": [ - "container_name = \"isdweatherdatacontainer\"\n", - "\n", + "color = \"green\"\n", + "container_name = \"nyctlc\"\n", "storage_options = {\"account_name\": \"azureopendatastorage\"}" ] }, @@ -222,8 +222,7 @@ "metadata": {}, "outputs": [], "source": [ - "%%time\n", - "fs.ls(f\"{container_name}/ISDWeather/year=2020\")" + "fs.ls(f\"{container_name}\")" ] }, { @@ -232,7 +231,25 @@ "metadata": {}, "outputs": [], "source": [ - "files = fs.glob(f\"{container_name}/ISDWeather/year=2020/month=2/*.parquet\")\n", + "fs.ls(f\"{container_name}/{color}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fs.ls(f\"{container_name}/{color}/puYear=2016/\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "files = fs.glob(f\"{container_name}/{color}/puYear=2016/puMonth=12/*.parquet\")\n", "files = [f\"az://{file}\" for file in files]\n", "files[-5:]" ] @@ -240,25 +257,20 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "%%time\n", - "ddf = dd.read_parquet(\n", - " files, storage_options=storage_options, chunksize=\"100MB\"\n", + "ddf = (\n", + " dd.read_parquet(files, storage_options=storage_options)\n", + " .repartition(npartitions=8)\n", + " .persist()\n", ")\n", "ddf" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# ddf = ddf.persist() # persist all or some of data in RAM" - ] - }, { "cell_type": "code", "execution_count": null, @@ -276,7 +288,62 @@ "outputs": [], "source": [ "%%time\n", - "ddf.describe().compute()" + "len(ddf)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ddf.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "plt.style.use(\"dark_background\")\n", + "\n", + "ddf[\"tipAmount\"].compute().hist(\n", + " figsize=(16, 8), bins=256, range=(0.1, 20),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = ddf.compute()\n", + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "gbs = round(df.memory_usage(index=True, deep=True).sum() / 1e9, 2)\n", + "print(f\"df is {gbs} GBs\")" ] }, {