Родитель
2122cd3a2d
Коммит
f0806bce04
|
@ -83,7 +83,7 @@
|
|||
"from distributed import Client\n",
|
||||
"\n",
|
||||
"c = Client()\n",
|
||||
"# c.restart()\n",
|
||||
"c.restart()\n",
|
||||
"c"
|
||||
]
|
||||
},
|
||||
|
@ -201,8 +201,8 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"container_name = \"isdweatherdatacontainer\"\n",
|
||||
"\n",
|
||||
"color = \"green\"\n",
|
||||
"container_name = \"nyctlc\"\n",
|
||||
"storage_options = {\"account_name\": \"azureopendatastorage\"}"
|
||||
]
|
||||
},
|
||||
|
@ -222,8 +222,7 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"fs.ls(f\"{container_name}/ISDWeather/year=2020\")"
|
||||
"fs.ls(f\"{container_name}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -232,7 +231,25 @@
|
|||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"files = fs.glob(f\"{container_name}/ISDWeather/year=2020/month=2/*.parquet\")\n",
|
||||
"fs.ls(f\"{container_name}/{color}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fs.ls(f\"{container_name}/{color}/puYear=2016/\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"files = fs.glob(f\"{container_name}/{color}/puYear=2016/puMonth=12/*.parquet\")\n",
|
||||
"files = [f\"az://{file}\" for file in files]\n",
|
||||
"files[-5:]"
|
||||
]
|
||||
|
@ -240,25 +257,20 @@
|
|||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"ddf = dd.read_parquet(\n",
|
||||
" files, storage_options=storage_options, chunksize=\"100MB\"\n",
|
||||
"ddf = (\n",
|
||||
" dd.read_parquet(files, storage_options=storage_options)\n",
|
||||
" .repartition(npartitions=8)\n",
|
||||
" .persist()\n",
|
||||
")\n",
|
||||
"ddf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# ddf = ddf.persist() # persist all or some of data in RAM"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
|
@ -276,7 +288,62 @@
|
|||
"outputs": [],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"ddf.describe().compute()"
|
||||
"len(ddf)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ddf.info()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"\n",
|
||||
"plt.style.use(\"dark_background\")\n",
|
||||
"\n",
|
||||
"ddf[\"tipAmount\"].compute().hist(\n",
|
||||
" figsize=(16, 8), bins=256, range=(0.1, 20),\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df = ddf.compute()\n",
|
||||
"df.info()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"df.describe()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"gbs = round(df.memory_usage(index=True, deep=True).sum() / 1e9, 2)\n",
|
||||
"print(f\"df is {gbs} GBs\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
|
Загрузка…
Ссылка в новой задаче