* saving

* updates
This commit is contained in:
Cody 2020-11-29 12:20:15 -08:00 коммит произвёл GitHub
Родитель 2122cd3a2d
Коммит f0806bce04
1 изменённых файлов: 86 добавлений и 19 удалений

Просмотреть файл

@ -83,7 +83,7 @@
"from distributed import Client\n",
"\n",
"c = Client()\n",
"# c.restart()\n",
"c.restart()\n",
"c"
]
},
@ -201,8 +201,8 @@
"metadata": {},
"outputs": [],
"source": [
"container_name = \"isdweatherdatacontainer\"\n",
"\n",
"color = \"green\"\n",
"container_name = \"nyctlc\"\n",
"storage_options = {\"account_name\": \"azureopendatastorage\"}"
]
},
@ -222,8 +222,7 @@
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"fs.ls(f\"{container_name}/ISDWeather/year=2020\")"
"fs.ls(f\"{container_name}\")"
]
},
{
@ -232,7 +231,25 @@
"metadata": {},
"outputs": [],
"source": [
"files = fs.glob(f\"{container_name}/ISDWeather/year=2020/month=2/*.parquet\")\n",
"fs.ls(f\"{container_name}/{color}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fs.ls(f\"{container_name}/{color}/puYear=2016/\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"files = fs.glob(f\"{container_name}/{color}/puYear=2016/puMonth=12/*.parquet\")\n",
"files = [f\"az://{file}\" for file in files]\n",
"files[-5:]"
]
@ -240,25 +257,20 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"%%time\n",
"ddf = dd.read_parquet(\n",
" files, storage_options=storage_options, chunksize=\"100MB\"\n",
"ddf = (\n",
" dd.read_parquet(files, storage_options=storage_options)\n",
" .repartition(npartitions=8)\n",
" .persist()\n",
")\n",
"ddf"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# ddf = ddf.persist() # persist all or some of data in RAM"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -276,7 +288,62 @@
"outputs": [],
"source": [
"%%time\n",
"ddf.describe().compute()"
"len(ddf)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ddf.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"plt.style.use(\"dark_background\")\n",
"\n",
"ddf[\"tipAmount\"].compute().hist(\n",
" figsize=(16, 8), bins=256, range=(0.1, 20),\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = ddf.compute()\n",
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"gbs = round(df.memory_usage(index=True, deep=True).sum() / 1e9, 2)\n",
"print(f\"df is {gbs} GBs\")"
]
},
{