From a4ba6529f3e3acf071f49be90b307068e21f2522 Mon Sep 17 00:00:00 2001 From: gaudyb <85708998+gaudyb@users.noreply.github.com> Date: Mon, 22 Apr 2024 13:58:41 -0600 Subject: [PATCH] snapshot python implementation (#714) --- .yarn/versions/cb6d33ac.yml | 9 ++ .../schema/docs/markdown/schema.filetype.md | 22 +++ javascript/schema/docs/markdown/schema.md | 2 + .../markdown/schema.snapshotargs.filetype.md | 11 ++ .../docs/markdown/schema.snapshotargs.md | 19 +++ .../docs/markdown/schema.snapshotargs.name.md | 11 ++ javascript/schema/docs/report/schema.api.json | 151 ++++++++++++++++++ javascript/schema/docs/report/schema.api.md | 22 +++ javascript/schema/src/workflow/verbs.ts | 14 ++ python/datashaper/datashaper/__init__.py | 2 + .../datashaper/datashaper/engine/__init__.py | 2 + python/datashaper/datashaper/engine/types.py | 8 + .../datashaper/engine/verbs/snapshot.py | 37 +++++ .../datashaper/tests/verbs/snapshot_test.py | 33 ++++ 14 files changed, 343 insertions(+) create mode 100644 .yarn/versions/cb6d33ac.yml create mode 100644 javascript/schema/docs/markdown/schema.filetype.md create mode 100644 javascript/schema/docs/markdown/schema.snapshotargs.filetype.md create mode 100644 javascript/schema/docs/markdown/schema.snapshotargs.md create mode 100644 javascript/schema/docs/markdown/schema.snapshotargs.name.md create mode 100644 python/datashaper/datashaper/engine/verbs/snapshot.py create mode 100644 python/datashaper/tests/verbs/snapshot_test.py diff --git a/.yarn/versions/cb6d33ac.yml b/.yarn/versions/cb6d33ac.yml new file mode 100644 index 000000000..bf4897c3b --- /dev/null +++ b/.yarn/versions/cb6d33ac.yml @@ -0,0 +1,9 @@ +releases: + "@datashaper/schema": minor + +declined: + - "@datashaper/app-framework" + - "@datashaper/react" + - "@datashaper/tables" + - "@datashaper/webapp" + - "@datashaper/workflow" diff --git a/javascript/schema/docs/markdown/schema.filetype.md b/javascript/schema/docs/markdown/schema.filetype.md new file mode 100644 index 000000000..16990beb6 --- /dev/null +++ b/javascript/schema/docs/markdown/schema.filetype.md @@ -0,0 +1,22 @@ + + +[Home](./index.md) > [@datashaper/schema](./schema.md) > [FileType](./schema.filetype.md) + +## FileType enum + +These are the available formats for the snapshot verb. + +Signature: + +```typescript +export declare enum FileType +``` + +## Enumeration Members + +| Member | Value | Description | +| --- | --- | --- | +| Csv | "csv" | | +| Json | "json" | | +| Parquet | "parquet" | | + diff --git a/javascript/schema/docs/markdown/schema.md b/javascript/schema/docs/markdown/schema.md index b0f6cb008..ee4f1b6cf 100644 --- a/javascript/schema/docs/markdown/schema.md +++ b/javascript/schema/docs/markdown/schema.md @@ -19,6 +19,7 @@ | [DateComparisonOperator](./schema.datecomparisonoperator.md) | | | [ErrorCode](./schema.errorcode.md) | | | [FieldAggregateOperation](./schema.fieldaggregateoperation.md) | This is the subset of aggregate functions that can operate on a single field so we don't accommodate additional args. See https://uwdata.github.io/arquero/api/op\#aggregate-functions | +| [FileType](./schema.filetype.md) | These are the available formats for the snapshot verb. | | [FilterCompareType](./schema.filtercomparetype.md) | Indicates the comparison type used for a filter operation. This is done on a row-by-row basis. | | [JoinStrategy](./schema.joinstrategy.md) | | | [KnownProfile](./schema.knownprofile.md) | | @@ -98,6 +99,7 @@ | [ResourceSchema](./schema.resourceschema.md) | Parent class for any resource type understood by the system. Any object type that extends from Resource is expected to have a standalone schema published. For project state, this can be left as generic as possible for now. | | [RollupArgs](./schema.rollupargs.md) | | | [SampleArgs](./schema.sampleargs.md) | | +| [SnapshotArgs](./schema.snapshotargs.md) | | | [SpreadArgs](./schema.spreadargs.md) | | | [StepJsonCommon](./schema.stepjsoncommon.md) | Common step properties | | [StringsArgs](./schema.stringsargs.md) | | diff --git a/javascript/schema/docs/markdown/schema.snapshotargs.filetype.md b/javascript/schema/docs/markdown/schema.snapshotargs.filetype.md new file mode 100644 index 000000000..696dce660 --- /dev/null +++ b/javascript/schema/docs/markdown/schema.snapshotargs.filetype.md @@ -0,0 +1,11 @@ + + +[Home](./index.md) > [@datashaper/schema](./schema.md) > [SnapshotArgs](./schema.snapshotargs.md) > [fileType](./schema.snapshotargs.filetype.md) + +## SnapshotArgs.fileType property + +Signature: + +```typescript +fileType: FileType; +``` diff --git a/javascript/schema/docs/markdown/schema.snapshotargs.md b/javascript/schema/docs/markdown/schema.snapshotargs.md new file mode 100644 index 000000000..aefebfa31 --- /dev/null +++ b/javascript/schema/docs/markdown/schema.snapshotargs.md @@ -0,0 +1,19 @@ + + +[Home](./index.md) > [@datashaper/schema](./schema.md) > [SnapshotArgs](./schema.snapshotargs.md) + +## SnapshotArgs interface + +Signature: + +```typescript +export interface SnapshotArgs +``` + +## Properties + +| Property | Modifiers | Type | Description | +| --- | --- | --- | --- | +| [fileType](./schema.snapshotargs.filetype.md) | | [FileType](./schema.filetype.md) | | +| [name](./schema.snapshotargs.name.md) | | string | | + diff --git a/javascript/schema/docs/markdown/schema.snapshotargs.name.md b/javascript/schema/docs/markdown/schema.snapshotargs.name.md new file mode 100644 index 000000000..12a1a3bf6 --- /dev/null +++ b/javascript/schema/docs/markdown/schema.snapshotargs.name.md @@ -0,0 +1,11 @@ + + +[Home](./index.md) > [@datashaper/schema](./schema.md) > [SnapshotArgs](./schema.snapshotargs.md) > [name](./schema.snapshotargs.name.md) + +## SnapshotArgs.name property + +Signature: + +```typescript +name: string; +``` diff --git a/javascript/schema/docs/report/schema.api.json b/javascript/schema/docs/report/schema.api.json index 2ab48f653..155de03f9 100644 --- a/javascript/schema/docs/report/schema.api.json +++ b/javascript/schema/docs/report/schema.api.json @@ -5619,6 +5619,85 @@ ], "extendsTokenRanges": [] }, + { + "kind": "Enum", + "canonicalReference": "@datashaper/schema!FileType:enum", + "docComment": "/**\n * These are the available formats for the snapshot verb.\n */\n", + "excerptTokens": [ + { + "kind": "Content", + "text": "export declare enum FileType " + } + ], + "releaseTag": "Public", + "name": "FileType", + "preserveMemberOrder": false, + "members": [ + { + "kind": "EnumMember", + "canonicalReference": "@datashaper/schema!FileType.Csv:member", + "docComment": "", + "excerptTokens": [ + { + "kind": "Content", + "text": "Csv = " + }, + { + "kind": "Content", + "text": "\"csv\"" + } + ], + "initializerTokenRange": { + "startIndex": 1, + "endIndex": 2 + }, + "releaseTag": "Public", + "name": "Csv" + }, + { + "kind": "EnumMember", + "canonicalReference": "@datashaper/schema!FileType.Json:member", + "docComment": "", + "excerptTokens": [ + { + "kind": "Content", + "text": "Json = " + }, + { + "kind": "Content", + "text": "\"json\"" + } + ], + "initializerTokenRange": { + "startIndex": 1, + "endIndex": 2 + }, + "releaseTag": "Public", + "name": "Json" + }, + { + "kind": "EnumMember", + "canonicalReference": "@datashaper/schema!FileType.Parquet:member", + "docComment": "", + "excerptTokens": [ + { + "kind": "Content", + "text": "Parquet = " + }, + { + "kind": "Content", + "text": "\"parquet\"" + } + ], + "initializerTokenRange": { + "startIndex": 1, + "endIndex": 2 + }, + "releaseTag": "Public", + "name": "Parquet" + } + ] + }, { "kind": "Interface", "canonicalReference": "@datashaper/schema!FillArgs:interface", @@ -9269,6 +9348,78 @@ } ] }, + { + "kind": "Interface", + "canonicalReference": "@datashaper/schema!SnapshotArgs:interface", + "docComment": "", + "excerptTokens": [ + { + "kind": "Content", + "text": "export interface SnapshotArgs " + } + ], + "releaseTag": "Public", + "name": "SnapshotArgs", + "preserveMemberOrder": false, + "members": [ + { + "kind": "PropertySignature", + "canonicalReference": "@datashaper/schema!SnapshotArgs#fileType:member", + "docComment": "", + "excerptTokens": [ + { + "kind": "Content", + "text": "fileType: " + }, + { + "kind": "Reference", + "text": "FileType", + "canonicalReference": "@datashaper/schema!FileType:enum" + }, + { + "kind": "Content", + "text": ";" + } + ], + "isReadonly": false, + "isOptional": false, + "releaseTag": "Public", + "name": "fileType", + "propertyTypeTokenRange": { + "startIndex": 1, + "endIndex": 2 + } + }, + { + "kind": "PropertySignature", + "canonicalReference": "@datashaper/schema!SnapshotArgs#name:member", + "docComment": "", + "excerptTokens": [ + { + "kind": "Content", + "text": "name: " + }, + { + "kind": "Content", + "text": "string" + }, + { + "kind": "Content", + "text": ";" + } + ], + "isReadonly": false, + "isOptional": false, + "releaseTag": "Public", + "name": "name", + "propertyTypeTokenRange": { + "startIndex": 1, + "endIndex": 2 + } + } + ], + "extendsTokenRanges": [] + }, { "kind": "Enum", "canonicalReference": "@datashaper/schema!SortDirection:enum", diff --git a/javascript/schema/docs/report/schema.api.md b/javascript/schema/docs/report/schema.api.md index 80f082550..d9fdab987 100644 --- a/javascript/schema/docs/report/schema.api.md +++ b/javascript/schema/docs/report/schema.api.md @@ -490,6 +490,18 @@ export interface FieldMetadata { type?: DataType; } +// Warning: (ae-missing-release-tag) "FileType" is exported by the package, but it is missing a release tag (@alpha, @beta, @public, or @internal) +// +// @public +export enum FileType { + // (undocumented) + Csv = "csv", + // (undocumented) + Json = "json", + // (undocumented) + Parquet = "parquet" +} + // Warning: (ae-missing-release-tag) "FillArgs" is exported by the package, but it is missing a release tag (@alpha, @beta, @public, or @internal) // // @public (undocumented) @@ -897,6 +909,16 @@ export enum SetOp { Union = "union" } +// Warning: (ae-missing-release-tag) "SnapshotArgs" is exported by the package, but it is missing a release tag (@alpha, @beta, @public, or @internal) +// +// @public (undocumented) +export interface SnapshotArgs { + // (undocumented) + fileType: FileType; + // (undocumented) + name: string; +} + // Warning: (ae-missing-release-tag) "SortDirection" is exported by the package, but it is missing a release tag (@alpha, @beta, @public, or @internal) // // @public (undocumented) diff --git a/javascript/schema/src/workflow/verbs.ts b/javascript/schema/src/workflow/verbs.ts index 9fc0ce23c..81e0f923b 100644 --- a/javascript/schema/src/workflow/verbs.ts +++ b/javascript/schema/src/workflow/verbs.ts @@ -358,6 +358,15 @@ export enum WindowFunction { UUID = 'uuid' } +/** + * These are the available formats for the snapshot verb. + */ +export enum FileType { + Csv = 'csv', + Json = 'json', + Parquet = 'parquet' +} + export interface AggregateArgs extends RollupArgs { /** * Column to group by @@ -722,6 +731,11 @@ export interface PrintArgs { limit?: number } +export interface SnapshotArgs { + name: string + fileType: FileType +} + export interface StringsReplaceArgs extends StringsArgs { pattern: string replacement: string diff --git a/python/datashaper/datashaper/__init__.py b/python/datashaper/datashaper/__init__.py index 4bd527268..62cc81e25 100644 --- a/python/datashaper/datashaper/__init__.py +++ b/python/datashaper/datashaper/__init__.py @@ -9,6 +9,7 @@ from .engine import ( Criterion, DataType, FieldAggregateOperation, + FileType, FilterArgs, FilterCompareType, InputColumnArgs, @@ -116,6 +117,7 @@ __all__ = [ "Category", "Criterion", "FieldAggregateOperation", + "FileType", "FilterArgs", "FilterCompareType", "InputColumnArgs", diff --git a/python/datashaper/datashaper/engine/__init__.py b/python/datashaper/datashaper/engine/__init__.py index ee403e779..015f1e6ee 100644 --- a/python/datashaper/datashaper/engine/__init__.py +++ b/python/datashaper/datashaper/engine/__init__.py @@ -15,6 +15,7 @@ from .types import ( Criterion, DataType, FieldAggregateOperation, + FileType, FilterArgs, FilterCompareType, InputColumnArgs, @@ -55,6 +56,7 @@ __all__ = [ "Category", "Criterion", "FieldAggregateOperation", + "FileType", "FilterArgs", "FilterCompareType", "InputColumnArgs", diff --git a/python/datashaper/datashaper/engine/types.py b/python/datashaper/datashaper/engine/types.py index 190ec77d0..5b19af1fd 100644 --- a/python/datashaper/datashaper/engine/types.py +++ b/python/datashaper/datashaper/engine/types.py @@ -243,6 +243,14 @@ class WindowFunction(str, Enum): UUID = "uuid" +class FileType(str, Enum): + """File type used for the snapshot verb..""" + + Json = "json" + Csv = "csv" + Parquet = "parquet" + + @dataclass class OrderByInstruction: """Details regarding how to order a column.""" diff --git a/python/datashaper/datashaper/engine/verbs/snapshot.py b/python/datashaper/datashaper/engine/verbs/snapshot.py new file mode 100644 index 000000000..72c7febc9 --- /dev/null +++ b/python/datashaper/datashaper/engine/verbs/snapshot.py @@ -0,0 +1,37 @@ +# +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. See LICENSE file in the project. +# +"""Snapshot verb implementation.""" +from typing import cast + +import pandas as pd + +from datashaper.engine.types import FileType +from datashaper.engine.verbs.verbs_mapping import verb +from datashaper.table_store.types import VerbResult, create_verb_result + +from .verb_input import VerbInput + + +@verb(name="snapshot") +def snapshot( + input: VerbInput, + name: str, + file_type: FileType, + **_kwargs: dict, +) -> VerbResult: + """Snapshot verb implementation.""" + output = cast(pd.DataFrame, input.get_input()) + file_name = "./" + name + "." + file_type + + if file_type == FileType.Csv: + output.to_csv(file_name) + + if file_type == FileType.Json: + output.to_json(file_name, orient="records", compression="infer") + + if file_type == FileType.Parquet: + output.to_parquet(file_name) + + return create_verb_result(output) diff --git a/python/datashaper/tests/verbs/snapshot_test.py b/python/datashaper/tests/verbs/snapshot_test.py new file mode 100644 index 000000000..6afcae016 --- /dev/null +++ b/python/datashaper/tests/verbs/snapshot_test.py @@ -0,0 +1,33 @@ +import pandas as pd + +from datashaper.engine.verbs import VerbInput, VerbManager +from datashaper.table_store.types import TableContainer, VerbResult + + +def make_verb_input(data: list, columns: list[str]): + pd_table = pd.DataFrame(data=data, columns=columns) + table_container = TableContainer(pd_table) + return VerbInput(table_container) + + +def test_snapshot_csv(): + verb_input = make_verb_input([[1], [2], [3], [4], [5]], ["id"]) + snapshot = VerbManager.get().get_verb("snapshot").func + output: VerbResult = snapshot(input=verb_input, name="test-file", file_type="csv") + output: TableContainer = output.output + + +def test_snapshot_json(): + verb_input = make_verb_input([[1], [2], [3], [4], [5]], ["id"]) + snapshot = VerbManager.get().get_verb("snapshot").func + output: VerbResult = snapshot(input=verb_input, name="test-file", file_type="json") + output: TableContainer = output.output + + +def test_snapshot_parquet(): + verb_input = make_verb_input([[1], [2], [3], [4], [5]], ["id"]) + snapshot = VerbManager.get().get_verb("snapshot").func + output: VerbResult = snapshot( + input=verb_input, name="test-file", file_type="parquet" + ) + output: TableContainer = output.output