This commit is contained in:
Rashmi Shukla 2021-09-12 18:24:09 +00:00
Родитель 57d1311d7b
Коммит 741deee3e4
8 изменённых файлов: 0 добавлений и 457 удалений

Просмотреть файл

@ -1,63 +0,0 @@
/***This Artifact belongs to the Data SQL Ninja Engineering Team***/
-- STEP 1: Create a master key. Only necessary if one does not already exist.
CREATE MASTER KEY ENCRYPTION BY PASSWORD = 'MyUltraSecurePassword!12345!';
GO
-- STEP 2: Create a database scoped credential
-- Azure Data Lake Credential
CREATE DATABASE SCOPED CREDENTIAL AzureCredential
WITH IDENTITY = '<AAD AppID>@https://login.microsoftonline.com/<subscriptionid>/oauth2/token',
SECRET = '<secret key>';
/* Blob Storage Credential
CREATE DATABASE SCOPED CREDENTIAL AzureCredential
WITH IDENTITY = 'SHARED ACCESS SIGNATURE',
SECRET = 'your key here';
*/
-- STEP 3: Create an external data source - type HADOOP for ADLS
CREATE EXTERNAL DATA SOURCE AzureStorage
WITH (TYPE = HADOOP, LOCATION = 'adl://<adls name>.azuredatalakestore.net', CREDENTIAL = AzureCredential);
GO
/* Blob Storage Data Source - wabs syntax with Hadoop type
CREATE EXTERNAL DATA SOURCE AzureStorage
WITH (TYPE = HADOOP, LOCATION = 'wasbs://container@storageacct.blob.core.windows.net',
CREDENTIAL = AzureCredential);
*/
-- STEP 4: Create an external file format
CREATE EXTERNAL FILE FORMAT TextFileFormat
WITH (FORMAT_TYPE = DELIMITEDTEXT, FORMAT_OPTIONS
(FIELD_TERMINATOR = '0x01', STRING_DELIMITER = '"', -- DATE_FORMAT = 'yyyy-MM-dd HH:mm:ss.fff',
USE_TYPE_DEFAULT = FALSE),
DATA_COMPRESSION = 'org.apache.hadoop.io.compress.GzipCodec'
);
GO
-- STEP 5: Create external table pointing to blob storage files
CREATE EXTERNAL TABLE [ext_ACCOUNT_FACT]
(
[ACCT_PK_ID] bigint NOT NULL,
[PERSON_PK_ID] bigint NOT NULL,
[SALES_PERSON_PK_ID] int NOT NULL,
[BATCH_ID] bigint NULL,
[START_TMSP] datetime NULL,
[END_TMSP] datetime NULL,
[ACCT_NAME] varchar(50) NULL,
[ACCT_FLAG] varchar(2) NULL,
[ACCT_STATUS] varchar(24) NULL,
[ACCT_STATUS_CHG_DATE] datetime NULL,
[ACCT_TYPE_CODE] varchar(30) NULL
)
WITH ( LOCATION='/data/test/', DATA_SOURCE = AzureStorage, FILE_FORMAT = TextFileFormat, REJECT_TYPE = VALUE, REJECT_VALUE = 0 );
GO
-- STEP 6: Create Table As Select (CTAS) operation - invokes Polybase to pull information out of one or more text files in ADLS into DW tables
-- note you need to split the input text files to take advantage of parallel load on the compute nodes
CREATE TABLE [POC_DM].[ACCOUNT_FACT]
WITH (DISTRIBUTION = HASH([ACCT_PK_ID]))
AS SELECT * FROM ext_ACCOUNT_FACT
OPTION (LABEL = 'CTAS : Load ACCOUNT_FACT');
GO

Просмотреть файл

@ -1,165 +0,0 @@
/***This Artifact belongs to the Data SQL Ninja Engineering Team***/
declare @sourceschema varchar(128) = 'SRC_POC_DM'
declare @targetschema varchar(128) = 'TARG_POC_DM'
declare @blobstore varchar(100) = '<account>.blob.core.windows.net'
set nocount on
-- ensure your data warehouse has a master key
-- CREATE MASTER KEY;
-- Use your blob storage key to provide SQL DW access to blob storage
if not exists(select * from [sys].[database_credentials] where [name]='AzureStorageCredential')
CREATE DATABASE SCOPED CREDENTIAL AzureStorageCredential WITH IDENTITY = 'SHARED ACCESS SIGNATURE', SECRET = 'your key here...=='
-- Create the file format definition
if not exists(select * from [sys].[external_file_formats] where [name]='TextFileFormat')
CREATE EXTERNAL FILE FORMAT TextFileFormat WITH (FORMAT_TYPE = DELIMITEDTEXT, FORMAT_OPTIONS (FIELD_TERMINATOR = '0x01', --STRING_DELIMITER = '',
USE_TYPE_DEFAULT = FALSE), DATA_COMPRESSION = 'org.apache.hadoop.io.compress.GzipCodec')
declare @objectid int, @table varchar(128), @colid int, @column varchar(128), @type varchar(128), @length smallint, @precision tinyint, @scale tinyint, @nullable bit, @colstr varchar(150)
declare @cmd varchar(max), @distcol varchar(128), @disttype varchar(50)
declare @trows int = 0, @trow int = 1, @crows int = 0, @crow int = 1, @start datetime, @tblrows bigint
-- Ensure target schema exists
if not exists(select * from sys.schemas where name = @targetschema)
begin
select @cmd = 'CREATE SCHEMA ' + @targetschema
exec(@cmd)
end
-- Check external table schema exists
if not exists(select * from sys.schemas where name = 'ASB')
exec('CREATE SCHEMA ASB')
-- cleanup of any previous failed run
IF OBJECT_ID('tempdb..#tables') IS NOT NULL
DROP TABLE #tables
IF OBJECT_ID('tempdb..#columns') IS NOT NULL
DROP TABLE #columns
create table #tables
(
rowid int not null,
objectid int not null,
[table] varchar(128) not null
)
WITH ( HEAP , DISTRIBUTION = ROUND_ROBIN )
create table #columns
(
colid int,
[column] varchar(128),
[type] varchar(128),
[length] smallint,
[precision] tinyint,
[scale] tinyint,
[nullable] bit
)
WITH ( HEAP , DISTRIBUTION = ROUND_ROBIN )
-- Set up to process all tables in the defined source schema
insert into #tables
select row_number() over (order by tb.name), object_id, tb.name
from sys.tables tb join sys.schemas s on (tb.schema_id=s.schema_id)
where s.name = @sourceschema
select @trows = count(*) from #tables
--select * from #tables
-- initial cleanup of any previous run - if an external table still exists, you will have to drop it first
if exists(select * from sys.external_data_sources where name='AzureStorage')
drop external data source AzureStorage
select @objectid=objectid, @table=[table] from #tables where rowid=@trow
while (@trow <= @trows)
begin
select @start = getdate() -- save start time
print '---------------------- ' + @targetschema + '.' + @table + ' ----------------------'
-- create the external data source
select @cmd = 'CREATE EXTERNAL DATA SOURCE AzureStorage WITH (TYPE = HADOOP, LOCATION = ''wasbs://' + replace(lower(@table), '_', '-') + '@' + @blobstore + ''', CREDENTIAL = AzureStorageCredential);'
print @cmd
print ''
exec(@cmd)
-- clear all rows in columns temp table (for previous table)
truncate table #columns
-- get all the column definitions for the target table
insert into #columns
select c.column_id, c.[name], t.[name], c.max_length, c.[precision], c.scale, c.is_nullable
from sys.columns c
join sys.types t on (c.user_type_id=t.user_type_id)
where object_id = @objectid
order by c.column_id
-- build external table definition
select @cmd = 'CREATE EXTERNAL TABLE [ASB].[' + @table + '] ('
-- process each column for the target table
select @crows = count(*) from #columns
select @crow = 1
select @colid = colid, @column = [column], @type = [type], @length = [length], @precision = [precision], @scale = [scale], @nullable = [nullable] from #columns where colid=@crow
while (@crow <= @crows)
begin
if (@colid <> 1) select @cmd = @cmd + ', '
select @cmd = @cmd + '[' + @column + '] ' + @type
if @type in ('char', 'varchar', 'nchar', 'nvarchar')
select @cmd = @cmd + '(' + case when @length < 0 then 'max' else cast(@length as varchar(4)) end + ')'
else if @type in ('numeric', 'decimal', 'real', 'float')
select @cmd = @cmd + '(' + cast(@precision as varchar(3)) + case when @type in ('decimal', 'numeric') then ', ' + cast(@scale as varchar(3)) else '' end + ')'
else if @type = 'datetime2'
select @cmd = @cmd + '(' + cast(@scale as varchar(3)) + ')'
select @cmd = @cmd + case when @nullable then ' NULL' else ' NOT NULL' end
select @crow = @crow + 1
select @colid = colid, @column = [column], @type = [type], @length = [length], @precision = [precision], @scale = [scale], @nullable = [nullable] from #columns where colid=@crow
end
select @cmd = @cmd + ') WITH ( LOCATION=''./'', DATA_SOURCE = AzureStorage, FILE_FORMAT = TextFileFormat, REJECT_TYPE = VALUE, REJECT_VALUE = 0 );'
declare @i int = 1
while (@i < len(@cmd))
begin
print substring(@cmd, @i, 1000) -- statements can exceed the capacity of a single print
select @i = @i + 1000
end
print ''
exec(@cmd)
-- get the distribution mechanism and column for the target table
select @distcol='', @disttype=distribution_policy_desc from sys.pdw_table_distribution_properties where object_id=@objectid
select @distcol=c.[name] from sys.pdw_column_distribution_properties d join sys.columns c on (d.object_id=c.object_id and d.column_id=c.column_id) where d.[object_id]=@objectid and distribution_ordinal=1
-- remove target table if it already exists
if exists(select * from sys.tables t join sys.schemas s on (t.schema_id=s.schema_id) where s.[name] = @targetschema and t.[name]=@table)
begin
select @cmd = 'DROP TABLE [' + @targetschema + '].[' + @table + ']'
exec(@cmd)
end
-- build simple CTAS statement (since column type and nullability done in external table)
select @cmd = 'CREATE TABLE [' + @targetschema + '].[' + @table + '] WITH (DISTRIBUTION = '+ case when ISNULL(@disttype, '') = '' then 'HEAP' else @disttype end + case when @distcol != '' then '([' + @distcol + '])' else '' end +
') AS SELECT * FROM [ASB].[' + @table + '] OPTION (LABEL = ''CTAS : Load [' + @targetschema + '].[' + @table + ']'');'
print @cmd
print ''
exec(@cmd)
-- Cleanup external objects
select @cmd = 'DROP EXTERNAL TABLE [ASB].[' + @table + ']'
print @cmd
exec(@cmd)
print 'DROP EXTERNAL DATA SOURCE AzureStorage'
DROP EXTERNAL DATA SOURCE AzureStorage
-- Output row count and elapsed load time for the current table
select @cmd = 'select COUNT_BIG(*) as ''Rows in [' + @targetschema + '].[' + @table + ']'', ' + cast(datediff(s, @start, getdate())/60.0 as varchar(40)) + ' as ''Minutes to Load'' from [' + @targetschema + '].[' + @table + ']'
exec(@cmd)
-- Increment to the next table
select @trow = @trow + 1
select @objectid=objectid, @table=[table] from #tables where rowid=@trow
end
drop table #tables
drop table #columns

Просмотреть файл

@ -1,195 +0,0 @@
/***This Artifact belongs to the Data SQL Ninja Engineering Team***/
declare @sourceschema varchar(128) = 'SRC_POC_DM'
declare @targetschema varchar(128) = 'TARG_POC_DM'
declare @blobstore varchar(100) = '<account>.blob.core.windows.net'
set nocount on
-- ensure your data warehouse has a master key
-- CREATE MASTER KEY;
-- Use your blob storage key to provide SQL DW access to blob storage
if not exists(select * from [sys].[database_credentials] where [name]='AzureStorageCredential')
CREATE DATABASE SCOPED CREDENTIAL AzureStorageCredential WITH IDENTITY = 'SHARED ACCESS SIGNATURE', SECRET = 'your key here...=='
-- Create the file format definition
if not exists(select * from [sys].[external_file_formats] where [name]='TextFileFormat')
CREATE EXTERNAL FILE FORMAT TextFileFormat WITH (FORMAT_TYPE = DELIMITEDTEXT, FORMAT_OPTIONS (FIELD_TERMINATOR = '0x01', --STRING_DELIMITER = '',
USE_TYPE_DEFAULT = FALSE), DATA_COMPRESSION = 'org.apache.hadoop.io.compress.GzipCodec')
declare @objectid int, @table varchar(128), @colid int, @column varchar(128), @type varchar(128), @length smallint, @precision tinyint, @scale tinyint, @nullable bit, @colstr varchar(150)
declare @cmd varchar(max), @distcol varchar(128), @disttype varchar(50)
declare @trows int = 0, @trow int = 1, @crows int = 0, @crow int = 1, @start datetime, @tblrows bigint
-- Ensure target schema exists
if not exists(select * from sys.schemas where name = @targetschema)
begin
select @cmd = 'CREATE SCHEMA ' + @targetschema
exec(@cmd)
end
-- Check external table schema exists
if not exists(select * from sys.schemas where name = 'ASB')
exec('CREATE SCHEMA ASB')
-- cleanup of any previous failed run
IF OBJECT_ID('tempdb..#tables') IS NOT NULL
DROP TABLE #tables
IF OBJECT_ID('tempdb..#columns') IS NOT NULL
DROP TABLE #columns
create table #tables
(
rowid int not null,
objectid int not null,
[table] varchar(128) not null
)
WITH ( HEAP , DISTRIBUTION = ROUND_ROBIN )
create table #columns
(
colid int,
[column] varchar(128),
[type] varchar(128),
[length] smallint,
[precision] tinyint,
[scale] tinyint,
[nullable] bit
)
WITH ( HEAP , DISTRIBUTION = ROUND_ROBIN )
-- Set up to process all tables in the defined source schema
insert into #tables
select row_number() over (order by tb.name), object_id, tb.name
from sys.tables tb join sys.schemas s on (tb.schema_id=s.schema_id)
where s.name = @sourceschema
select @trows = count(*) from #tables
--select * from #tables
-- initial cleanup of any previous run - if an external table still exists, you will have to drop it first
if exists(select * from sys.external_data_sources where name='AzureStorage')
drop external data source AzureStorage
select @objectid=objectid, @table=[table] from #tables where rowid=@trow
while (@trow <= @trows)
begin
select @start = getdate() -- save start time
print '---------------------- ' + @targetschema + '.' + @table + ' ----------------------'
-- create the external data source
select @cmd = 'CREATE EXTERNAL DATA SOURCE AzureStorage WITH (TYPE = HADOOP, LOCATION = ''wasbs://' + replace(lower(@table), '_', '-') + '@' + @blobstore + ''', CREDENTIAL = AzureStorageCredential);'
print @cmd
print ''
exec(@cmd)
-- clear all rows in columns temp table (for previous table)
truncate table #columns
-- get all the column definitions for the target table
insert into #columns
select c.column_id, c.[name], t.[name], c.max_length, c.[precision], c.scale, c.is_nullable
from sys.columns c
join sys.types t on (c.user_type_id=t.user_type_id)
where object_id = @objectid
order by c.column_id
-- build external table definition
select @cmd = 'CREATE EXTERNAL TABLE [ASB].[' + @table + '] ('
-- process each column for the target table
select @crows = count(*) from #columns
select @crow = 1
select @colid = colid, @column = [column], @type = [type], @length = [length], @precision = [precision], @scale = [scale], @nullable = [nullable] from #columns where colid=@crow
while (@crow <= @crows)
begin
if (@colid <> 1) select @cmd = @cmd + ','
select @cmd = @cmd + '[' + @column + '] ' + case when @type in ('nvarchar', 'nchar') then 'nvarchar' else 'varchar' end + '(' +
case when @type in ('decimal', 'numeric', 'bigint', 'real', 'float', 'money') then '35'
when @type in ('int', 'smallmoney') then '14'
when @type in ('bit', 'tinyint', 'smallint') then '6'
when @type in ('char', 'varchar', 'nchar', 'nvarchar', 'binary', 'varbinary') then case when @length = -1 then 'MAX' when @length < 6 then '10' when @length > 3980 and left(@type,1)='n' then '4000' when @length > 7980 then '8000' else cast(@length+20 as varchar(5)) end -- handle null and add quotes (& embedded quotes)
when @type = 'uniqueidentifier' then '38'
else '50' end + ') NULL' -- dates and times @ 50 - image, text, xml, hierarchy and spatial data types not supported on DW
select @crow = @crow + 1
select @colid = colid, @column = [column], @type = [type], @length = [length], @precision = [precision], @scale = [scale], @nullable = [nullable] from #columns where colid=@crow
end
select @cmd = @cmd + ') WITH ( LOCATION=''./'', DATA_SOURCE = AzureStorage, FILE_FORMAT = TextFileFormat, REJECT_TYPE = VALUE, REJECT_VALUE = 0 );'
declare @i int = 1
while (@i < len(@cmd))
begin
print substring(@cmd, @i, 1000) -- statements can exceed the capacity of a single print
select @i = @i + 1000
end
print ''
exec(@cmd)
-- get the distribution mechanism and column for the target table
select @distcol='', @disttype=distribution_policy_desc from sys.pdw_table_distribution_properties where object_id=@objectid
select @distcol=c.[name] from sys.pdw_column_distribution_properties d join sys.columns c on (d.object_id=c.object_id and d.column_id=c.column_id) where d.[object_id]=@objectid and distribution_ordinal=1
-- remove target table if it already exists
if exists(select * from sys.tables t join sys.schemas s on (t.schema_id=s.schema_id) where s.[name] = @targetschema and t.[name]=@table)
begin
select @cmd = 'DROP TABLE [' + @targetschema + '].[' + @table + ']'
exec(@cmd)
end
-- build CTAS statement - looping through all of the columns to do a cast to the appropriate data type
select @cmd = 'CREATE TABLE [' + @targetschema + '].[' + @table + '] WITH (DISTRIBUTION = '+ case when ISNULL(@disttype, '') = '' then 'HEAP' else @disttype end + case when @distcol != '' then '([' + @distcol + '])' else '' end + ') AS SELECT '
select @crow = 1
select @colid = colid, @column = [column], @type = [type], @length = [length], @precision = [precision], @scale = [scale], @nullable = [nullable] from #columns where colid=@crow
while (@crow <= @crows)
begin
select @colstr = 'substring(['+@column+'], 2, LEN(['+@column+'])-2)' -- remove lead and tail quotes
if (@colid <> 1) select @cmd = @cmd + ', '
if (@nullable = 0) -- if this column is not nullable we have to give SQL DW the hint to make it NOT NULL - in theory we should error if there is a text value of null in the field, but that is just more code...
select @cmd = @cmd + 'ISNULL(('
if @type in ('char', 'varchar', 'nchar', 'nvarchar') -- remove escaped quotes and replace special line end characters with line feed
select @cmd = @cmd + 'case when [' + @column + '] = ''"null"'' then null else cast(replace(replace(replace(' + @colstr + ', ''\"'', ''"''), char(31), char(10)), char(30), char(13)) as '+ @type +'('+ case when @length=-1 then 'max' else cast(@length as varchar(10)) end +')) end' + case when @nullable=0 then '), '''')' else '' end
else if @type in ('numeric', 'decimal')
select @cmd = @cmd + 'case when [' + @column + '] = ''"null"'' then null else cast(' + @colstr + ' as ' + @type + '(' + cast(@precision as varchar(3)) + ',' + cast(@scale as varchar(3)) + ')) end' + case when @nullable=0 then '), 0.)' else '' end
else if @type in ('bigint', 'real', 'float', 'money', 'int', 'smallmoney', 'bit', 'tinyint', 'smallint')
select @cmd = @cmd + 'case when [' + @column + '] = ''"null"'' then null else cast(' + @colstr + ' as ' + @type + ') end' + case when @nullable=0 then '), 0)' else '' end
else if @type in ('datetime', 'smalldatetime', 'date')
select @cmd = @cmd + 'case when [' + @column + '] = ''"null"'' then null when left(['+@column+'], 5) < ''"1753'' then cast(''1753-01-01 00:00:00'' as ' + @type + ') else cast(substring(' + @colstr + ', 1, (case when CHARINDEX(''.'', ' + @column + ') != 0 then CHARINDEX(''.'', ' + @column + ') else len(' + @column + ') end)-2) as ' + @type + ') end' + case when @nullable=0 then '), cast(''1753-01-01 00:00:00'' as ' + @type + '))' else '' end
else if @type = 'date'
select @cmd = @cmd + 'case when [' + @column + '] = ''"null"'' then null when left(['+@column+'], 5) < ''"0001'' then cast(''0001-01-01'' as date) else cast(' + @colstr + ' as date) end' + case when @nullable=0 then '), cast(''0001-01-01'' as date))' else '' end
else if @type = 'datetime2'
select @cmd = @cmd + 'case when [' + @column + '] = ''"null"'' then null when left(['+@column+'], 5) < ''"0001'' then cast(''0001-01-01 00:00:00'' as datetime2(' + cast(@scale as varchar(3)) + ')) else cast(' + @colstr + ' as datetime2(' + cast(@scale as varchar(3)) + ')) end' + case when @nullable=0 then '), cast(''0001-01-01 00:00:00'' as datetime2(' + @precision + ')))' else '' end
else if @type = 'uniqueidentifier'
select @cmd = @cmd + 'case when [' + @column + '] = ''"null"'' then null else cast(' + @colstr + ' as ' + @type + ') end' + case when @nullable=0 then '), cast(''00000000-0000-0000-0000-000000000000'' as uniqueidentifier))' else '' end
else -- not sure any data types are left - if so, you need a null value for 'not null' instead of the 0
select @cmd = @cmd + 'case when [' + @column + '] = ''"null"'' then null else cast(' + @colstr + ' as ' + @type + ') end' + case when @nullable=0 then '), 0)' else '' end
select @cmd = @cmd + ' as ''' + @column + '''' -- add column name
select @crow = @crow + 1
select @colid = colid, @column = [column], @type = [type], @length = [length], @precision = [precision], @scale = [scale], @nullable = [nullable] from #columns where colid=@crow
end
select @cmd = @cmd + ' FROM [ASB].[' + @table + '] OPTION (LABEL = ''CTAS : Load [' + @targetschema + '].[' + @table + ']'');'
select @i = 1
while (@i < len(@cmd))
begin
print substring(@cmd, @i, 1000)
select @i = @i + 1000
end
print ''
exec(@cmd)
-- Cleanup external objects
select @cmd = 'DROP EXTERNAL TABLE [ASB].[' + @table + ']'
print @cmd
exec(@cmd)
print 'DROP EXTERNAL DATA SOURCE AzureStorage'
DROP EXTERNAL DATA SOURCE AzureStorage
-- Output row count and elapsed load time for the current table
select @cmd = 'select COUNT_BIG(*) as ''Rows in [' + @targetschema + '].[' + @table + ']'', ' + cast(datediff(s, @start, getdate())/60.0 as varchar(40)) + ' as ''Minutes to Load'' from [' + @targetschema + '].[' + @table + ']'
exec(@cmd)
-- Increment to the next table
select @trow = @trow + 1
select @objectid=objectid, @table=[table] from #tables where rowid=@trow
end
drop table #tables
drop table #columns

Двоичный файл не отображается.

Просмотреть файл

@ -1,33 +0,0 @@
# /***This Artifact belongs to the Data SQL Ninja Engineering Team***/
# Auto Generate Azure SQL DW Load – TSQL Scripts
Script Author: Mitch van Huuksloot, Solution Architect, Data SQL Ninja Engineering Team
These scripts were developed to help with a large Azure SQL DW POC with one of the DMJ customers. Some of the assumptions made in the scripts are very specific to the scenario encountered at the customer. Feel free to adapt these scripts to your data loading scenario.
Each of the scripts creates the appropriate objects to allow an optimal performance DW load from text files in blob storage into Azure SQL DW using Polybase. You can easily change the external data source to use Azure Data Lake as a source as well. The optimal nature of the load will depend on the number of files provided. Our customer had many gzipd files that distributed well among several DW nodes (few larger gzipd files would not have done as well).
From a workflow perspective, we used SSMA for Oracle to generate the DW schema and then manually tweaked the data types in the script (the customer used the “number” data type without scale or precision indiscriminately). The SSMA default conversion of number to float(53) needed to be adjusted (given you are converting from precise to approximate). Part of the manual schema editing process was also deciding on distribution hash keys, which was an initial guess, since we didnt have a list of typical queries. If we had no idea what to hash on, we left the tables round robin. For dimension tables, this would also be an appropriate time to make them replicated tables. We did not have row counts or sizes at that point, so we couldnt make that determination (the guidance is a maximum size of 2 GB).
Before running the schema script on the DW, we put all the tables into “source” schemas (SRC_XXX). The “source” schema is the template that the script uses to create the final tables in the target schema (including column types and distribution – note that no other indexes are created, beyond the default CCI index). Once a table is processed, just remove it from the source schema. We did not add this step to the script just in case there were errors that didnt stop the script, but the drop could be easily added.
In the POC, we had 72 tables split across 4 schemas, so we could load each schema separately with the script (note; to load multiple schemas in parallel with this script would require you to give the AzureStorage object a unique name for each copy of the script).
During the POC, we had two locations where the customer put the text files to be loaded in Azure Blob Storage. Initially, they put the files in their own container under the same storage account. Many of the tables in their schema had underscore characters in the name, but the container names in blob storage had the table names with dashes instead of underscores (so the script has a replace to change this automatically). Later during the POC, the customer started moving newly extracted tables to a single container with a subfolder naming scheme (note; in blob store there arent actually subfolders, just a naming trick). There are therefore two different script versions, one for each case. In the first case we need to create a separate external data source for each container (we couldnt get the root container syntax to work), while in the second case we can use the same external data source for every table and specify the subfolder in the external table creation command.
The V1 script generates a set of “classic” load statements, where the external table definition has the appropriate types expected in each column. The CTAS statement then is just a simple “create table … select *…”. Also included is a single table CTAS load script that documents every step required to load one or more text files into a table in Azure SQL DW.
During the POC, we started with the V1 script, but quickly determined that the script failed to load many of the customer tables for a variety of reasons, including; every column even numeric ones were quoted (which Polybase might have handled, except null values were specified as “null”), timestamps had too much precision for Polybase, embedded escaped quotes (\”) were not handled by Polybase, embedded new line characters caused Polybase to error etc.
A strategy was developed to address some of the loading issues, which in some cases required the files to be re-extracted from the source system (hence the change during the POC of where files were stored). The customer originally had selected an ASCII 01 (SOH) character as the record separator. This seemed like a good choice, since, being an infrequently used control character, it should not appear in any of the text fields. The hope was that Polybase would not have an issue with it, and there wasnt once we specified the value in the script in hex.
Given the relatively exotic column separator, there really wasnt a need for quotes around strings, but they were already in the generated files. We ended up having to remove the quote specification from the external file format definition, because the embedded quotes in columns were causing Polybase errors. In the process we switched to reading everything from the text files as character columns (or Unicode, if required) to reduce the number of Polybase issues we were seeing. Note that since we are reading everything into character columns and there may be embedded quotes in the strings, the character column lengths used are arbitrarily longer than the actual source columns length (we settled on adding 20 characters to character columns length).
Instead of staging the raw file in a temp table, we came up with a strategy to cast the columns in the CTAS statement, to the appropriate data types (later we discovered the trick to add nullability). The complexity of the CTAS statement grew over time to handle many issues that we ran into. For some of the tables, the statements were longer than a print command could handle (so it is printed in multiple lines, with no concept of an appropriate line break location).
One issue that was very problematic was newlines embedded in column text. We had the customer regenerate the files using an ascii 31 (RS) for an ascii 10 (LF) and an ascii 30 (US) for an ascii 13 (CR) and we added code to handle the replacement. They initially missed some columns with both CR and LF as newlines, but eventually they got them all. One table had a SOH character that showed up in one of the columns, which was very problematic to hunt down – the error was a string truncation but in column 85, but the problem column was actually 32. We had the customer regenerate the files for this table with a different record separator of an ascii 28 (FS) – this was a temporary script change that was not shared in this package but would be easy to replicate.
If the script fails, you will need to manually remove the external table definition created (drop external table…). We could remove these tables automatically in the script but have no idea if there are multiple scripts running in parallel.

Двоичный файл не отображается.

Двоичные данные
Azure DW Load Scripts/READ ME.docx

Двоичный файл не отображается.

Просмотреть файл

@ -1 +0,0 @@
/***This Artifact belongs to the Data SQL Ninja Engineering Team***/