Merge pull request #8 from riv/master

[NEW FEATURE] MSGHT Non-Incremental Pipeline Setup
This commit is contained in:
Kevin Lewis 2016-06-24 11:20:46 -07:00 коммит произвёл GitHub
Родитель 6fa96fe4e0 fd60427441
Коммит 8c275a4a1f
8 изменённых файлов: 167 добавлений и 2 удалений

1
.gitignore поставляемый
Просмотреть файл

@ -251,3 +251,4 @@ paket-files/
.idea/
*.sln.iml
/ghinsights/DataFactory/ProductionEnvironment.json

Просмотреть файл

@ -0,0 +1,43 @@
{
"$schema": "http://datafactories.schema.management.azure.com/schemas/2015-08-01/Microsoft.DataFactory.Table.json",
"name": "MSEventDetail",
"properties": {
"type": "AzureBlob",
"linkedServiceName": "MSPrestagedEventsAzureStorage",
"typeProperties": {
"folderPath": "raw/{EventName}/v1/{Year}/{Month}",
"fileName": "{EventName}_{Year}_{Month}_{Day}.json.gz",
"partitionedBy": [
{
"name": "Year",
"value": {
"type": "DateTime",
"date": "SliceStart",
"format": "yyyy"
}
},
{
"name": "Month",
"value": {
"type": "DateTime",
"date": "SliceStart",
"format": "MM"
}
},
{
"name": "Day",
"value": {
"type": "DateTime",
"date": "SliceStart",
"format": "dd"
}
}
]
},
"external": false,
"availability": {
"frequency": "Day",
"interval": 1
}
}
}

Просмотреть файл

@ -0,0 +1,39 @@
{
"$schema": "http://datafactories.schema.management.azure.com/schemas/2015-08-01/Microsoft.DataFactory.Pipeline.json",
"name": "MSGHTEventDetailPipeline",
"properties": {
"description": "Custom DotNetActivity pipeline to gather microsoft github mongodb backups and unpack the seperate events into a new container.",
"activities": [
{
"name": "MongoDbDumpTransformActivity",
"type": "DotNetActivity",
"inputs": [ { "name": "MSMongoDbDump" } ],
"outputs": [ { "name": "MSEventDetail" } ],
"typeProperties": {
"assemblyName": "GHInsights.DataFactory.dll",
"entryPoint": "GHInsights.DataFactory.MongoDbDumpTransformActivity",
"packageLinkedService": "GHInsightsAzureStorage",
"packageFile": "datafactory/GHInsights.DataFactory.zip",
"extendedProperties": {
"Year": "$$Text.Format('{0:yyyy}',SliceStart)",
"Month": "$$Text.Format('{0:MM}',SliceStart)",
"Day": "$$Text.Format('{0:dd}',SliceStart)"
}
},
"linkedServiceName": "BatchProcessor",
"policy": {
"concurrency": 6,
"executionPriorityOrder": "NewestFirst",
"retry": 0,
"timeout": "04:00:00"
},
"scheduler": {
"frequency": "Day",
"interval": 1
}
}
],
"start": "2016-06-13T00:00:00Z",
"end": "9999-09-09T00:00:00Z"
}
}

Просмотреть файл

@ -0,0 +1,11 @@
{
"$schema": "http://datafactories.schema.management.azure.com/schemas/2015-10-01/Microsoft.DataFactory.LinkedService.json",
"name": "MSGHTorrentAzureStorage",
"properties": {
"type": "AzureStorageSas",
"typeProperties": {
"sasUri": ""
}
}
}

Просмотреть файл

@ -0,0 +1,61 @@
{
"$schema": "http://datafactories.schema.management.azure.com/schemas/2015-08-01/Microsoft.DataFactory.Table.json",
"name": "MSMongoDbDump",
"properties": {
"type": "AzureBlob",
"linkedServiceName": "MSGHTorrentAzureStorage",
// "structure": [],
"typeProperties": {
"folderPath": "msght-azure-storage/downloads/",
"fileName": "mongo-dump-{Year}-{Month}-{Day}.tar.gz",
"partitionedBy": [
{
"name": "Year",
"value": {
"type": "DateTime",
"date": "SliceStart",
"format": "yyyy"
}
},
{
"name": "Month",
"value": {
"type": "DateTime",
"date": "SliceStart",
"format": "MM"
}
},
{
"name": "Day",
"value": {
"type": "DateTime",
"date": "SliceStart",
"format": "dd"
}
}
]
//,"format": {
// "type": "<Two formats are supported: TextFormat, AvroFormat. If the type is 'TextFormat', you can specify the following properties. The type 'Avro' does not require any additional properties>",
// "columnDelimiter": "<The character used as a column separator in a file.This property is optional. The default value is comma (,)>",
// "rowDelimiter": "<The character used as a row separator in a file. This property is optional. The default value is any of the following: (” \n”)>",
// "EscapeChar": "<The character used to escape any special character in the blob content. This property is optional. No default value>",
// "NullValue": "<The character used to represent null value in the blob content. This property is optional. The default value is ” \n”>"
// }
},
"external": true,
"availability": {
"frequency": "Day",
"interval": 1
},
"policy": {
"validation": {
"minimumSizeMB": 0.1
},
"externalData": {
"retryInterval": "01:00:00",
"retryTimeout": "00:10:00",
"maximumRetry": 10
}
}
}
}

Просмотреть файл

@ -0,0 +1,10 @@
{
"$schema": "http://datafactories.schema.management.azure.com/schemas/2015-08-01/Microsoft.DataFactory.LinkedService.json",
"name": "MSPrestagedEventsAzureStorage",
"properties": {
"type": "AzureStorage",
"typeProperties": {
"connectionString": ""
}
}
}

Просмотреть файл

@ -54,7 +54,7 @@
"externalData": {
"retryInterval": "01:00:00",
"retryTimeout": "00:10:00",
"maximumRetry": 24
"maximumRetry": 10
}
}
}

Просмотреть файл

@ -55,7 +55,7 @@ SELECT GHInsights.USql.Utility.GetString(Data, "repo") AS Repo
,GHInsights.USql.Utility.GetString(Data, "base.user.login") AS BaseUserLogin
,GHInsights.USql.Utility.GetBoolean(Data, "base.user.site_admin") AS BaseUserSiteAdmin
,GHInsights.USql.Utility.GetString(Data, "base.user.type") AS BaseUserType
,GHInsights.USql.Utility.GetUsqlString(Data, "body") AS Body
,GHInsights.USql.Utility.GetUSqlString(Data, "body") AS Body
,GHInsights.USql.Utility.GetInteger(Data, "changed_files") AS ChangedFiles
,GHInsights.USql.Utility.GetDateTime(Data, "closed_at") AS ClosedAt
,GHInsights.USql.Utility.GetInteger(Data, "comments") AS Comments