From 5034703008d72a53e9452862ded5c41caf355083 Mon Sep 17 00:00:00 2001 From: Hauke Mallow Date: Tue, 7 Dec 2021 12:25:24 +0100 Subject: [PATCH] Benchmark queries for Kusto and Spark SQL Benchmark queries for Kusto and Spark SQL --- queries/kusto/query01.kql | 4 ++ queries/kusto/query02.kql | 4 ++ queries/kusto/query03.kql | 4 ++ queries/kusto/query04.kql | 4 ++ queries/kusto/query05.kql | 11 ++++ queries/kusto/query06.kql | 8 +++ queries/kusto/query07.kql | 6 ++ queries/kusto/query08.kql | 3 + queries/kusto/query09.kql | 4 ++ queries/kusto/query10.kql | 5 ++ queries/kusto/query11.kql | 11 ++++ queries/kusto/query12.kql | 4 ++ queries/kusto/query13.kql | 7 +++ queries/kusto/query14.kql | 5 ++ queries/kusto/query15.kql | 12 ++++ queries/kusto/query16.kql | 10 ++++ queries/kusto/query17.kql | 12 ++++ queries/kusto/query18.kql | 16 ++++++ queries/kusto/query19.kql | 19 +++++++ queries/sparksql/query01.sql | 7 +++ queries/sparksql/query02.sql | 6 ++ queries/sparksql/query03.sql | 9 +++ queries/sparksql/query04.sql | 10 ++++ queries/sparksql/query05.sql | 15 +++++ queries/sparksql/query06.sql | 16 ++++++ queries/sparksql/query07.sql | 18 ++++++ queries/sparksql/query08.sql | 10 ++++ queries/sparksql/query09.sql | 15 +++++ queries/sparksql/query10.sql | 21 +++++++ queries/sparksql/query11.sql | 27 +++++++++ queries/sparksql/query12.sql | 13 +++++ queries/sparksql/query13.sql | 15 +++++ queries/sparksql/query14.sql | 19 +++++++ queries/sparksql/query15.sql | 45 +++++++++++++++ queries/sparksql/query16.sql | 104 +++++++++++++++++++++++++++++++++++ queries/sparksql/query17.sql | 37 +++++++++++++ queries/sparksql/query18.sql | 36 ++++++++++++ queries/sparksql/query19.sql | 37 +++++++++++++ 38 files changed, 609 insertions(+) create mode 100644 queries/kusto/query01.kql create mode 100644 queries/kusto/query02.kql create mode 100644 queries/kusto/query03.kql create mode 100644 queries/kusto/query04.kql create mode 100644 queries/kusto/query05.kql create mode 100644 queries/kusto/query06.kql create mode 100644 queries/kusto/query07.kql create mode 100644 queries/kusto/query08.kql create mode 100644 queries/kusto/query09.kql create mode 100644 queries/kusto/query10.kql create mode 100644 queries/kusto/query11.kql create mode 100644 queries/kusto/query12.kql create mode 100644 queries/kusto/query13.kql create mode 100644 queries/kusto/query14.kql create mode 100644 queries/kusto/query15.kql create mode 100644 queries/kusto/query16.kql create mode 100644 queries/kusto/query17.kql create mode 100644 queries/kusto/query18.kql create mode 100644 queries/kusto/query19.kql create mode 100644 queries/sparksql/query01.sql create mode 100644 queries/sparksql/query02.sql create mode 100644 queries/sparksql/query03.sql create mode 100644 queries/sparksql/query04.sql create mode 100644 queries/sparksql/query05.sql create mode 100644 queries/sparksql/query06.sql create mode 100644 queries/sparksql/query07.sql create mode 100644 queries/sparksql/query08.sql create mode 100644 queries/sparksql/query09.sql create mode 100644 queries/sparksql/query10.sql create mode 100644 queries/sparksql/query11.sql create mode 100644 queries/sparksql/query12.sql create mode 100644 queries/sparksql/query13.sql create mode 100644 queries/sparksql/query14.sql create mode 100644 queries/sparksql/query15.sql create mode 100644 queries/sparksql/query16.sql create mode 100644 queries/sparksql/query17.sql create mode 100644 queries/sparksql/query18.sql create mode 100644 queries/sparksql/query19.sql diff --git a/queries/kusto/query01.kql b/queries/kusto/query01.kql new file mode 100644 index 0000000..9910bd0 --- /dev/null +++ b/queries/kusto/query01.kql @@ -0,0 +1,4 @@ +Logs +| where Timestamp between(datetime(2014-03-08) .. 12h) +| where Level == 'Warning' and Message has 'enabled' +| count diff --git a/queries/kusto/query02.kql b/queries/kusto/query02.kql new file mode 100644 index 0000000..e97d503 --- /dev/null +++ b/queries/kusto/query02.kql @@ -0,0 +1,4 @@ +Logs +| where Level == 'Error' +| where Message has 'safeArrayRankMismatch' +| count diff --git a/queries/kusto/query03.kql b/queries/kusto/query03.kql new file mode 100644 index 0000000..59bdac4 --- /dev/null +++ b/queries/kusto/query03.kql @@ -0,0 +1,4 @@ +Logs +| where Timestamp between(datetime(2014-03-8 03:00:00) .. 1h) +| where Source startswith 'im' and Message contains 'response' +| summarize Count=count() by Component diff --git a/queries/kusto/query04.kql b/queries/kusto/query04.kql new file mode 100644 index 0000000..1bf4c85 --- /dev/null +++ b/queries/kusto/query04.kql @@ -0,0 +1,4 @@ +Logs +| where Timestamp between(datetime(2014-03-08 03:00:00) .. 1h) +| where * has 'Internal' +| top 1000 by Timestamp diff --git a/queries/kusto/query05.kql b/queries/kusto/query05.kql new file mode 100644 index 0000000..bbc3d64 --- /dev/null +++ b/queries/kusto/query05.kql @@ -0,0 +1,11 @@ +Logs +| where ClientRequestId in ( +'d71ab629-ebaf-9380-5fe8-942541387ce5', +'6bb29a30-ce0d-1288-36f0-27dbd57d66b0', +'1f82e290-a7c4-ac84-7117-52209b3b9c91', +'ecc12181-8c5a-4f87-1ca3-712b4a82c4f0', +'d275a6f0-ba1d-22cf-b06b-6dac508ece4b', +'f0565381-29db-bf73-ca1b-319e80debe1c', +'54807a9a-e442-883f-6d8b-186c1c2a1041', +'f1d10647-fc31-dbc3-9e25-67f68a6fe194') +| count diff --git a/queries/kusto/query06.kql b/queries/kusto/query06.kql new file mode 100644 index 0000000..d5b15ee --- /dev/null +++ b/queries/kusto/query06.kql @@ -0,0 +1,8 @@ +Logs +| where Timestamp between(datetime(2014-03-08) .. 10d) +| where Source in ('IMAGINEFIRST0', 'HAVINGCOLUMN182', 'THEREFORESTORE156', 'HOSTNODES207') +| extend LogType = case(Component in ('CLOUDREPORTSERVER', 'COMMON1', 'FABRICINTEGRATOR', 'REQUESTPROTECTION', 'DIRECTORYSERVICE', 'REPORTSERVERSERVICETRACE', 'ACONFIGURATION', 'EXPLORESERVICEWATCHDOG', 'COMMUNICATIONRUNTIME'), 'Security', + Component in ('REPORTNETWORKING', 'PUSHDATASERVICETRACE', 'HEALTHSERVICE', 'UTILS', 'PROVIDERSCOMMON'), 'Performance', + Component in ('WORKERSERVICECONTENT', 'XMLACOMMON', 'INTEGRATIONDATABASE', 'DATABASEMANAGEMENT'), 'Ingestion', + 'Other') +| summarize Count=count() by LogType diff --git a/queries/kusto/query07.kql b/queries/kusto/query07.kql new file mode 100644 index 0000000..1680291 --- /dev/null +++ b/queries/kusto/query07.kql @@ -0,0 +1,6 @@ +Logs +| where Timestamp between(datetime(2014-03-08 12:00) .. 1h) +| where Level=='Error' and Message startswith 'exception' +| parse Message with 'Exception=' ExeptionType ';' * 'Message=' ExceptionMessage ';' * +| summarize Count=count() by ExeptionType, ExceptionMessage +| top 10 by Count diff --git a/queries/kusto/query08.kql b/queries/kusto/query08.kql new file mode 100644 index 0000000..8af7dab --- /dev/null +++ b/queries/kusto/query08.kql @@ -0,0 +1,3 @@ +Logs +| where Timestamp between(datetime(2014-03-08 12:00) .. 6h) +| summarize Count=count() by Level diff --git a/queries/kusto/query09.kql b/queries/kusto/query09.kql new file mode 100644 index 0000000..2f2b862 --- /dev/null +++ b/queries/kusto/query09.kql @@ -0,0 +1,4 @@ +Logs +| where Timestamp between(datetime(2014-03-08 12:00) .. 6h) +| summarize Count=count() by Level, Component +| top 50 by Count diff --git a/queries/kusto/query10.kql b/queries/kusto/query10.kql new file mode 100644 index 0000000..cbb46bb --- /dev/null +++ b/queries/kusto/query10.kql @@ -0,0 +1,5 @@ +Logs +| where Timestamp between(datetime(2014-03-08) .. 3d) +| where Source == 'IMAGINEFIRST0' and Message has 'downloaded' +| summarize hint.shufflekey=ClientRequestId Count=count() by Level, Component,Node, ClientRequestId +| top 10 by Count diff --git a/queries/kusto/query11.kql b/queries/kusto/query11.kql new file mode 100644 index 0000000..5d5c97e --- /dev/null +++ b/queries/kusto/query11.kql @@ -0,0 +1,11 @@ +let top_nodes = + Logs + | where Timestamp between(datetime(2014-03-08 12:00) .. 6h) + | where Level == 'Error' + | summarize count() by Node + | top 10 by count_ + | project Node; +Logs +| where Timestamp between(datetime(2014-03-08 12:00) .. 6h) +| where Node in (top_nodes) +| summarize count() by Level, Node diff --git a/queries/kusto/query12.kql b/queries/kusto/query12.kql new file mode 100644 index 0000000..116c5f4 --- /dev/null +++ b/queries/kusto/query12.kql @@ -0,0 +1,4 @@ +Logs +| where Timestamp between(datetime(2014-03-08 12:00) .. 6h) +| where Level == 'Error' +| summarize dcount(ClientRequestId) by bin(Timestamp, 1h) diff --git a/queries/kusto/query13.kql b/queries/kusto/query13.kql new file mode 100644 index 0000000..fd504cf --- /dev/null +++ b/queries/kusto/query13.kql @@ -0,0 +1,7 @@ +Logs +| where Timestamp between(datetime(2014-03-08 12:00) .. 6h) +| where Message startswith 'IngestionCompletionEvent' +| where Source in ('IMAGINEFIRST0', 'CLIMBSTEADY83', 'INTERNALFIRST79', 'WORKWITHIN77', 'ADOPTIONCUSTOMERS81', 'FIVENEARLY85', + 'WHATABOUT98', 'PUBLICBRAINCHILD89', 'WATCHPREVIEW91', 'LATERYEARS87', 'GUTHRIESSCOTT93', 'THISSTORING16') +| where Properties has 'parquet' +| summarize MaxRowCount= max(tolong(Properties.rowCount)), percentiles(totimespan(Properties.duration), 50, 90, 95) by Source diff --git a/queries/kusto/query14.kql b/queries/kusto/query14.kql new file mode 100644 index 0000000..bb471a2 --- /dev/null +++ b/queries/kusto/query14.kql @@ -0,0 +1,5 @@ +Logs +| where Timestamp between(datetime(2014-03-08 12:00) .. 1h) +| where Component == "DOWNLOADER" +| summarize DownloadRate=max(todouble(Properties.compressedSize) / ((totimespan(Properties.downloadDuration) / 1sec))) by Source +| top 10 by DownloadRate diff --git a/queries/kusto/query15.kql b/queries/kusto/query15.kql new file mode 100644 index 0000000..4d81430 --- /dev/null +++ b/queries/kusto/query15.kql @@ -0,0 +1,12 @@ +let Data = Logs + | where Source == 'IMAGINEFIRST0' + | where Timestamp between(datetime(2014-03-08 12:00) .. 5d) + | where Message startswith 'IngestionCompletionEvent'; +let TopNodesByCPU = Data +| summarize MaxCPU = max(totimespan(Properties.cpuTime)) by Node +| order by MaxCPU desc, Node desc +| take 10 +| project Node; +Data +| where Node in (TopNodesByCPU) +| summarize AverageProcessTimeInSeconds=avg(totimespan(Properties.cpuTime)) by bin(Timestamp, 5m), Node diff --git a/queries/kusto/query16.kql b/queries/kusto/query16.kql new file mode 100644 index 0000000..7449f6c --- /dev/null +++ b/queries/kusto/query16.kql @@ -0,0 +1,10 @@ +Logs +| where Timestamp between(datetime(2014-03-08) .. 3d) +| where Level == 'Error' +| where Message has 'ArrayTypeMismatch' +| where Source in ('IMAGINEFIRST0', 'CLIMBSTEADY83', 'INTERNALFIRST79', 'WORKWITHIN77', 'ADOPTIONCUSTOMERS81', 'FIVENEARLY85', + 'WHATABOUT98', 'PUBLICBRAINCHILD89', 'WATCHPREVIEW91', 'LATERYEARS87', 'GUTHRIESSCOTT93', 'THISSTORING16') +| top-nested of Source by max(1), + top-nested 3 of Node by NodeErrors=count(), + top-nested 3 of Component with others = 'Other components' by ComponentErrors=count() +| project Source, Node, Component, NodeErrors, ComponentErrors diff --git a/queries/kusto/query17.kql b/queries/kusto/query17.kql new file mode 100644 index 0000000..b7dc9fd --- /dev/null +++ b/queries/kusto/query17.kql @@ -0,0 +1,12 @@ +let Data = Logs + | where Source == 'IMAGINEFIRST0' + | where Timestamp between(datetime(2014-03-08 12:00) .. 3d) + | where Message startswith 'IngestionCompletionEvent'; +let TopNodesByCPU = Data +| summarize MaxCPU = max(totimespan(Properties.cpuTime)) by Node +| order by MaxCPU desc, Node desc +| take 10 +| project Node; +Data +| where Node in (TopNodesByCPU) +| summarize AverageProcessTimeInSeconds=avg(totimespan(Properties.cpuTime)) by bin(Timestamp, 5m), Node diff --git a/queries/kusto/query18.kql b/queries/kusto/query18.kql new file mode 100644 index 0000000..f5e76e2 --- /dev/null +++ b/queries/kusto/query18.kql @@ -0,0 +1,16 @@ +let Data = +Logs +| where Timestamp between(datetime(2014-03-08 12:00) .. 3d) +| where Source == 'IMAGINEFIRST0'; +Data +| where Message startswith '$$IngestionCommand' +| parse Message with '$$IngestionCommand table=' Table ' ' * +| distinct hint.shufflekey=ClientRequestId ClientRequestId, Table +| join kind=inner hint.shufflekey=ClientRequestId ( + Data + | where Message startswith 'IngestionCompletionEvent' +) on ClientRequestId +| extend Format=tostring(Properties.format), RowCount=tolong(Properties.rowCount) +| top-nested of Format by max(1), + top-nested 10 of Table by Rows=sum(RowCount) +| project Format, Table, Rows diff --git a/queries/kusto/query19.kql b/queries/kusto/query19.kql new file mode 100644 index 0000000..a2e8da9 --- /dev/null +++ b/queries/kusto/query19.kql @@ -0,0 +1,19 @@ +let Data = + Logs + | where Timestamp between(datetime(2014-03-08 12:00) .. 3d) + | where Source in ('IMAGINEFIRST0') + ; +let Downloading = + Data + | where Message startswith 'Downloading file path:' + | parse Message with * 'path:' path + | project DownloadClientRequestId = ClientRequestId, path; +let IngestionCompletion = + Data + | where Message startswith 'IngestionCompletionEvent' + | parse Message with * 'path:' path + | project CompleteClientRequestId = ClientRequestId, path; +Downloading +| join hint.shufflekey=path kind=inner (IngestionCompletion) on path +| where DownloadClientRequestId != CompleteClientRequestId +| count diff --git a/queries/sparksql/query01.sql b/queries/sparksql/query01.sql new file mode 100644 index 0000000..e8af204 --- /dev/null +++ b/queries/sparksql/query01.sql @@ -0,0 +1,7 @@ +select + count(*) +from + logs_tpc +where Timestamp between '2014-03-08 00:00:00' and '2014-03-08 12:00:00' + and Level = 'Warning' + and Message like '%enabled%' \ No newline at end of file diff --git a/queries/sparksql/query02.sql b/queries/sparksql/query02.sql new file mode 100644 index 0000000..cb0bf69 --- /dev/null +++ b/queries/sparksql/query02.sql @@ -0,0 +1,6 @@ +select + count(*) +from + logs_tpc +where + Level = 'Error' and lower (Message) like '%safearrayrankmismatch%' \ No newline at end of file diff --git a/queries/sparksql/query03.sql b/queries/sparksql/query03.sql new file mode 100644 index 0000000..e1e51e3 --- /dev/null +++ b/queries/sparksql/query03.sql @@ -0,0 +1,9 @@ +select + Component, count(*) +from + logs_tpc +where + Day = '2014-03-08 00:00:00' and + Timestamp between '2014-03-08 03:00:00' and '2014-03-08 04:00:00' + and Source like 'IN%' and Message ILIKE '%response%' +group by Component \ No newline at end of file diff --git a/queries/sparksql/query04.sql b/queries/sparksql/query04.sql new file mode 100644 index 0000000..1404aaa --- /dev/null +++ b/queries/sparksql/query04.sql @@ -0,0 +1,10 @@ +select + * +from + logs_tpc +where + Timestamp between '2014-03-08 03:00:00' and '2014-03-08 04:00:00' + and (lower (message) like "%internal%") + order by Timestamp + limit 1000 + \ No newline at end of file diff --git a/queries/sparksql/query05.sql b/queries/sparksql/query05.sql new file mode 100644 index 0000000..ed29cb0 --- /dev/null +++ b/queries/sparksql/query05.sql @@ -0,0 +1,15 @@ +select + count(*) +from + logs_tpc +where ClientRequestId in +( +'d71ab629-ebaf-9380-5fe8-942541387ce5', +'6bb29a30-ce0d-1288-36f0-27dbd57d66b0', +'1f82e290-a7c4-ac84-7117-52209b3b9c91', +'ecc12181-8c5a-4f87-1ca3-712b4a82c4f0', +'d275a6f0-ba1d-22cf-b06b-6dac508ece4b', +'f0565381-29db-bf73-ca1b-319e80debe1c', +'54807a9a-e442-883f-6d8b-186c1c2a1041', +'f1d10647-fc31-dbc3-9e25-67f68a6fe194') + \ No newline at end of file diff --git a/queries/sparksql/query06.sql b/queries/sparksql/query06.sql new file mode 100644 index 0000000..4c9ae30 --- /dev/null +++ b/queries/sparksql/query06.sql @@ -0,0 +1,16 @@ +select + case + when Component in ('CLOUDREPORTSERVER', 'COMMON1', 'FABRICINTEGRATOR', 'REQUESTPROTECTION', 'DIRECTORYSERVICE', 'REPORTSERVERSERVICETRACE', 'ACONFIGURATION', 'EXPLORESERVICEWATCHDOG', 'COMMUNICATIONRUNTIME') then 'Security' + when Component in ('REPORTNETWORKING', 'PUSHDATASERVICETRACE', 'HEALTHSERVICE', 'UTILS', 'PROVIDERSCOMMON') then 'Performance' + when Component in ('WORKERSERVICECONTENT', 'XMLACOMMON', 'INTEGRATIONDATABASE', 'DATABASEMANAGEMENT') then 'Ingestion' + else 'Other' + end as LogType, + count(*) +from + logs_tpc +where + Timestamp between '2014-03-08 00:00:00' and '2014-03-08 10:00:00' + and Source in ('IMAGINEFIRST0', 'HAVINGCOLUMN182', 'THEREFORESTORE156', 'HOSTNODES207') +group by + LogType + \ No newline at end of file diff --git a/queries/sparksql/query07.sql b/queries/sparksql/query07.sql new file mode 100644 index 0000000..9b34ce9 --- /dev/null +++ b/queries/sparksql/query07.sql @@ -0,0 +1,18 @@ +select + regexp_extract(Message, 'Exception=(.*);') as ExceptionType, + regexp_extract(Message, 'Message=(.*);') as ExceptionMessage, + count(*) as ExceptionCount +from + logs_tpc +where + Day = '2014-03-08 00:00:00' and + Timestamp between '2014-03-08 12:00:00' and '2014-03-08 13:00:00' and + Level = 'Error' and + Message like 'Exception%' +group by + ExceptionType, + ExceptionMessage +order by + ExceptionCount desc +limit 10 + \ No newline at end of file diff --git a/queries/sparksql/query08.sql b/queries/sparksql/query08.sql new file mode 100644 index 0000000..220b9d5 --- /dev/null +++ b/queries/sparksql/query08.sql @@ -0,0 +1,10 @@ +select + Level, + count(*) as Count +from + logs_tpc +where + Day = '2014-03-08 00:00:00' and + Timestamp between '2014-03-08 12:00:00' and '2014-03-08 18:00:00' +group by + Level \ No newline at end of file diff --git a/queries/sparksql/query09.sql b/queries/sparksql/query09.sql new file mode 100644 index 0000000..d5c0593 --- /dev/null +++ b/queries/sparksql/query09.sql @@ -0,0 +1,15 @@ +-- q9 +select + Level, + Component, + count(*) as Count +from + logs_tpc +where + Timestamp between '2014-03-08 12:00:00' and '2014-03-08 18:00:00' +group by + Level, + Component +order by + Count desc +limit 50 \ No newline at end of file diff --git a/queries/sparksql/query10.sql b/queries/sparksql/query10.sql new file mode 100644 index 0000000..d447235 --- /dev/null +++ b/queries/sparksql/query10.sql @@ -0,0 +1,21 @@ +--Q10 +select + Component, + Level, + Node, + ClientRequestId, + count(*) as Count +from + logs_tpc +where + Timestamp between '2014-03-08 12:00:00' and '2014-03-11 00:00:00' and + -- Source = 'IMAGINEFIRST0' and + lower(Message) like "%downloaded%" +group by + Component, + Level, + Node, + ClientRequestId +order by + Count desc +limit 10 \ No newline at end of file diff --git a/queries/sparksql/query11.sql b/queries/sparksql/query11.sql new file mode 100644 index 0000000..6a94526 --- /dev/null +++ b/queries/sparksql/query11.sql @@ -0,0 +1,27 @@ +-- query 11 +with top_nodes as ( +select + Node +from + logs_tpc +where + Timestamp between '2014-03-08 12:00:00' and '2014-03-08 18:00:00' and + Level = 'Error' +group by + Node +order by + count(*) desc +limit 10 +) +select + Level, + Node, + count(*) as Count +from + logs_tpc +where + Timestamp between '2014-03-08 12:00:00' and '2014-03-08 18:00:00' and + Node in (select Node from top_nodes) +group by + Level, + Node \ No newline at end of file diff --git a/queries/sparksql/query12.sql b/queries/sparksql/query12.sql new file mode 100644 index 0000000..1768332 --- /dev/null +++ b/queries/sparksql/query12.sql @@ -0,0 +1,13 @@ +--Q12; +select + date_trunc("Hour", Timestamp) as bin, + count(distinct ClientRequestId) as dcount +from + logs_tpc +where + Timestamp between '2014-03-08 12:00:00' and '2014-03-08 18:00:00' + and Level = 'Error' +group by + bin +order by + bin diff --git a/queries/sparksql/query13.sql b/queries/sparksql/query13.sql new file mode 100644 index 0000000..e10340c --- /dev/null +++ b/queries/sparksql/query13.sql @@ -0,0 +1,15 @@ +with CPUTime as ( +select + Source, + json_tuple(Properties, 'rowCount', 'duration') as (rowCount, durationdt), + cast ((to_timestamp(durationdt , 'HH:mm:ss.SSSSSSS')) as double) as duration +from + logs_tpc +where + Timestamp between '2014-03-08 12:00:00'and '2014-03-08 18:00:00' + and lower(Message) like 'ingestioncompletionevent%' + and Source in ('IMAGINEFIRST0', 'CLIMBSTEADY83', 'INTERNALFIRST79', 'WORKWITHIN77', 'ADOPTIONCUSTOMERS81', 'FIVENEARLY85', 'WHATABOUT98', 'PUBLICBRAINCHILD89', 'WATCHPREVIEW91', 'LATERYEARS87', 'GUTHRIESSCOTT93', 'THISSTORING16') + and lower(Properties) like '%parquet%') +select Source, max(rowCount) as MaxRowCount, percentile(duration, 0.50) as p50, percentile(duration, 0.90) as p90, percentile(duration, 0.95) as p95 +from CPUTime +group by source diff --git a/queries/sparksql/query14.sql b/queries/sparksql/query14.sql new file mode 100644 index 0000000..8ca1810 --- /dev/null +++ b/queries/sparksql/query14.sql @@ -0,0 +1,19 @@ +with DownloadRates as ( +select + Source, + Properties, + json_tuple(Properties, 'compressedSize', 'downloadDuration') as (compressedSize, downloadDuration ), + cast ((to_timestamp(downloadDuration , 'HH:mm:ss.SSSSSSS')) as double) as duration +from + logs_tpc +where + timestamp between '2014-03-08 12:00:00' and '2014-03-08 13:00:00' + and Component = 'DOWNLOADER' +) +select + Source, max(compressedSize/duration) as DownloadRate +from DownloadRates +group by + Source +order by DownloadRate desc +limit 10 diff --git a/queries/sparksql/query15.sql b/queries/sparksql/query15.sql new file mode 100644 index 0000000..7dbe3e2 --- /dev/null +++ b/queries/sparksql/query15.sql @@ -0,0 +1,45 @@ +create or replace temporary view Data as ( +select timestamp, node, properties +from + logs_tpc +where isNotNull(Properties) +-- and Source = 'IMAGINEFIRST0' + and Timestamp between '2014-03-08 12:00:00' and '2014-03-13 12:00:00' + and lower(Message) like 'ingestioncompletionevent%' +); +select + count(*) +from + logs_tpc +where + Day = '2014-03-08 00:00:00' and + Timestamp between '2014-03-08 00:00:00' and '2014-03-08 14:00:00' + and Level = 'Warning' and Message like '%enabled%'; + create or replace temporary view TopNodesByCPU as +(with CPUTime as ( +select + Node, + Properties, + json_tuple(Properties, 'cpuTime') as (cpuTimeDate), + cast ((to_timestamp(cpuTimeDate , 'HH:mm:ss.SSSSSSS')) as double) as cpuTime +from + data +) +select Node, max(cpuTime) as MaxCPU +from CPUTime +group by Node +order by Node desc, MaxCPU desc +limit 10); +with CPUTime as (select + Node, + cast (round (cast (timestamp as double) / 300L)*300 as timestamp) as bin, -- 5 min bin + json_tuple(Properties, 'cpuTime') as (cpuTimeDate), + cast ((to_timestamp(cpuTimeDate , 'HH:mm:ss.SSSSSSS')) as double) as cpuTime +from + data +where isNotNull(Properties) and Node in (select node from TopNodesByCPU)) +select Node, +bin, +avg(cpuTime) as AvgCPU +from CPUTime +group by Node, bin \ No newline at end of file diff --git a/queries/sparksql/query16.sql b/queries/sparksql/query16.sql new file mode 100644 index 0000000..0ecb9ce --- /dev/null +++ b/queries/sparksql/query16.sql @@ -0,0 +1,104 @@ +--Q16; +with SourceMax as ( +select + Source, + count(*) as SourceCount +from + logs_tpc +where + Timestamp between '2014-03-08 00:00:00' and '2014-03-08 03:00:00' + and Level = 'Error' + and Source in ('IMAGINEFIRST0', 'CLIMBSTEADY83', 'INTERNALFIRST79', 'WORKWITHIN77', 'ADOPTIONCUSTOMERS81', 'FIVENEARLY85', 'WHATABOUT98', 'PUBLICBRAINCHILD89', 'WATCHPREVIEW91', 'LATERYEARS87', 'GUTHRIESSCOTT93', 'THISSTORING16') + and lower(Message) like '%arraytypemismatch%' +group by + Source +) +, TopNestedNodes as ( +select + * +from ( +select + *, + row_number() over (partition by Source order by NodeErrors desc) as rownum +from ( +select + Source, + Node, + count(*) as NodeErrors +from + logs_tpc +where + Timestamp between '2014-03-08 00:00:00' and '2014-03-08 03:00:00' + and Level = 'Error' + and lower(Message) like '%arraytypemismatch%' +group by + Node, + Source +order by + Source, + NodeErrors desc +) +) +where + rownum <= 3 +) +, TopNestedComponents as ( +select + Source, + Node, + case +-- added first due to aggregation error + when first(rownum) <= 3 then Component + else 'Other components' + end as Component, + sum(ComponentErrors) as Errors +from ( +select + *, + row_number() over (partition by Source,Node order by ComponentErrors desc) as rownum +from ( +select + Source, + Node, + Component, + count(*) as ComponentErrors +from + logs_tpc +where + Timestamp between '2014-03-08 00:00:00' and '2014-03-08 03:00:00' + and Level = 'Error' + and lower (Message) like '%arraytypemismatch%' +group by + Component, + Node, + Source +) +) +group by + Source, + Node, + Component +order by + Source, + Node, + Errors desc +) +select + S.Source, + N.Node, + N.NodeErrors, + C.Component, + C.Errors as ComponentErrors +from + TopNestedComponents C +inner join + SourceMax S + on C.Source = S.Source +inner join + TopNestedNodes N + on C.Node = N.Node + and C.Source = N.Source +order by + S.Source, + N.Node, + C.Component \ No newline at end of file diff --git a/queries/sparksql/query17.sql b/queries/sparksql/query17.sql new file mode 100644 index 0000000..f0e4023 --- /dev/null +++ b/queries/sparksql/query17.sql @@ -0,0 +1,37 @@ +create or replace temporary view Data as ( +select timestamp, node, properties +from + logs_tpc +where isNotNull(Properties) + and Source = 'IMAGINEFIRST0' + and Timestamp between '2014-03-08 12:00:00' and '2014-03-08 15:00:00' + and lower(Message) like 'ingestioncompletionevent%' +); +create or replace temporary view TopNodesByCPU as +(with CPUTime as ( +select + Node, + Properties, + json_tuple(Properties, 'cpuTime') as (cpuTimeDate), + cast ((to_timestamp(cpuTimeDate , 'HH:mm:ss.SSSSSSS')) as double) as cpuTime +from + data +) +select Node, max(cpuTime) as MaxCPU +from CPUTime +group by Node +order by Node desc, MaxCPU desc +limit 10); +with CPUTime as (select + Node, + cast (round (cast (timestamp as double) / 300L)*300 as timestamp) as bin, -- 5 min bin + json_tuple(Properties, 'cpuTime') as (cpuTimeDate), + cast ((to_timestamp(cpuTimeDate , 'HH:mm:ss.SSSSSSS')) as double) as cpuTime +from + data +where isNotNull(Properties) and Node in (select node from TopNodesByCPU)) +select Node, +bin, +avg(cpuTime) as AvgCPU +from CPUTime +group by Node, bin diff --git a/queries/sparksql/query18.sql b/queries/sparksql/query18.sql new file mode 100644 index 0000000..de94bc4 --- /dev/null +++ b/queries/sparksql/query18.sql @@ -0,0 +1,36 @@ +create or replace temporary view Data as ( +select + * +from + logs_tpc +where + Timestamp between '2014-03-08 12:00:00' and '2014-03-08 15:00:00' + and Source = 'IMAGINEFIRST0' or Source ='PAPERWHITE113' +); +create or replace temporary view Data2 as ( +select + distinct regexp_extract(Message, 'table=(.*) ') as icTable, + ClientRequestId +from + Data l +where + lower(substring(l.Message, 3)) like 'ingestioncommand%' +); +create or replace temporary view Data3 as ( +select + json_tuple(l.Properties, 'format', 'rowCount') as (Format, rowCount), + icTable +from + Data l +inner join + Data2 d + on l.ClientRequestId = d.ClientRequestId +where + lower(l.Message) like 'ingestioncompletionevent%' +); +select Format, icTable, sum(rowCount) as rowCount +from + Data3 +group by + Format, + icTable diff --git a/queries/sparksql/query19.sql b/queries/sparksql/query19.sql new file mode 100644 index 0000000..6fe4826 --- /dev/null +++ b/queries/sparksql/query19.sql @@ -0,0 +1,37 @@ +--Q19; +with Data as ( +select + * +from + logs_tpc + where + Timestamp between '2014-03-08 12:00:00' and '2014-03-08 15:00:00' + and Source in ('IMAGINEFIRST0') +), +Downloading as ( +select + regexp_extract(Message, 'path:(.*)') as Path, + ClientRequestId as DownloadClientRequestId +from + Data +where + lower (Message) like 'downloading file path:%' +), +IngestionCompletion as ( +select + regexp_extract(Message, 'path:(.*)') as Path, + ClientRequestId as CompleteClientRequestId +from + Data +where + lower (Message) like 'ingestioncompletionevent%' +) +select + count(*) +from + Downloading d +inner join + IngestionCompletion ic + on d.Path = ic.path +where + DownloadClientRequestId <> CompleteClientRequestId