From b964868bbceecff168ed69bb84d8e4d90453a783 Mon Sep 17 00:00:00 2001 From: Dimitri Furman Date: Thu, 7 Jan 2021 16:48:52 -0500 Subject: [PATCH] Fixes, added instance CPU tip --- sqldb-tips/get-sqldb-tips.sql | 157 +++++++++++++++++++++++----------- 1 file changed, 108 insertions(+), 49 deletions(-) diff --git a/sqldb-tips/get-sqldb-tips.sql b/sqldb-tips/get-sqldb-tips.sql index da04065..70cc361 100644 --- a/sqldb-tips/get-sqldb-tips.sql +++ b/sqldb-tips/get-sqldb-tips.sql @@ -141,7 +141,13 @@ DECLARE @NotableNetworkEventsIntervalMinutes int = 60, -- 1380: Minimum duration of login considered too long -@NotableNetworkEventsSlowLoginThresholdMs int = 5000 +@NotableNetworkEventsSlowLoginThresholdMs int = 5000, + +-- 1390: Minimum instance CPU percentage considered too high +@HighInstanceCPUThresholdPercent decimal(5,2) = 90, + +-- 1390: Minimum duration of a high instance CPU period considered significant +@HighInstanceCPUMinThresholdSeconds int = 300 ; DECLARE @TipDefinition table ( @@ -179,9 +185,9 @@ IF EXISTS ( WHERE next_end_time IS NULL AND ( - rs.avg_cpu_percent > 98 + rs.avg_cpu_percent > 95 OR - rs.avg_instance_cpu_percent > 95 + rs.avg_instance_cpu_percent > 97 ) ) THROW 50010, 'CPU utilization is too high. Execute the script at a later time.', 1; @@ -192,47 +198,48 @@ IF DB_NAME() = 'master' AND @EngineEdition = 5 -- Define all tips INSERT INTO @TipDefinition (tip_id, tip_name, confidence_percent, tip_url) VALUES -(1000, 'Reduce MAXDOP on all replicas', 90, 'https://aka.ms/sqldbtips#1000'), -(1010, 'Reduce MAXDOP on primary', 90, 'https://aka.ms/sqldbtips#1010'), -(1020, 'Reduce MAXDOP on secondaries', 90, 'https://aka.ms/sqldbtips#1020'), -(1030, 'Use the latest database compatibility level', 70, 'https://aka.ms/sqldbtips#1030'), -(1040, 'Enable auto-create statistics', 95, 'https://aka.ms/sqldbtips#1040'), -(1050, 'Enable auto-update statistics', 95, 'https://aka.ms/sqldbtips#1050'), -(1060, 'Enable RCSI', 80, 'https://aka.ms/sqldbtips#1060'), -(1070, 'Enable Query Store', 90, 'https://aka.ms/sqldbtips#1070'), -(1071, 'Change Query Store operation mode to read-write', 90, 'https://aka.ms/sqldbtips#1071'), -(1072, 'Change Query Store capture mode from NONE to AUTO/ALL', 90, 'https://aka.ms/sqldbtips#1072'), -(1080, 'Disable AUTO_SHRINK', 99, 'https://aka.ms/sqldbtips#1080'), -(1100, 'Avoid GUID leading columns in btree indexes', 60, 'https://aka.ms/sqldbtips#1100'), -(1110, 'Enable FLGP auto-tuning', 95, 'https://aka.ms/sqldbtips#1110'), -(1120, 'Used data size is close to MAXSIZE', 80, 'https://aka.ms/sqldbtips#1120'), -(1130, 'Allocated data size is close to MAXSIZE', 60, 'https://aka.ms/sqldbtips#1130'), -(1140, 'Allocated data size is much larger than used data size', 50, 'https://aka.ms/sqldbtips#1140'), -(1150, 'Recent CPU throttling found', 90, 'https://aka.ms/sqldbtips#1150'), -(1160, 'Recent out of memory errors found', 80, 'https://aka.ms/sqldbtips#1160'), -(1165, 'Recent memory grant waits and timeouts found', 70, 'https://aka.ms/sqldbtips#1165'), -(1170, 'Nonclustered indexes with low reads found', 60, 'https://aka.ms/sqldbtips#1170'), -(1180, 'ROW or PAGE compression opportunities may exist', 65, 'https://aka.ms/sqldbtips#1180'), -(1190, 'Transaction log IO is close to limit', 70, 'https://aka.ms/sqldbtips#1190'), -(1200, 'Plan cache is bloated by single-use plans', 90, 'https://aka.ms/sqldbtips#1200'), -(1210, 'Missing indexes may be impacting performance', 70, 'https://aka.ms/sqldbtips#1210'), -(1220, 'Redo queue or a secondary replica is large', 60, 'https://aka.ms/sqldbtips#1220'), -(1230, 'Data IOPS are close to workload group limit', 70, 'https://aka.ms/sqldbtips#1230'), -(1240, 'Workload group IO governance impact is significant', 40, 'https://aka.ms/sqldbtips#1240'), -(1250, 'Data IOPS are close to resource pool limit', 70, 'https://aka.ms/sqldbtips#1250'), -(1260, 'Resouce pool IO governance impact is significant', 40, 'https://aka.ms/sqldbtips#1260'), -(1270, 'Persistent Version Store size is large', 70, 'https://aka.ms/sqldbtips#1270'), -(1280, 'Paused resumable index operations found', 90, 'https://aka.ms/sqldbtips#1280'), -(1290, 'Clustered columnstore candidates found', 50, 'https://aka.ms/sqldbtips#1290'), -(1300, 'Geo-replication state may be unhealthy', 70, 'https://aka.ms/sqldbtips#1300'), -(1310, 'Last partitions are not empty', 80, 'https://aka.ms/sqldbtips#1310'), -(1320, 'Top queries should be investigated and tuned', 90, 'https://aka.ms/sqldbtips#1320'), -(1330, 'Tempdb data allocated size is close to MAXSIZE', 70, 'https://aka.ms/sqldbtips#1330'), -(1340, 'Tempdb data used size is close to MAXSIZE', 95, 'https://aka.ms/sqldbtips#1340'), -(1350, 'Tempdb log allocated size is close to MAXSIZE', 80, 'https://aka.ms/sqldbtips#1350'), -(1360, 'Worker utilization is close to workload group limit', 80, 'https://aka.ms/sqldbtips#1360'), -(1370, 'Worker utilization is close to resource pool limit', 80, 'https://aka.ms/sqldbtips#1370'), -(1380, 'Notable network connectivity events found', 50, 'https://aka.ms/sqldbtips#1380') +(1000, 'Reduce MAXDOP on all replicas', 90, 'https://aka.ms/sqldbtipswiki#1000'), +(1010, 'Reduce MAXDOP on primary', 90, 'https://aka.ms/sqldbtipswiki#1010'), +(1020, 'Reduce MAXDOP on secondaries', 90, 'https://aka.ms/sqldbtipswiki#1020'), +(1030, 'Use the latest database compatibility level', 70, 'https://aka.ms/sqldbtipswiki#1030'), +(1040, 'Enable auto-create statistics', 95, 'https://aka.ms/sqldbtipswiki#1040'), +(1050, 'Enable auto-update statistics', 95, 'https://aka.ms/sqldbtipswiki#1050'), +(1060, 'Enable RCSI', 80, 'https://aka.ms/sqldbtipswiki#1060'), +(1070, 'Enable Query Store', 90, 'https://aka.ms/sqldbtipswiki#1070'), +(1071, 'Change Query Store operation mode to read-write', 90, 'https://aka.ms/sqldbtipswiki#1071'), +(1072, 'Change Query Store capture mode from NONE to AUTO/ALL', 90, 'https://aka.ms/sqldbtipswiki#1072'), +(1080, 'Disable AUTO_SHRINK', 99, 'https://aka.ms/sqldbtipswiki#1080'), +(1100, 'Avoid GUID leading columns in btree indexes', 60, 'https://aka.ms/sqldbtipswiki#1100'), +(1110, 'Enable FLGP auto-tuning', 95, 'https://aka.ms/sqldbtipswiki#1110'), +(1120, 'Used data size is close to MAXSIZE', 80, 'https://aka.ms/sqldbtipswiki#1120'), +(1130, 'Allocated data size is close to MAXSIZE', 60, 'https://aka.ms/sqldbtipswiki#1130'), +(1140, 'Allocated data size is much larger than used data size', 50, 'https://aka.ms/sqldbtipswiki#1140'), +(1150, 'Recent CPU throttling found', 90, 'https://aka.ms/sqldbtipswiki#1150'), +(1160, 'Recent out of memory errors found', 80, 'https://aka.ms/sqldbtipswiki#1160'), +(1165, 'Recent memory grant waits and timeouts found', 70, 'https://aka.ms/sqldbtipswiki#1165'), +(1170, 'Nonclustered indexes with low reads found', 60, 'https://aka.ms/sqldbtipswiki#1170'), +(1180, 'ROW or PAGE compression opportunities may exist', 65, 'https://aka.ms/sqldbtipswiki#1180'), +(1190, 'Transaction log IO is close to limit', 70, 'https://aka.ms/sqldbtipswiki#1190'), +(1200, 'Plan cache is bloated by single-use plans', 90, 'https://aka.ms/sqldbtipswiki#1200'), +(1210, 'Missing indexes may be impacting performance', 70, 'https://aka.ms/sqldbtipswiki#1210'), +(1220, 'Redo queue or a secondary replica is large', 60, 'https://aka.ms/sqldbtipswiki#1220'), +(1230, 'Data IOPS are close to workload group limit', 70, 'https://aka.ms/sqldbtipswiki#1230'), +(1240, 'Workload group IO governance impact is significant', 40, 'https://aka.ms/sqldbtipswiki#1240'), +(1250, 'Data IOPS are close to resource pool limit', 70, 'https://aka.ms/sqldbtipswiki#1250'), +(1260, 'Resouce pool IO governance impact is significant', 40, 'https://aka.ms/sqldbtipswiki#1260'), +(1270, 'Persistent Version Store size is large', 70, 'https://aka.ms/sqldbtipswiki#1270'), +(1280, 'Paused resumable index operations found', 90, 'https://aka.ms/sqldbtipswiki#1280'), +(1290, 'Clustered columnstore candidates found', 50, 'https://aka.ms/sqldbtipswiki#1290'), +(1300, 'Geo-replication state may be unhealthy', 70, 'https://aka.ms/sqldbtipswiki#1300'), +(1310, 'Last partitions are not empty', 80, 'https://aka.ms/sqldbtipswiki#1310'), +(1320, 'Top queries should be investigated and tuned', 90, 'https://aka.ms/sqldbtipswiki#1320'), +(1330, 'Tempdb data allocated size is close to MAXSIZE', 70, 'https://aka.ms/sqldbtipswiki#1330'), +(1340, 'Tempdb data used size is close to MAXSIZE', 95, 'https://aka.ms/sqldbtipswiki#1340'), +(1350, 'Tempdb log allocated size is close to MAXSIZE', 80, 'https://aka.ms/sqldbtipswiki#1350'), +(1360, 'Worker utilization is close to workload group limit', 80, 'https://aka.ms/sqldbtipswiki#1360'), +(1370, 'Worker utilization is close to resource pool limit', 80, 'https://aka.ms/sqldbtipswiki#1370'), +(1380, 'Notable network connectivity events found', 50, 'https://aka.ms/sqldbtipswiki#1380'), +(1390, 'Instance CPU utilization is high', 60, 'https://aka.ms/sqldbtipswiki#1390') ; -- MAXDOP @@ -816,7 +823,7 @@ SELECT STRING_AGG( ', partition range: ', partition_range, ', partition range size (MB): ', FORMAT(partition_range_size_mb, 'N'), ', present compression type: ', present_compression_type, - ', new compression type: ', new_compression_type + ', suggested compression type: ', new_compression_type ) AS nvarchar(max)), @CRLF ) WITHIN GROUP (ORDER BY object_id, index_name, partition_range, partition_range_size_mb, new_compression_type) @@ -859,7 +866,7 @@ packed_log_rate_snapshot AS ( SELECT MIN(end_time) AS min_end_time, MAX(end_time) AS max_end_time, - AVG(avg_log_write_percent) AS avg_log_write_percent + MAX(avg_log_write_percent) AS max_log_write_percent FROM pre_packed_log_rate_snapshot WHERE high_log_rate_indicator = 1 GROUP BY grouping_helper @@ -867,16 +874,16 @@ GROUP BY grouping_helper log_rate_top_stat AS ( SELECT MAX(DATEDIFF(second, min_end_time, max_end_time)) AS top_log_rate_duration_seconds, - MAX(avg_log_write_percent) AS top_log_write_percent, + MAX(max_log_write_percent) AS top_log_write_percent, COUNT(1) AS count_high_log_write_intervals FROM packed_log_rate_snapshot ) INSERT INTO @DetectedTip (tip_id, details) SELECT 1190 AS tip_id, CONCAT( - 'In the last hour, there were ', count_high_log_write_intervals, + 'In the last hour, there were ', count_high_log_write_intervals, ' interval(s) with transaction log IO staying above ', @HighLogRateThresholdPercent, - '% of the limit for the service objective. The longest such interval lasted ', FORMAT(top_log_rate_duration_seconds, '#,0'), + '% of the service objective limit. The longest such interval lasted ', FORMAT(top_log_rate_duration_seconds, '#,0'), ' seconds, and the maximum log IO was ', FORMAT(top_log_write_percent, '#,0.00'), '%.' ) AS details @@ -2264,6 +2271,58 @@ IF @@ROWCOUNT > 0 'SELECT * FROM ##tips_connectivity_event ORDER BY event_time DESC;' ) AS details; +-- High instance CPU +WITH +instance_cpu_snapshot AS +( +SELECT end_time, + avg_instance_cpu_percent, + IIF(avg_instance_cpu_percent > @HighInstanceCPUThresholdPercent, 1, 0) AS high_instance_cpu_indicator +FROM sys.dm_db_resource_stats +WHERE @EngineEdition = 5 +), +pre_packed_instance_cpu_snapshot AS +( +SELECT end_time, + avg_instance_cpu_percent, + high_instance_cpu_indicator, + ROW_NUMBER() OVER (ORDER BY end_time) -- row number across all readings, in increasing chronological order + - + SUM(high_instance_cpu_indicator) OVER (ORDER BY end_time ROWS UNBOUNDED PRECEDING) -- running count of all intervals where log rate exceeded the threshold + AS grouping_helper -- this difference remains constant while log rate is above the threshold, and can be used to collapse/pack an interval using aggregation +FROM instance_cpu_snapshot +), +packed_instance_cpu_snapshot AS +( +SELECT MIN(end_time) AS min_end_time, + MAX(end_time) AS max_end_time, + MAX(avg_instance_cpu_percent) AS max_instance_cpu_percent +FROM pre_packed_instance_cpu_snapshot +WHERE high_instance_cpu_indicator = 1 +GROUP BY grouping_helper +HAVING DATEDIFF(second, MIN(end_time), MAX(end_time)) > @HighInstanceCPUMinThresholdSeconds +), +instance_cpu_top_stat AS +( +SELECT MAX(DATEDIFF(second, min_end_time, max_end_time)) AS top_instance_cpu_duration_seconds, + MAX(max_instance_cpu_percent) AS top_instance_cpu_percent, + COUNT(1) AS count_high_instance_cpu_intervals +FROM packed_instance_cpu_snapshot +) +INSERT INTO @DetectedTip (tip_id, details) +SELECT 1390 AS tip_id, + CONCAT( + 'In the last hour, there were ', count_high_instance_cpu_intervals, + ' interval(s) with instance CPU utilization staying above ', @HighInstanceCPUThresholdPercent, + '% for at least ' , FORMAT(@HighInstanceCPUMinThresholdSeconds, '#,0'), + ' seconds. The longest such interval lasted ', FORMAT(top_instance_cpu_duration_seconds, '#,0'), + ' seconds, and the maximum instance CPU utilization was ', FORMAT(top_instance_cpu_percent, '#,0.00'), + '%.' + ) AS details +FROM instance_cpu_top_stat +WHERE count_high_instance_cpu_intervals > 0 +; + -- Return detected tips IF @JSONOutput = 0