Report for node down longer than 14 days

I have looked for multiple days...

even watched every youtube video i can find, but i can't find a swql query that will show me all node down for over 14 days...  due to the fact that we have devices that are not servers, it should work for 'IS SERVER=YES' or anyway to make sure I am just looking at down servers...

any help is appreciated...  I have also tried to create a custom report, but can only get the total duration in minutes to be 100805, and i cant change that number...

Parents
  • Here is a SWQL statement I picked up somewhere. I originally had it set for 60 minutes, but I believe it is set to a week now.--


    select n.nodeid, n.caption as [Device]
    -- shows the current status icon
    , '/Orion/images/StatusIcons/Small-' + n.StatusIcon AS [_IconFor_Device]
    -- makes a clickable link to the node details
    , n.DetailsUrl as [_linkfor_Device]
    -- shows the timestamp of the down event, if there is no timestamp then is says the event was greater than the number of days in your event retention settings
    , isnull(tostring(t2.[Down Event]),concat('Greater than ',(SELECT CurrentValue FROM Orion.Settings where settingid='SWNetPerfMon-Settings-Retain Events'),' days ago')) as [Down Event]
    -- shows the timestamp of the up event, unless the object is still down
    , isnull(tostring(t2.[Up Event]),'Still Down') as [Up Event]
    -- figures out the minutes between the down and up events, if the object is still down it counts from the down event to now, displays 99999 if we cannot accurately determine the original downtime, and
    , isnull(MINUTEDIFF(t2.[Down Event], isnull(t2.[Up Event],GETUTCDATE())),99999) as Minutes


    from orion.nodes n
    left join (SELECT
    -- Device nodeid used for our join
    StartTime.Nodes.NodeID

    -- Down Event time stamp in local time zone
    ,ToLocal(StartTime.EventTime) AS [Down Event]

    -- Up Event time stamp in local time zone
    ,(SELECT TOP 1
    ToLocal(EventTime) AS [EventTime]
    FROM Orion.Events AS [EndTime]
    -- picks the first up event that is newer than the down event for this node
    WHERE EndTime.EventTime >= StartTime.EventTime
    -- EventType 5 is a node up
    AND EndTime.EventType = 5
    AND EndTime.NetObjectID = StartTime.NetObjectID
    AND EventTime IS NOT NULL
    ORDER BY EndTime.EventTime
    ) AS [Up Event]

    -- This is the table we are querying
    FROM Orion.Events StartTime

    -- EventType 1 is a node down
    WHERE StartTime.EventType = 1

    ) t2 on n.NodeID = t2.nodeid


    -- this is how I catch nodes that are down but have aged out of the events table, and where I can only get nodes that are still down.
    where (n.status = 2 AND t2.[Up Event] IS NULL)


    -- If you want to filter the results to only show outages of a minimum duration uncomment the below line, this is set to a week.
    and MINUTEDIFF(isnull(t2.[Down Event],(GETUTCDATE()-30)), isnull(t2.[Up Event],GETUTCDATE())) > 10080


    -- if you want to use this query in a search box of the Custom Query resource uncomment the below line
    --and n.Caption like '%${SEARCH_STRING}%'


    order by t2.[down event] desc

  • Oh, but you wanted 2 weeks, just change 10080 to 20160 near the end of the statement. And to get only IsServer then change the last where line and add--  AND n.isserver = TRUE 

    select n.nodeid, n.caption as [Device]
    -- shows the current status icon
    , '/Orion/images/StatusIcons/Small-' + n.StatusIcon AS [_IconFor_Device]
    -- makes a clickable link to the node details
    , n.DetailsUrl as [_linkfor_Device]
    -- shows the timestamp of the down event, if there is no timestamp then is says the event was greater than the number of days in your event retention settings
    , isnull(tostring(t2.[Down Event]),concat('Greater than ',(SELECT CurrentValue FROM Orion.Settings where settingid='SWNetPerfMon-Settings-Retain Events'),' days ago')) as [Down Event]
    -- shows the timestamp of the up event, unless the object is still down
    , isnull(tostring(t2.[Up Event]),'Still Down') as [Up Event]
    -- figures out the minutes between the down and up events, if the object is still down it counts from the down event to now, displays 99999 if we cannot accurately determine the original downtime, and
    , isnull(MINUTEDIFF(t2.[Down Event], isnull(t2.[Up Event],GETUTCDATE())),99999) as Minutes


    from orion.nodes n
    left join (SELECT
    -- Device nodeid used for our join
    StartTime.Nodes.NodeID

    -- Down Event time stamp in local time zone
    ,ToLocal(StartTime.EventTime) AS [Down Event]

    -- Up Event time stamp in local time zone
    ,(SELECT TOP 1
    ToLocal(EventTime) AS [EventTime]
    FROM Orion.Events AS [EndTime]
    -- picks the first up event that is newer than the down event for this node
    WHERE EndTime.EventTime >= StartTime.EventTime
    -- EventType 5 is a node up
    AND EndTime.EventType = 5
    AND EndTime.NetObjectID = StartTime.NetObjectID
    AND EventTime IS NOT NULL
    ORDER BY EndTime.EventTime
    ) AS [Up Event]

    -- This is the table we are querying
    FROM Orion.Events StartTime

    -- EventType 1 is a node down
    WHERE StartTime.EventType = 1

    ) t2 on n.NodeID = t2.nodeid


    -- this is how I catch nodes that are down but have aged out of the events table, and where I can only get nodes that are still down.
    where (n.status = 2 AND t2.[Up Event] IS NULL AND n.IsServer=TRUE)


    -- If you want to filter the results to only show outages of a minimum duration uncomment the below line, this is set to a week.
    and MINUTEDIFF(isnull(t2.[Down Event],(GETUTCDATE()-30)), isnull(t2.[Up Event],GETUTCDATE())) > 20160


    -- if you want to use this query in a search box of the Custom Query resource uncomment the below line
    --and n.Caption like '%${SEARCH_STRING}%'


    order by t2.[down event] desc

  • Also, giving false positive in this query   showing CCEprd1 as 'Still Down' above, but this node is actually up... IDEAS?

  • In the Swql query it has n.status = 2, which should only include nodes that are down. What is the status in the database for the node? 

    SELECT NodeID, Caption, Status
    FROM Orion.Nodes
    Where Caption='CCEprd1'


    I'm actually having problems recently where the agent stops connecting on multiple servers that I am troubleshooting, where we don't get recent metrics but it shows the node as up. Maybe ping is working, but the agent can't connect?

  • It is also showing nodes up since jul 6...

  • try moving the t2.nodeid is not null statement to outside the parenthesis you currently have it in. Because you are grouping status =2 OR nodeid is not null its picking picking up all of the nodes that have a nodeid that also have an event in the events table. You'll need to ungroup that part of the where statement to get it to work the way you are intending.

  • I changed my script a while ago to use AND instead of OR there. But I bet I messed up the intent of the script when I did that. I'm wondering if I need to start from scratch on my own, or go through line by line again. 

  • But I guess changing it to AND makes sense. You want it where nodes are down and there is not an "Up Event"

  • If I move it ouside the parenthesis, it bypasses the AND statements showing PACS devices and BIOMED devices again...

  • Latest Updates to syntax

    ...

    select n.caption as [Device]
    -- shows the current status icon
    --, '/Orion/images/StatusIcons/Small-' + n.StatusIcon AS [_IconFor_Device]
    -- makes a clickable link to the node details
    , n.DetailsUrl as [_linkfor_Device]
    -- shows the timestamp of the down event, if there is no timestamp then is says the event was greater than the number of days in your event retention settings
    , isnull(tostring(t2.[Down Event]),concat('Greater than ',(SELECT CurrentValue FROM Orion.Settings where settingid='SWNetPerfMon-Settings-Retain Events'),' days ago')) as [Down Event]
    -- shows the timestamp of the up event, unless the object is still down
    , isnull(tostring(t2.[Up Event]),'Still Down') as [Up Event]
    -- figures out the minutes between the down and up events, if the object is still down it counts from the down event to now, displays 99999 if we cannot accurately determine the original downtime, and 
    , isnull(MINUTEDIFF(t2.[Down Event], isnull(t2.[Up Event],GETDATE())),99999) as Minutes
    
    
    from orion.nodes n
    left join (SELECT    
     -- Device nodeid used for our join   
     StartTime.Nodes.NodeID     
    
     -- Down Event time stamp in local time zone    
     ,ToLocal(StartTime.EventTime) AS [Down Event]    
      
     -- Up Event time stamp in local time zone    
     ,(SELECT TOP 1    
     ToLocal(EventTime) AS [EventTime]    
     FROM Orion.Events AS [EndTime]    
    -- picks the first up event that is newer than the down event for this node
     WHERE EndTime.EventTime >= StartTime.EventTime   
    -- EventType 5 is a node up 
     AND EndTime.EventType = 5    
     AND EndTime.NetObjectID = StartTime.NetObjectID    
     AND EventTime IS NOT NULL    
     ORDER BY EndTime.EventTime    
     ) AS [Up Event]    
      
    -- This is the table we are querying    
    FROM Orion.Events StartTime    
      
    -- EventType 1 is a node down
    WHERE StartTime.EventType = 1    
        
    ) t2 on n.NodeID = t2.nodeid
    -- This should cause only servers to show...
    AND n.IsServer = TRUE
    
    -- this is how I catch nodes that are down but have aged out of the events table
    where (n.status = 2 or t2.nodeid is not null)
    -- This should remove 'Biomed' devices
    AND n.CustomProperties.model <> 'Biomed'
    -- This should remove 'PACS' devices
    AND n.CustomProperties.INFRASTRUCTURE_TEAM <> 'PACS'
    
    
    -- If you want to filter the results to only show outages of a minimum duration uncomment the below line
    and MINUTEDIFF(isnull(t2.[Down Event],(GETUTCDATE()-30)), isnull(t2.[Up Event],GETUTCDATE())) >  20160
    
    
    order by t2.[down event] desc

  • can you paste a copy of the code after you move it out and see it bypassing those and statements?

Reply Children
  • Here is what i tried last...

    select n.caption as [Device]
    -- shows the current status icon
    --, '/Orion/images/StatusIcons/Small-' + n.StatusIcon AS [_IconFor_Device]
    -- makes a clickable link to the node details
    , n.DetailsUrl as [_linkfor_Device]
    -- shows the timestamp of the down event, if there is no timestamp then is says the event was greater than the number of days in your event retention settings
    , isnull(tostring(t2.[Down Event]),concat('Greater than ',(SELECT CurrentValue FROM Orion.Settings where settingid='SWNetPerfMon-Settings-Retain Events'),' days ago')) as [Down Event]
    -- shows the timestamp of the up event, unless the object is still down
    , isnull(tostring(t2.[Up Event]),'Still Down') as [Up Event]
    -- figures out the minutes between the down and up events, if the object is still down it counts from the down event to now, displays 99999 if we cannot accurately determine the original downtime, and 
    , isnull(MINUTEDIFF(t2.[Down Event], isnull(t2.[Up Event],GETDATE())),99999) as Minutes
    
    
    from orion.nodes n
    left join (SELECT    
     -- Device nodeid used for our join   
     StartTime.Nodes.NodeID     
    
     -- Down Event time stamp in local time zone    
     ,ToLocal(StartTime.EventTime) AS [Down Event]    
      
     -- Up Event time stamp in local time zone    
     ,(SELECT TOP 1    
     ToLocal(EventTime) AS [EventTime]    
     FROM Orion.Events AS [EndTime]    
    -- picks the first up event that is newer than the down event for this node
     WHERE EndTime.EventTime >= StartTime.EventTime   
    -- EventType 5 is a node up 
     AND EndTime.EventType = 5    
     AND EndTime.NetObjectID = StartTime.NetObjectID    
     AND EventTime IS NOT NULL    
     ORDER BY EndTime.EventTime    
     ) AS [Up Event]    
      
    -- This is the table we are querying    
    FROM Orion.Events StartTime  
    
      
    -- EventType 1 is a node down
    WHERE StartTime.EventType = 1    
        
    ) t2 on n.NodeID = t2.nodeid
    AND n.IsServer = TRUE
    
    -- this is how I catch nodes that are down but have aged out of the events table
    where (n.status = 2) AND (t2.[Up Event]is null)
    AND n.CustomProperties.model <> 'Biomed' 
    AND n.CustomProperties.INFRASTRUCTURE_TEAM <> 'PACS'
    
    
    -- If you want to filter the results to only show outages of a minimum duration uncomment the below line
    and MINUTEDIFF(isnull(t2.[Down Event],(GETUTCDATE()-30)), isnull(t2.[Up Event],GETUTCDATE())) >  20160
    
    
    -- if you want to use this query in a search box of the Custom Query resource uncomment the below line
    --and n.Caption like '%${SEARCH_STRING}%'
    
    
    order by t2.[down event] desc

  • Ah... with the last change I made it is working!!!  Thanks for your help!