[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-API] [PATCH 2 of 2] Turn off the alerts METADATA_LUN_{HEALTHY, BROKEN} by default. They can be re-enabled by setting the key Pool.other_config:metadata_lun_alerts=true



# HG changeset patch
# User David Scott <dave.scott@xxxxxxxxxxxxx>
# Date 1257373330 0
# Node ID ec2f51bc7d3bb040954b96b45cd29117ed007567
# Parent  02fe2ca1d1f4e02b0921164299ff9e996c90413b
Turn off the alerts METADATA_LUN_{HEALTHY,BROKEN} by default. They can be 
re-enabled by setting the key Pool.other_config:metadata_lun_alerts=true.

The alerts would only be generated in two circumstances:
1. a bug in the code generating them;
2. when the storage underlying the metadata LUN is broken.

I think they were being triggered by brief storage datapath glitches and then 
scaring people. Until we can investigate this a bit further, it seems better to 
turn them off by default.

Signed-off-by: David Scott <dave.scott@xxxxxxxxxxxxx>

diff -r 02fe2ca1d1f4 -r ec2f51bc7d3b ocaml/xapi/redo_log_alert.ml
--- a/ocaml/xapi/redo_log_alert.ml      Wed Nov 04 22:22:09 2009 +0000
+++ b/ocaml/xapi/redo_log_alert.ml      Wed Nov 04 22:22:10 2009 +0000
@@ -22,13 +22,16 @@
 let raise_system_alert news =
   (* This code may block indefinitely while attempting to look up the pool 
UUID and send the alert, so do it in a separate thread *)
   ignore (Thread.create (fun () ->
-    debug "Raising system alert...";
+       debug "Processing redo log event: %s" news;
     let __context = Context.make "context" in
     let pool = Helpers.get_pool ~__context in
     let obj_uuid = Db.Pool.get_uuid ~__context ~self:pool in
-    debug "Pool UUID is %s" obj_uuid;
-    (try ignore (Xapi_message.create ~__context ~name:news ~priority:1L 
~cls:`Pool ~obj_uuid ~body:"") with _ -> ());
-    debug "System alert raised"
+       let other_config = Db.Pool.get_other_config ~__context ~self:pool in
+       if List.mem_assoc Xapi_globs.redo_log_alert_key other_config && 
(List.assoc Xapi_globs.redo_log_alert_key other_config = "true") then begin
+      debug "Raising alert for pool UUID %s" obj_uuid;
+      (try ignore (Xapi_message.create ~__context ~name:news ~priority:1L 
~cls:`Pool ~obj_uuid ~body:"") with _ -> ());
+         debug "Alert raised"
+       end else debug "Not raising alert because Pool.other_config:%s <> true" 
Xapi_globs.redo_log_alert_key;
   ) ())
 
 let loop () =
diff -r 02fe2ca1d1f4 -r ec2f51bc7d3b ocaml/xapi/redo_log_alert.mli
--- a/ocaml/xapi/redo_log_alert.mli     Wed Nov 04 22:22:09 2009 +0000
+++ b/ocaml/xapi/redo_log_alert.mli     Wed Nov 04 22:22:10 2009 +0000
@@ -13,5 +13,6 @@
  *)
 
 (** Runs forever waiting for the redo log's status to change i.e. for
-       it to fail or to recover, generating alerts on transitions.*)
+       it to fail or to recover, generating alerts on transitions if
+    Pool.other_config:metadata_lun_alerts is set to "true" *)
 val loop: unit -> unit
diff -r 02fe2ca1d1f4 -r ec2f51bc7d3b ocaml/xapi/xapi_globs.ml
--- a/ocaml/xapi/xapi_globs.ml  Wed Nov 04 22:22:09 2009 +0000
+++ b/ocaml/xapi/xapi_globs.ml  Wed Nov 04 22:22:10 2009 +0000
@@ -530,6 +530,9 @@
 (** The maximum permitted backoff delay, in seconds *)
 let redo_log_maximum_backoff_delay = 120
 
+(** Pool.other_config key which, when set to the value "true", enables 
generation of METADATA_LUN_{HEALTHY_BROKEN} alerts *)
+let redo_log_alert_key = "metadata_lun_alerts"
+
 (** Called from the SR.lvhd_stop_using_these_vdis_and_call_script *)
 let lvhd_script_hook = "lvhd-script-hook"
 
@@ -600,3 +603,4 @@
 
 
 let permanent_master_failure_retry_timeout = 5. *. 60. (* 5 minutes *)
+
3 files changed, 13 insertions(+), 5 deletions(-)
ocaml/xapi/redo_log_alert.ml  |   11 +++++++----
ocaml/xapi/redo_log_alert.mli |    3 ++-
ocaml/xapi/xapi_globs.ml      |    4 ++++


Attachment: xen-api.hg-2.patch
Description: Text Data

_______________________________________________
xen-api mailing list
xen-api@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/mailman/listinfo/xen-api

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.