[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [OSSTEST PATCH 19/21] starvation: Abandon jobs which are unreasonably delaying their flight
Sometimes, due to a shortage of available resources, a flight might be delayed because a handful of jobs are waiting much longer than the rest. Add a heuristic which causes these jobs to be abandoned. We consider ourselves starving if we are starving now, based on the most optimistic start time seen in the last I. Signed-off-by: Ian Jackson <Ian.Jackson@xxxxxxxxxxxxx> --- ts-hosts-allocate-Executive | 105 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/ts-hosts-allocate-Executive b/ts-hosts-allocate-Executive index 8c9ddaf7..7ea3c4af 100755 --- a/ts-hosts-allocate-Executive +++ b/ts-hosts-allocate-Executive @@ -62,6 +62,8 @@ our %magictaskid; our $fi; our $jobinfo; our $harness_rev; +our $starvation_p; +our @abs_start_estimates; #---------- general utilities, setup, etc. ---------- @@ -114,12 +116,16 @@ END } $alloc_start_time = time // die $!; + + $starvation_p = + hostalloc_starvation_parse_runvar($r{hostalloc_maxwait_starvation}); } #---------- prepared sql statements ---------- # all users of these must ->finish them afterwards, to avoid db deadlock our ($flagscheckq, $equivflagscheckq, $duration_estimator, $resprop_q, + $starvation_q, $alloc_findres_q, $alloc_shared_q, $alloc_sharing_slot_q, $claim_share_reuse_q, $claim_maxshare_q, $claim_rmshares_q, $claim_noshares_q, $claim_rmshare_q, $claim_setres_q, @@ -146,6 +152,15 @@ END AND name = ? END + $starvation_q= $dbh_tests->prepare(<<END); + SELECT job, jobs.status, max(steps.finished) + FROM jobs + LEFT JOIN steps + USING (flight,job) + WHERE flight= ? + GROUP BY job, jobs.status +END + # for allocation $alloc_findres_q= $dbh_tests->prepare(<<END); @@ -712,6 +727,88 @@ sub alloc_hosts () { logm("host allocation: all successful and recorded."); } +sub most_optimistic ($$$) { + my ($best, $now, $period) = @_; + # Records that we have now estimated $best->{Start}. + # Returns the most optimistic absolute start time "in the last + # $period". Returns undef if we don't have good data yet. + + push @abs_start_estimates, { At => $now, Got => $best->{Start} + $now }; + + # Actually, what we do is prune all but the last entry from before + # $period, and we expect at least 4 estimates. That ensures that + # the answer involves at least one estimate at least $period ago. + # Ie what we actually return is + # Consider the most recent estimate which is at least $period + # ago (the "oldest relevant"), and all subsequent estimates. + # Answer is the most optimistic start time of all of those, + # provided there are at least 4 of them. + my $is_old = sub { return $_[0]{At} <= $now - $period; }; + my $need_estimates = 4; + while (@abs_start_estimates > $need_estimates && + $is_old->($abs_start_estimates[1])) { + # estimates[1] is at least $period ago and more recent + # than $estimates[0], so $estimates[0] cannot be the + # oldest relevant and is indeed older than the oldest + # relevant. + shift @abs_start_estimates; + } + + my $pr = sub { + my ($e) = @_; + printf(DEBUG ' %s (@%s)', + $e->{Got} - $now, + $e->{At} - $now); + }; + + print DEBUG "most_optimistic: all:"; + my $optimist; + foreach (@abs_start_estimates) { + $pr->($_); + $optimist = $_ if !$optimist || $_->{Got} < $optimist->{Got}; + } + print DEBUG "\n"; + printf(DEBUG "most_optimistic: (period=%s):", $period); + $pr->($optimist); + print DEBUG "\n"; + + return undef unless @abs_start_estimates >= $need_estimates; + + return $optimist->{Got}; +} + +sub starving ($) { + my ($best_start_abs) = @_; + return (0, 'runvar says never give up') unless %$starvation_p; + return (0, 'no estimate') unless defined $best_start_abs; + $starvation_q->execute($flight); + my $d=0; + my $w=0; + my $maxfin=0; + while (my ($j,$st,$fin) = $starvation_q->fetchrow_array()) { + if ($st eq 'preparing' || + $st eq 'queued' || + $st eq 'running') { + $w++; + } else { + $d++; + return (0, "job $j status $st but no step finished time!") + unless defined $fin; + $maxfin = $fin if $fin > $maxfin; + } + } + # we quit if the total time from the start of the flight + # to our expected finish is more than the total time so + # far (for the completed jobs) by the margin X and I + my $X = hostalloc_starvation_calculate_X($starvation_p, $w, $d); + return (0, 'X=inf') unless defined $X; + my $total_d = $maxfin - $fi->{started}; + my $projected_me = $best_start_abs - $fi->{started}; + my $m = "D=$d W=$w X=$X maxfin=$maxfin"; + my $bad = $projected_me > $X * $total_d + $starvation_p->{I}; + return ($bad, $m); +} + sub attempt_allocation { my $mayalloc; ($plan, $mayalloc) = @_; @@ -772,6 +869,14 @@ sub attempt_allocation { if ($wait_sofar > $maxwait/2 && $wait_sofar + $best->{Start} > $maxwait) { logm "timed out: $wait_sofar, $best->{Start}, $maxwait"; + } elsif (%$starvation_p) { + my $est_abs = most_optimistic($best, $now, $starvation_p->{I}); + my ($starving, $m) = starving($est_abs); + $starvation_q->finish(); + if (!$starving) { + print DEBUG "not starving: $m\n"; + } else { + logm "starving ($m)"; return 2; } } -- 2.11.0 _______________________________________________ Xen-devel mailing list Xen-devel@xxxxxxxxxxxxxxxxxxxx https://lists.xenproject.org/mailman/listinfo/xen-devel
|
Lists.xenproject.org is hosted with RackSpace, monitoring our |