|
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index] [Xen-devel] [OSSTEST PATCH 19/21] starvation: Abandon jobs which are unreasonably delaying their flight
Sometimes, due to a shortage of available resources, a flight might be
delayed because a handful of jobs are waiting much longer than the
rest. Add a heuristic which causes these jobs to be abandoned.
We consider ourselves starving if we are starving now, based on the
most optimistic start time seen in the last I.
Signed-off-by: Ian Jackson <Ian.Jackson@xxxxxxxxxxxxx>
---
ts-hosts-allocate-Executive | 105 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 105 insertions(+)
diff --git a/ts-hosts-allocate-Executive b/ts-hosts-allocate-Executive
index 8c9ddaf7..7ea3c4af 100755
--- a/ts-hosts-allocate-Executive
+++ b/ts-hosts-allocate-Executive
@@ -62,6 +62,8 @@ our %magictaskid;
our $fi;
our $jobinfo;
our $harness_rev;
+our $starvation_p;
+our @abs_start_estimates;
#---------- general utilities, setup, etc. ----------
@@ -114,12 +116,16 @@ END
}
$alloc_start_time = time // die $!;
+
+ $starvation_p =
+ hostalloc_starvation_parse_runvar($r{hostalloc_maxwait_starvation});
}
#---------- prepared sql statements ----------
# all users of these must ->finish them afterwards, to avoid db deadlock
our ($flagscheckq, $equivflagscheckq, $duration_estimator, $resprop_q,
+ $starvation_q,
$alloc_findres_q, $alloc_shared_q, $alloc_sharing_slot_q,
$claim_share_reuse_q, $claim_maxshare_q, $claim_rmshares_q,
$claim_noshares_q, $claim_rmshare_q, $claim_setres_q,
@@ -146,6 +152,15 @@ END
AND name = ?
END
+ $starvation_q= $dbh_tests->prepare(<<END);
+ SELECT job, jobs.status, max(steps.finished)
+ FROM jobs
+ LEFT JOIN steps
+ USING (flight,job)
+ WHERE flight= ?
+ GROUP BY job, jobs.status
+END
+
# for allocation
$alloc_findres_q= $dbh_tests->prepare(<<END);
@@ -712,6 +727,88 @@ sub alloc_hosts () {
logm("host allocation: all successful and recorded.");
}
+sub most_optimistic ($$$) {
+ my ($best, $now, $period) = @_;
+ # Records that we have now estimated $best->{Start}.
+ # Returns the most optimistic absolute start time "in the last
+ # $period". Returns undef if we don't have good data yet.
+
+ push @abs_start_estimates, { At => $now, Got => $best->{Start} + $now };
+
+ # Actually, what we do is prune all but the last entry from before
+ # $period, and we expect at least 4 estimates. That ensures that
+ # the answer involves at least one estimate at least $period ago.
+ # Ie what we actually return is
+ # Consider the most recent estimate which is at least $period
+ # ago (the "oldest relevant"), and all subsequent estimates.
+ # Answer is the most optimistic start time of all of those,
+ # provided there are at least 4 of them.
+ my $is_old = sub { return $_[0]{At} <= $now - $period; };
+ my $need_estimates = 4;
+ while (@abs_start_estimates > $need_estimates &&
+ $is_old->($abs_start_estimates[1])) {
+ # estimates[1] is at least $period ago and more recent
+ # than $estimates[0], so $estimates[0] cannot be the
+ # oldest relevant and is indeed older than the oldest
+ # relevant.
+ shift @abs_start_estimates;
+ }
+
+ my $pr = sub {
+ my ($e) = @_;
+ printf(DEBUG ' %s (@%s)',
+ $e->{Got} - $now,
+ $e->{At} - $now);
+ };
+
+ print DEBUG "most_optimistic: all:";
+ my $optimist;
+ foreach (@abs_start_estimates) {
+ $pr->($_);
+ $optimist = $_ if !$optimist || $_->{Got} < $optimist->{Got};
+ }
+ print DEBUG "\n";
+ printf(DEBUG "most_optimistic: (period=%s):", $period);
+ $pr->($optimist);
+ print DEBUG "\n";
+
+ return undef unless @abs_start_estimates >= $need_estimates;
+
+ return $optimist->{Got};
+}
+
+sub starving ($) {
+ my ($best_start_abs) = @_;
+ return (0, 'runvar says never give up') unless %$starvation_p;
+ return (0, 'no estimate') unless defined $best_start_abs;
+ $starvation_q->execute($flight);
+ my $d=0;
+ my $w=0;
+ my $maxfin=0;
+ while (my ($j,$st,$fin) = $starvation_q->fetchrow_array()) {
+ if ($st eq 'preparing' ||
+ $st eq 'queued' ||
+ $st eq 'running') {
+ $w++;
+ } else {
+ $d++;
+ return (0, "job $j status $st but no step finished time!")
+ unless defined $fin;
+ $maxfin = $fin if $fin > $maxfin;
+ }
+ }
+ # we quit if the total time from the start of the flight
+ # to our expected finish is more than the total time so
+ # far (for the completed jobs) by the margin X and I
+ my $X = hostalloc_starvation_calculate_X($starvation_p, $w, $d);
+ return (0, 'X=inf') unless defined $X;
+ my $total_d = $maxfin - $fi->{started};
+ my $projected_me = $best_start_abs - $fi->{started};
+ my $m = "D=$d W=$w X=$X maxfin=$maxfin";
+ my $bad = $projected_me > $X * $total_d + $starvation_p->{I};
+ return ($bad, $m);
+}
+
sub attempt_allocation {
my $mayalloc;
($plan, $mayalloc) = @_;
@@ -772,6 +869,14 @@ sub attempt_allocation {
if ($wait_sofar > $maxwait/2
&& $wait_sofar + $best->{Start} > $maxwait) {
logm "timed out: $wait_sofar, $best->{Start}, $maxwait";
+ } elsif (%$starvation_p) {
+ my $est_abs = most_optimistic($best, $now, $starvation_p->{I});
+ my ($starving, $m) = starving($est_abs);
+ $starvation_q->finish();
+ if (!$starving) {
+ print DEBUG "not starving: $m\n";
+ } else {
+ logm "starving ($m)";
return 2;
}
}
--
2.11.0
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxxx
https://lists.xenproject.org/mailman/listinfo/xen-devel
|
![]() |
Lists.xenproject.org is hosted with RackSpace, monitoring our |