[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Xen-changelog] [xen stable-4.8] xenstored: apply a write transaction rate limit



commit 4cd02a2513dc224e343eaa8a88418a14ade092b3
Author:     Ian Jackson <ian.jackson@xxxxxxxxxxxxx>
AuthorDate: Tue Mar 7 16:09:12 2017 +0000
Commit:     Ian Jackson <Ian.Jackson@xxxxxxxxxxxxx>
CommitDate: Wed Apr 5 15:09:08 2017 +0100

    xenstored: apply a write transaction rate limit
    
    This avoids a rogue client being about to stall another client (eg the
    toolstack) indefinitely.
    
    This is XSA-206.
    
    Signed-off-by: Ian Jackson <Ian.Jackson@xxxxxxxxxxxxx>
    
    Backported to 4.8 (not entirely trivial).
    
    Reported-by: Juergen Gross <jgross@xxxxxxxx>
    Signed-off-by: George Dunlap <george.dunlap@xxxxxxxxxx>
    Acked-by: Ian Jackson <Ian.Jackson@xxxxxxxxxxxxx>
---
 tools/xenstore/Makefile                |   3 +-
 tools/xenstore/xenstored_core.c        |   9 ++
 tools/xenstore/xenstored_core.h        |   6 +
 tools/xenstore/xenstored_domain.c      | 215 +++++++++++++++++++++++++++++++++
 tools/xenstore/xenstored_domain.h      |  25 ++++
 tools/xenstore/xenstored_transaction.c |   5 +
 6 files changed, 262 insertions(+), 1 deletion(-)

diff --git a/tools/xenstore/Makefile b/tools/xenstore/Makefile
index 36b6fd4..9cb54de 100644
--- a/tools/xenstore/Makefile
+++ b/tools/xenstore/Makefile
@@ -32,6 +32,7 @@ XENSTORED_OBJS_$(CONFIG_FreeBSD) = xenstored_posix.o
 XENSTORED_OBJS_$(CONFIG_MiniOS) = xenstored_minios.o
 
 XENSTORED_OBJS += $(XENSTORED_OBJS_y)
+LDLIBS_xenstored += -lrt
 
 ifneq ($(XENSTORE_STATIC_CLIENTS),y)
 LIBXENSTORE := libxenstore.so
@@ -73,7 +74,7 @@ endif
 $(XENSTORED_OBJS): CFLAGS += $(CFLAGS_libxengnttab)
 
 xenstored: $(XENSTORED_OBJS)
-       $(CC) $^ $(LDFLAGS) $(LDLIBS_libxenevtchn) $(LDLIBS_libxengnttab) 
$(LDLIBS_libxenctrl) $(SOCKET_LIBS) -o $@ $(APPEND_LDFLAGS)
+       $(CC) $^ $(LDFLAGS) $(LDLIBS_libxenevtchn) $(LDLIBS_libxengnttab) 
$(LDLIBS_libxenctrl) $(LDLIBS_xenstored) $(SOCKET_LIBS) -o $@ $(APPEND_LDFLAGS)
 
 xenstored.a: $(XENSTORED_OBJS)
        $(AR) cr $@ $^
diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c
index 3df977b..d14f096 100644
--- a/tools/xenstore/xenstored_core.c
+++ b/tools/xenstore/xenstored_core.c
@@ -358,6 +358,7 @@ static void initialize_fds(int sock, int *p_sock_pollfd_idx,
                           int *ptimeout)
 {
        struct connection *conn;
+       struct wrl_timestampt now;
 
        if (fds)
                memset(fds, 0, sizeof(struct pollfd) * current_array_size);
@@ -377,8 +378,11 @@ static void initialize_fds(int sock, int 
*p_sock_pollfd_idx,
                xce_pollfd_idx = set_fd(xenevtchn_fd(xce_handle),
                                        POLLIN|POLLPRI);
 
+       wrl_gettime_now(&now);
+
        list_for_each_entry(conn, &connections, list) {
                if (conn->domain) {
+                       wrl_check_timeout(conn->domain, now, ptimeout);
                        if (domain_can_read(conn) ||
                            (domain_can_write(conn) &&
                             !list_empty(&conn->out_list)))
@@ -833,6 +837,7 @@ static void delete_node_single(struct connection *conn, 
struct node *node)
                corrupt(conn, "Could not delete '%s'", node->name);
                return;
        }
+
        domain_entry_dec(conn, node);
 }
 
@@ -972,6 +977,7 @@ static void do_write(struct connection *conn, struct 
buffered_data *in)
        }
 
        add_change_node(conn->transaction, name, false);
+       wrl_apply_debit_direct(conn);
        fire_watches(conn, in, name, false);
        send_ack(conn, XS_WRITE);
 }
@@ -1003,6 +1009,7 @@ static void do_mkdir(struct connection *conn, struct 
buffered_data *in)
                        return;
                }
                add_change_node(conn->transaction, name, false);
+               wrl_apply_debit_direct(conn);
                fire_watches(conn, in, name, false);
        }
        send_ack(conn, XS_MKDIR);
@@ -1129,6 +1136,7 @@ static void do_rm(struct connection *conn, struct 
buffered_data *in)
 
        if (_rm(conn, node, name)) {
                add_change_node(conn->transaction, name, true);
+               wrl_apply_debit_direct(conn);
                fire_watches(conn, in, name, true);
                send_ack(conn, XS_RM);
        }
@@ -1205,6 +1213,7 @@ static void do_set_perms(struct connection *conn, struct 
buffered_data *in)
        }
 
        add_change_node(conn->transaction, name, false);
+       wrl_apply_debit_direct(conn);
        fire_watches(conn, in, name, false);
        send_ack(conn, XS_SET_PERMS);
 }
diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h
index ecc614f..9e9d960 100644
--- a/tools/xenstore/xenstored_core.h
+++ b/tools/xenstore/xenstored_core.h
@@ -33,6 +33,12 @@
 #include "list.h"
 #include "tdb.h"
 
+#define MIN(a, b) (((a) < (b))? (a) : (b))
+
+typedef int32_t wrl_creditt;
+#define WRL_CREDIT_MAX (1000*1000*1000)
+/* ^ satisfies non-overflow condition for wrl_xfer_credit */
+
 struct buffered_data
 {
        struct list_head list;
diff --git a/tools/xenstore/xenstored_domain.c 
b/tools/xenstore/xenstored_domain.c
index 5de93d4..012dfe6 100644
--- a/tools/xenstore/xenstored_domain.c
+++ b/tools/xenstore/xenstored_domain.c
@@ -21,6 +21,7 @@
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdarg.h>
+#include <time.h>
 
 #include "utils.h"
 #include "talloc.h"
@@ -74,6 +75,10 @@ struct domain
 
        /* number of watch for this domain */
        int nbwatch;
+
+       /* write rate limit */
+       wrl_creditt wrl_credit; /* [ -wrl_config_writecost, +_dburst ] */
+       struct wrl_timestampt wrl_timestamp;
 };
 
 static LIST_HEAD(domains);
@@ -206,6 +211,8 @@ static int destroy_domain(void *_domain)
 
        fire_watches(NULL, domain, "@releaseDomain", false);
 
+       wrl_domain_destroy(domain);
+
        return 0;
 }
 
@@ -253,6 +260,9 @@ void handle_event(void)
 bool domain_can_read(struct connection *conn)
 {
        struct xenstore_domain_interface *intf = conn->domain->interface;
+
+       if (domain_is_unprivileged(conn) && conn->domain->wrl_credit < 0)
+               return false;
        return (intf->req_cons != intf->req_prod);
 }
 
@@ -284,6 +294,8 @@ static struct domain *new_domain(void *context, unsigned 
int domid,
        domain->domid = domid;
        domain->path = talloc_domain_path(domain, domid);
 
+       wrl_domain_new(domain);
+
        list_add(&domain->list, &domains);
        talloc_set_destructor(domain, destroy_domain);
 
@@ -751,6 +763,209 @@ int domain_watch(struct connection *conn)
                : 0;
 }
 
+static wrl_creditt wrl_config_writecost      = WRL_FACTOR;
+static wrl_creditt wrl_config_rate           = WRL_RATE   * WRL_FACTOR;
+static wrl_creditt wrl_config_dburst         = WRL_DBURST * WRL_FACTOR;
+static wrl_creditt wrl_config_gburst         = WRL_GBURST * WRL_FACTOR;
+static wrl_creditt wrl_config_newdoms_dburst =
+                                WRL_DBURST * WRL_NEWDOMS * WRL_FACTOR;
+
+long wrl_ntransactions;
+
+static long wrl_ndomains;
+static wrl_creditt wrl_reserve; /* [-wrl_config_newdoms_dburst, +_gburst ] */
+
+void wrl_gettime_now(struct wrl_timestampt *now_wt)
+{
+       struct timespec now_ts;
+       int r;
+
+       r = clock_gettime(CLOCK_MONOTONIC, &now_ts);
+       if (r)
+               barf_perror("Could not find time (clock_gettime failed)");
+
+       now_wt->sec = now_ts.tv_sec;
+       now_wt->msec = now_ts.tv_nsec / 1000000;
+}
+
+static void wrl_xfer_credit(wrl_creditt *debit,  wrl_creditt debit_floor,
+                           wrl_creditt *credit, wrl_creditt credit_ceil)
+       /*
+        * Transfers zero or more credit from "debit" to "credit".
+        * Transfers as much as possible while maintaining
+        * debit >= debit_floor and credit <= credit_ceil.
+        * (If that's violated already, does nothing.)
+        *
+        * Sufficient conditions to avoid overflow, either of:
+        *  |every argument| <= 0x3fffffff
+        *  |every argument| <= 1E9
+        *  |every argument| <= WRL_CREDIT_MAX
+        * (And this condition is preserved.)
+        */
+{
+       wrl_creditt xfer = MIN( *debit      - debit_floor,
+                               credit_ceil - *credit      );
+       if (xfer > 0) {
+               *debit -= xfer;
+               *credit += xfer;
+       }
+}
+
+void wrl_domain_new(struct domain *domain)
+{
+       domain->wrl_credit = 0;
+       wrl_gettime_now(&domain->wrl_timestamp);
+       wrl_ndomains++;
+       /* Steal up to DBURST from the reserve */
+       wrl_xfer_credit(&wrl_reserve, -wrl_config_newdoms_dburst,
+                       &domain->wrl_credit, wrl_config_dburst);
+}
+
+void wrl_domain_destroy(struct domain *domain)
+{
+       wrl_ndomains--;
+       /*
+        * Don't bother recalculating domain's credit - this just
+        * means we don't give the reserve the ending domain's credit
+        * for time elapsed since last update.
+        */
+       wrl_xfer_credit(&domain->wrl_credit, 0,
+                       &wrl_reserve, wrl_config_dburst);
+}
+
+void wrl_credit_update(struct domain *domain, struct wrl_timestampt now)
+{
+       /*
+        * We want to calculate
+        *    credit += (now - timestamp) * RATE / ndoms;
+        * But we want it to saturate, and to avoid floating point.
+        * To avoid rounding errors from constantly adding small
+        * amounts of credit, we only add credit for whole milliseconds.
+        */
+       long seconds      = now.sec -  domain->wrl_timestamp.sec;
+       long milliseconds = now.msec - domain->wrl_timestamp.msec;
+       long msec;
+       int64_t denom, num;
+       wrl_creditt surplus;
+
+       seconds = MIN(seconds, 1000*1000); /* arbitrary, prevents overflow */
+       msec = seconds * 1000 + milliseconds;
+
+       if (msec < 0)
+                /* shouldn't happen with CLOCK_MONOTONIC */
+               msec = 0;
+
+       /* 32x32 -> 64 cannot overflow */
+       denom = (int64_t)msec * wrl_config_rate;
+       num  =  (int64_t)wrl_ndomains * 1000;
+       /* denom / num <= 1E6 * wrl_config_rate, so with
+          reasonable wrl_config_rate, denom / num << 2^64 */
+
+       /* at last! */
+       domain->wrl_credit = MIN( (int64_t)domain->wrl_credit + denom / num,
+                                 WRL_CREDIT_MAX );
+       /* (maybe briefly violating the DBURST cap on wrl_credit) */
+
+       /* maybe take from the reserve to make us nonnegative */
+       wrl_xfer_credit(&wrl_reserve,        0,
+                       &domain->wrl_credit, 0);
+
+       /* return any surplus (over DBURST) to the reserve */
+       surplus = 0;
+       wrl_xfer_credit(&domain->wrl_credit, wrl_config_dburst,
+                       &surplus,            WRL_CREDIT_MAX);
+       wrl_xfer_credit(&surplus,     0,
+                       &wrl_reserve, wrl_config_gburst);
+       /* surplus is now implicitly discarded */
+
+       domain->wrl_timestamp = now;
+
+       trace("wrl: dom %4d %6ld  msec  %9ld credit   %9ld reserve"
+             "  %9ld discard\n",
+             domain->domid,
+             msec,
+             (long)domain->wrl_credit, (long)wrl_reserve,
+             (long)surplus);
+}
+
+void wrl_check_timeout(struct domain *domain,
+                      struct wrl_timestampt now,
+                      int *ptimeout)
+{
+       uint64_t num, denom;
+       int wakeup;
+
+       wrl_credit_update(domain, now);
+
+       if (domain->wrl_credit >= 0)
+               /* not blocked */
+               return;
+
+       if (!*ptimeout)
+               /* already decided on immediate wakeup,
+                  so no need to calculate our timeout */
+               return;
+
+       /* calculate  wakeup = now + -credit / (RATE / ndoms); */
+
+       /* credit cannot go more -ve than one transaction,
+        * so the first multiplication cannot overflow even 32-bit */
+       num   = (uint64_t)(-domain->wrl_credit * 1000) * wrl_ndomains;
+       denom = wrl_config_rate;
+
+       wakeup = MIN( num / denom /* uint64_t */, INT_MAX );
+       if (*ptimeout==-1 || wakeup < *ptimeout)
+               *ptimeout = wakeup;
+
+       trace("wrl: domain %u credit=%ld (reserve=%ld) SLEEPING for %d\n",
+             domain->domid,
+             (long)domain->wrl_credit, (long)wrl_reserve,
+             wakeup);
+}
+
+void wrl_apply_debit_actual(struct domain *domain)
+{
+       struct wrl_timestampt now;
+
+       if (!domain)
+               /* sockets escape the write rate limit */
+               return;
+
+       wrl_gettime_now(&now);
+       wrl_credit_update(domain, now);
+
+       domain->wrl_credit -= wrl_config_writecost;
+       trace("wrl: domain %u credit=%ld (reserve=%ld)\n",
+             domain->domid,
+             (long)domain->wrl_credit, (long)wrl_reserve);
+}
+
+void wrl_apply_debit_direct(struct connection *conn)
+{
+       if (!conn)
+               /* some writes are generated internally */
+               return;
+
+       if (conn->transaction)
+               /* these are accounted for when the transaction ends */
+               return;
+
+       if (!wrl_ntransactions)
+               /* we don't conflict with anyone */
+               return;
+
+       wrl_apply_debit_actual(conn->domain);
+}
+
+void wrl_apply_debit_trans_commit(struct connection *conn)
+{
+       if (wrl_ntransactions <= 1)
+               /* our own transaction appears in the counter */
+               return;
+
+       wrl_apply_debit_actual(conn->domain);
+}
+
 /*
  * Local variables:
  *  c-file-style: "linux"
diff --git a/tools/xenstore/xenstored_domain.h 
b/tools/xenstore/xenstored_domain.h
index 2554423..cec341e 100644
--- a/tools/xenstore/xenstored_domain.h
+++ b/tools/xenstore/xenstored_domain.h
@@ -65,4 +65,29 @@ void domain_watch_inc(struct connection *conn);
 void domain_watch_dec(struct connection *conn);
 int domain_watch(struct connection *conn);
 
+/* Write rate limiting */
+
+#define WRL_FACTOR   1000 /* for fixed-point arithmetic */
+#define WRL_RATE      200
+#define WRL_DBURST     10
+#define WRL_GBURST   1000
+#define WRL_NEWDOMS     5
+
+struct wrl_timestampt {
+       time_t sec;
+       int msec;
+};
+
+extern long wrl_ntransactions;
+
+void wrl_gettime_now(struct wrl_timestampt *now_ts);
+void wrl_domain_new(struct domain *domain);
+void wrl_domain_destroy(struct domain *domain);
+void wrl_credit_update(struct domain *domain, struct wrl_timestampt now);
+void wrl_check_timeout(struct domain *domain,
+                       struct wrl_timestampt now,
+                       int *ptimeout);
+void wrl_apply_debit_direct(struct connection *conn);
+void wrl_apply_debit_trans_commit(struct connection *conn);
+
 #endif /* _XENSTORED_DOMAIN_H */
diff --git a/tools/xenstore/xenstored_transaction.c 
b/tools/xenstore/xenstored_transaction.c
index 84cb0bf..5059a11 100644
--- a/tools/xenstore/xenstored_transaction.c
+++ b/tools/xenstore/xenstored_transaction.c
@@ -120,6 +120,7 @@ static int destroy_transaction(void *_transaction)
 {
        struct transaction *trans = _transaction;
 
+       wrl_ntransactions--;
        trace_destroy(trans, "transaction");
        if (trans->tdb)
                tdb_close(trans->tdb);
@@ -183,6 +184,7 @@ void do_transaction_start(struct connection *conn, struct 
buffered_data *in)
        talloc_steal(conn, trans);
        talloc_set_destructor(trans, destroy_transaction);
        conn->transaction_started++;
+       wrl_ntransactions++;
 
        snprintf(id_str, sizeof(id_str), "%u", trans->id);
        send_reply(conn, XS_TRANSACTION_START, id_str, strlen(id_str)+1);
@@ -218,6 +220,9 @@ void do_transaction_end(struct connection *conn, struct 
buffered_data *in)
                        send_error(conn, EAGAIN);
                        return;
                }
+
+               wrl_apply_debit_trans_commit(conn);
+
                if (!replace_tdb(trans->tdb_name, trans->tdb)) {
                        send_error(conn, errno);
                        return;
--
generated by git-patchbot for /home/xen/git/xen.git#stable-4.8

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxx
https://lists.xenproject.org/xen-changelog

 


Rackspace

Lists.xenproject.org is hosted with RackSpace, monitoring our
servers 24x7x365 and backed by RackSpace's Fanatical Support®.