From 5aed8d6e20d1848f6818e649905df84e9cee34ae Mon Sep 17 00:00:00 2001
From: Ian Jackson <ian.jackson@eu.citrix.com>
Date: Sat, 18 Mar 2017 16:44:46 +0000
Subject: [PATCH 01/23] xenstored: apply a write transaction rate limit

This avoids a rogue client being about to stall another client (eg the
toolstack) indefinitely.

This is XSA-206.

Reported-by: Juergen Gross <jgross@suse.com>
Signed-off-by: Ian Jackson <Ian.Jackson@eu.citrix.com>
---
 tools/xenstore/Makefile                |   3 +-
 tools/xenstore/xenstored_core.c        |   9 ++
 tools/xenstore/xenstored_core.h        |   6 +
 tools/xenstore/xenstored_domain.c      | 215 +++++++++++++++++++++++++++++++++
 tools/xenstore/xenstored_domain.h      |  25 ++++
 tools/xenstore/xenstored_transaction.c |   5 +
 6 files changed, 262 insertions(+), 1 deletion(-)

diff --git a/tools/xenstore/Makefile b/tools/xenstore/Makefile
index 1b4a494..289e427 100644
--- a/tools/xenstore/Makefile
+++ b/tools/xenstore/Makefile
@@ -30,6 +30,7 @@ XENSTORED_OBJS_$(CONFIG_FreeBSD) = xenstored_posix.o
 XENSTORED_OBJS_$(CONFIG_MiniOS) = xenstored_minios.o
 
 XENSTORED_OBJS += $(XENSTORED_OBJS_y)
+LDLIBS_xenstored += -lrt
 
 ifneq ($(XENSTORE_STATIC_CLIENTS),y)
 LIBXENSTORE := libxenstore.so
@@ -78,7 +79,7 @@ init-xenstore-domain: init-xenstore-domain.o $(LIBXENSTORE)
 	$(CC) $^ $(LDFLAGS) $(LDLIBS_libxenctrl) $(LDLIBS_libxenguest) $(LDLIBS_libxenstore) -o $@ $(APPEND_LDFLAGS)
 
 xenstored: $(XENSTORED_OBJS)
-	$(CC) $^ $(LDFLAGS) $(LDLIBS_libxenctrl) $(SOCKET_LIBS) -o $@ $(APPEND_LDFLAGS)
+	$(CC) $^ $(LDFLAGS) $(LDLIBS_libxenctrl) $(LDLIBS_xenstored) $(SOCKET_LIBS) -o $@ $(APPEND_LDFLAGS)
 
 xenstored.a: $(XENSTORED_OBJS)
 	$(AR) cr $@ $^
diff --git a/tools/xenstore/xenstored_core.c b/tools/xenstore/xenstored_core.c
index 25a548d..9dd06b1 100644
--- a/tools/xenstore/xenstored_core.c
+++ b/tools/xenstore/xenstored_core.c
@@ -356,6 +356,7 @@ static void initialize_fds(int sock, int *p_sock_pollfd_idx,
 			   int *ptimeout)
 {
 	struct connection *conn;
+	struct wrl_timestampt now;
 
 	if (fds)
 		memset(fds, 0, sizeof(struct pollfd) * current_array_size);
@@ -375,8 +376,11 @@ static void initialize_fds(int sock, int *p_sock_pollfd_idx,
 		xce_pollfd_idx = set_fd(xc_evtchn_fd(xce_handle),
 					POLLIN|POLLPRI);
 
+	wrl_gettime_now(&now);
+
 	list_for_each_entry(conn, &connections, list) {
 		if (conn->domain) {
+			wrl_check_timeout(conn->domain, now, ptimeout);
 			if (domain_can_read(conn) ||
 			    (domain_can_write(conn) &&
 			     !list_empty(&conn->out_list)))
@@ -809,6 +813,7 @@ static void delete_node_single(struct connection *conn, struct node *node)
 		corrupt(conn, "Could not delete '%s'", node->name);
 		return;
 	}
+
 	domain_entry_dec(conn, node);
 }
 
@@ -948,6 +953,7 @@ static void do_write(struct connection *conn, struct buffered_data *in)
 	}
 
 	add_change_node(conn->transaction, name, false);
+	wrl_apply_debit_direct(conn);
 	fire_watches(conn, name, false);
 	send_ack(conn, XS_WRITE);
 }
@@ -972,6 +978,7 @@ static void do_mkdir(struct connection *conn, const char *name)
 			return;
 		}
 		add_change_node(conn->transaction, name, false);
+		wrl_apply_debit_direct(conn);
 		fire_watches(conn, name, false);
 	}
 	send_ack(conn, XS_MKDIR);
@@ -1097,6 +1104,7 @@ static void do_rm(struct connection *conn, const char *name)
 
 	if (_rm(conn, node, name)) {
 		add_change_node(conn->transaction, name, true);
+		wrl_apply_debit_direct(conn);
 		fire_watches(conn, name, true);
 		send_ack(conn, XS_RM);
 	}
@@ -1172,6 +1180,7 @@ static void do_set_perms(struct connection *conn, struct buffered_data *in)
 	}
 
 	add_change_node(conn->transaction, name, false);
+	wrl_apply_debit_direct(conn);
 	fire_watches(conn, name, false);
 	send_ack(conn, XS_SET_PERMS);
 }
diff --git a/tools/xenstore/xenstored_core.h b/tools/xenstore/xenstored_core.h
index 8c853c9..a8b2a0e 100644
--- a/tools/xenstore/xenstored_core.h
+++ b/tools/xenstore/xenstored_core.h
@@ -30,6 +30,12 @@
 #include "list.h"
 #include "tdb.h"
 
+#define MIN(a, b) (((a) < (b))? (a) : (b))
+
+typedef int32_t wrl_creditt;
+#define WRL_CREDIT_MAX (1000*1000*1000)
+/* ^ satisfies non-overflow condition for wrl_xfer_credit */
+
 struct buffered_data
 {
 	struct list_head list;
diff --git a/tools/xenstore/xenstored_domain.c b/tools/xenstore/xenstored_domain.c
index dcd6581..3cf5c75 100644
--- a/tools/xenstore/xenstored_domain.c
+++ b/tools/xenstore/xenstored_domain.c
@@ -21,6 +21,7 @@
 #include <unistd.h>
 #include <stdlib.h>
 #include <stdarg.h>
+#include <time.h>
 
 #include "utils.h"
 #include "talloc.h"
@@ -73,6 +74,10 @@ struct domain
 
 	/* number of watch for this domain */
 	int nbwatch;
+
+	/* write rate limit */
+	wrl_creditt wrl_credit; /* [ -wrl_config_writecost, +_dburst ] */
+	struct wrl_timestampt wrl_timestamp;
 };
 
 static LIST_HEAD(domains);
@@ -205,6 +210,8 @@ static int destroy_domain(void *_domain)
 
 	fire_watches(NULL, "@releaseDomain", false);
 
+	wrl_domain_destroy(domain);
+
 	return 0;
 }
 
@@ -252,6 +259,9 @@ void handle_event(void)
 bool domain_can_read(struct connection *conn)
 {
 	struct xenstore_domain_interface *intf = conn->domain->interface;
+
+	if (domain_is_unprivileged(conn) && conn->domain->wrl_credit < 0)
+		return false;
 	return (intf->req_cons != intf->req_prod);
 }
 
@@ -283,6 +293,8 @@ static struct domain *new_domain(void *context, unsigned int domid,
 	domain->domid = domid;
 	domain->path = talloc_domain_path(domain, domid);
 
+	wrl_domain_new(domain);
+
 	list_add(&domain->list, &domains);
 	talloc_set_destructor(domain, destroy_domain);
 
@@ -746,6 +758,209 @@ int domain_watch(struct connection *conn)
 		: 0;
 }
 
+static wrl_creditt wrl_config_writecost      = WRL_FACTOR;
+static wrl_creditt wrl_config_rate           = WRL_RATE   * WRL_FACTOR;
+static wrl_creditt wrl_config_dburst         = WRL_DBURST * WRL_FACTOR;
+static wrl_creditt wrl_config_gburst         = WRL_GBURST * WRL_FACTOR;
+static wrl_creditt wrl_config_newdoms_dburst =
+	                         WRL_DBURST * WRL_NEWDOMS * WRL_FACTOR;
+
+long wrl_ntransactions;
+
+static long wrl_ndomains;
+static wrl_creditt wrl_reserve; /* [-wrl_config_newdoms_dburst, +_gburst ] */
+
+void wrl_gettime_now(struct wrl_timestampt *now_wt)
+{
+	struct timespec now_ts;
+	int r;
+
+	r = clock_gettime(CLOCK_MONOTONIC, &now_ts);
+	if (r)
+		barf_perror("Could not find time (clock_gettime failed)");
+
+	now_wt->sec = now_ts.tv_sec;
+	now_wt->msec = now_ts.tv_nsec / 1000000;
+}
+
+static void wrl_xfer_credit(wrl_creditt *debit,  wrl_creditt debit_floor,
+			    wrl_creditt *credit, wrl_creditt credit_ceil)
+	/*
+	 * Transfers zero or more credit from "debit" to "credit".
+	 * Transfers as much as possible while maintaining
+	 * debit >= debit_floor and credit <= credit_ceil.
+	 * (If that's violated already, does nothing.)
+	 *
+	 * Sufficient conditions to avoid overflow, either of:
+	 *  |every argument| <= 0x3fffffff
+	 *  |every argument| <= 1E9
+	 *  |every argument| <= WRL_CREDIT_MAX
+	 * (And this condition is preserved.)
+	 */
+{
+	wrl_creditt xfer = MIN( *debit      - debit_floor,
+			        credit_ceil - *credit      );
+	if (xfer > 0) {
+		*debit -= xfer;
+		*credit += xfer;
+	}
+}
+
+void wrl_domain_new(struct domain *domain)
+{
+	domain->wrl_credit = 0;
+	wrl_gettime_now(&domain->wrl_timestamp);
+	wrl_ndomains++;
+	/* Steal up to DBURST from the reserve */
+	wrl_xfer_credit(&wrl_reserve, -wrl_config_newdoms_dburst,
+			&domain->wrl_credit, wrl_config_dburst);
+}
+
+void wrl_domain_destroy(struct domain *domain)
+{
+	wrl_ndomains--;
+	/*
+	 * Don't bother recalculating domain's credit - this just
+	 * means we don't give the reserve the ending domain's credit
+	 * for time elapsed since last update.
+	 */
+	wrl_xfer_credit(&domain->wrl_credit, 0,
+			&wrl_reserve, wrl_config_dburst);
+}
+
+void wrl_credit_update(struct domain *domain, struct wrl_timestampt now)
+{
+	/*
+	 * We want to calculate
+	 *    credit += (now - timestamp) * RATE / ndoms;
+	 * But we want it to saturate, and to avoid floating point.
+	 * To avoid rounding errors from constantly adding small
+	 * amounts of credit, we only add credit for whole milliseconds.
+	 */
+	long seconds      = now.sec -  domain->wrl_timestamp.sec;
+	long milliseconds = now.msec - domain->wrl_timestamp.msec;
+	long msec;
+	int64_t denom, num;
+	wrl_creditt surplus;
+
+	seconds = MIN(seconds, 1000*1000); /* arbitrary, prevents overflow */
+	msec = seconds * 1000 + milliseconds;
+
+	if (msec < 0)
+                /* shouldn't happen with CLOCK_MONOTONIC */
+		msec = 0;
+
+	/* 32x32 -> 64 cannot overflow */
+	denom = (int64_t)msec * wrl_config_rate;
+	num  =  (int64_t)wrl_ndomains * 1000;
+	/* denom / num <= 1E6 * wrl_config_rate, so with
+	   reasonable wrl_config_rate, denom / num << 2^64 */
+
+	/* at last! */
+	domain->wrl_credit = MIN( (int64_t)domain->wrl_credit + denom / num,
+				  WRL_CREDIT_MAX );
+	/* (maybe briefly violating the DBURST cap on wrl_credit) */
+
+	/* maybe take from the reserve to make us nonnegative */
+	wrl_xfer_credit(&wrl_reserve,        0,
+			&domain->wrl_credit, 0);
+
+	/* return any surplus (over DBURST) to the reserve */
+	surplus = 0;
+	wrl_xfer_credit(&domain->wrl_credit, wrl_config_dburst,
+			&surplus,            WRL_CREDIT_MAX);
+	wrl_xfer_credit(&surplus,     0,
+			&wrl_reserve, wrl_config_gburst);
+	/* surplus is now implicitly discarded */
+
+	domain->wrl_timestamp = now;
+
+	trace("wrl: dom %4d %6ld  msec  %9ld credit   %9ld reserve"
+	      "  %9ld discard\n",
+	      domain->domid,
+	      msec,
+	      (long)domain->wrl_credit, (long)wrl_reserve,
+	      (long)surplus);
+}
+			
+void wrl_check_timeout(struct domain *domain,
+		       struct wrl_timestampt now,
+		       int *ptimeout)
+{
+	uint64_t num, denom;
+	int wakeup;
+
+	wrl_credit_update(domain, now);
+
+	if (domain->wrl_credit >= 0)
+		/* not blocked */
+		return;
+
+	if (!*ptimeout)
+		/* already decided on immediate wakeup,
+		   so no need to calculate our timeout */
+		return;
+
+	/* calculate  wakeup = now + -credit / (RATE / ndoms); */
+
+	/* credit cannot go more -ve than one transaction,
+	 * so the first multiplication cannot overflow even 32-bit */
+	num   = (uint64_t)(-domain->wrl_credit * 1000) * wrl_ndomains;
+	denom = wrl_config_rate;
+
+	wakeup = MIN( num / denom /* uint64_t */, INT_MAX );
+	if (*ptimeout==-1 || wakeup < *ptimeout)
+		*ptimeout = wakeup;
+
+	trace("wrl: domain %u credit=%ld (reserve=%ld) SLEEPING for %d\n",
+	      domain->domid,
+	      (long)domain->wrl_credit, (long)wrl_reserve,
+	      wakeup);
+}
+
+void wrl_apply_debit_actual(struct domain *domain)
+{
+	struct wrl_timestampt now;
+
+	if (!domain)
+		/* sockets escape the write rate limit */
+		return;
+
+	wrl_gettime_now(&now);
+	wrl_credit_update(domain, now);
+
+	domain->wrl_credit -= wrl_config_writecost;
+	trace("wrl: domain %u credit=%ld (reserve=%ld)\n",
+	      domain->domid,
+	      (long)domain->wrl_credit, (long)wrl_reserve);
+}
+
+void wrl_apply_debit_direct(struct connection *conn)
+{
+	if (!conn)
+		/* some writes are generated internally */
+		return;
+
+	if (conn->transaction)
+		/* these are accounted for when the transaction ends */
+		return;
+
+	if (!wrl_ntransactions)
+		/* we don't conflict with anyone */
+		return;
+
+	wrl_apply_debit_actual(conn->domain);
+}
+
+void wrl_apply_debit_trans_commit(struct connection *conn)
+{
+	if (wrl_ntransactions <= 1)
+		/* our own transaction appears in the counter */
+		return;
+
+	wrl_apply_debit_actual(conn->domain);
+}
+
 /*
  * Local variables:
  *  c-file-style: "linux"
diff --git a/tools/xenstore/xenstored_domain.h b/tools/xenstore/xenstored_domain.h
index 83488ed..bdc4044 100644
--- a/tools/xenstore/xenstored_domain.h
+++ b/tools/xenstore/xenstored_domain.h
@@ -65,4 +65,29 @@ void domain_watch_inc(struct connection *conn);
 void domain_watch_dec(struct connection *conn);
 int domain_watch(struct connection *conn);
 
+/* Write rate limiting */
+
+#define WRL_FACTOR   1000 /* for fixed-point arithmetic */
+#define WRL_RATE      200
+#define WRL_DBURST     10
+#define WRL_GBURST   1000
+#define WRL_NEWDOMS     5
+
+struct wrl_timestampt {
+	time_t sec;
+	int msec;
+};
+
+extern long wrl_ntransactions;
+
+void wrl_gettime_now(struct wrl_timestampt *now_ts);
+void wrl_domain_new(struct domain *domain);
+void wrl_domain_destroy(struct domain *domain);
+void wrl_credit_update(struct domain *domain, struct wrl_timestampt now);
+void wrl_check_timeout(struct domain *domain,
+                       struct wrl_timestampt now,
+                       int *ptimeout);
+void wrl_apply_debit_direct(struct connection *conn);
+void wrl_apply_debit_trans_commit(struct connection *conn);
+
 #endif /* _XENSTORED_DOMAIN_H */
diff --git a/tools/xenstore/xenstored_transaction.c b/tools/xenstore/xenstored_transaction.c
index d0e4739..a4b328f 100644
--- a/tools/xenstore/xenstored_transaction.c
+++ b/tools/xenstore/xenstored_transaction.c
@@ -116,6 +116,7 @@ static int destroy_transaction(void *_transaction)
 {
 	struct transaction *trans = _transaction;
 
+	wrl_ntransactions--;
 	trace_destroy(trans, "transaction");
 	if (trans->tdb)
 		tdb_close(trans->tdb);
@@ -179,6 +180,7 @@ void do_transaction_start(struct connection *conn, struct buffered_data *in)
 	talloc_steal(conn, trans);
 	talloc_set_destructor(trans, destroy_transaction);
 	conn->transaction_started++;
+	wrl_ntransactions++;
 
 	snprintf(id_str, sizeof(id_str), "%u", trans->id);
 	send_reply(conn, XS_TRANSACTION_START, id_str, strlen(id_str)+1);
@@ -213,6 +215,9 @@ void do_transaction_end(struct connection *conn, const char *arg)
 			send_error(conn, EAGAIN);
 			return;
 		}
+
+		wrl_apply_debit_trans_commit(conn);
+
 		if (!replace_tdb(trans->tdb_name, trans->tdb)) {
 			send_error(conn, errno);
 			return;
-- 
2.1.4

