From 2f634be5d8c29c56a16a6ddfcc13cbc3c2c92ef7 Mon Sep 17 00:00:00 2001 From: Salvatore Bonaccorso Date: Sun, 3 Dec 2017 10:51:49 +0100 Subject: [PATCH] xen/time: do not decrease steal time after live migration on xen Closes: #871608 --- debian/changelog | 2 + ...decrease-steal-time-after-live-migra.patch | 200 ++++++++++++++++++ debian/patches/series | 1 + 3 files changed, 203 insertions(+) create mode 100644 debian/patches/bugfix/all/xen-time-do-not-decrease-steal-time-after-live-migra.patch diff --git a/debian/changelog b/debian/changelog index 28013c577..5a85a9c6f 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,6 +1,8 @@ linux (4.14.2-2) UNRELEASED; urgency=medium * Add ABI reference for 4.14.0-1 + * xen/time: do not decrease steal time after live migration on xen + (Closes: #871608) -- Salvatore Bonaccorso Sun, 03 Dec 2017 10:18:39 +0100 diff --git a/debian/patches/bugfix/all/xen-time-do-not-decrease-steal-time-after-live-migra.patch b/debian/patches/bugfix/all/xen-time-do-not-decrease-steal-time-after-live-migra.patch new file mode 100644 index 000000000..b5382d09f --- /dev/null +++ b/debian/patches/bugfix/all/xen-time-do-not-decrease-steal-time-after-live-migra.patch @@ -0,0 +1,200 @@ +From: Dongli Zhang +Date: Wed, 1 Nov 2017 09:46:33 +0800 +Subject: xen/time: do not decrease steal time after live migration on xen +Origin: https://git.kernel.org/linus/5e25f5db6abb96ca8ee2aaedcb863daa6dfcc07a +Bug-Debian: https://bugs.debian.org/871608 + +After guest live migration on xen, steal time in /proc/stat +(cpustat[CPUTIME_STEAL]) might decrease because steal returned by +xen_steal_lock() might be less than this_rq()->prev_steal_time which is +derived from previous return value of xen_steal_clock(). + +For instance, steal time of each vcpu is 335 before live migration. + +cpu 198 0 368 200064 1962 0 0 1340 0 0 +cpu0 38 0 81 50063 492 0 0 335 0 0 +cpu1 65 0 97 49763 634 0 0 335 0 0 +cpu2 38 0 81 50098 462 0 0 335 0 0 +cpu3 56 0 107 50138 374 0 0 335 0 0 + +After live migration, steal time is reduced to 312. + +cpu 200 0 370 200330 1971 0 0 1248 0 0 +cpu0 38 0 82 50123 500 0 0 312 0 0 +cpu1 65 0 97 49832 634 0 0 312 0 0 +cpu2 39 0 82 50167 462 0 0 312 0 0 +cpu3 56 0 107 50207 374 0 0 312 0 0 + +Since runstate times are cumulative and cleared during xen live migration +by xen hypervisor, the idea of this patch is to accumulate runstate times +to global percpu variables before live migration suspend. Once guest VM is +resumed, xen_get_runstate_snapshot_cpu() would always return the sum of new +runstate times and previously accumulated times stored in global percpu +variables. + +Comment above HYPERVISOR_suspend() has been removed as it is inaccurate: +the call can return an error code (e.g., possibly -EPERM in the future). + +Similar and more severe issue would impact prior linux 4.8-4.10 as +discussed by Michael Las at +https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest, +which would overflow steal time and lead to 100% st usage in top command +for linux 4.8-4.10. A backport of this patch would fix that issue. + +[boris: added linux/slab.h to driver/xen/time.c, slightly reformatted + commit message] + +References: https://0xstubs.org/debugging-a-flaky-cpu-steal-time-counter-on-a-paravirtualized-xen-guest +Signed-off-by: Dongli Zhang +Reviewed-by: Boris Ostrovsky +Signed-off-by: Boris Ostrovsky +--- + drivers/xen/manage.c | 7 ++--- + drivers/xen/time.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++-- + include/xen/xen-ops.h | 1 + + 3 files changed, 73 insertions(+), 7 deletions(-) + +diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c +index c425d03d37d2..8835065029d3 100644 +--- a/drivers/xen/manage.c ++++ b/drivers/xen/manage.c +@@ -72,18 +72,15 @@ static int xen_suspend(void *data) + } + + gnttab_suspend(); ++ xen_manage_runstate_time(-1); + xen_arch_pre_suspend(); + +- /* +- * This hypercall returns 1 if suspend was cancelled +- * or the domain was merely checkpointed, and 0 if it +- * is resuming in a new domain. +- */ + si->cancelled = HYPERVISOR_suspend(xen_pv_domain() + ? virt_to_gfn(xen_start_info) + : 0); + + xen_arch_post_suspend(si->cancelled); ++ xen_manage_runstate_time(si->cancelled ? 1 : 0); + gnttab_resume(); + + if (!si->cancelled) { +diff --git a/drivers/xen/time.c b/drivers/xen/time.c +index ac5f23fcafc2..8c46f555d82a 100644 +--- a/drivers/xen/time.c ++++ b/drivers/xen/time.c +@@ -5,6 +5,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -19,6 +20,8 @@ + /* runstate info updated by Xen */ + static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate); + ++static DEFINE_PER_CPU(u64[4], old_runstate_time); ++ + /* return an consistent snapshot of 64-bit time/counter value */ + static u64 get64(const u64 *p) + { +@@ -47,8 +50,8 @@ static u64 get64(const u64 *p) + return ret; + } + +-static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res, +- unsigned int cpu) ++static void xen_get_runstate_snapshot_cpu_delta( ++ struct vcpu_runstate_info *res, unsigned int cpu) + { + u64 state_time; + struct vcpu_runstate_info *state; +@@ -66,6 +69,71 @@ static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res, + (state_time & XEN_RUNSTATE_UPDATE)); + } + ++static void xen_get_runstate_snapshot_cpu(struct vcpu_runstate_info *res, ++ unsigned int cpu) ++{ ++ int i; ++ ++ xen_get_runstate_snapshot_cpu_delta(res, cpu); ++ ++ for (i = 0; i < 4; i++) ++ res->time[i] += per_cpu(old_runstate_time, cpu)[i]; ++} ++ ++void xen_manage_runstate_time(int action) ++{ ++ static struct vcpu_runstate_info *runstate_delta; ++ struct vcpu_runstate_info state; ++ int cpu, i; ++ ++ switch (action) { ++ case -1: /* backup runstate time before suspend */ ++ if (unlikely(runstate_delta)) ++ pr_warn_once("%s: memory leak as runstate_delta is not NULL\n", ++ __func__); ++ ++ runstate_delta = kmalloc_array(num_possible_cpus(), ++ sizeof(*runstate_delta), ++ GFP_ATOMIC); ++ if (unlikely(!runstate_delta)) { ++ pr_warn("%s: failed to allocate runstate_delta\n", ++ __func__); ++ return; ++ } ++ ++ for_each_possible_cpu(cpu) { ++ xen_get_runstate_snapshot_cpu_delta(&state, cpu); ++ memcpy(runstate_delta[cpu].time, state.time, ++ sizeof(runstate_delta[cpu].time)); ++ } ++ ++ break; ++ ++ case 0: /* backup runstate time after resume */ ++ if (unlikely(!runstate_delta)) { ++ pr_warn("%s: cannot accumulate runstate time as runstate_delta is NULL\n", ++ __func__); ++ return; ++ } ++ ++ for_each_possible_cpu(cpu) { ++ for (i = 0; i < 4; i++) ++ per_cpu(old_runstate_time, cpu)[i] += ++ runstate_delta[cpu].time[i]; ++ } ++ ++ break; ++ ++ default: /* do not accumulate runstate time for checkpointing */ ++ break; ++ } ++ ++ if (action != -1 && runstate_delta) { ++ kfree(runstate_delta); ++ runstate_delta = NULL; ++ } ++} ++ + /* + * Runstate accounting + */ +diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h +index 218e6aae5433..09072271f122 100644 +--- a/include/xen/xen-ops.h ++++ b/include/xen/xen-ops.h +@@ -32,6 +32,7 @@ void xen_resume_notifier_unregister(struct notifier_block *nb); + bool xen_vcpu_stolen(int vcpu); + void xen_setup_runstate_info(int cpu); + void xen_time_setup_guest(void); ++void xen_manage_runstate_time(int action); + void xen_get_runstate_snapshot(struct vcpu_runstate_info *res); + u64 xen_steal_clock(int cpu); + +-- +2.15.1 + diff --git a/debian/patches/series b/debian/patches/series index 167fb4731..de08c5d2e 100644 --- a/debian/patches/series +++ b/debian/patches/series @@ -81,6 +81,7 @@ bugfix/all/kbuild-include-addtree-remove-quotes-before-matching-path.patch bugfix/all/i40e-i40evf-organize-and-re-number-feature-flags.patch bugfix/all/i40e-fix-flags-declaration.patch bugfix/all/apparmor-fix-oops-in-audit_signal_cb-hook.patch +bugfix/all/xen-time-do-not-decrease-steal-time-after-live-migra.patch # Miscellaneous features