All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed
From: Sean Christopherson <seanjc@google.com>
To: Oliver Upton <oupton@google.com>
Cc: kvm@vger.kernel.org, kvmarm@lists.cs.columbia.edu,
	Paolo Bonzini <pbonzini@redhat.com>,
	Marc Zyngier <maz@kernel.org>, Peter Shier <pshier@google.com>,
	Jim Mattson <jmattson@google.com>,
	David Matlack <dmatlack@google.com>,
	Ricardo Koller <ricarkol@google.com>,
	Jing Zhang <jingzhangos@google.com>,
	Raghavendra Rao Anata <rananta@google.com>,
	James Morse <james.morse@arm.com>,
	Alexandru Elisei <alexandru.elisei@arm.com>,
	Suzuki K Poulose <suzuki.poulose@arm.com>,
	linux-arm-kernel@lists.infradead.org,
	Andrew Jones <drjones@redhat.com>
Subject: Re: [PATCH v5 01/13] KVM: x86: Report host tsc and realtime values in KVM_GET_CLOCK
Date: Fri, 30 Jul 2021 17:48:46 +0000	[thread overview]
Message-ID: <YQQ7fuOhSoJVfkYn@google.com> (raw)
In-Reply-To: <20210729173300.181775-2-oupton@google.com>

On Thu, Jul 29, 2021, Oliver Upton wrote:
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 916c976e99ab..e052c7afaac4 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2782,17 +2782,24 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
>  #endif
>  }
>  
> -u64 get_kvmclock_ns(struct kvm *kvm)
> +/*
> + * Returns true if realtime and TSC values were written back to the caller.
> + * Returns false if a clock triplet cannot be obtained, such as if the host's
> + * realtime clock is not based on the TSC.
> + */
> +static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
> +				      u64 *realtime_ns, u64 *tsc)
>  {
>  	struct kvm_arch *ka = &kvm->arch;
>  	struct pvclock_vcpu_time_info hv_clock;
>  	unsigned long flags;
> -	u64 ret;
> +	bool ret = false;
>  
>  	spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
>  	if (!ka->use_master_clock) {
>  		spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
> -		return get_kvmclock_base_ns() + ka->kvmclock_offset;
> +		*kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
> +		return false;
>  	}
>  
>  	hv_clock.tsc_timestamp = ka->master_cycle_now;
> @@ -2803,18 +2810,36 @@ u64 get_kvmclock_ns(struct kvm *kvm)
>       get_cpu();
>
>       if (__this_cpu_read(cpu_tsc_khz)) {
> +             struct timespec64 ts;
> +             u64 tsc_val;
> +
>               kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
>                                  &hv_clock.tsc_shift,
>                                  &hv_clock.tsc_to_system_mul);
> -             ret = __pvclock_read_cycles(&hv_clock, rdtsc());
> +
> +             if (kvm_get_walltime_and_clockread(&ts, &tsc_val)) {
> +                     *realtime_ns = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
> +                     *tsc = tsc_val;
> +                     ret = true;
> +             }
> +
> +             *kvmclock_ns = __pvclock_read_cycles(&hv_clock, tsc_val);

This is buggy, as tsc_val is not initialized if kvm_get_walltime_and_clockread()
returns false.  This also straight up fails to compile on 32-bit kernels because
kvm_get_walltime_and_clockread() is wrapped with CONFIG_X86_64.

A gross way to resolve both issues would be (see below regarding 'data'):

	if (__this_cpu_read(cpu_tsc_khz)) {
#ifdef CONFIG_X86_64
		struct timespec64 ts;

		if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
			data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
			data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
		} else
#endif
		data->host_tsc = rdtsc();

		kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
				   &hv_clock.tsc_shift,
				   &hv_clock.tsc_to_system_mul);

		data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
	} else {
		data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
	}


>       } else

Not your code, but this needs braces.

> -             ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
> +             *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
>
>       put_cpu();
>
>       return ret;
>  } 

...

> @@ -5820,6 +5845,68 @@ int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
>  }
>  #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
>  
> +static int kvm_vm_ioctl_get_clock(struct kvm *kvm,
> +				  void __user *argp)
> +{
> +	struct kvm_clock_data data;
> +
> +	memset(&data, 0, sizeof(data));
> +
> +	if (get_kvmclock_and_realtime(kvm, &data.clock, &data.realtime,
> +				      &data.host_tsc))
> +		data.flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
> +
> +	if (kvm->arch.use_master_clock)
> +		data.flags |= KVM_CLOCK_TSC_STABLE;

I know nothing about the actual behavior, but this appears to have a potential
race (though it came from the existing code).  get_kvmclock_and_realtime() checks
arch.use_master_clock under pvclock_gtod_sync_lock, but this does not.  Even if
that's functionally ok, it's a needless complication.

Fixing that weirdness would also provide an opportunity to clean up the API,
e.g. the boolean return is not exactly straightforward.  The simplest approach
would be to take "struct kvm_clock_data *data" in get_kvmclock_and_realtime()
instead of multiple params.  Then that helper can set the flags accordingly, thus
avoiding the question of whether or not there's a race and eliminating the boolean
return.  E.g.

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e052c7afaac4..872a53a7c467 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2782,25 +2782,20 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
-/*
- * Returns true if realtime and TSC values were written back to the caller.
- * Returns false if a clock triplet cannot be obtained, such as if the host's
- * realtime clock is not based on the TSC.
- */
-static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
-                                     u64 *realtime_ns, u64 *tsc)
+static void get_kvmclock_and_realtime(struct kvm *kvm,
+                                     struct kvm_clock_data *data)
 {
        struct kvm_arch *ka = &kvm->arch;
        struct pvclock_vcpu_time_info hv_clock;
        unsigned long flags;
-       bool ret = false;
 
        spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
        if (!ka->use_master_clock) {
                spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
-               *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
-               return false;
+               data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
+               return;
        }
+       data->flags |= KVM_CLOCK_TSC_STABLE;
 
        hv_clock.tsc_timestamp = ka->master_cycle_now;
        hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
@@ -2810,34 +2805,40 @@ static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
        get_cpu();
 
        if (__this_cpu_read(cpu_tsc_khz)) {
+#ifdef CONFIG_X86_64
                struct timespec64 ts;
-               u64 tsc_val;
+
+               if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
+                       data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
+                       data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
+               } else
+#endif
+               data->host_tsc = rdtsc();
 
                kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
                                   &hv_clock.tsc_shift,
                                   &hv_clock.tsc_to_system_mul);
 
-               if (kvm_get_walltime_and_clockread(&ts, &tsc_val)) {
-                       *realtime_ns = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
-                       *tsc = tsc_val;
-                       ret = true;
-               }
-
-               *kvmclock_ns = __pvclock_read_cycles(&hv_clock, tsc_val);
-       } else
-               *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
+               data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
+       } else {
+               data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
+       }
 
        put_cpu();
-
-       return ret;
 }
 
 u64 get_kvmclock_ns(struct kvm *kvm)
 {
-       u64 kvmclock_ns, realtime_ns, tsc;
+       struct kvm_clock_data data;
 
-       get_kvmclock_and_realtime(kvm, &kvmclock_ns, &realtime_ns, &tsc);
-       return kvmclock_ns;
+       /*
+        * Zero flags as it's accessed RMW, leave everything else uninitialized
+        * as clock is always written and no other fields are consumed.
+        */
+       data.flags = 0;
+
+       get_kvmclock_and_realtime(kvm, &data);
+       return data.clock;
 }
 
 static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
@@ -5852,12 +5853,7 @@ static int kvm_vm_ioctl_get_clock(struct kvm *kvm,
 
        memset(&data, 0, sizeof(data));
 
-       if (get_kvmclock_and_realtime(kvm, &data.clock, &data.realtime,
-                                     &data.host_tsc))
-               data.flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
-
-       if (kvm->arch.use_master_clock)
-               data.flags |= KVM_CLOCK_TSC_STABLE;
+       get_kvmclock_and_realtime(kvm, &data);
 
        if (copy_to_user(argp, &data, sizeof(data)))
                return -EFAULT;

> +
> +	if (copy_to_user(argp, &data, sizeof(data)))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +static int kvm_vm_ioctl_set_clock(struct kvm *kvm,
> +				  void __user *argp)
> +{
> +	struct kvm_arch *ka = &kvm->arch;
> +	struct kvm_clock_data data;
> +	u64 now_raw_ns;
> +
> +	if (copy_from_user(&data, argp, sizeof(data)))
> +		return -EFAULT;
> +
> +	if (data.flags & ~KVM_CLOCK_REALTIME)
> +		return -EINVAL;

Huh, this an odd ABI, the output of KVM_GET_CLOCK isn't legal input to KVM_SET_CLOCK.
The existing code has the same behavior, so apparently it's ok, just odd.

> +
> +	/*
> +	 * TODO: userspace has to take care of races with VCPU_RUN, so
> +	 * kvm_gen_update_masterclock() can be cut down to locked
> +	 * pvclock_update_vm_gtod_copy().
> +	 */
> +	kvm_gen_update_masterclock(kvm);
> +
> +	spin_lock_irq(&ka->pvclock_gtod_sync_lock);
> +	if (data.flags & KVM_CLOCK_REALTIME) {
> +		u64 now_real_ns = ktime_get_real_ns();
> +
> +		/*
> +		 * Avoid stepping the kvmclock backwards.
> +		 */
> +		if (now_real_ns > data.realtime)
> +			data.clock += now_real_ns - data.realtime;
> +	}
> +
> +	if (ka->use_master_clock)
> +		now_raw_ns = ka->master_kernel_ns;
> +	else
> +		now_raw_ns = get_kvmclock_base_ns();
> +	ka->kvmclock_offset = data.clock - now_raw_ns;
> +	spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
> +
> +	kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
> +	return 0;
> +}
> +
>  long kvm_arch_vm_ioctl(struct file *filp,
>  		       unsigned int ioctl, unsigned long arg)
>  {
> @@ -6064,57 +6151,11 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  	}
>  #endif
>  	case KVM_SET_CLOCK: {

The curly braces can be dropped, both here and in KVM_GET_CLOCK.

>  	}
>  	case KVM_GET_CLOCK: {

...

>  	}

WARNING: multiple messages have this Message-ID (diff)
From: Sean Christopherson <seanjc@google.com>
To: Oliver Upton <oupton@google.com>
Cc: kvm@vger.kernel.org, kvmarm@lists.cs.columbia.edu,
	Paolo Bonzini <pbonzini@redhat.com>,
	Marc Zyngier <maz@kernel.org>, Peter Shier <pshier@google.com>,
	Jim Mattson <jmattson@google.com>,
	David Matlack <dmatlack@google.com>,
	Ricardo Koller <ricarkol@google.com>,
	Jing Zhang <jingzhangos@google.com>,
	Raghavendra Rao Anata <rananta@google.com>,
	James Morse <james.morse@arm.com>,
	Alexandru Elisei <alexandru.elisei@arm.com>,
	Suzuki K Poulose <suzuki.poulose@arm.com>,
	linux-arm-kernel@lists.infradead.org,
	Andrew Jones <drjones@redhat.com>
Subject: Re: [PATCH v5 01/13] KVM: x86: Report host tsc and realtime values in KVM_GET_CLOCK
Date: Fri, 30 Jul 2021 17:48:46 +0000	[thread overview]
Message-ID: <YQQ7fuOhSoJVfkYn@google.com> (raw)
In-Reply-To: <20210729173300.181775-2-oupton@google.com>

On Thu, Jul 29, 2021, Oliver Upton wrote:
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 916c976e99ab..e052c7afaac4 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2782,17 +2782,24 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
>  #endif
>  }
>  
> -u64 get_kvmclock_ns(struct kvm *kvm)
> +/*
> + * Returns true if realtime and TSC values were written back to the caller.
> + * Returns false if a clock triplet cannot be obtained, such as if the host's
> + * realtime clock is not based on the TSC.
> + */
> +static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
> +				      u64 *realtime_ns, u64 *tsc)
>  {
>  	struct kvm_arch *ka = &kvm->arch;
>  	struct pvclock_vcpu_time_info hv_clock;
>  	unsigned long flags;
> -	u64 ret;
> +	bool ret = false;
>  
>  	spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
>  	if (!ka->use_master_clock) {
>  		spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
> -		return get_kvmclock_base_ns() + ka->kvmclock_offset;
> +		*kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
> +		return false;
>  	}
>  
>  	hv_clock.tsc_timestamp = ka->master_cycle_now;
> @@ -2803,18 +2810,36 @@ u64 get_kvmclock_ns(struct kvm *kvm)
>       get_cpu();
>
>       if (__this_cpu_read(cpu_tsc_khz)) {
> +             struct timespec64 ts;
> +             u64 tsc_val;
> +
>               kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
>                                  &hv_clock.tsc_shift,
>                                  &hv_clock.tsc_to_system_mul);
> -             ret = __pvclock_read_cycles(&hv_clock, rdtsc());
> +
> +             if (kvm_get_walltime_and_clockread(&ts, &tsc_val)) {
> +                     *realtime_ns = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
> +                     *tsc = tsc_val;
> +                     ret = true;
> +             }
> +
> +             *kvmclock_ns = __pvclock_read_cycles(&hv_clock, tsc_val);

This is buggy, as tsc_val is not initialized if kvm_get_walltime_and_clockread()
returns false.  This also straight up fails to compile on 32-bit kernels because
kvm_get_walltime_and_clockread() is wrapped with CONFIG_X86_64.

A gross way to resolve both issues would be (see below regarding 'data'):

	if (__this_cpu_read(cpu_tsc_khz)) {
#ifdef CONFIG_X86_64
		struct timespec64 ts;

		if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
			data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
			data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
		} else
#endif
		data->host_tsc = rdtsc();

		kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
				   &hv_clock.tsc_shift,
				   &hv_clock.tsc_to_system_mul);

		data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
	} else {
		data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
	}


>       } else

Not your code, but this needs braces.

> -             ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
> +             *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
>
>       put_cpu();
>
>       return ret;
>  } 

...

> @@ -5820,6 +5845,68 @@ int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
>  }
>  #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
>  
> +static int kvm_vm_ioctl_get_clock(struct kvm *kvm,
> +				  void __user *argp)
> +{
> +	struct kvm_clock_data data;
> +
> +	memset(&data, 0, sizeof(data));
> +
> +	if (get_kvmclock_and_realtime(kvm, &data.clock, &data.realtime,
> +				      &data.host_tsc))
> +		data.flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
> +
> +	if (kvm->arch.use_master_clock)
> +		data.flags |= KVM_CLOCK_TSC_STABLE;

I know nothing about the actual behavior, but this appears to have a potential
race (though it came from the existing code).  get_kvmclock_and_realtime() checks
arch.use_master_clock under pvclock_gtod_sync_lock, but this does not.  Even if
that's functionally ok, it's a needless complication.

Fixing that weirdness would also provide an opportunity to clean up the API,
e.g. the boolean return is not exactly straightforward.  The simplest approach
would be to take "struct kvm_clock_data *data" in get_kvmclock_and_realtime()
instead of multiple params.  Then that helper can set the flags accordingly, thus
avoiding the question of whether or not there's a race and eliminating the boolean
return.  E.g.

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e052c7afaac4..872a53a7c467 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2782,25 +2782,20 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
-/*
- * Returns true if realtime and TSC values were written back to the caller.
- * Returns false if a clock triplet cannot be obtained, such as if the host's
- * realtime clock is not based on the TSC.
- */
-static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
-                                     u64 *realtime_ns, u64 *tsc)
+static void get_kvmclock_and_realtime(struct kvm *kvm,
+                                     struct kvm_clock_data *data)
 {
        struct kvm_arch *ka = &kvm->arch;
        struct pvclock_vcpu_time_info hv_clock;
        unsigned long flags;
-       bool ret = false;
 
        spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
        if (!ka->use_master_clock) {
                spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
-               *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
-               return false;
+               data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
+               return;
        }
+       data->flags |= KVM_CLOCK_TSC_STABLE;
 
        hv_clock.tsc_timestamp = ka->master_cycle_now;
        hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
@@ -2810,34 +2805,40 @@ static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
        get_cpu();
 
        if (__this_cpu_read(cpu_tsc_khz)) {
+#ifdef CONFIG_X86_64
                struct timespec64 ts;
-               u64 tsc_val;
+
+               if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
+                       data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
+                       data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
+               } else
+#endif
+               data->host_tsc = rdtsc();
 
                kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
                                   &hv_clock.tsc_shift,
                                   &hv_clock.tsc_to_system_mul);
 
-               if (kvm_get_walltime_and_clockread(&ts, &tsc_val)) {
-                       *realtime_ns = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
-                       *tsc = tsc_val;
-                       ret = true;
-               }
-
-               *kvmclock_ns = __pvclock_read_cycles(&hv_clock, tsc_val);
-       } else
-               *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
+               data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
+       } else {
+               data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
+       }
 
        put_cpu();
-
-       return ret;
 }
 
 u64 get_kvmclock_ns(struct kvm *kvm)
 {
-       u64 kvmclock_ns, realtime_ns, tsc;
+       struct kvm_clock_data data;
 
-       get_kvmclock_and_realtime(kvm, &kvmclock_ns, &realtime_ns, &tsc);
-       return kvmclock_ns;
+       /*
+        * Zero flags as it's accessed RMW, leave everything else uninitialized
+        * as clock is always written and no other fields are consumed.
+        */
+       data.flags = 0;
+
+       get_kvmclock_and_realtime(kvm, &data);
+       return data.clock;
 }
 
 static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
@@ -5852,12 +5853,7 @@ static int kvm_vm_ioctl_get_clock(struct kvm *kvm,
 
        memset(&data, 0, sizeof(data));
 
-       if (get_kvmclock_and_realtime(kvm, &data.clock, &data.realtime,
-                                     &data.host_tsc))
-               data.flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
-
-       if (kvm->arch.use_master_clock)
-               data.flags |= KVM_CLOCK_TSC_STABLE;
+       get_kvmclock_and_realtime(kvm, &data);
 
        if (copy_to_user(argp, &data, sizeof(data)))
                return -EFAULT;

> +
> +	if (copy_to_user(argp, &data, sizeof(data)))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +static int kvm_vm_ioctl_set_clock(struct kvm *kvm,
> +				  void __user *argp)
> +{
> +	struct kvm_arch *ka = &kvm->arch;
> +	struct kvm_clock_data data;
> +	u64 now_raw_ns;
> +
> +	if (copy_from_user(&data, argp, sizeof(data)))
> +		return -EFAULT;
> +
> +	if (data.flags & ~KVM_CLOCK_REALTIME)
> +		return -EINVAL;

Huh, this an odd ABI, the output of KVM_GET_CLOCK isn't legal input to KVM_SET_CLOCK.
The existing code has the same behavior, so apparently it's ok, just odd.

> +
> +	/*
> +	 * TODO: userspace has to take care of races with VCPU_RUN, so
> +	 * kvm_gen_update_masterclock() can be cut down to locked
> +	 * pvclock_update_vm_gtod_copy().
> +	 */
> +	kvm_gen_update_masterclock(kvm);
> +
> +	spin_lock_irq(&ka->pvclock_gtod_sync_lock);
> +	if (data.flags & KVM_CLOCK_REALTIME) {
> +		u64 now_real_ns = ktime_get_real_ns();
> +
> +		/*
> +		 * Avoid stepping the kvmclock backwards.
> +		 */
> +		if (now_real_ns > data.realtime)
> +			data.clock += now_real_ns - data.realtime;
> +	}
> +
> +	if (ka->use_master_clock)
> +		now_raw_ns = ka->master_kernel_ns;
> +	else
> +		now_raw_ns = get_kvmclock_base_ns();
> +	ka->kvmclock_offset = data.clock - now_raw_ns;
> +	spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
> +
> +	kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
> +	return 0;
> +}
> +
>  long kvm_arch_vm_ioctl(struct file *filp,
>  		       unsigned int ioctl, unsigned long arg)
>  {
> @@ -6064,57 +6151,11 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  	}
>  #endif
>  	case KVM_SET_CLOCK: {

The curly braces can be dropped, both here and in KVM_GET_CLOCK.

>  	}
>  	case KVM_GET_CLOCK: {

...

>  	}

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

WARNING: multiple messages have this Message-ID (diff)
From: Sean Christopherson <seanjc@google.com>
To: Oliver Upton <oupton@google.com>
Cc: kvm@vger.kernel.org, Marc Zyngier <maz@kernel.org>,
	Peter Shier <pshier@google.com>,
	Raghavendra Rao Anata <rananta@google.com>,
	David Matlack <dmatlack@google.com>,
	Paolo Bonzini <pbonzini@redhat.com>,
	kvmarm@lists.cs.columbia.edu,
	linux-arm-kernel@lists.infradead.org,
	Jim Mattson <jmattson@google.com>
Subject: Re: [PATCH v5 01/13] KVM: x86: Report host tsc and realtime values in KVM_GET_CLOCK
Date: Fri, 30 Jul 2021 17:48:46 +0000	[thread overview]
Message-ID: <YQQ7fuOhSoJVfkYn@google.com> (raw)
In-Reply-To: <20210729173300.181775-2-oupton@google.com>

On Thu, Jul 29, 2021, Oliver Upton wrote:
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 916c976e99ab..e052c7afaac4 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2782,17 +2782,24 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
>  #endif
>  }
>  
> -u64 get_kvmclock_ns(struct kvm *kvm)
> +/*
> + * Returns true if realtime and TSC values were written back to the caller.
> + * Returns false if a clock triplet cannot be obtained, such as if the host's
> + * realtime clock is not based on the TSC.
> + */
> +static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
> +				      u64 *realtime_ns, u64 *tsc)
>  {
>  	struct kvm_arch *ka = &kvm->arch;
>  	struct pvclock_vcpu_time_info hv_clock;
>  	unsigned long flags;
> -	u64 ret;
> +	bool ret = false;
>  
>  	spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
>  	if (!ka->use_master_clock) {
>  		spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
> -		return get_kvmclock_base_ns() + ka->kvmclock_offset;
> +		*kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
> +		return false;
>  	}
>  
>  	hv_clock.tsc_timestamp = ka->master_cycle_now;
> @@ -2803,18 +2810,36 @@ u64 get_kvmclock_ns(struct kvm *kvm)
>       get_cpu();
>
>       if (__this_cpu_read(cpu_tsc_khz)) {
> +             struct timespec64 ts;
> +             u64 tsc_val;
> +
>               kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
>                                  &hv_clock.tsc_shift,
>                                  &hv_clock.tsc_to_system_mul);
> -             ret = __pvclock_read_cycles(&hv_clock, rdtsc());
> +
> +             if (kvm_get_walltime_and_clockread(&ts, &tsc_val)) {
> +                     *realtime_ns = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
> +                     *tsc = tsc_val;
> +                     ret = true;
> +             }
> +
> +             *kvmclock_ns = __pvclock_read_cycles(&hv_clock, tsc_val);

This is buggy, as tsc_val is not initialized if kvm_get_walltime_and_clockread()
returns false.  This also straight up fails to compile on 32-bit kernels because
kvm_get_walltime_and_clockread() is wrapped with CONFIG_X86_64.

A gross way to resolve both issues would be (see below regarding 'data'):

	if (__this_cpu_read(cpu_tsc_khz)) {
#ifdef CONFIG_X86_64
		struct timespec64 ts;

		if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
			data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
			data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
		} else
#endif
		data->host_tsc = rdtsc();

		kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
				   &hv_clock.tsc_shift,
				   &hv_clock.tsc_to_system_mul);

		data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
	} else {
		data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
	}


>       } else

Not your code, but this needs braces.

> -             ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
> +             *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
>
>       put_cpu();
>
>       return ret;
>  } 

...

> @@ -5820,6 +5845,68 @@ int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
>  }
>  #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
>  
> +static int kvm_vm_ioctl_get_clock(struct kvm *kvm,
> +				  void __user *argp)
> +{
> +	struct kvm_clock_data data;
> +
> +	memset(&data, 0, sizeof(data));
> +
> +	if (get_kvmclock_and_realtime(kvm, &data.clock, &data.realtime,
> +				      &data.host_tsc))
> +		data.flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
> +
> +	if (kvm->arch.use_master_clock)
> +		data.flags |= KVM_CLOCK_TSC_STABLE;

I know nothing about the actual behavior, but this appears to have a potential
race (though it came from the existing code).  get_kvmclock_and_realtime() checks
arch.use_master_clock under pvclock_gtod_sync_lock, but this does not.  Even if
that's functionally ok, it's a needless complication.

Fixing that weirdness would also provide an opportunity to clean up the API,
e.g. the boolean return is not exactly straightforward.  The simplest approach
would be to take "struct kvm_clock_data *data" in get_kvmclock_and_realtime()
instead of multiple params.  Then that helper can set the flags accordingly, thus
avoiding the question of whether or not there's a race and eliminating the boolean
return.  E.g.

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e052c7afaac4..872a53a7c467 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2782,25 +2782,20 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
-/*
- * Returns true if realtime and TSC values were written back to the caller.
- * Returns false if a clock triplet cannot be obtained, such as if the host's
- * realtime clock is not based on the TSC.
- */
-static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
-                                     u64 *realtime_ns, u64 *tsc)
+static void get_kvmclock_and_realtime(struct kvm *kvm,
+                                     struct kvm_clock_data *data)
 {
        struct kvm_arch *ka = &kvm->arch;
        struct pvclock_vcpu_time_info hv_clock;
        unsigned long flags;
-       bool ret = false;
 
        spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
        if (!ka->use_master_clock) {
                spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
-               *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
-               return false;
+               data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
+               return;
        }
+       data->flags |= KVM_CLOCK_TSC_STABLE;
 
        hv_clock.tsc_timestamp = ka->master_cycle_now;
        hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
@@ -2810,34 +2805,40 @@ static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
        get_cpu();
 
        if (__this_cpu_read(cpu_tsc_khz)) {
+#ifdef CONFIG_X86_64
                struct timespec64 ts;
-               u64 tsc_val;
+
+               if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
+                       data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
+                       data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
+               } else
+#endif
+               data->host_tsc = rdtsc();
 
                kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
                                   &hv_clock.tsc_shift,
                                   &hv_clock.tsc_to_system_mul);
 
-               if (kvm_get_walltime_and_clockread(&ts, &tsc_val)) {
-                       *realtime_ns = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
-                       *tsc = tsc_val;
-                       ret = true;
-               }
-
-               *kvmclock_ns = __pvclock_read_cycles(&hv_clock, tsc_val);
-       } else
-               *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
+               data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
+       } else {
+               data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
+       }
 
        put_cpu();
-
-       return ret;
 }
 
 u64 get_kvmclock_ns(struct kvm *kvm)
 {
-       u64 kvmclock_ns, realtime_ns, tsc;
+       struct kvm_clock_data data;
 
-       get_kvmclock_and_realtime(kvm, &kvmclock_ns, &realtime_ns, &tsc);
-       return kvmclock_ns;
+       /*
+        * Zero flags as it's accessed RMW, leave everything else uninitialized
+        * as clock is always written and no other fields are consumed.
+        */
+       data.flags = 0;
+
+       get_kvmclock_and_realtime(kvm, &data);
+       return data.clock;
 }
 
 static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
@@ -5852,12 +5853,7 @@ static int kvm_vm_ioctl_get_clock(struct kvm *kvm,
 
        memset(&data, 0, sizeof(data));
 
-       if (get_kvmclock_and_realtime(kvm, &data.clock, &data.realtime,
-                                     &data.host_tsc))
-               data.flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
-
-       if (kvm->arch.use_master_clock)
-               data.flags |= KVM_CLOCK_TSC_STABLE;
+       get_kvmclock_and_realtime(kvm, &data);
 
        if (copy_to_user(argp, &data, sizeof(data)))
                return -EFAULT;

> +
> +	if (copy_to_user(argp, &data, sizeof(data)))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +static int kvm_vm_ioctl_set_clock(struct kvm *kvm,
> +				  void __user *argp)
> +{
> +	struct kvm_arch *ka = &kvm->arch;
> +	struct kvm_clock_data data;
> +	u64 now_raw_ns;
> +
> +	if (copy_from_user(&data, argp, sizeof(data)))
> +		return -EFAULT;
> +
> +	if (data.flags & ~KVM_CLOCK_REALTIME)
> +		return -EINVAL;

Huh, this an odd ABI, the output of KVM_GET_CLOCK isn't legal input to KVM_SET_CLOCK.
The existing code has the same behavior, so apparently it's ok, just odd.

> +
> +	/*
> +	 * TODO: userspace has to take care of races with VCPU_RUN, so
> +	 * kvm_gen_update_masterclock() can be cut down to locked
> +	 * pvclock_update_vm_gtod_copy().
> +	 */
> +	kvm_gen_update_masterclock(kvm);
> +
> +	spin_lock_irq(&ka->pvclock_gtod_sync_lock);
> +	if (data.flags & KVM_CLOCK_REALTIME) {
> +		u64 now_real_ns = ktime_get_real_ns();
> +
> +		/*
> +		 * Avoid stepping the kvmclock backwards.
> +		 */
> +		if (now_real_ns > data.realtime)
> +			data.clock += now_real_ns - data.realtime;
> +	}
> +
> +	if (ka->use_master_clock)
> +		now_raw_ns = ka->master_kernel_ns;
> +	else
> +		now_raw_ns = get_kvmclock_base_ns();
> +	ka->kvmclock_offset = data.clock - now_raw_ns;
> +	spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
> +
> +	kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
> +	return 0;
> +}
> +
>  long kvm_arch_vm_ioctl(struct file *filp,
>  		       unsigned int ioctl, unsigned long arg)
>  {
> @@ -6064,57 +6151,11 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  	}
>  #endif
>  	case KVM_SET_CLOCK: {

The curly braces can be dropped, both here and in KVM_GET_CLOCK.

>  	}
>  	case KVM_GET_CLOCK: {

...

>  	}
_______________________________________________
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm

  reply	other threads:[~2021-07-30 17:48 UTC|newest]

Thread overview: 81+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-07-29 17:32 [PATCH v5 00/13] KVM: Add idempotent controls for migrating system counter state Oliver Upton
2021-07-29 17:32 ` Oliver Upton
2021-07-29 17:32 ` Oliver Upton
2021-07-29 17:32 ` [PATCH v5 01/13] KVM: x86: Report host tsc and realtime values in KVM_GET_CLOCK Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-30 17:48   ` Sean Christopherson [this message]
2021-07-30 17:48     ` Sean Christopherson
2021-07-30 17:48     ` Sean Christopherson
2021-07-30 18:24     ` Oliver Upton
2021-07-30 18:24       ` Oliver Upton
2021-07-30 18:24       ` Oliver Upton
2021-07-29 17:32 ` [PATCH v5 02/13] KVM: x86: Refactor tsc synchronization code Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-30 18:08   ` Sean Christopherson
2021-07-30 18:08     ` Sean Christopherson
2021-07-30 18:08     ` Sean Christopherson
2021-08-03 21:18     ` Oliver Upton
2021-08-03 21:18       ` Oliver Upton
2021-08-03 21:18       ` Oliver Upton
2021-07-29 17:32 ` [PATCH v5 03/13] KVM: x86: Expose TSC offset controls to userspace Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-30 18:34   ` Sean Christopherson
2021-07-30 18:34     ` Sean Christopherson
2021-07-30 18:34     ` Sean Christopherson
2021-07-29 17:32 ` [PATCH v5 04/13] tools: arch: x86: pull in pvclock headers Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-29 17:32 ` [PATCH v5 05/13] selftests: KVM: Add test for KVM_{GET,SET}_CLOCK Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-29 17:32 ` [PATCH v5 06/13] selftests: KVM: Fix kvm device helper ioctl assertions Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-29 17:32 ` [PATCH v5 07/13] selftests: KVM: Add helpers for vCPU device attributes Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-29 17:32 ` [PATCH v5 08/13] selftests: KVM: Introduce system counter offset test Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-29 17:32 ` [PATCH v5 09/13] KVM: arm64: Allow userspace to configure a vCPU's virtual offset Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-30 10:12   ` Marc Zyngier
2021-07-30 10:12     ` Marc Zyngier
2021-07-30 10:12     ` Marc Zyngier
2021-08-02 23:27     ` Oliver Upton
2021-08-02 23:27       ` Oliver Upton
2021-08-02 23:27       ` Oliver Upton
2021-07-29 17:32 ` [PATCH v5 10/13] selftests: KVM: Add support for aarch64 to system_counter_offset_test Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-29 17:32 ` [PATCH v5 11/13] KVM: arm64: Provide userspace access to the physical counter offset Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-30 11:08   ` Marc Zyngier
2021-07-30 11:08     ` Marc Zyngier
2021-07-30 11:08     ` Marc Zyngier
2021-07-30 15:22     ` Oliver Upton
2021-07-30 15:22       ` Oliver Upton
2021-07-30 15:22       ` Oliver Upton
2021-07-30 16:17       ` Marc Zyngier
2021-07-30 16:17         ` Marc Zyngier
2021-07-30 16:17         ` Marc Zyngier
2021-07-30 16:48         ` Oliver Upton
2021-07-30 16:48           ` Oliver Upton
2021-07-30 16:48           ` Oliver Upton
2021-08-04  6:59           ` Oliver Upton
2021-08-04  6:59             ` Oliver Upton
2021-08-04  6:59             ` Oliver Upton
2021-07-29 17:32 ` [PATCH v5 12/13] selftests: KVM: Test physical counter offsetting Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-29 17:32   ` Oliver Upton
2021-07-29 17:33 ` [PATCH v5 13/13] selftests: KVM: Add counter emulation benchmark Oliver Upton
2021-07-29 17:33   ` Oliver Upton
2021-07-29 17:33   ` Oliver Upton
2021-07-29 17:45   ` Andrew Jones
2021-07-29 17:45     ` Andrew Jones
2021-07-29 17:45     ` Andrew Jones

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=YQQ7fuOhSoJVfkYn@google.com \
    --to=seanjc@google.com \
    --cc=alexandru.elisei@arm.com \
    --cc=dmatlack@google.com \
    --cc=drjones@redhat.com \
    --cc=james.morse@arm.com \
    --cc=jingzhangos@google.com \
    --cc=jmattson@google.com \
    --cc=kvm@vger.kernel.org \
    --cc=kvmarm@lists.cs.columbia.edu \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=maz@kernel.org \
    --cc=oupton@google.com \
    --cc=pbonzini@redhat.com \
    --cc=pshier@google.com \
    --cc=rananta@google.com \
    --cc=ricarkol@google.com \
    --cc=suzuki.poulose@arm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.