[Intel-gfx] freedesktop bug id: 100548, bisected to sched/clock commit

Tue Apr 18 14:10:07 UTC 2017

> -----Original Message-----
> From: Peter Zijlstra [mailto:peterz at infradead.org]
> Sent: Thursday, April 13, 2017 4:24 PM
> To: Martin Peres <martin.peres at linux.intel.com>
> Cc: Lofstedt, Marta <marta.lofstedt at intel.com>;
> pasha.tatashin at oracle.com; intel-gfx at lists.freedesktop.org; Thomas
> Gleixner <tglx at linutronix.de>
> Subject: Re: freedesktop bug id: 100548, bisected to sched/clock commit
> 
> On Thu, Apr 13, 2017 at 03:30:25PM +0300, Martin Peres wrote:
> > On 13/04/17 14:48, Peter Zijlstra wrote:
> > > On Wed, Apr 12, 2017 at 05:49:53PM +0300, Martin Peres wrote:
> > >
> > > > Good to know. Is there a way to disable this behaviour, as a
> > > > workaround for our CI system until a proper fix lands? We already
> > > > pushed locally the revert for this patch, but that may affect
> > > > other platforms which do not exhibit the problem.
> > >
> > > Blergh, so the patch is correct, but the __gtod_offset calculation
> > > is fed with absolute crap numbers due to 'circumstances' and then
> > > using it ends up being worse than not using it.
> >
> > Thanks for taking this bug seriously!
> 
> So I've not actually dug out a Core2 machine, so have only tested this by
> poking random values into the TSC MSR on an otherwise 'good' machine.
> 
> Could you give it a go to see if it works for you?

Sorry Peter, I still see regression on the Core2 machine, with your patch.

/Marta

> 
> Thomas, how much hate?
> 
> ---
> Subject: sched/clock,x86/tsc: Improve clock continuity for stable->unstable
> transition
> From: Peter Zijlstra <peterz at infradead.org>
> Date: Thu Apr 13 14:56:44 CEST 2017
> 
> Marta reported that commit:
> 
>   7b09cc5a9deb ("sched/clock: Fix broken stable to unstable transfer")
> 
> Appeared to have broken things on a Core2Duo machine. While that patch is
> in fact correct, it exposes a problem with commit:
> 
>   5680d8094ffa ("sched/clock: Provide better clock continuity")
> 
> Where we hoped that TSC would not make big jumps after SMP bringup. Of
> course, TSC needs to prove us wrong. Because Core2 comes up with a semi-
> stable TSC and only goes funny once we probe the idle drivers, because
> Core2 stops TSC on idle.
> 
> Now we could of course delay the final switch to stable longer, but it would
> be better to entirely remove the assumption that TSC doesn't make big
> jumps and improve things all-round.
> 
> So instead we have the clocksource watchdog call a special function when it
> finds the TSC is still good (there's a race, it could've gotten bad between us
> determining it's still good and calling our function, do we care?).
> 
> This function then updates the __gtod_offset using sane values, which is the
> value needed for clock continuity when being marked unstable.
> 
> Cc: Pavel Tatashin <pasha.tatashin at oracle.com>
> Cc: Martin Peres <martin.peres at linux.intel.com>
> Reported-by: "Lofstedt, Marta" <marta.lofstedt at intel.com>
> Fixes: 5680d8094ffa ("sched/clock: Provide better clock continuity")
> Signed-off-by: Peter Zijlstra (Intel) <peterz at infradead.org>
> ---
>  arch/x86/kernel/tsc.c       |   12 ++++++++++
>  include/linux/clocksource.h |    1
>  include/linux/sched/clock.h |    2 -
>  kernel/sched/clock.c        |   50 ++++++++++++++++++++++++------------------
> --
>  kernel/time/clocksource.c   |    3 ++
>  5 files changed, 45 insertions(+), 23 deletions(-)
> 
> --- a/arch/x86/kernel/tsc.c
> +++ b/arch/x86/kernel/tsc.c
> @@ -374,6 +374,8 @@ static int __init tsc_setup(char *str)
>  		tsc_clocksource_reliable = 1;
>  	if (!strncmp(str, "noirqtime", 9))
>  		no_sched_irq_time = 1;
> +	if (!strcmp(str, "unstable"))
> +		mark_tsc_unstable("boot parameter");
>  	return 1;
>  }
> 
> @@ -1127,6 +1129,15 @@ static void tsc_cs_mark_unstable(struct
>  	pr_info("Marking TSC unstable due to clocksource
> watchdog\n");  }
> 
> +static void tsc_cs_tick_stable(struct clocksource *cs) {
> +	if (tsc_unstable)
> +		return;
> +
> +	if (using_native_sched_clock())
> +		sched_clock_tick_stable();
> +}
> +
>  /*
>   * .mask MUST be CLOCKSOURCE_MASK(64). See comment above
> read_tsc()
>   */
> @@ -1140,6 +1151,7 @@ static struct clocksource clocksource_ts
>  	.archdata               = { .vclock_mode = VCLOCK_TSC },
>  	.resume			= tsc_resume,
>  	.mark_unstable		=
> tsc_cs_mark_unstable,
> +	.tick_stable		= tsc_cs_tick_stable,
>  };
> 
>  void mark_tsc_unstable(char *reason)
> --- a/include/linux/clocksource.h
> +++ b/include/linux/clocksource.h
> @@ -96,6 +96,7 @@ struct clocksource {
>  	void (*suspend)(struct clocksource *cs);
>  	void (*resume)(struct clocksource *cs);
>  	void (*mark_unstable)(struct clocksource *cs);
> +	void (*tick_stable)(struct clocksource *cs);
> 
>  	/* private: */
>  #ifdef CONFIG_CLOCKSOURCE_WATCHDOG
> --- a/include/linux/sched/clock.h
> +++ b/include/linux/sched/clock.h
> @@ -63,8 +63,8 @@ extern void clear_sched_clock_stable(voi
>   */
>  extern u64 __sched_clock_offset;
> 
> -
>  extern void sched_clock_tick(void);
> +extern void sched_clock_tick_stable(void);
>  extern void sched_clock_idle_sleep_event(void);
>  extern void sched_clock_idle_wakeup_event(u64 delta_ns);
> 
> --- a/kernel/sched/clock.c
> +++ b/kernel/sched/clock.c
> @@ -152,25 +152,15 @@ static void __clear_sched_clock_stable(v  {
>  	struct sched_clock_data *scd = this_scd();
> 
> -	/*
> -	 * Attempt to make the stable->unstable transition
> continuous.
> -	 *
> -	 * Trouble is, this is typically called from the TSC watchdog
> -	 * timer, which is late per definition. This means the tick
> -	 * values can already be screwy.
> -	 *
> -	 * Still do what we can.
> -	 */
> -	__gtod_offset = (scd->tick_raw + __sched_clock_offset) -
> (scd->tick_gtod);
> +	if (!sched_clock_stable())
> +		return;
> 
>  	printk(KERN_INFO "sched_clock: Marking unstable (%lld,
> %lld)<-(%lld, %lld)\n",
>  			scd->tick_gtod, __gtod_offset,
>  			scd->tick_raw,
> __sched_clock_offset);
> 
>  	tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
> -
> -	if (sched_clock_stable())
> -		schedule_work(&sched_clock_work);
> +	schedule_work(&sched_clock_work);
>  }
> 
>  void clear_sched_clock_stable(void)
> @@ -347,21 +337,37 @@ void sched_clock_tick(void)  {
>  	struct sched_clock_data *scd;
> 
> +	if (sched_clock_stable())
> +		return;
> +
> +	if (unlikely(!sched_clock_running))
> +		return;
> +
>  	WARN_ON_ONCE(!irqs_disabled());
> 
> -	/*
> -	 * Update these values even if sched_clock_stable(), because
> it can
> -	 * become unstable at any point in time at which point we
> need some
> -	 * values to fall back on.
> -	 *
> -	 * XXX arguably we can skip this if we expose
> tsc_clocksource_reliable
> -	 */
>  	scd = this_scd();
>  	scd->tick_raw  = sched_clock();
>  	scd->tick_gtod = ktime_get_ns();
> +	sched_clock_local(scd);
> +}
> 
> -	if (!sched_clock_stable() && likely(sched_clock_running))
> -		sched_clock_local(scd);
> +void sched_clock_tick_stable(void)
> +{
> +	u64 gtod, clock;
> +
> +	if (!sched_clock_stable())
> +		return;
> +
> +	/*
> +	 * Called under watchdog_lock.
> +	 *
> +	 * The watchdog just found this TSC to (still) be stable, so now
> is a
> +	 * good moment to update our __gtod_offset. Because once
> we find the
> +	 * TSC to be unstable, any computation will be computing crap.
> +	 */
> +	gtod = ktime_get_ns();
> +	clock = sched_clock();
> +	__gtod_offset = (clock + __sched_clock_offset) - gtod;
>  }
> 
>  /*
> --- a/kernel/time/clocksource.c
> +++ b/kernel/time/clocksource.c
> @@ -233,6 +233,9 @@ static void clocksource_watchdog(unsigne
>  			continue;
>  		}
> 
> +		if (cs == curr_clocksource && cs->tick_stable)
> +			cs->tick_stable(cs);
> +
>  		if (!(cs->flags &
> CLOCK_SOURCE_VALID_FOR_HRES) &&
>  		    (cs->flags &
> CLOCK_SOURCE_IS_CONTINUOUS) &&
>  		    (watchdog->flags &
> CLOCK_SOURCE_IS_CONTINUOUS)) {