/*
 * This file contains a copy of cputime_adjust() and related functions from
 * linux-next(next-20150610) in addition to stubs and a driver to verify
 * and demonstrate some problems in the scaling mechanism in addition to
 * some concurrency problems.
 *
 * The functions copied are instrumented with a call to emulate_preemption()
 * in order to be able to easly demonstrate the concurrency problem, other
 * then that they are identical to the ones in the kernel.
 *
 * Compile with: gcc -o testadjust testadjust.c
 * Run with: ./testadjust
 * And expect:
 *   0.0 sum_exec=100000000000 utime=0     stime=1     =>    0.0 tot=10000      user=0     sys=10000 =====> OK
 *   0.1 sum_exec=101000000000 utime=100   stime=1     =>    0.1 tot=20000      user=10000 sys=10000 =====> FAIL
 *
 *   1.0 sum_exec=100000000000 utime=1     stime=0     =>    1.0 tot=10000      user=10000 sys=0     =====> OK
 *   1.1 sum_exec=101000000000 utime=1     stime=100   =>    1.1 tot=20000      user=10000 sys=10000 =====> FAIL
 *
 *   2.0 sum_exec=100000000000 utime=1     stime=1     =>    2.0 tot=10000      user=5000  sys=5000  =====> OK
 *   2.1 sum_exec=101000000000 utime=100   stime=1     =>    2.1 tot=15000      user=10000 sys=5000  =====> FAIL
 *
 *   3.0 sum_exec=100000000000 utime=1     stime=1     => <<PREEMPT>>
 *   3.1 sum_exec=101000000000 utime=100   stime=1     =>    3.1 tot=10100      user=10000 sys=100   =====> OK
 *   <<SWITCH BACK>>                                         3.0 tot=15000      user=10000 sys=5000  =====> FAIL
 *
 */
#include <stdio.h>
#include <stdint.h>

/* Use (or don't use) the fix proposed by peterz on lkml
 */
#define PETERZ_FIX 0

/*****************************************************************************
 *
 * The code below is stubs and implementations of macros and functions used
 * by cputime_adjust() and friends.
 *
 *****************************************************************************
 */

typedef uint32_t u32;
typedef uint64_t u64;
typedef uint64_t cputime_t;


/* Some macro definitions used by cputime_adjust() and friends copied or stubbed
 */

#define __force

#define swap(a, b)							\
	do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)

/* No real concurrency, so we can use a trivial implementation.
 */
#define READ_ONCE(x) x


/* Structs copied from kernel code
 */
struct cputime {
	cputime_t utime;
	cputime_t stime;
};

struct task_cputime {
	cputime_t utime;
	cputime_t stime;
	unsigned long long sum_exec_runtime;
};

/* We are running in a standard glibc execution environment, so just rely on
 * the standard 64 bit arithmetics.
 */
u64 div_u64(u64 a, u64 b) {
	return a/b;
}

/* We have no real concurrency in the test so we can make a
 * simple version of cmpxchg
 */
int cmpxchg_cputime(cputime_t *counter, cputime_t old, cputime_t new) {
	cputime_t ret = *counter;
	(void)old;
	*counter = new;
	return ret;
}


/* Convert nanoseconds to jiffies, this assumes HZ=100
 */
cputime_t nsecs_to_cputime(u64 ns) {
	return (cputime_t)(ns/(1000000000/100));
}


void emulate_preemption(int preemption_position);


/*****************************************************************************
 *
 * The code below is copied verbatim from the kernel, except:
 *
 *   - The additional call to emulate_preemption() from cputime_adjust()
 *   - The code within #if PETERZ_FIX in cputime_adjust() is a fix proposed by peterz
 *
 *****************************************************************************
 */

/*
 * Perform (stime * rtime) / total, but avoid multiplication overflow by
 * loosing precision when the numbers are big.
 */
static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
{
	u64 scaled;

	for (;;) {
		/* Make sure "rtime" is the bigger of stime/rtime */
		if (stime > rtime)
			swap(rtime, stime);

		/* Make sure 'total' fits in 32 bits */
		if (total >> 32)
			goto drop_precision;

		/* Does rtime (and thus stime) fit in 32 bits? */
		if (!(rtime >> 32))
			break;

		/* Can we just balance rtime/stime rather than dropping bits? */
		if (stime >> 31)
			goto drop_precision;

		/* We can grow stime and shrink rtime and try to make them both fit */
		stime <<= 1;
		rtime >>= 1;
		continue;

drop_precision:
		/* We drop from rtime, it has more bits than stime */
		rtime >>= 1;
		total >>= 1;
	}

	/*
	 * Make sure gcc understands that this is a 32x32->64 multiply,
	 * followed by a 64/32->64 divide.
	 */
	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
	return (__force cputime_t) scaled;
}

/*
 * Atomically advance counter to the new value. Interrupts, vcpu
 * scheduling, and scaling inaccuracies can cause cputime_advance
 * to be occasionally called with a new value smaller than counter.
 * Let's enforce atomicity.
 *
 * Normally a caller will only go through this loop once, or not
 * at all in case a previous caller updated counter the same jiffy.
 */
static void cputime_advance(cputime_t *counter, cputime_t new)
{
	cputime_t old;

	while (new > (old = READ_ONCE(*counter)))
		cmpxchg_cputime(counter, old, new);
}

/*
 * Adjust tick based cputime random precision against scheduler
 * runtime accounting.
 */
static void cputime_adjust(struct task_cputime *curr,
			   struct cputime *prev,
			   cputime_t *ut, cputime_t *st)
{
	cputime_t rtime, stime, utime;

	/*
	 * Tick based cputime accounting depend on random scheduling
	 * timeslices of a task to be interrupted or not by the timer.
	 * Depending on these circumstances, the number of these interrupts
	 * may be over or under-optimistic, matching the real user and system
	 * cputime with a variable precision.
	 *
	 * Fix this by scaling these tick based values against the total
	 * runtime accounted by the CFS scheduler.
	 */
	rtime = nsecs_to_cputime(curr->sum_exec_runtime);

	/*
	 * Update userspace visible utime/stime values only if actual execution
	 * time is bigger than already exported. Note that can happen, that we
	 * provided bigger values due to scaling inaccuracy on big numbers.
	 */
	if (prev->stime + prev->utime >= rtime)
		goto out;

	stime = curr->stime;
	utime = curr->utime;

	if (utime == 0) {
		stime = rtime;
	} else if (stime == 0) {
		utime = rtime;
	} else {
		cputime_t total = stime + utime;

		stime = scale_stime((__force u64)stime,
				    (__force u64)rtime, (__force u64)total);

#if PETERZ_FIX // PETERZ:s proposed fix
		if (stime < prev->stime)
			stime = prev->stime;
#endif
		emulate_preemption(0);
		utime = rtime - stime;
	}
	cputime_advance(&prev->stime, stime);
	cputime_advance(&prev->utime, utime);

out:
	*ut = prev->utime;
	*st = prev->stime;
}


/*****************************************************************************
 *
 * The code below is the actual test driver. The tests struct has a number of
 * testcases each with a sequence of inputs to cputime_adjust() after each step
 * we verify that the reported system plus user time is equal to the provided
 * sum_exec_runtime.
 *
 *****************************************************************************
 */
#define NSEC_PER_SEC 1000000000ULL


static const struct {
	unsigned long long sum_exec_runtime;
	cputime_t utime;
	cputime_t stime;
	int preempt;
} tests[][10] = {
	{{ .sum_exec_runtime = 100*NSEC_PER_SEC, .utime = 0,   .stime = 1 },
	 { .sum_exec_runtime = 101*NSEC_PER_SEC, .utime = 100, .stime = 1 },
	 { .sum_exec_runtime = 0},
	},
	{{ .sum_exec_runtime = 100*NSEC_PER_SEC, .utime = 1,   .stime = 0 },
	 { .sum_exec_runtime = 101*NSEC_PER_SEC, .utime = 1, .stime = 100 },
	 { .sum_exec_runtime = 0},
	},
	{{ .sum_exec_runtime = 100*NSEC_PER_SEC, .utime = 1,   .stime = 1 },
	 { .sum_exec_runtime = 101*NSEC_PER_SEC, .utime = 100, .stime = 1 },
	 { .sum_exec_runtime = 0},
	},
	{{ .sum_exec_runtime = 100*NSEC_PER_SEC, .utime = 1,   .stime = 1, .preempt = 1},
	 { .sum_exec_runtime = 101*NSEC_PER_SEC, .utime = 100, .stime = 1 },
	 { .sum_exec_runtime = 0},
	}
};

/* The global state of the test machinery
 */
static int testno;
static int step;
static struct cputime prev;


/* Execute the current testno/step and validate the results
 */
void do_step(void) {
  struct task_cputime t;
  cputime_t ut,st;
  int tno,s;

  tno = testno;
  s = step;
  t.sum_exec_runtime = tests[tno][s].sum_exec_runtime;
  t.utime = tests[tno][s].utime;
  t.stime = tests[tno][s].stime;

  printf("%d.%d sum_exec=%-10lld utime=%-5lld stime=%-5lld => ",
	    tno, s,
	    (unsigned long long)t.sum_exec_runtime,
	    (unsigned long long)t.utime,
	    (unsigned long long)t.stime);

  cputime_adjust(&t, &prev, &ut, &st);

  printf("   %d.%d tot=%-10llu user=%-5llu sys=%-5llu",
	    tno, s, (unsigned long long)(ut+st),
	    (unsigned long long)ut, (unsigned long long)st);

  if((ut + st) != nsecs_to_cputime(t.sum_exec_runtime))
    printf(" =====> FAIL\n");
  else
    printf(" =====> OK\n");
}

/* Called from the preemption point in cputime_adjust()
 */
void emulate_preemption(int pos) {
	if((1 << pos) && tests[testno][step].preempt) {
		printf("<<PREEMPT>>\n");
		step++;
		do_step();
		printf("<<SWITCH BACK>>%38s","");
	}
}

int main(void) {
	for(testno = 0; testno < (int)(sizeof(tests)/sizeof(tests[0])); testno++) {
		prev.utime = 0;
		prev.stime = 0;
		step = 0;
		printf("\n");
		while(tests[testno][step].sum_exec_runtime != 0) {
			do_step();
			step++;
		}
	}

	return 0;
}