/*  time-schedule.c

    Programme to test how long a context switch takes.

    Copyright (C) 1998  Richard Gooch

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

    Richard Gooch may be reached by email at  rgooch@atnf.csiro.au
    The postal address is:
      Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
*/

/*
 *   This programme will determine the context switch (scheduling) overhead on
 *   a system. It takes into account SMP machines. True context switches are
 *   measured.
 *
 *
 *   Written by      Richard Gooch   15-SEP-1998
 *   Last updated by Richard Gooch   25-SEP-1998
 */
#include <unistd.h>
#ifdef _POSIX_THREADS
#  ifndef _REENTRANT
#    define _REENTRANT
#  endif
#  ifndef _POSIX_THREAD_SAFE_FUNCTIONS
#    define _POSIX_THREAD_SAFE_FUNCTIONS
#  endif
#  include <pthread.h>
#endif
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <ctype.h>
#include <signal.h>
#include <sched.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <dirent.h>
#include <errno.h>

#ifndef __KARMA__
#  define mt_num_processors() 1   /*  Set to the number of processors   */
#  define ERRSTRING sys_errlist[errno]
#  define FALSE 0
#  define TRUE  1
#else
#  include <karma.h>
#  include <karma_mt.h>
#endif


#define MAX_ITERATIONS      1000

static unsigned int hog_other_cpus ();
static void run_yielder (int use_threads, int read_fd);
static void *yielder_main (void *arg);
static void s_term_handler ();
static void run_low_priority (unsigned int num, int read_fd);
static unsigned long compute_median (unsigned long values[MAX_ITERATIONS],
				     unsigned long max_value);
static unsigned int get_run_queue_size ();
static unsigned long get_num_switches ();
static void use_fpu_value (double val);


static volatile unsigned int sched_count = 0;
/*  For yielder  */
static int pipe_read_fd = -1;
static int pipe_write_fd = -1;
static pid_t child = -1;


int main (int argc, char **argv)
{
    int use_threads = FALSE;
    int use_pipe = FALSE;
    int no_test = FALSE;
    int frob_fpu = FALSE;
    int read_fd = -1;
    int write_fd = -1;
    int num_low_priority = -1;
    int i, j;
    int fds[2];
    unsigned int count, num_yields, run_queue_size1, run_queue_size2, num_hogs;
    unsigned long median, switches1, num_switches, num_overhead_switches;
    signed long overhead, total_diffs;
    signed long min_diff =  1000000000;
    signed long max_diff = -1000000000;
    double dcount = 0.0;
    unsigned long diffs[MAX_ITERATIONS];
    struct timeval before, after;
    static char *usage =
	"time-schedule [-h] [-thread] [-notest] [-pipe] [-fpu] [num_running]";

    /*  First create pipe used to sychronise low priority processes  */
    if (pipe (fds) != 0)
    {
	fprintf (stderr, "Error creating pipe\t%s\n", ERRSTRING);
	exit (1);
    }
    read_fd = fds[0];
    pipe_write_fd = fds[1];
    for (count = 1; count < argc; ++count)
    {
	if (strcmp (argv[count], "-thread") == 0)
	{
#ifdef _POSIX_THREADS
	    use_threads = TRUE;
#else
	    fprintf (stderr, "POSIX threads not available\n");
#endif
	}
	else if (strcmp (argv[count], "-pipe") == 0) use_pipe = TRUE;
	else if (strcmp (argv[count], "-notest") == 0) no_test = TRUE;
	else if (strcmp (argv[count], "-fpu") == 0) frob_fpu = TRUE;
	else if ( isdigit (argv[count][0]) )
	    num_low_priority = atoi (argv[count]);
	else
	{
	    fprintf (stderr,
		     "Programme to time context switches (schedules)\n");
	    fprintf (stderr,
		     "(C) 1998  Richard Gooch <rgooch@atnf.csiro.au>\n");
	    fprintf (stderr, "Usage:\t%s\n", usage);
	    fprintf (stderr, "\t-thread\t\tswitch threads not processes\n");
	    fprintf (stderr, "\t-pipe\t\tuse pipes not sched_yield()\n");
	    fprintf (stderr,
		     "\t-fpu\t\tpollute the FPU after each switch in main\n");
	    fprintf (stderr, "\tnum_running\tnumber of extra processes\n");
	    exit (0);
	}
    }
    if (no_test)
    {
	if (num_low_priority > 0) run_low_priority (num_low_priority, read_fd);
	while (TRUE) pause ();
    }
    if (geteuid () == 0)
    {
	struct sched_param sp;

	memset (&sp, 0, sizeof sp);
	sp.sched_priority = 10;
	if (sched_setscheduler (0, SCHED_FIFO, &sp) != 0)
	{
	    fprintf (stderr, "Error changing to RT class\t%s\n", ERRSTRING);
	    exit (1);
	}
	if (mlockall (MCL_CURRENT | MCL_FUTURE) != 0)
	{
	    fprintf (stderr, "Error locking pages\t%s\n", ERRSTRING);
	    exit (1);
	}
    }
    else fprintf (stderr, "Not running with RT priority!\n");
    /*  Give shell and login programme time to finish up and get off the run
	queue  */
    usleep (200000);
    if (use_pipe)
    {
	if (pipe (fds) != 0)
	{
	    fprintf (stderr, "Error creating pipe\t%s\n", ERRSTRING);
	    exit (1);
	}
	pipe_read_fd = fds[0];
	write_fd = fds[1];
    }
    num_hogs = hog_other_cpus ();
    /*  Determine overhead. Do it in a loop=2. The first iteration should warm
	the cache, the second will compute the overhead  */
    for (j = 0; j < 2; ++j)
    {
	switches1 = get_num_switches ();
	gettimeofday (&before, NULL);
	for (i = 0; i < 20; ++i)
	{
	    if (use_pipe)
	    {
		char ch = 0;

		write (pipe_write_fd, &ch, 1);
		read (read_fd, &ch, 1);
	    }
	    else sched_yield ();
	    if (frob_fpu) ++dcount;
	}
	gettimeofday (&after, NULL);
	num_overhead_switches = get_num_switches () - switches1;
	overhead = 1000000 * (after.tv_sec - before.tv_sec);
	overhead += after.tv_usec - before.tv_usec;
    }
    use_fpu_value (dcount);
    if (num_low_priority > 0) run_low_priority (num_low_priority, read_fd);
    /*  Set up for the benchmark  */
    run_yielder (use_threads, read_fd);
    memset (diffs, 0, sizeof diffs);
    run_queue_size1 = get_run_queue_size ();
    total_diffs = 0;
    switches1 = get_num_switches ();
    /*  Benchmark!  */
    for (count = 0; count < MAX_ITERATIONS; ++count)
    {
	signed long diff;

	gettimeofday (&before, NULL);
	/*  Generate 20 context switches  */
	for (i = 0; i < 10; ++i)
	{
	    if (use_pipe)
	    {
		char ch = 0;

		write (write_fd, &ch, 1);
		read (read_fd, &ch, 1);
	    }
	    else sched_yield ();
	    if (frob_fpu) dcount += 1.0;
	}
	gettimeofday (&after, NULL);
	diff = 1000000 * (after.tv_sec - before.tv_sec);
	diff += after.tv_usec - before.tv_usec;
	diffs[count] = diff;
	total_diffs += diff;
	if (diff < min_diff) min_diff = diff;
	if (diff > max_diff) max_diff = diff;
    }
    num_yields = sched_count;
    run_queue_size2 = get_run_queue_size ();
    num_switches = get_num_switches () - switches1;
    if (!use_threads) kill (child, SIGTERM);
    fprintf (stderr, "Started %u hog processes\n", num_hogs);
    fprintf (stderr, "Syscall%s overhead: %.1f us\n", frob_fpu ? "/FPU" : "", (double) overhead / 20.0);
    if (switches1 > 0)
	fprintf (stderr, "Num switches during overhead check: %lu\n",
		 num_overhead_switches);
    fprintf (stderr, "Minimum scheduling latency: %.1f (%.1f) us\n",
	     (double) min_diff / 20.0, (double) (min_diff - overhead) / 20.0);
    median = compute_median (diffs, max_diff);
    fprintf (stderr, "Median scheduling latency:  %.1f (%.1f) us\n",
	     (double) median / 20.0, (double) (median - overhead) / 20.0);
    fprintf (stderr, "Average scheduling latency: %.1f (%.1f) us\n",
	     (double) total_diffs / (double) MAX_ITERATIONS / 20.0,
	     (double) (total_diffs - overhead * MAX_ITERATIONS) /
	     (double) MAX_ITERATIONS / 20.0);
    fprintf (stderr, "Maximum scheduling latency: %.1f (%.1f) us\n",
	     (double) max_diff / 20.0, (double) (max_diff - overhead) / 20.0);
    fprintf (stderr, "Run queue size: %u, %u\n",
	     run_queue_size1, run_queue_size2);
    use_fpu_value (dcount);
    if (use_threads) fprintf (stderr, "Number of yields: %u\n", num_yields);
    if (num_switches > 0) fprintf (stderr, "Num switches: %lu\n",num_switches);
    /*  Finish up  */
    kill (0, SIGTERM);
    return (0);
}   /*  End Function main  */


static unsigned int hog_other_cpus ()
/*  [SUMMARY] Hog other CPUs with a high-priority job.
    [RETURNS] The number of hogged CPUs.
*/
{
    unsigned int count;

    for (count = mt_num_processors (); count > 1; --count)
    {
	switch ( fork () )
	{
	  case 0:
	    /*  Child  */
	    while (TRUE);
	    break;
	  case -1:
	    /*  Error  */
	    fprintf (stderr, "Error forking\t%s\n", ERRSTRING);
	    kill (0, SIGTERM);
	    break;
	  default:
	    /*  Parent  */
	    break;
	}
    }
    return mt_num_processors () - 1;
}   /*  End Function hog_other_cpus  */

static void run_yielder (int use_threads, int read_fd)
/*  [SUMMARY] Run other process which will continuously yield.
    <use_threads> If TRUE, the yielding process is just a thread.
    <read_fd> The pipe to read the synchronisation byte from.
    [RETURNS] Nothing.
*/
{
    char ch;
    struct sigaction new_action;
#ifdef _POSIX_THREADS
    pthread_t thread;

    if (use_threads)
    {
	if (pthread_create (&thread, NULL, yielder_main, NULL) != 0)
	{
	    fprintf (stderr, "Error creating thread\t%s\n", ERRSTRING);
	    kill (0, SIGTERM);
	}
	read (read_fd, &ch, 1);
	return;
    }
#endif
    switch ( child = fork () )
    {
      case 0:
	/*  Child  */
	break;
      case -1:
	/*  Error  */
	fprintf (stderr, "Error forking\t%s\n", ERRSTRING);
	kill (0, SIGTERM);
	break;
      default:
	/*  Parent  */
	read (read_fd, &ch, 1);
	return;
	/*break;*/
    }
    memset (&new_action, 0, sizeof new_action);
    sigemptyset (&new_action.sa_mask);
    new_action.sa_handler = s_term_handler;
    if (sigaction (SIGTERM, &new_action, NULL) != 0)
    {
	fprintf (stderr, "Error setting SIGTERM handler\t%s\n", ERRSTRING);
	exit (1);
    }
    yielder_main (NULL);
}   /*  End Function run_yielder  */

static void *yielder_main (void *arg)
/*  [SUMMARY] Yielder function.
    <arg> An arbirtrary pointer. Ignored.
    [RETURNS] NULL.
*/
{
    char ch = 0;

    sched_count = 0;
    write (pipe_write_fd, &ch, 1);
    while (TRUE)    
    {
	if (pipe_read_fd >= 0)
	{
	    read (pipe_read_fd, &ch, 1);
	    write (pipe_write_fd, &ch, 1);
	}
	else sched_yield ();
	++sched_count;
    }
    return (NULL);
}   /*  End Function yielder_main  */

static void s_term_handler ()
{
    fprintf (stderr, "Number of yields: %u\n", sched_count);
    exit (0);
}   /*  End Function s_term_handler  */

static void run_low_priority (unsigned int num, int read_fd)
/*  [SUMMARY] Run low priority processes.
    <num> Number of processes.
    <read_fd> The pipe to read the synchronisation byte from.
    [RETURNS] Nothing.
*/
{
    char ch = 0;

    for (; num > 0; --num)
    {
	switch ( fork () )
	{
	  case 0:
	    /*  Child  */
	    if (geteuid () == 0)
	    {
		struct sched_param sp;
		
		memset (&sp, 0, sizeof sp);
		sp.sched_priority = 0;
		if (sched_setscheduler (0, SCHED_OTHER, &sp) != 0)
		{
		    fprintf (stderr,
			     "Error changing to SCHED_OTHER class\t%s\n",
			     ERRSTRING);
		    exit (1);
		}
	    }
	    if (nice (20) != 0)
	    {
		fprintf (stderr, "Error nicing\t%s\n", ERRSTRING);
		kill (0, SIGTERM);
	    }
	    write (pipe_write_fd, &ch, 1);
	    while (TRUE) sched_yield ();
	    break;
	  case -1:
	    /*  Error  */
	    fprintf (stderr, "Error forking\t%s\n", ERRSTRING);
	    kill (0, SIGTERM);
	    break;
	  default:
	    /*  Parent  */
	    read (read_fd, &ch, 1);
	    break;
	}
    }
}   /*  End Function run_low_priority  */

static unsigned long compute_median (unsigned long values[MAX_ITERATIONS],
				     unsigned long max_value)
/*  [SUMMARY] Compute the median from an array of values.
    <values> The array of values.
    <max_value> The maximum value in the array.
    [RETURNS] The median value.
*/
{
    unsigned long count;
    unsigned long median = 0;
    unsigned long peak = 0;
    unsigned long *table;

    /*  Crude but effective  */
    if ( ( table = calloc (max_value + 1, sizeof *table) ) == NULL )
    {
	fprintf (stderr, "Error allocating median table\n");
	exit (1);
    }
    for (count = 0; count < MAX_ITERATIONS; ++count)
    {
	++table[ values[count] ];
    }
    /*  Now search for peak. Position of peak is median  */
    for (count = 0; count < max_value + 1; ++count)
    {
	if (table[count] < peak) continue;
	peak = table[count];
	median = count;
    }
    return (median);
}   /*  End Function compute_median  */

static unsigned int get_run_queue_size ()
/*  [SUMMARY] Compute the current size of the run queue.
    [RETURNS] The length of the run queue.
*/
{
    int dummy_i;
    unsigned int length = 0;
    FILE *fp;
    DIR *dp;
    struct dirent *de;
    char txt[64], dummy_str[64];

    if ( ( dp = opendir ("/proc") ) == NULL ) return (0);
    while ( ( de = readdir (dp) ) != NULL )
    {
	if ( !isdigit (de->d_name[0]) ) continue;
	sprintf (txt, "/proc/%s/stat", de->d_name);
	if ( ( fp = fopen (txt, "r") ) == NULL ) return (length);
	fscanf (fp, "%d %s %s", &dummy_i, dummy_str, txt);
	if (txt[0] == 'R') ++length;
	fclose (fp);
    }
    closedir (dp);
    return (length);
}   /*  End Function get_run_queue_size  */

static unsigned long get_num_switches ()
/*  [SUMMARY] Get the number of context switches.
    [RETURNS] The number of context switches on success, else 0.
*/
{
    unsigned long val;
    FILE *fp;
    char line[256], name[64];

    if ( ( fp = fopen ("/proc/stat", "r") ) == NULL ) return (0);
    while (fgets (line, sizeof line, fp) != NULL)
    {
	sscanf (line, "%s %lu", name, &val);
	if (strcasecmp (name, "ctxt") != 0) continue;
	fclose (fp);
	return (val);
    }
    fclose (fp);
    return (0);
}   /*  End Function get_num_switches  */

static void use_fpu_value (double val)
/*  [SUMMARY] Dummy function to consume FPU value. Fools compiler.
    <val> The value.
    [RETURNS] Nothing.
*/
{
}   /*  End Function use_fpu_value  */
