#ifndef __bw_mgmt_h__ /* { */
#define __bw_mgmt_h__
/*
 * bw_mgmt.h - kernel bandwidth management defines and structures
 *
 * Copyright (C) 1999, Cobalt Networks, Inc.
 * All rights reserved.
 *
 * The bandwidth management layer uses a two level scheduling
 * model, where an I/O scheduling layer runs occasionally and
 * pushes requests to the I/O active layer.  I/O is only done
 * from the I/O active layer.
 */

#ifdef USER_TEST
/*
 * For unit test model, we need to fool the kernel include files
 * Look for "USER_TEST" for tacky tricks.
 */
#define _I386_CURRENT_H

extern struct task_struct *current;
#endif

#include <linux/autoconf.h>
#if CONFIG_MODVERSIONS == 1 && ! defined(MODVERSIONS)
#define MODVERSIONS
#endif

#ifdef MODVERSIONS
#include <linux/modversions.h>
#endif

#include <linux/wait.h>
#include <linux/types.h>
#include <linux/uio.h>
#include <linux/time.h>
#include <linux/timer.h>
#include <linux/sched.h>
#include <linux/net.h>
#include <net/sock.h>
#include <linux/socket.h>
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/poll.h>
#include <bw_user.h>
#include <linux/bw_iface.h>
#include <asm/atomic.h>

#ifdef BIGNASTYLOCK
#define BW_SKIP_LOCKS
#endif

#include "bw_trace.h"

#ifndef NULL
#define NULL ((void *) 0)
#endif


                    
#ifdef HACKTASTIC
extern int hacktastic_flag;
extern void (* hacktastic_func)( void );
#define HACKTASTIC_SET  ; hacktastic_flag=current->processor
#define HACKTASTIC_UNSET hacktastic_flag=-1;
#else
#define HACKTASTIC_SET 
#define HACKTASTIC_UNSET
#endif

#ifdef BIGNASTYLOCK

extern spinlock_t bw_bignastylock;

#define NASTY_LOCK_DECL unsigned long bw_nasty_flags
#define NASTY_LOCK_FLAGS bw_nasty_flags
#define NASTY_LOCK_FLAGS_REF &bw_nasty_flags

#define NASTY_LOCK( flags )  do { BW_LOG_LOCK0(&bw_bignastylock, flags)\
                                  HACKTASTIC_SET; } while(0)
#define NASTY_UNLOCK( flags ) do { HACKTASTIC_UNSET \
                                   BW_LOG_UNLOCK0(&bw_bignastylock, flags); } while(0)

#define BW_GFP GFP_ATOMIC
#else
#define NASTY_LOCK_DECL
#define NASTY_LOCK_FLAGS
#define NASTY_LOCK_FLAGS_REF NULL

#define NASTY_LOCK( flags )  
#define NASTY_UNLOCK( flags )  
#define BW_GFP GFP_KERNEL
#endif


/*
 * We do all of our time calculations in a native counter that we
 * expect to be free running (more or less).  For now, this is
 * based off the jiffy time.  We don't scale the time into a
 * uniform unit (eg microsec) because we can't index by
 * time % power_of_two.  (Bad things happen at the +/- boundary:
 * eg. -10,000 % 16 != (0 % 16) - 1.)
 */

#define BW_CLOCKRATE	HZ
#define BW_IOSCHED_RATE	(8)
#define BW_IO_RATE	(1)
#define BW_MIN_RATE	(BW_CLOCKRATE * 60)

#define BW_CUR_TIME	(jiffies)

#define BW_IOTIME_TO_JIFFY(time)  (time)
#define BW_SCHEDTIME_TO_JIFFY(time) (time)
#define BW_IOSLOT_TOSCHED(slot) ((slot) / BW_IOSCHED_RATE)

#define BW_IO_ADJUST_TIMER	0x1
#define BW_IO_DONT_ADJUST_TIMER	0x2

#define BW_SOCK_MGMT(s) ((struct bw_info *) ((s)->bw_mgmt))

/* 
 * We xfer the minimum of this or the request size.
 */
#define BW_MIN_IOSIZE	512

/*
 * We keep fine grain statistics on BW_NUMIO_CNT BW_IO_RATE periods.
 * This is a divisor, so a sacred power of two is indicated.  Also,
 * this must be bigger than the burst window to keep new I/O from
 * starving queued requests.
 */
#define BW_NUMIO_CNT (BW_IOSCHED_RATE << 2)
#define BW_MIN_BURST 2
#define BW_MAX_BURST (BW_NUMIO_CNT >> 1)

/*
 * We keep a max of 16 minutes of history on bandwidth usage.  
 * This is a sacred power of two, and also enables a 15 minute
 * bandwidth average.
 */
#define BW_NUMMINUTES	16

/*
 * This is the number of scheduler statistics buckets.  We only
 * need two - double buffering suffices to smooth the scheduler.
 */
#define BW_SCHEDIO_CNT	2

/*
 * Max overalloc prevents an application from polling
 * to store up bandwidth for later use.  64k lets a
 * maximum UDP packet be processed with nonblocking I/O.
 */
#define BW_MAXOVERALLOC (64 * 1024)

#define BW_INFO_MAGIC 0x8c4ba9df

/*
 * The bw_socket structure is added to the end of every socket
 * structure to make the bandwidth limit lookup efficient.
 *
 * XXX - if you add fields here, you will need to figure out the
 *     semantics of 'dup'.  For now, the bw_io_info struct is
 *     cloned by hand because of reference counts and linked lists.
 */
struct bw_info {
    unsigned int magic; /*only magic if debugging */
    struct socket *bw_sock;
    /*
     * This is the array of type information for this socket
     */
    struct bw_limit_id bw_type_id[N_BW_TYPES];
    /*
     * We wait here when we need to during a sychronous poll
     */
#if LINUX_VERSION_CODE >= 0x20400
    wait_queue_head_t	bw_waitq;
    wait_queue_head_t	*bw_waitqp;
#else
    struct wait_queue	*bw_waitq;
    struct wait_queue	**bw_waitqp;
#endif
    struct bw_ioreq	*bw_poll_req;	/* link to the polling ioreq */

    int bw_flags;
    int bw_proto;	/* Remember protocol type and */
    int bw_family;	/* family for network address */
    int bw_savederror;

    /*
     * each regulated type has a hash entry to locate the 
     * current regulation information.
     */
    struct bw_io_info {
	struct bw_limit_hash *bw_limit[N_BW_TYPES];	/* I/O limits */

	atomic_t bw_prealloc;	/* number of bytes guaranteed to this socket */
	int bw_pending;		/* number of bytes in pending I/O */
    } bw_io[2];   /* one for read, one for write */

    /*
     * If a particular type isn't limited, we add it to a
     * unlimit linked list - this means that we don't have
     * to scan every socket whenever a limit is adjusted.
     *
     * We don't have a read/write distinction because this
     * is only a convenience.  If either is unlimited, we
     * will recheck both if a new limit is added.
     */
    struct bw_info *bw_unlimit_forw[N_BW_TYPES];
    struct bw_info *bw_unlimit_back[N_BW_TYPES];
};

#ifdef DEBUG
#define BW_INFO_MAGIC_VALID( bwi )  ((bwi)->magic == BW_INFO_MAGIC)
#else
#define BW_INFO_MAGIC_VALID( bwi ) (1)
#endif

/*
 * Flags values in the bw_info structure.
 */
#define bw_type_valid(bwi, type)	(bwi->bw_flags & (1 << type))
#define BW_IPVALID	0x1
#define BW_UIDVALID	0x2
#define BW_GIDVALID	0x4

#define BW_MSGONLY	0x10000		/* Socket is message oriented */ 
#define BW_POLLPEND	0x20000		/* Socket has poll pending */ 

#define bw_type_unlimit(bwi, type)	(bwi->bw_flags & (BW_UNLIMIT << (type)))
#define BW_UNLIMIT	0x200000	/* Base for no limit flags */
#define BW_UNLIMIT_IP	0x200000	/* no limit found for connection */
#define BW_UNLIMIT_UID	0x400000	/* no limit found for connection */
#define BW_UNLIMIT_GID	0x800000	/* no limit found for connection */

#define BW_UNLIMIT_ALL	(BW_UNLIMIT_IP|BW_UNLIMIT_UID|BW_UNLIMIT_GID)
#define BW_NODUP_FLAGS	(BW_UNLIMIT_ALL | BW_POLLPEND)

/*
 * Most of the statistics in the limiter are tracked with timestamp
 * counters, or pairs of counters.  We check the time, and if it
 * matches our expected value, we use/update the counter.  If the
 * time doesn't match, the counter is stale, and we use zero, and
 * reset the time.
 *
 * This lets us use a circular buffer of counters, and we never need
 * to worry about time based decay.  If the network is lightly loaded
 * some buckets can be very stale, but under load everything is right.
 * In either event, we don't need periodic interrupts to clean up our
 * statistics.
 */

/*
 * bw_time_cnt : this tracks I/O that has completed
 */
struct bw_time_cnt {
    bwl_time_t bwcnt_time;
    int bwcnt_cnt;
};

/*
 * The limit hash has an entry for every limit, and includes 
 * information about every currently pending request.
 */
struct bw_limit_hash {
    struct bw_limit_id bwh_lim_id;
    struct bw_limit_hash *bwh_hashnext;

    int		bwh_flags;		/* Tracks bwlim_params */
    int		bwh_lim_refcnt;
    int		bwh_active_ios;
    struct bw_time_cnt bwh_active_cnt;

    int		bwh_bps;		/* bytes per sec */
    int		bwh_bp_bckt;	/* bytes per bwhcnt_current */
    int		bwh_bp_sched;	/* bytes per bwhcnt_sched */

    bwl_totcnt_t	bwh_lim_tot_bytes;
    bwl_totcnt_t	bwh_lim_max_bytes;
    bwl_totcnt_t	bwh_interval_bytes;

    bwl_time_t 	bwh_lim_rearm_time;
    bwl_time_t 	bwh_overlim_time;

    int		bwh_lim_overlim_bps; /* limit if tot_bytes > max_bytes */

    int bwh_burst_time;		/* number of ticks that we overalloc */
    int bwh_burst_count;		/* byte_count for bursting */

    /*
     * bwhcnt_sched is the count of bytes already consumed
     * in this scheduling interval.  
     */
    struct bw_time_cnt	bwhcnt_sched[BW_SCHEDIO_CNT];
    struct bw_time_cnt	bwhcnt_minutes[BW_NUMMINUTES];
    struct proc_dir_entry *bwh_proc_ent;
};

/*
 * A bunch of macros to convert clock times into indicies
 * for the various arrays above.
 */
#define BW_TIME_TO_IOIND(t)	\
		(((bwl_time_t) (t) / BW_IO_RATE) % BW_NUMIO_CNT)

#define BW_TIME_TO_SCHEDIND(t)	\
		(((bwl_time_t) (t) / BW_IOSCHED_RATE) % BW_SCHEDIO_CNT)

#define BW_TIME_TO_MININD(t)	\
		(((bwl_time_t) (t) / BW_MIN_RATE) % BW_NUMMINUTES)


/*
 * Usage information
 * We can't use a decay model (like load average) since that
 * represents the smoothed average over the last time period.
 * For example, if a connection peaks at 100% of the limit,
 * the next time slice should still make 100% available, but
 * a decay model would show that the link was still 90% used.
 * Even worse, the decay model would require periodic updates
 * for every limit, or a compound update when active. Yuck.
 *
 * So, we have to actually do the math.  Here we keep a circular
 * array of buckets for the last N time periods, and when we
 * perform an I/O operation, we update the counter and timestamp.
 * We calculate the current usage by examining the previous
 * time slot (if valid) and the current slot.
 *
 * Note - we let time flow "backwards" - time will wrap eventually.
 */

/* locks the bucket count updates in the limit structures 
 * this never needs to be locked explicetly as it is taken care of in
 * bw_num_cnt and bw_incr_cnt */
extern spinlock_t bw_cntlock;

#define bw_num_cnt( bckt, time, rate ) bw_do_num_cnt(bckt,time,rate,__FILE__,__LINE__)
#define bw_incr_cnt( bckt, time, incr, rate) \
        bw_do_incr_cnt(bckt, time, incr,rate, __FILE__,__LINE__)
static inline int
bw_do_num_cnt(struct bw_time_cnt *bckt, bwl_time_t time, int rate, char *file, int line)
{
    int cnt;
    BW_LOCK_FLAGS_DECL;

    BW_LOG_LOCK_FULL( &bw_cntlock, BW_LOCK_FLAGS, file, line );
    time /= rate;
    if (bckt->bwcnt_time == time) {
	cnt = bckt->bwcnt_cnt;
	BW_LOG_UNLOCK_FULL( &bw_cntlock, BW_LOCK_FLAGS, file, line );

	return cnt;
    }
    else
    {
	BW_LOG_UNLOCK_FULL( &bw_cntlock, BW_LOCK_FLAGS, file, line );
	return 0;
    }
}


static inline void
bw_do_incr_cnt(struct bw_time_cnt *bckt, bwl_time_t time, int incr, int rate,
	    char * file, int line)
{
    BW_LOCK_FLAGS_DECL;

    BW_LOG_LOCK_FULL( &bw_cntlock, BW_LOCK_FLAGS, file, line);
    time /= rate;
    if (bckt->bwcnt_time == time) {
	bckt->bwcnt_cnt += incr;
    }
    else {
	bckt->bwcnt_time = time;
	bckt->bwcnt_cnt = incr;
    }

    BW_LOG_UNLOCK_FULL( &bw_cntlock, BW_LOCK_FLAGS, file, line );
}


struct bw_ioreq;
typedef int (*bw_iofunc_ptr_t)(struct bw_ioreq *, int len, void *params,
			       unsigned long *lock_flags );

/*
 * There is a pending_io structure per active I/O request.  This is
 * allocated on the stack, (except eventually for poll/select) and
 * is used to manage all facets of ongoing I/O.
 */
struct bw_ioreq {
    int bw_magic;			/* only magic if debugging */
    struct bw_info	*bw_bwi;
    enum bw_iotype	bw_rw;
    atomic_t		bw_prealloc;	/* B/W allocated to this request */
#if LINUX_VERSION_CODE >= 0x20400
    wait_queue_head_t	bw_sleep;
    wait_queue_head_t	*bw_sleep_qpp;
#else
    struct wait_queue	*bw_sleep;
    struct wait_queue	**bw_sleep_qpp;
#endif
    atomic_t		bw_totallen;	/* down counter of bytes to schedule */
    int			bw_lastalloc;	/* Last I/O alloc for I/O q order */
    int			bw_total_ioc;	/* down counter of I/O completed */
    int			bw_error_ret;	/* I/O status to return to user */
    struct msghdr *bw_msg;
    struct iovec *bw_iov_curbound;
    struct iovec *bw_iov_end;
    struct iovec bw_saved_iov;

    void		*bw_allocbase;	/* kmalloc base of I/O buffer */

    void 		*bw_iopoll;	/* flag for polled I/O mode */
    int			bw_poll_len;	/* number of bytes collected in poll */

    bw_iofunc_ptr_t	bw_iofunc;
    void	*bw_ioparams;		/* parameters to above */
    int			bw_cur_iolen[BW_NUMIO_CNT/BW_IOSCHED_RATE];
    struct bw_ioreq	*bw_ioactive[BW_NUMIO_CNT/BW_IOSCHED_RATE];
#ifdef DEBUG
    bwl_time_t		bw_io_set[BW_NUMIO_CNT/BW_IOSCHED_RATE];
    bwl_time_t		bw_io_slot[BW_NUMIO_CNT/BW_IOSCHED_RATE];
#endif
    struct bw_ioreq	*bw_iosched;
 
    spinlock_t bw_busylock;

}; 



/*
 * The I/O request queues are typical head/tail queues.
 * Fast insert/dequeue.  Slow remove from middle.
 */
struct bw_queue {
    struct bw_ioreq *bwq_head;
    struct bw_ioreq **bwq_tail;
};

#define BW_REQ_MAGIC 0xA15CE1D

#ifdef DEBUG
int bw_verify_active(int needlock);
#define BW_CHECK_REQ(req)	ASSERT((req)->bw_magic == BW_REQ_MAGIC)
#else
#define BW_CHECK_REQ(req)
#endif

/* linux's macro just became in-compatable with ours */
#ifdef ASSERT
#undef ASSERT
#endif

#ifndef ASSERT
#ifdef DEBUG
#define ASSERT(x) 						\
    if ( ! (x)) {						\
	printk("assert %s failed, file  %s, line %d\n",		\
			#x, __FILE__, __LINE__); 		\
	panic("halting");					\
    }
#else
#define ASSERT(x)
#endif /* DEBUG */
#endif /* ! ASSERT */


#ifndef min
#define min(x, y)	((x) < (y) ? (x) : (y))
#endif

#ifdef USER_TEST
#undef write_lock_irqsave
#define write_lock_irqsave(lock,flags)
#undef write_lock_irqrestore
#define write_lock_irqrestore(lock,flags)
#define bw_add_wait_queue __add_wait_queue
#undef spin_lock_irqsave
#define spin_lock_irqsave(lock, flags)

#undef spin_unlock_irqrestore
#define spin_unlock_irqrestore(lock, flags)
#else
#define bw_add_wait_queue add_wait_queue
#endif


/*
 * Various functions decls
 */
void bw_update_io(struct bw_info *, enum bw_iotype, int nalloc, int num_io);
void bw_update_io(struct bw_info *, enum bw_iotype, int nalloc, int num_io);
int bw_new_ent(struct bw_info *, enum bw_types, enum bw_iotype);
void bw_delete_ent(struct bw_info *, enum bw_types, enum bw_iotype);

struct bw_info *bw_setip(struct bw_info *, u_int8_t protocol, u_int32_t addr,
		    u_int16_t port);

int bw_io_check(struct bw_ioreq *bwreq, int start_bias, int *slotp);
void bw_init_ioreq(struct bw_ioreq *bwreq, struct bw_info *bwi, int rw,
                struct msghdr *m, int len);
void bw_iov_pullup(struct bw_ioreq *bwreq, int nplanned, int nio);
int bw_iov_copyin(struct bw_ioreq *bwreq, unsigned long *flags);
void bw_iov_dealloc(struct bw_ioreq *bwreq, unsigned long *flags);

int bw_new_limit(struct bwlim_params *bwp);
int bw_delete_limit(struct bwlim_params *bwp);

struct bw_limit_hash	*bw_create(struct bw_limit_id *bwid);
int	bw_delete(struct bw_limit_id *bwid);
int	bw_copyin_iov(struct bw_ioreq *);

void bw_delete_ioreq(struct bw_ioreq *, struct bw_info *);
void bw_io_update(struct bw_ioreq *bwreq, int nalloc, int num_io);
void bw_iov_front(struct bw_ioreq *bwreq, int nbytes);
void bw_io_complete(struct bw_ioreq *bwreq);
void bw_io_active(struct bw_ioreq *bwreq);
void bw_io_inactive(struct bw_ioreq *bwreq);

int bw_q_io(struct bw_ioreq *bwreq, bw_iofunc_ptr_t iofunc, void *ioparams);

void bw_cancel_io(struct bw_ioreq *bwreq);
int bw_do_some_active(int adj_timer, int end_bias, unsigned long *lock_flags);
void bw_do_q_active(struct bw_ioreq *, int slot, int cur_len);
void bw_do_q_sched(struct bw_ioreq *);

void bw_set_burst_calc(struct bw_limit_hash *);
void bw_io_short_io(struct bw_info *, enum bw_iotype rw, int nfree);

void bw_mgmt_init(void);
void bw_timers_init(void);
void bw_timer_drain(unsigned long * lock_flags);
void bw_proc_disable(void);
void bw_proc_newip(struct bw_limit_hash *bw_limit);
void bw_proc_delip(struct bw_limit_hash *bw_limit);

#ifdef BW_TRACK_READ
XXX BW_TRACK_READ will not work yet - check out every ifdef and fix stuff
#endif

#endif /* } __bw_mgmt_h__ */
// LICENSE:
// This software is subject to the terms of the GNU GENERAL 
// PUBLIC LICENSE Version 2, June 1991
