#ifdef MODULE
#if LINUX_VERSION_CODE >= 0x20400
#include <linux/module.h>
#endif
#include <linux/modversions.h>
#endif

/*
 * bw_mgmt.c : calculate the amount of I/O available on a socket.
 *
 * Copyright (C) 1999-2001, Sun Microsystem, Inc.
 * All rights reserved.
 *
 * These routines calculate the amount of I/O, and update the
 * counters to reflect actual I/O performed.  This is also an
 * intermediate routine between the bw_abstract layer, and the
 * bw_hash and bw_timer modules.
 */

#include <bw_mgmt.h>

void bw_update_universe(enum bw_iotype rw, int actual);
void bw_io_done(struct bw_limit_hash *limit, int actual);
void bw_io_unalloc(struct bw_limit_hash *, int my_num, int final_num,
		int my_slot, int final_slot, int time_bias);

#define DPRINTF if (0) printk
/*
 * Global counter for everything.
 */
struct bw_time_cnt bw_total_mincnt[2][BW_NUMMINUTES];

/*
 * Lock for all limit updates.  Technically, this creates a
 * bottleneck on SMP, but the lock duration is small, and if
 * we had a lock per limit, we would need to lock and unlock
 * them _everywhere_ (worst case 9 times per partial I/O).
 *
 * XXX this also protects the hash list.  Same reason as above,
 * but somewhat more tacky.
 */
extern spinlock_t bw_limitlock;

#ifdef BIGNASTYLOCK
/* a lock to be put arround the entirety of bwmgmt just to 
 * make sure it's all safe like.
 */
spinlock_t bw_bignastylock;

#endif

#if defined( DEBUG ) && BW_TRACE_LEVEL>0

struct bw_trace_log_t bw_trace_log[BW_TRACE_LOG_LEN];
int bw_trace_log_pos = 0;
unsigned int bw_trace_log_stamp = 0;

#endif

/*
 * Given a limit structure and a limit, fill in the cooked
 * values that will be use to calculate each I/O size.  This
 * is a way to avoid too many division operations on the
 * performance path.
 */
void
bw_set_bckt_limit(struct bw_limit_hash *limit, int bps)
{
    BW_ENTER_FUNC();
    DPRINTF("bw_set_bckt_limit: limit 0x%x, bps %d\n", (int) limit, bps);
    limit->bwh_bp_bckt = (bps * BW_IO_RATE / BW_CLOCKRATE);
    limit->bwh_bp_sched = limit->bwh_bp_bckt * (BW_IOSCHED_RATE / BW_IO_RATE);
    DPRINTF("    bwh_bp_bckt %d, bwh_bp_sched %d\n",
		limit->bwh_bp_bckt, limit->bwh_bp_sched);
    BW_EXIT_FUNC();
}

/* 
 * Configure the burst window for a limit.  This is an artifact of 
 * earlier implementations.  The idea is to have a configurable
 * window that allows an overcommitment of bandwidth.
 */
void
bw_set_burst_calc(struct bw_limit_hash *limit)
{
    BW_ENTER_FUNC();
    if ( ! (limit->bwh_flags & BWPARAM_SETBURST)
	    || ! (limit->bwh_flags & BWPARAM_SETBPS)
	    || (limit->bwh_flags & BWPARAM_OVERMAX)
	    || ! limit->bwh_bp_bckt) {
	limit->bwh_burst_time = BW_MIN_BURST;
	BW_EXIT_FUNC();
	return;
    }

    limit->bwh_burst_time = limit->bwh_burst_count / limit->bwh_bp_bckt;

    if (limit->bwh_burst_time < BW_MIN_BURST)
	limit->bwh_burst_time = BW_MIN_BURST;
    else if (limit->bwh_burst_time >= BW_MAX_BURST)
	limit->bwh_burst_time = BW_MAX_BURST;
    BW_EXIT_FUNC();
}

/*
 * bw_check_overmax looks to see if the total byte count on a
 * limit has exceeded the programmed limit.  If the limit is
 * exceeded, a new bandwidth is installed.
 *
 * This functionality has been defered to phase 2.
 */
inline void
bw_check_overmax(struct bw_limit_hash *limit)
{
    BW_ENTER_FUNC();
    if (limit->bwh_flags & BWPARAM_SETMAX
		&& limit->bwh_interval_bytes > limit->bwh_lim_max_bytes) {
	limit->bwh_flags |= BWPARAM_OVERMAX;

	if (limit->bwh_flags & BWPARAM_SETOVERBPS)
	    bw_set_bckt_limit(limit, limit->bwh_lim_overlim_bps);
	else bw_set_bckt_limit(limit, 0);

	bw_set_burst_calc(limit);

	limit->bwh_overlim_time = BW_CUR_TIME;
    }
    BW_EXIT_FUNC();
}

/*
 * bw_check_stillover is the companion piece to bw_check_overmax().
 * If a limit has been exceeded, and a new limit is loaded, or the
 * time window has passed to open the limit again, do so.
 *
 * Again, this is phase 2 funtionality.
 */
inline void
bw_check_stillover(struct bw_limit_hash *limit)
{
    BW_ENTER_FUNC();
    if ((limit->bwh_flags & (BWPARAM_OVERMAX|BWPARAM_SETTIME))
		    == (BWPARAM_OVERMAX|BWPARAM_SETTIME)
		&& limit->bwh_lim_rearm_time <= BW_CUR_TIME) {
	bw_set_bckt_limit(limit, limit->bwh_bps);
	bw_set_burst_calc(limit);

	limit->bwh_flags &= ~BWPARAM_OVERMAX;
	limit->bwh_interval_bytes = 0;
    }
    BW_EXIT_FUNC();
}

/*
 * bw_new_limit() - this is the backend to the set limit ioctl.
 * we crack the bwlim_params parameter structure, then apply the
 * various underlying routines to the data.
 */
int
bw_new_limit(struct bwlim_params *bwp)
{
    struct bw_limit_hash *bwhp;
    BW_LOCK_FLAGS_DECL;
    
    BW_ENTER_FUNC();
    /*
     * Do hash specific stuff here.
     */
    switch (bwp->bwp_bwid.bwid_type) {
    case BW_IP:

	bwp->bwp_bwid.bwid_addr = htonl(bwp->bwp_bwid.bwid_addr);

	if (bwp->bwp_bwid.bwid_mask > 32)
	    bwp->bwp_bwid.bwid_mask = 32;

	DPRINTF("bw_new_limit: setting limit for ip 0x%x to %d bps\n",
		bwp->bwp_bwid.bwid_addr, bwp->bwp_bps);
	break;

    case BW_UID:
    case BW_GID:
	break;
    default:
	printk("bw_new_limit: unknown bandwidth ID type %d\n",
		bwp->bwp_bwid.bwid_type);
	BW_EXIT_FUNC();
	return -EINVAL;
    }

    /*
     * Once we create the limit, we can look it up, so lock
     * it until we get valid data into the hash table
     */
    BW_LOG_LOCK(&bw_limitlock, BW_LOCK_FLAGS);

    if ((bwhp = bw_create(&bwp->bwp_bwid)) == NULL) {
	BW_LOG_UNLOCK(&bw_limitlock, BW_LOCK_FLAGS);
	BW_EXIT_FUNC();
	return -EINVAL;
    }

    bwhp->bwh_lim_id = bwp->bwp_bwid;

    if (bwp->bwp_flags & BWPARAM_SETBPS) {
	bwhp->bwh_bps = bwp->bwp_bps;
	bwhp->bwh_flags |= BWPARAM_SETBPS;
	bw_set_bckt_limit(bwhp, bwp->bwp_bps);
	bw_set_burst_calc(bwhp);
    }

    if (bwp->bwp_flags & BWPARAM_SETMAX) {
	bwhp->bwh_lim_max_bytes = bwp->bwp_max_bytes;
	bwhp->bwh_flags |= BWPARAM_SETMAX;
    }

    if (bwp->bwp_flags & BWPARAM_SETBURST) {
	bwhp->bwh_burst_count = bwp->bwp_burst;
	bwhp->bwh_flags |= BWPARAM_SETBURST;
	bw_set_burst_calc(bwhp);
    }

    if (bwp->bwp_flags & BWPARAM_SETTIME) {
	bwhp->bwh_lim_rearm_time = bwp->bwp_rearm_time;
	bwhp->bwh_flags |= BWPARAM_SETTIME;
    }

    if (bwp->bwp_flags & BWPARAM_SETOVERBPS) {
	bwhp->bwh_lim_overlim_bps = bwp->bwp_overlim_bps;
	bwhp->bwh_flags |= BWPARAM_SETTIME;
    }

    BW_LOG_UNLOCK(&bw_limitlock, BW_LOCK_FLAGS);
    BW_EXIT_FUNC();
    return 0;
}

int
bw_delete_limit(struct bwlim_params *bwp)
{
    int ret;
    BW_LOCK_FLAGS_DECL;
    
    BW_ENTER_FUNC();
    /*
     * Do hash specific stuff here.
     */
    switch (bwp->bwp_bwid.bwid_type) {
    case BW_IP:

	bwp->bwp_bwid.bwid_addr = htonl(bwp->bwp_bwid.bwid_addr);

	if (bwp->bwp_bwid.bwid_mask > 32)
	    bwp->bwp_bwid.bwid_mask = 32;

	DPRINTF("bw_new_limit: setting limit for ip 0x%x to %d bps\n",
		bwp->bwp_bwid.bwid_addr, bwp->bwp_bps);
	break;

    case BW_UID:
    case BW_GID:
	break;
    default:
	printk("bw_new_limit: unknown bandwidth ID type %d\n",
		bwp->bwp_bwid.bwid_type);
	BW_EXIT_FUNC();
	return -EINVAL;
    }

    BW_LOG_LOCK(&bw_limitlock, BW_LOCK_FLAGS);

    ret = bw_delete(&bwp->bwp_bwid);

    BW_LOG_UNLOCK(&bw_limitlock, BW_LOCK_FLAGS);

    BW_EXIT_FUNC();
    return ret;
}

/*
 * bw_setip - set the IP address on a connection, and lookup
 * in the hash tables for any limit on this address.
 */
struct bw_info *
bw_setip(struct bw_info *bwi, u_int8_t protocol, u_int32_t addr, u_int16_t port)
{
    BW_LOCK_FLAGS_DECL;

    BW_ENTER_FUNC();
    if ( ! bwi)
    {
	BW_EXIT_FUNC();
	return NULL;
    }

    BW_LOG_LOCK(&bw_limitlock, BW_LOCK_FLAGS);

    if (bw_type_valid(bwi, BW_IP)) {
	bw_delete_ent(bwi, BW_IP, BW_WRIO);
#ifdef BW_TRACK_READ
	bw_delete_ent(bwi, BW_IP, BW_RDIO);
#endif
    }

    DPRINTF("bw_setip: newip addr 0x%x\n", addr);
    bwi->bw_type_id[BW_IP].bwid_type = BW_IP;
    bwi->bw_type_id[BW_IP].bwid_addr = addr;
    bwi->bw_type_id[BW_IP].bwid_mask = 32;
    bwi->bw_type_id[BW_IP].bwid_port = port;
    bwi->bw_type_id[BW_IP].bwid_protocol = protocol;

    bwi->bw_flags |= BW_IPVALID;

    bw_new_ent(bwi, BW_IP, BW_WRIO);

#ifdef DEBUG
    {
    int i;
    for (i = 0; i< N_BW_TYPES; i++) {
	if (bw_type_valid(bwi, i)) {
	    struct bw_limit_hash *limit = bwi->bw_io[BW_WRIO].bw_limit[i];
	    DPRINTF("bw_setip: limit 0x%x\n", (int) limit);
	    if (limit)
		DPRINTF("limit is %d bps, %d b per bucket, %d b per sched\n", 
		    limit->bwh_bps, limit->bwh_bp_bckt, limit->bwh_bp_sched);
	}
    }
    }
#endif

#ifdef BW_TRACK_READ
    bw_new_ent(bwi, BW_IP, BW_RDIO);
#endif
    BW_LOG_UNLOCK(&bw_limitlock, BW_LOCK_FLAGS);
    
    BW_EXIT_FUNC();
    return bwi;
}

/*
 * bw_setugid - set the user ID and group id for a socket
 */
struct bw_info *
bw_setugid(struct bw_info *bwi, uid_t uid, gid_t gid)
{
    BW_LOCK_FLAGS_DECL;
    
    BW_ENTER_FUNC();
    if ( ! bwi)
    {
	BW_EXIT_FUNC();
	return NULL;
    }

    bwi->bw_type_id[BW_UID].bwid_type = BW_UID;
    bwi->bw_type_id[BW_UID].bwid_uid = uid;
    bwi->bw_type_id[BW_GID].bwid_type= BW_GID;
    bwi->bw_type_id[BW_GID].bwid_gid = gid;

    bwi->bw_flags |= BW_UIDVALID|BW_GIDVALID;

    BW_LOG_LOCK(&bw_limitlock, BW_LOCK_FLAGS);

    if (bw_type_valid(bwi, BW_IP)) {
	bw_delete_ent(bwi, BW_GID, BW_WRIO);
	bw_delete_ent(bwi, BW_UID, BW_WRIO);
#ifdef BW_TRACK_READ
	bw_delete_ent(bwi, BW_GID, BW_RDIO);
	bw_delete_ent(bwi, BW_UID, BW_RDIO);
#endif
    }

#ifdef BW_TRACK_READ
    bw_new_ent(bwi, BW_UID, BW_RDIO);
    bw_new_ent(bwi, BW_GID, BW_RDIO);
#endif

    bw_new_ent(bwi, BW_UID, BW_WRIO);
    bw_new_ent(bwi, BW_GID, BW_WRIO);
    BW_LOG_UNLOCK(&bw_limitlock, BW_LOCK_FLAGS);

    BW_EXIT_FUNC();
    return bwi;
}

/*
 * bw_update_io is called after an I/O request finishes.
 * We update the actual counters that track I/O, and if we
 * had a short I/O, we return the extra to our prealloc pool.
 */
void
bw_io_update(struct bw_ioreq *bwreq, int nalloc, int num_io)
{
    enum bw_iotype type = bwreq->bw_rw;
    struct bw_info *bwi;
    struct bw_io_info *bwio_inf;
    int i;
    BW_LOCK_FLAGS_DECL;

    BW_ENTER_FUNC();

    bwreq->bw_error_ret += num_io; /* this is the only place where bw_error_ret
				    * gets incrememted and bw_io_update is never
				    * called syncronosly.  should be SMP safe
				    */
    bwreq->bw_total_ioc -= num_io; /* ditto here */

    bwi = bwreq->bw_bwi;

    bwio_inf = &bwi->bw_io[type];

    BW_LOG_LOCK(&bw_limitlock, BW_LOCK_FLAGS);
    bw_update_universe(bwreq->bw_rw, num_io);

    for (i = 0; i< N_BW_TYPES; i++) {
	if (bw_type_valid(bwi, i)) {
	    bw_io_done(bwio_inf->bw_limit[i], num_io);
	}
    }
    BW_LOG_UNLOCK(&bw_limitlock, BW_LOCK_FLAGS);

    /*
     * If the I/O was short, return the excess to our pool
     */
    if (nalloc != num_io) {
	atomic_add( nalloc - num_io , &bwio_inf->bw_prealloc );
    }

    BW_EXIT_FUNC();
}

/* 
 * bw_io_short_io : return surplus bandwidth to the pools.
 * This is called when a socket is closed with bandwidth
 * pre-allocated.  Rather than let the bandwidth evaporate,
 * we place it in the next I/O sched window, since that way
 * there is a better chance that it will be used.
 *  
 * XXX called with bw_limitlock held
 */  
void
bw_io_short_io(struct bw_info *bwi, enum bw_iotype rw, int nfree)
{
    struct bw_io_info *bwio_inf;
    int targ_slot;
    int i;

    BW_ENTER_FUNC();

    bwio_inf = &bwi->bw_io[rw];

    targ_slot = BW_TIME_TO_IOIND(BW_CUR_TIME + BW_IOSCHED_RATE);

    for (i = 0; i< N_BW_TYPES; i++) {
	if (bw_type_valid(bwi, i)) {
	    bw_io_unalloc(bwio_inf->bw_limit[i], nfree, 0,
		    targ_slot, targ_slot, 0);
	}
    }
    
    BW_EXIT_FUNC();
}

/*
 * bw_io_alloc: Calculate the number of bytes available on this
 * limit during the scheduling window that starts after start_bias.
 * If the request fits, allocate the bandwidth and return the slot
 * that we want to be scheduled.  Otherwise allocate anything we
 * can and set the slot to '-1'.
 *
 * This is the per limit backend to bw_io_check() below.
 */
int
bw_io_alloc(struct bw_limit_hash *limit, int nwanted,
		int start_bias, int *slotp)
{
    bwl_time_t base_time;
    bwl_time_t end_time, target_time;
    struct bw_time_cnt *sched_bkt;
    int sched_alloc;
    int cur_alloc = 0;
    int num_users;
    int cur_sched;

    BW_ENTER_FUNC();

    base_time = BW_CUR_TIME + start_bias;

    if ( ! limit) {
	/*
	 * If we have no limit, caller can do as
	 * much as he wants whenever he wants.
	 */
	if (slotp)
	    *slotp = BW_TIME_TO_IOIND(base_time);
	BW_EXIT_FUNC();
	return nwanted;
    }
    DPRINTF("bw_io_alloc: limit 0x%x, start %d, want %d bytes\n",
	    (int) limit,  start_bias, nwanted);

#if 0
    /*
     * We rearm the overlimit interval timer *before* we check
     * a particular request.
     *
     * XXX this should be off the performance path
     *
     * disabled until rev 2 + thinking
     */
    bw_check_stillover(limit);
#endif

    sched_bkt = &limit->bwhcnt_sched[BW_TIME_TO_SCHEDIND(base_time)];
    cur_sched = bw_num_cnt(sched_bkt, base_time, BW_IOSCHED_RATE);
    sched_alloc = limit->bwh_bp_sched - cur_sched;

    /*
     * If there are others waiting on this bandwidth limit,
     * then ensure that this user shares fairly.
     */
    num_users = limit->bwh_active_ios;
    if (num_users) {
	num_users -= bw_num_cnt(&limit->bwh_active_cnt, base_time,
			BW_IOSCHED_RATE);
	if (num_users <= 0)
	    num_users = 1;

	bw_incr_cnt(&limit->bwh_active_cnt, base_time, 1, BW_IOSCHED_RATE);

	sched_alloc = sched_alloc / num_users;
    }

    /*
     * At this point, sched_alloc is the number of bytes in
     * this window available for the current request
     */
    cur_alloc = min(nwanted, sched_alloc);

    if (cur_alloc) {
	if (slotp) {

	    /*
	     * We ensure that the slot we return is in the current
	     * scheduling window, which avoids enqueuing it twice.
	     */
	    end_time = base_time | (BW_IOSCHED_RATE - 1);

	    target_time = base_time
			+ BW_IOTIME_TO_JIFFY(cur_sched / limit->bwh_bp_bckt);

	    if (target_time > end_time)
		target_time = end_time;
	    else if (target_time < base_time)
		target_time = base_time;

	    *slotp = BW_TIME_TO_IOIND(target_time);
	}
	bw_incr_cnt(sched_bkt, base_time, cur_alloc, BW_IOSCHED_RATE);
    }
    else if (slotp)
	*slotp = -1;

    DPRINTF("bw_io_alloc: alloc %d, slot %d\n", cur_alloc, slotp ? *slotp : -1);
    BW_EXIT_FUNC();
    return cur_alloc;
}

/*
 * bw_io_unalloc : return bytes to the pool.
 */
void
bw_io_unalloc(struct bw_limit_hash *limit, int my_num, int final_num,
		int my_slot, int final_slot, int time_bias)
{
    bwl_time_t base_time;
    struct bw_time_cnt *my_sched;
    int num_reduce;

    BW_ENTER_FUNC();
    /*
     * never allocated anything - nothing to adjust
     */
    if ( ! limit || my_slot == -1)
    {
	BW_EXIT_FUNC();
	return;
    }
    
    /*
     * if there is nothing to reduce, we are done.
     */
    num_reduce = my_num - final_num;
    if (num_reduce == 0)
    {
	BW_EXIT_FUNC();
	return;
    }

    base_time = BW_CUR_TIME + time_bias;

    my_sched = &limit->bwhcnt_sched[BW_TIME_TO_SCHEDIND(base_time)];
    bw_incr_cnt(my_sched, base_time, -num_reduce, BW_IOSCHED_RATE);
    BW_EXIT_FUNC();
}

bwl_totcnt_t bw_total_count[2];
/*
 * After an I/O request completes, we update the counters.
 */
void
bw_update_universe(enum bw_iotype rw, int actual)
{
    int min_index;

    BW_ENTER_FUNC();

    min_index = BW_TIME_TO_MININD(BW_CUR_TIME);

    ASSERT(rw < 2);

    bw_incr_cnt(&bw_total_mincnt[rw][min_index], BW_CUR_TIME,
		actual, BW_MIN_RATE);
    bw_total_count[rw] += actual;
    BW_EXIT_FUNC();
}

/*
 * bw_io_done : we did some I/O and need to track it. 
 * we scan the I/O list and update the actual I/O.
 */
void
bw_io_done(struct bw_limit_hash *limit, int actual)
{
    bwl_time_t base_time;
    int min_index;
    
    BW_ENTER_FUNC();
    if ( ! limit)
    {
	BW_EXIT_FUNC();
	return;
    }

    /*
     * First, add the bytes to the minute counter
     *
     * Note : only the minute counters accurately track io.
     * the rest of the counters are just used for bandwidth
     * calculations, to they are locally accurate, but not
     * globally.  For example, we allocate in one time frame,
     * and transmit at another, so we update alloc and the
     * actual in different windows.  We can't scan back to
     * correct the actual count for the alloc window because
     * we would need to track a lot more data.  Also, it would
     * be wrong - the bytes left _now_, not when they were
     * allocated.  So we update now. 
     *
     * The fact that this does not matter is left as an
     * exercise to the reader.
     */
    base_time = BW_CUR_TIME;
    min_index = BW_TIME_TO_MININD(base_time);

	/* lock is not needed here because limit lock is 
	   aquired in  bw_io_update */
    bw_incr_cnt(&limit->bwhcnt_minutes[min_index], base_time,
		actual, BW_MIN_RATE);

    limit->bwh_lim_tot_bytes += actual; /* these two are only incremented here thus */
    limit->bwh_interval_bytes += actual;/* are SMP safe. */

#if 0
    /*
     * Check overlimit after loop to avoid weird effects in
     * loop is we set bwh_bp_bckt == 0;
     *
     * XXX - this should be off the performance path
     *
     * Delayed for phase 2 and more thinking
     */
    if (limit->bwh_flags & (BWPARAM_OVERMAX|BWPARAM_SETMAX) == BWPARAM_SETMAX)
	    bw_check_overmax(limit);
#endif
    BW_EXIT_FUNC();
}

/*
 * bw_io_check - given an I/O request, return the number of
 *	bandwidth limited bytes available in the window at
 *	the time start_bias offset from now.  Also return the
 *	slot when that amount of bandwidth is available.
 */
int
bw_io_check(struct bw_ioreq *bwreq, int start_bias, int *slotp)
{
    struct bw_info	*bwi = bwreq->bw_bwi;
    struct bw_io_info	*bwio;
    int		nalloc[N_BW_TYPES];
    int		slot[N_BW_TYPES];
    int		num_io = atomic_read( &bwreq->bw_totallen );
    int		min_lim, max_slot;
    struct bw_limit_hash *lim;
    int cur_alloc, need_new, i;
    BW_LOCK_FLAGS_DECL;

    BW_ENTER_FUNC();
    bwio = &bwi->bw_io[bwreq->bw_rw];

    /*
     * If we already have bandwidth allocated for this,
     * send the data
     */
    if ( atomic_read( &bwreq->bw_prealloc )>= num_io) {
	DPRINTF("bw_io_check: prealloc hit - want %d, have %d\n", num_io,
		atomic_read( &bwreq->bw_prealloc ) );
	if (slotp)
	    *slotp = BW_TIME_TO_IOIND(BW_CUR_TIME + start_bias);
	BW_EXIT_FUNC();
	return num_io;
    }

    cur_alloc = atomic_read( &bwreq->bw_prealloc );

    min_lim = need_new = num_io - cur_alloc;
    max_slot = -1;

    BW_LOG_LOCK(&bw_limitlock, BW_LOCK_FLAGS);

    /*
     * scan each limit to find out what we can do
     */
    if (min_lim) for (i = 0; i< N_BW_TYPES; i++) {

	if (bw_type_valid(bwi, i)) {

	    lim = bwio->bw_limit[i];

	    if (lim && lim->bwh_flags & BWPARAM_DELETE) {
		/*
		 * Disconnect the bwreq and bwinfo
		 */
		lim->bwh_active_ios--;

		bw_delete_ent(bwi, i, bwreq->bw_rw);

		bwio->bw_limit[i] = NULL;

		if (lim->bwh_active_ios == 0 && lim->bwh_active_ios == 0)
		    kfree(lim);

		bw_new_ent(bwi, i, bwreq->bw_rw);

		lim = bwio->bw_limit[i];

		if (lim)
		    lim->bwh_active_ios++;
	    }

	    nalloc[i] = bw_io_alloc(lim, need_new, start_bias, &slot[i]);

	    if (slot[i] > max_slot)
		max_slot = slot[i];

	    if (nalloc[i] < min_lim)
		min_lim = nalloc[i];
	}
    }
    cur_alloc += min_lim;

    /*
     * If the request is too small, we adjust the slot to say
     * that we will defer this I/O.  We still keep any allocation
     * we got on all the limits to ensure forward progress.
     */
    if (cur_alloc < num_io &&
		(bwi->bw_flags & BW_MSGONLY || cur_alloc < BW_MIN_IOSIZE))
	max_slot = -1;

    /*
     * No go back and correct for min vs. attempt.
     */
    for (i = 0; i< N_BW_TYPES; i++) {
	if (bw_type_valid(bwi, i)
		&& (nalloc[i] > min_lim || slot[i] != max_slot)) {
	    bw_io_unalloc(bwio->bw_limit[i], nalloc[i], min_lim,
			slot[i], max_slot, start_bias);
	}
    }

    BW_LOG_UNLOCK(&bw_limitlock, BW_LOCK_FLAGS);

    bwreq->bw_lastalloc = min_lim;

    if (slotp)
	*slotp = max_slot;

    atomic_set( &bwreq->bw_prealloc, cur_alloc );
    BW_EXIT_FUNC();
    return cur_alloc;
}

/*
 * bw_q_io : the asynchronous guts.  We take an I/O request, and
 * a io function and parameters, and schedule the I/O request to
 * occur over the nextfew cycles.
 */
int
bw_q_io(struct bw_ioreq *bwreq, bw_iofunc_ptr_t iofunc, void *ioparams)
{
    int cur_len;
    int slot;

    BW_ENTER_FUNC();

    bwreq->bw_iofunc = iofunc;
    bwreq->bw_ioparams = ioparams;

    cur_len = bw_io_check(bwreq, BW_IOSCHED_RATE, &slot);
    if (cur_len && slot != -1)
	bw_do_q_active(bwreq, slot, cur_len);

    /*
     * If we still have I/O remaining, queue the ioreq
     * for the I/O scheduler
     *
     * XXX - race condition if the I/O gets performed before
     *       we return here and check bw_totallen.  do_active()
     *	     will re-queue if a short I/O happens, and we could
     *	     be adding to the queue.
     *
     * For now, don't worry.  The BW_IOSCHED_RATE means we have
     * at least 80 milliseconds to get from the bw_do_q_active to
     * the next lines.
     *
     * Probably ought to have bw_io_check decrement bw_totallen
     * rather than bw_do_q_active(), and have bw_io_update
     * increment when necessary, rather than bw_iov_pullup().
     * Then, ..., we could move the following two lines above
     * the bw_do_q_active() and eliminate the race.
     */

    if ( atomic_read( &bwreq->bw_totallen) )
	bw_do_q_sched(bwreq);

    BW_EXIT_FUNC();

    return 1;
}

/*
 * bw_io_active : reference count an I/O request.
 * this is here rather than bw_init_ioreq because we only do this
 * if we need to do async I/O.  It could fold into bw_iov_copyin,
 * but that is there, and this is here.  Also, poll ooperations
 * call us, but not bw_iov_copyin().
 */
void
bw_io_active(struct bw_ioreq *bwreq)
{
    struct bw_info	*bwi = bwreq->bw_bwi;
    struct bw_io_info *bwio;
    int i;
    BW_LOCK_FLAGS_DECL;
    
    BW_ENTER_FUNC();

    bwio = &bwi->bw_io[bwreq->bw_rw];

    BW_LOG_LOCK(&bw_limitlock, BW_LOCK_FLAGS);
    for (i = 0; i < N_BW_TYPES; i++) {
	if (bw_type_valid(bwi, i) && bwio->bw_limit[i])
		bwio->bw_limit[i]->bwh_active_ios++;
    }
    BW_LOG_UNLOCK(&bw_limitlock, BW_LOCK_FLAGS);
    BW_EXIT_FUNC();
}

/*
 * Release the limits attached to a previously active request
 *
 */
void
bw_io_inactive(struct bw_ioreq *bwreq)
{
    struct bw_info	*bwi = bwreq->bw_bwi;
    struct bw_io_info *bwio;
    int i;
    BW_LOCK_FLAGS_DECL;
    
    BW_ENTER_FUNC();
    bwio = &bwi->bw_io[bwreq->bw_rw];
    DPRINTF("bw_io_inactive: req 0x%x done\n", (int) bwreq);
    
    
    BW_LOG_LOCK(&bw_limitlock, BW_LOCK_FLAGS);

    for (i = 0; i < N_BW_TYPES; i++) {
	if (bw_type_valid(bwi, i) && bwio->bw_limit[i])
		bwio->bw_limit[i]->bwh_active_ios--;
    }
    BW_LOG_UNLOCK(&bw_limitlock, BW_LOCK_FLAGS); 
    BW_EXIT_FUNC();
}
// LICENSE:
// This software is subject to the terms of the GNU GENERAL 
// PUBLIC LICENSE Version 2, June 1991
