/*
 * bw_timer.c : back end timers and timer driven routines.
 *
 * Copyright (C) 1999-2001, Sun Microsystems, Inc.
 * All rights reserved.
 *
 * These are the I/O scheduler timer and active I/O routines.
 * They process their various queues, and perform sceduling
 * or I/O respectively.  The scheduler calls into bw_mgmt to
 * calculate available bandwidth, and the active I/O layer
 * uses bw_mgmt to update statistics.
 */
#include <bw_mgmt.h>
#include <linux/wait.h>
/*
 * The array of active I/O requests to be handled by
 * the timer loop.
 */
struct bw_queue bw_iolist[BW_NUMIO_CNT];

/*
 * The schedule list is more generic.  We need to
 * do an insertion sort based on the bandwidth
 * allocation, so generic head/tail is not useful
 *
 * Tail size is the length of the last request so
 * we can do a fast enqueue.
 */
struct bw_queue bw_sched_list;
int bw_sched_tailsize;

#define DPRINTF if (0) printk

/*
 * Integer counter - this is the last real time our I/O timer ran.
 * It is not 100% accurate - we do not interrupt if there is no I/O.
 *
 *... also it is only set/incremented at interrupt time so accesses to
 * is are restricted to one processor and are serialized
 */
bwl_time_t bw_current_count;

int bw_active_next_slot = -1;	/* Sweep hand for where we think we are */
unsigned int bw_next_active_mask;

int next_sched;

struct timer_list bw_active_timer;
struct timer_list bw_sched_timer;

spinlock_t bw_ioactive_lock;
spinlock_t bw_iosched_lock;

#define BW_SLOTDELTA(cur, next)	((unsigned) ((next) - (cur)) % BW_NUMIO_CNT)
#define BW_TIMEMIN(cur, next)	((int) ((next) - (cur)) < 0 ? (next) : (cur))
#define BW_TIMEMAX(cur, next)	((int) ((cur) - (next)) < 0 ? (next) : (cur))

#ifdef DEBUG
#define DBG_ENQ(bwq, bwreq, link)			\
	struct bw_ioreq *tbreq;				\
							\
	for (tbreq = (bwq)->bwq_head; tbreq; tbreq = tbreq->link) \
	    ASSERT(tbreq != bwreq);			\
							\
	BW_CHECK_REQ(bwreq);				\
	if ( ! BW_Q_EMPTY(bwq))				\
	    BW_CHECK_REQ((bwq)->bwq_head);
#else
#define DBG_ENQ(bwq, bwreq, link)
#endif

#define BW_Q_ENQ(bwq, bwreq, link)			\
    ({							\
	DBG_ENQ(bwq, bwreq, link);			\
	*(bwq)->bwq_tail = (bwreq);			\
	(bwreq)->link = NULL;				\
	(bwq)->bwq_tail = &(bwreq)->link;		\
	DPRINTF("BW_Q_ENQ q 0x%x, req 0x%x via link 0x%x\n", (int) bwq,	\
		    (int) bwreq, (int) &(bwreq)->link);		\
    })

#define BW_Q_EMPTY(bwq)	((bwq)->bwq_head == NULL)

#define BW_Q_SET_EMPTY(bwq)	\
    (bwq)->bwq_head = NULL; (bwq)->bwq_tail = &(bwq)->bwq_head;

#define BW_Q_DEQ(bwq, bwreq, link)					\
    (bwreq) = (bwq)->bwq_head;						\
    DPRINTF("BW_Q_DEQ q 0x%x, req 0x%x via link 0x%x\n", (int) bwq,	\
		(int) bwreq, (int) &(bwreq)->link);			\
    if (bwreq) {							\
	if ((bwq)->bwq_tail == &(bwreq)->link) {			\
	    BW_Q_SET_EMPTY(bwq);					\
	}								\
	else {								\
	    (bwq)->bwq_head = (bwreq)->link;				\
	    BW_CHECK_REQ((bwq)->bwq_head);				\
	}								\
	(bwreq)->link = NULL;						\
	BW_CHECK_REQ(bwreq);						\
    }

#define BW_Q_DELETE(bwq, bwreq, link)					\
    ({									\
	struct bw_ioreq **bqpp;						\
	int found = 0;							\
									\
	DPRINTF("BW_Q_DELETE q 0x%x, req 0x%x via link 0x%x\n",		\
		(int) bwq, (int) bwreq, (int) &(bwreq)->link);		\
	for (bqpp = &(bwq)->bwq_head;					\
		    (*bqpp) && *bqpp != bwreq;				\
		    bqpp = &(*bqpp)->link)				\
	    BW_CHECK_REQ(*bqpp);					\
									\
	if ((*bqpp) == bwreq) {						\
	    (*bqpp) = bwreq->link;					\
	    if ((bwq)->bwq_tail == &bwreq->link)			\
		(bwq)->bwq_tail = bqpp;					\
	    found = 1;							\
	}								\
	found == 1;							\
    })

void bw_set_active_timer(void);
void bw_do_q_sched(struct bw_ioreq *bwreq);

void
bw_do_q_active(struct bw_ioreq *bwreq, int slot, int cur_len)
{
    BW_LOCK_FLAGS_DECL
    
    BW_ENTER_FUNC();
    BW_CHECK_REQ(bwreq);

    atomic_sub( cur_len, &bwreq->bw_totallen );
    atomic_sub( cur_len, &bwreq->bw_prealloc );

    ASSERT(BW_IOSLOT_TOSCHED(slot) < 4 && BW_IOSLOT_TOSCHED(slot) >= 0);

    if (bwreq->bw_cur_iolen[BW_IOSLOT_TOSCHED(slot)] != 0) {
#ifdef DEBUG
	DPRINTF("bw_do_q_active: req 0x%x, slot %d, SCHED %d, now 0x%x, prev armed 0x%x, prev slot %d\n",
		(int) bwreq, slot, BW_IOSLOT_TOSCHED(slot), (int) BW_CUR_TIME,
		(int) bwreq->bw_io_set[BW_IOSLOT_TOSCHED(slot)],
		bwreq->bw_io_slot[BW_IOSLOT_TOSCHED(slot)]);
#endif
	panic("bw_do_q_active: re-queue\n");
    }

    bwreq->bw_cur_iolen[BW_IOSLOT_TOSCHED(slot)] = cur_len;

#ifdef DEBUG
    bwreq->bw_io_set[BW_IOSLOT_TOSCHED(slot)] = BW_CUR_TIME;
    bwreq->bw_io_slot[BW_IOSLOT_TOSCHED(slot)] = slot;
#endif

    BW_LOG_LOCK(&bw_ioactive_lock, BW_LOCK_FLAGS);

    /*
     * If there are no Q's active, bw_current_count could be invalid,
     * which breaks many calculations.
     */
    if ( ! bw_next_active_mask) {
	DPRINTF("1 t was %d, now %d\n", BW_TIME_TO_IOIND(bw_current_count),
			BW_TIME_TO_IOIND(BW_CUR_TIME));
	bw_current_count = BW_CUR_TIME;
    }

    BW_Q_ENQ(&bw_iolist[slot], bwreq, bw_ioactive[BW_IOSLOT_TOSCHED(slot)]);
    DPRINTF("bw_do_q_active: 0x%x poll %d in slot %d, curslot %d, curtime %d\n",
		(int) bwreq, (int) bwreq->bw_iopoll, slot,
		BW_TIME_TO_IOIND(bw_current_count),
		BW_TIME_TO_IOIND(BW_CUR_TIME));

    bw_next_active_mask |= 1 << slot;
    DPRINTF("0x%x 1\n", bw_next_active_mask);

    bw_set_active_timer();
    ASSERT(bw_verify_active(0));
    BW_LOG_UNLOCK(&bw_ioactive_lock, BW_LOCK_FLAGS);
    BW_EXIT_FUNC();
}

/*
 * Adjust the I/O timer to fire at the next non-empty bucket.
 * We always track from bw_current_count, not BW_CUR_TIME so
 * we don't have to worry about canceling a timer event that
 * is behind schedule
 */
void
bw_set_active_timer()
{
    int cur_slot = BW_TIME_TO_IOIND(bw_current_count);
    int tmp_mask, next_slot;
    bwl_time_t timeout;
    int delta;

    BW_ENTER_FUNC();

    if (bw_next_active_mask) {
	DPRINTF("0x%x 2\n", bw_next_active_mask);
	tmp_mask = bw_next_active_mask & (-1 << cur_slot);
	DPRINTF("0x%x 3\n", tmp_mask);
	ASSERT(bw_verify_active(0));

	if (tmp_mask)
	    next_slot = ffs(tmp_mask);
	else next_slot = ffs(bw_next_active_mask);

       next_slot--;	/* subtract 1 because ffs returns 0 for no bits 
			 * XXX: verify this
			 */

	/*
	 * If we are going to run this queue, there
	 * better be something there.
	 */
	ASSERT(bw_iolist[next_slot].bwq_head);

	delta = BW_SLOTDELTA(cur_slot, next_slot);

	/*
	 * There are weird effects if we do slot based math on the delta
	 * and realtime, because realtime may have passed us by, so the
	 * sign cannot be used to detect wraps in the circular buffer.
	 *
	 * Instead, we convert the delta we know into a time, and use
	 * that to schedule the interrupt.
	 */
	timeout = BW_IOTIME_TO_JIFFY(delta) + bw_current_count;

	/*
	 * We want to see if the next timer event needs to be adjusted.
	 * This happens if there is no pending timer, or if the new event
	 * is sooner than the currently armed timer (bw_active_next_slot).
	 */
	if (bw_active_next_slot == -1 ||
		    delta < BW_SLOTDELTA(cur_slot, bw_active_next_slot)) {
	    bw_active_next_slot = next_slot;

	    ASSERT(bw_verify_active(0));
	
		/* this is superfluous because in UP it is a
		 * tautology and in MP it can not be garunteed.
		 * besides the scheduler wont let things happen
		 * in the past
		 */
#if 0
	    ASSERT(timeout > BW_CUR_TIME
		? timeout == BW_TIMEMAX(timeout, BW_CUR_TIME)
		: BW_CUR_TIME == BW_TIMEMAX(timeout, BW_CUR_TIME));
#endif

	    mod_timer(&bw_active_timer, BW_TIMEMAX(timeout, BW_CUR_TIME));
	}
	ASSERT(bw_verify_active(0));

	/*
	 * Pull time to the min of the realtime and the next event.  This
	 * prevents the delta between real time and current time from
	 * getting so large that new events are scheduled in the past.
	 */
	DPRINTF("3 t was %d, now %d\n", BW_TIME_TO_IOIND(bw_current_count),
		    BW_TIME_TO_IOIND(BW_TIMEMIN(timeout, BW_CUR_TIME)));
	    /* this is superfluous because in UP it is a
	     * tautology and in MP it can not be garunteed.
	     * besides the scheduler wont let things happen
	     * in the past
	     */

#if 0
	ASSERT(timeout < BW_CUR_TIME
	    ? timeout == BW_TIMEMIN(timeout, BW_CUR_TIME)
	    : BW_CUR_TIME == BW_TIMEMIN(timeout, BW_CUR_TIME));
#endif
	bw_current_count = BW_TIMEMIN(timeout, BW_CUR_TIME);
	ASSERT(bw_verify_active(0));
    }
    else {	

	    /* 
	     * No events are pending - clean up stray stuff
	     */
	ASSERT(bw_verify_active(0));

	if (bw_active_next_slot != -1)
	{
	    del_timer(&bw_active_timer);
	}
	bw_active_next_slot = -1;
	DPRINTF("4 t was %d, now %d\n", BW_TIME_TO_IOIND(bw_current_count),
		    BW_TIME_TO_IOIND(BW_CUR_TIME));
	bw_current_count = BW_CUR_TIME;
	ASSERT(bw_verify_active(0));

    }
    BW_EXIT_FUNC();
}

/*
 * do an I/O request from the front of the queue now
 * We do at most one request, but we will scan forward
 * up to the end_bias parameter.
 */
int
bw_do_some_active(int adj_timer, int end_bias, unsigned long * lock_flags)
{
    struct bw_ioreq *bwreq;
    int slot;
    int io_ret;
    int q_empty = 0;
    int num_io;
    int cur_len;
    int cur_bias = 0;
    BW_LOCK_FLAGS_DECL

    BW_ENTER_FUNC();
    BW_LOG_LOCK(&bw_ioactive_lock, BW_LOCK_FLAGS);
    ASSERT(bw_verify_active(0));

next_slot:
    slot = BW_TIME_TO_IOIND(bw_current_count + cur_bias);

    ASSERT(BW_IOSLOT_TOSCHED(slot) < 4 && BW_IOSLOT_TOSCHED(slot) >= 0);

    BW_Q_DEQ(&bw_iolist[slot], bwreq, bw_ioactive[BW_IOSLOT_TOSCHED(slot)]);
    DPRINTF("bw_do_some_active: req 0x%x in slot %d\n", (int) bwreq, slot);

    if ( ! bwreq) {
	cur_bias += BW_IO_RATE;
	if (cur_bias < end_bias)
	    goto next_slot;

	ASSERT((bw_next_active_mask & (1 << slot)) == 0);
	ASSERT(bw_verify_active(0));
	BW_LOG_UNLOCK(&bw_ioactive_lock, BW_LOCK_FLAGS);
	BW_EXIT_FUNC();
	return 0;
    }

    spin_lock( &bwreq->bw_busylock ); 

    BW_CHECK_REQ(bwreq);

    q_empty = BW_Q_EMPTY(&bw_iolist[slot]);

    if (q_empty) {
	bw_next_active_mask &= ~(1 << slot);
	DPRINTF("0x%x 4\n", bw_next_active_mask);
	if (adj_timer == BW_IO_ADJUST_TIMER)
	    bw_set_active_timer();
    }

    cur_len = bwreq->bw_cur_iolen[BW_IOSLOT_TOSCHED(slot)];
    bwreq->bw_cur_iolen[BW_IOSLOT_TOSCHED(slot)] = 0;

    BW_LOG_UNLOCK(&bw_ioactive_lock, BW_LOCK_FLAGS);
    
    bw_iov_front(bwreq, cur_len);

    io_ret = bwreq->bw_iofunc(bwreq, cur_len, bwreq->bw_ioparams, lock_flags);

    BW_CHECK_REQ(bwreq);
    ASSERT(bw_verify_active(1));

    num_io = io_ret >= 0 ? io_ret : 0;
    bw_io_update(bwreq, cur_len, num_io);


    DPRINTF("bw_do_some_active: iofunc ret %d, bw_total_ioc %d\n",
		io_ret, bwreq->bw_total_ioc);

    /*
     * If we have finished the I/O, do the wakeup.
     *
     * XXXX after bw_io_complete() finishes, the bwreq will be
     *      invalid for polled I/O operations.
     */
    if ( ! bwreq->bw_total_ioc) {
	bw_io_complete(bwreq);
    }
    else if (io_ret < 0 && io_ret != -EWOULDBLOCK) {
	/*
	 * If we have already transfered data, we need to
	 * tell the user that number, then give him this
	 * error on the next call.  Gross.
	 *
	 * Note that we expect EWOULDBLOCK.
	 */
	DPRINTF("bw_do_some_active: error %d returned cur_ret %d\n",
		io_ret, bwreq->bw_error_ret);
	if (bwreq->bw_error_ret > 0)
	    bwreq->bw_bwi->bw_savederror = io_ret;
	else bwreq->bw_error_ret = io_ret;

	bw_cancel_io(bwreq);
	bw_io_complete(bwreq);
    }
    else {
	/*
	 * "Normal" case.  I/O completed, either fully or partially.
	 * We know there is I/O remaining, because bw_total_ioc != 0.
	 *
	 * Update the IOV based on the completed I/O
	 */
	bw_iov_pullup(bwreq, cur_len, num_io);

	/*
	 * The only question is whether we had a short I/O when we
	 * have been removed from the I/O sched Q, in which case
	 * we need to be put back on the Q.  We detect this by the
	 * fact that there is still I/O to schedule, but it is equal
	 * to our shortfall.
	 */
	if (atomic_read( &bwreq->bw_totallen ) && 
	    atomic_read( &bwreq->bw_totallen ) == cur_len - num_io) {
	    DPRINTF("bw_do_some_active: partial I/O on 0x%x - %d more bytes\n", 
		    (int) bwreq, atomic_read( &bwreq->bw_totallen) );
	    bw_do_q_sched(bwreq);
	}
    }
    
     spin_unlock( &bwreq->bw_busylock );

    ASSERT(bw_verify_active(1));
    BW_EXIT_FUNC();
    return ! q_empty;
}


/*
 * The IO request bombed out - cancel any pending
 * IO timer events for this request.
 */
void
bw_cancel_io(struct bw_ioreq *bwreq)
{
    int i, actslot;
    int new_intr = 0;
    BW_LOCK_FLAGS_DECL

    BW_ENTER_FUNC();
    BW_CHECK_REQ(bwreq);
    DPRINTF("bw_cancel_io: req 0x%x\n", (int) bwreq);

    /*
     * Nuke from the iosched queue first so it won't burp
     * and enter another request on the active queue.
     */

    BW_LOG_LOCK(&bw_iosched_lock, BW_LOCK_FLAGS);
    BW_Q_DELETE(&bw_sched_list, bwreq, bw_iosched);
    if (BW_Q_EMPTY(&bw_sched_list))
	del_timer(&bw_sched_timer);

    BW_LOG_UNLOCK(&bw_iosched_lock, BW_LOCK_FLAGS);


    BW_LOG_LOCK(&bw_ioactive_lock, BW_LOCK_FLAGS);

    /*
     * Now, scan the active queues to see if this request is
     * there.  A bit of a performance hog (32 lists), but
     * cancel operations should be infrequent.  If not, we
     * can add code to remember the slot and cancel directly.
     */
    for (i = 0; i < BW_NUMIO_CNT; i++) {
	actslot = BW_IOSLOT_TOSCHED(i);
	
	if (BW_Q_DELETE(&bw_iolist[i], bwreq, bw_ioactive[actslot])) {
	    DPRINTF("bw_cancel_io: slot %d\n", i);

	    if (BW_Q_EMPTY(&bw_iolist[i])) {
		bw_next_active_mask &= ~(1 << i);
		DPRINTF("0x%x 5\n", bw_next_active_mask);
		if (bw_active_next_slot == i) {
		    DPRINTF("bw_cancel_io: rearm timer\n");
		    new_intr = 1;
		}
	    }
	}
    }
 
    if (new_intr)
	bw_set_active_timer();

    BW_LOG_UNLOCK(&bw_ioactive_lock, BW_LOCK_FLAGS);
    BW_EXIT_FUNC();
}

int
bw_verify_active(int needlock)
{
    struct bw_ioreq *bwreq;
    int i, actslot;
    BW_LOCK_FLAGS_DECL
    
    BW_ENTER_FUNC();
    /*
     * scan the active queues to see if they are sane.
     */
    if (needlock)
	BW_LOG_LOCK(&bw_ioactive_lock, BW_LOCK_FLAGS);

    for (i = 0; i < BW_NUMIO_CNT; i++) {
	actslot = BW_IOSLOT_TOSCHED(i);
	
	bwreq = bw_iolist[i].bwq_head;
	if (bwreq && (bw_next_active_mask & (1 << i)) == 0) {
	    printk("bw_verify_active: slot %d non-empty, but non-active\n", i);
	    if (needlock)
		BW_LOG_UNLOCK(&bw_ioactive_lock, BW_LOCK_FLAGS);
	    BW_EXIT_FUNC();
	    return 0;

	}
	
	for ( ; bwreq; bwreq = bwreq->bw_ioactive[actslot]) 
	{
	    if (bwreq->bw_magic != BW_REQ_MAGIC) {
		printk("bw_verify_active: bad magic req 0x%x, slot %d\n",
			    (int) bwreq, i);
		if (needlock)
		    BW_LOG_UNLOCK(&bw_ioactive_lock, BW_LOCK_FLAGS);
		BW_EXIT_FUNC();
		return 0;
	    }
	}

    }
    if (needlock)
	BW_LOG_UNLOCK(&bw_ioactive_lock, BW_LOCK_FLAGS);
    BW_EXIT_FUNC();
    return 1;
}
 

void
bw_do_active(unsigned long tmp)
{
    BW_LOCK_FLAGS_DECL
    NASTY_LOCK_DECL;

    NASTY_LOCK( NASTY_LOCK_FLAGS );
    BW_ENTER_FUNC();

    bw_active_next_slot = -1;

    ASSERT(bw_verify_active(1));
    DPRINTF(" <<<<< ");
    /* 
     * process all the queued I/O between our last time and now
     */
    while (1) {
	while (bw_do_some_active(BW_IO_DONT_ADJUST_TIMER, 0, NASTY_LOCK_FLAGS_REF) != 0)
	    ;

	    /* lock here to make sure that bw_current_count is safe */
	BW_LOG_LOCK(&bw_ioactive_lock, BW_LOCK_FLAGS);
	if (bw_current_count == BW_CUR_TIME)
	{
	    BW_LOG_UNLOCK(&bw_ioactive_lock, BW_LOCK_FLAGS);
	    break;
	}
	bw_current_count += BW_IO_RATE;
	BW_LOG_UNLOCK(&bw_ioactive_lock, BW_LOCK_FLAGS);
    }
    ASSERT(bw_verify_active(1));

    BW_LOG_LOCK(&bw_ioactive_lock, BW_LOCK_FLAGS);
    bw_set_active_timer();
    BW_LOG_UNLOCK(&bw_ioactive_lock, BW_LOCK_FLAGS);
    DPRINTF(" >>>>> ");

    ASSERT(bw_verify_active(1));
    BW_EXIT_FUNC();
    NASTY_UNLOCK( NASTY_LOCK_FLAGS );
}

/*
 * Straight forward queue of active requests.  We need to
 * scan every ioreq each timer tick since we don't know
 * what limit has forced us to wait.
 *
 * (The choice of having a bandwidth pool and waking up
 * the sleepers when the pool opened was considered and
 * rejected - we would still need to scan the bandwidth
 * limits to know who to wake, and we are likely to have
 * more limits than requests.)
 */

void
bw_do_q_sched(struct bw_ioreq *bwreq)
{
    int time_tmp;
    int was_empty;
    struct bw_ioreq **bwreqpp;
    struct bw_ioreq *tbreq;
    BW_LOCK_FLAGS_DECL
    
    BW_ENTER_FUNC();
    BW_CHECK_REQ(bwreq);

    BW_LOG_LOCK(&bw_iosched_lock, BW_LOCK_FLAGS);

    for (tbreq = bw_sched_list.bwq_head; tbreq; tbreq = tbreq->bw_iosched) {
	BW_CHECK_REQ(tbreq);
	ASSERT(tbreq != bwreq);
    }

    was_empty = BW_Q_EMPTY(&bw_sched_list);

#if LINUX_VERSION_CODE >= 0x20400
    if (was_empty && bw_sched_timer.list.prev)
	printk("bw_do_q_sched: empty q with armed timer\n");
#else
    if (was_empty && bw_sched_timer.prev)
	printk("bw_do_q_sched: empty q with armed timer\n");

#endif
    /*
     * XXX technically, we don't need to check BW_Q_EMPTY.
     * when we drain the Q, we set bw_sched_tailsize = 0;
     */
    if (was_empty || bwreq->bw_lastalloc >= bw_sched_tailsize) {
	BW_Q_ENQ(&bw_sched_list, bwreq, bw_iosched);
	bw_sched_tailsize = bwreq->bw_lastalloc;

	if (was_empty) {
	    /*
	     * Arm the timer, rounding down to ensure full window.
	     */
	    time_tmp = jiffies + BW_SCHEDTIME_TO_JIFFY(BW_IOSCHED_RATE);
	    time_tmp = (time_tmp / BW_IOSCHED_RATE) * BW_IOSCHED_RATE;

	    bw_sched_timer.expires = time_tmp;
	    add_timer(&bw_sched_timer);
	}
    }
    else {
	/*
	 * We lost on the two easy cases.  We need to
	 * scale the queue and insert where apropriate.
	 * Follow the bouncing pointer.....
	 */
	for (bwreqpp = &bw_sched_list.bwq_head; *bwreqpp;
		    bwreqpp = &(*bwreqpp)->bw_iosched) {
	    if (bwreq->bw_lastalloc <= (*bwreqpp)->bw_lastalloc) {
		bwreq->bw_iosched = *bwreqpp;
		BW_CHECK_REQ(*bwreqpp);
		*bwreqpp = bwreq;
		goto unlock_out;
	    }
	}
	/*
	 * It seems like this can't happen - we didn't belong at the
	 * tail of the queue, but the search fell off the end.  We get
	 * here if the request that had the previous bw_sched_tailsize
	 * limit got cancelled.  Just do the ENQ, update tailsize, and
	 * get out (we know the queue is non-empty from previous test).
	 */
	BW_Q_ENQ(&bw_sched_list, bwreq, bw_iosched);
	bw_sched_tailsize = bwreq->bw_lastalloc;
    }

unlock_out:
    BW_LOG_UNLOCK(&bw_iosched_lock, BW_LOCK_FLAGS);
    BW_CHECK_REQ(bwreq);
    BW_EXIT_FUNC();
}

void
bw_do_sched(unsigned long tmp)
{
    struct bw_ioreq *bwreq, *old_list;
    int cur_len, slot;
    BW_LOCK_FLAGS_DECL
    NASTY_LOCK_DECL;

    NASTY_LOCK( NASTY_LOCK_FLAGS );
    BW_ENTER_FUNC();
    DPRINTF(" [[[[[ ");
    ASSERT(bw_verify_active(1));
    BW_LOG_LOCK(&bw_iosched_lock, BW_LOCK_FLAGS);

#if LINUX_VERSION_CODE >= 0x20400
    if (bw_sched_timer.list.prev)
	printk("bw_do_sched: called with timer armed\n");
#else
    if (bw_sched_timer.prev)
	printk("bw_do_sched: called with timer armed\n");

#endif
    old_list = bw_sched_list.bwq_head;
    BW_Q_SET_EMPTY(&bw_sched_list);
    bw_sched_tailsize = 0;

    BW_LOG_UNLOCK(&bw_iosched_lock, BW_LOCK_FLAGS);

    while (old_list) {
	bwreq = old_list;
	old_list = bwreq->bw_iosched;
	spin_lock( &bwreq->bw_busylock ); 

	BW_CHECK_REQ(bwreq);

	slot = BW_IOSLOT_TOSCHED(
		    BW_TIME_TO_IOIND(BW_CUR_TIME + BW_IOSCHED_RATE));

	/*
	 * If there is nothing already active in the target time
	 * slot, calculate a new I/O request.  A request can happen
	 * if the scheduling processing crosses a clock tick boundary.
	 */
	if (bwreq->bw_cur_iolen[slot] == 0) {

	    cur_len = bw_io_check(bwreq, BW_IOSCHED_RATE, &slot);

	    if (slot != -1)
		bw_do_q_active(bwreq, slot, cur_len);
	}
	/*
	 * If we have more work to do, plop this back on the sched q.
	 */
	if (atomic_read(&bwreq->bw_totallen) )
	    bw_do_q_sched(bwreq);
	spin_unlock( &bwreq->bw_busylock ); 
    }
    ASSERT(bw_verify_active(1));
    DPRINTF(" ]]]]] ");
    BW_EXIT_FUNC();
    NASTY_UNLOCK( NASTY_LOCK_FLAGS );
}

void
bw_timers_init()
{
    struct bw_queue *bq;

    BW_ENTER_FUNC();

    for (bq = bw_iolist; bq < &bw_iolist[BW_NUMIO_CNT]; bq++) {
	BW_Q_SET_EMPTY(bq);
    }

    BW_Q_SET_EMPTY(&bw_sched_list);

    spin_lock_init(&bw_ioactive_lock);
    spin_lock_init(&bw_iosched_lock);

    init_timer(&bw_active_timer);
    bw_active_timer.function = bw_do_active;

    init_timer(&bw_sched_timer);
    bw_sched_timer.function = bw_do_sched;
    BW_EXIT_FUNC();
}

/*
 * We are being unloaded - clear the universe
 *
 * XXX - this is broken because Linux doesn't let us return
 * an error from unload, and it is possible that we can block
 * on I/O with noway to clean up.
 */
void
bw_timer_drain(unsigned long *lock_flags)
{
    struct bw_ioreq *bwreq;
    struct bw_limit_hash *limit;
    int i;
#if LINUX_VERSION_CODE < 0x20400
    struct wait_queue *wait = NULL;
#else
    wait_queue_head_t wait;
    init_waitqueue_head( &wait );
#endif
    BW_LOCK_FLAGS_DECL

    BW_ENTER_FUNC();
    /*
     * 1) Cancel the timers
     * 2) drain the active I/O, 
     * 3) Scan the io sched list delete all limits
     * 4) push the I/O sched loop
     * 5) drain the active queue again.
     *
     * If the list isn't empty (eg. a protocol flow controls us)
     * sleep a while and try again.  If we get an interrupt, let
     * the user try the modunload again later.
     */
try_again:
    BW_LOG_LOCK(&bw_ioactive_lock, BW_LOCK_FLAGS);  
    if (bw_active_next_slot != -1)
	del_timer(&bw_active_timer);
    BW_LOG_UNLOCK(&bw_ioactive_lock, BW_LOCK_FLAGS);  

    BW_LOG_LOCK(&bw_iosched_lock, BW_LOCK_FLAGS);  
    del_timer(&bw_sched_timer);
    BW_LOG_UNLOCK(&bw_iosched_lock, BW_LOCK_FLAGS);

    /*
     * Do all active I/O.  _DO_NOT_ rearm timer.
     */
    while (bw_do_some_active(BW_IO_DONT_ADJUST_TIMER, BW_NUMIO_CNT, lock_flags) != 0)
	;

    /*
     * Nuke limits
     */
    bwreq = bw_sched_list.bwq_head;
    if (bwreq) {
	for ( ; bwreq; bwreq = bwreq->bw_iosched) {
	    for (i = 0; i < N_BW_TYPES; i++) {
		limit = bwreq->bw_bwi->bw_io[BW_WRIO].bw_limit[i];
		if (limit) {
		    limit->bwh_active_ios--;
		    bwreq->bw_bwi->bw_io[BW_WRIO].bw_limit[i] = NULL;
		}

#ifdef BW_TRACK_READ
		limit = bwreq->bw_bwi->bw_io[BW_RDIO].bw_limit[i];

		if (limit) {
		    limit->bwh_active_ios--;
		    bwreq->bw_bwi->bw_io[BW_RDIO].bw_limit[i] = NULL;
		}
#endif
	    }
	}

	/*
	 * process I/O q.  
	 */
	bw_do_sched(42);

	BW_LOG_LOCK(&bw_ioactive_lock, BW_LOCK_FLAGS);  
	if (bw_active_next_slot != -1)
	    del_timer(&bw_active_timer);
	BW_LOG_UNLOCK(&bw_ioactive_lock, BW_LOCK_FLAGS);  

	/*
	 * Do all active I/O.  _DO_NOT_ rearm timer.
	 */
	while (bw_do_some_active(BW_IO_DONT_ADJUST_TIMER, BW_NUMIO_CNT, lock_flags) != 0);

	if (bw_sched_list.bwq_head) {
	    /*
	     * Oh no - still have work to do before we can be unloaded
	     * sleep a while
	     */
	    interruptible_sleep_on_timeout(&wait, 10);
	    if (signal_pending(current)) {
		BW_EXIT_FUNC();
		return;
	    }
	    goto try_again;
	}
    }
    BW_EXIT_FUNC();
}
// LICENSE:
// This software is subject to the terms of the GNU GENERAL 
// PUBLIC LICENSE Version 2, June 1991
