/*
 * bw_abstract.c : pseudo abstraction layer between the bw_sock.c
 * interface and the bandwidth management subsystem.
 *
 * Copyright (C) 1999-2001, Sun Microsystems, Inc.
 * All rights reserved.
 *
 * The routines here are registered with bw_sock using the
 * bw_iface_funcs structure defined at the bottom of this file.
 * These routines are called before, after, or inplace of the
 * main protocol code.
 *
 * This should be cleaned up somewhat.
 */

#ifdef MODULE
#include <linux/module.h>
#include <linux/modversions.h>
#endif 

#include <bw_mgmt.h>


#define DPRINTF if (0) printk

static int bw_init = 0;
int bw_allow_unload = 0;
atomic_t bw_numconnections;

#ifdef MODULE
MODULE_AUTHOR("Sun Cobalt <support@cobalt.com>");
 
#ifdef CONFIG_SMP
#define COBALT_DESC "Sun Cobalt Bandwidth Limiting Module (SMP)"
#else
#define COBALT_DESC "Sun Cobalt Bandwidth Limiting Module"
#endif

MODULE_DESCRIPTION(COBALT_DESC);
MODULE_PARM(bw_allow_unload, "i");

int
init_module(void)
{
    void bw_init_all(void);

    bw_init_all();
    printk("%s\n", COBALT_DESC);
    printk("Copyright (c) 1999-2001, Sun Microsystems, Inc.\n");
    printk("All rights reserved\n");
#ifndef DEBUG
    if (bw_allow_unload == 0)
	MOD_INC_USE_COUNT;
#endif
    return 0;
}

/*
 * XXX cleanup_module is broken because we can error out of
 * bw_timer_drain(), but Linux doesn't let cleanup() fail.
 *
 * XXXXXXXXXXX also a nast race condition happens when you
 * uninstall then install a new module.   the bw_info structs
 * will still bw initialized on open sockets and will suddenly 
 * start trying to do some pretty nasty shit.
 *
 * For testing purposes, unload works.
 *
 * The right solution seems to involve can_unload_module(),
 * but that is a new and unknown invention.
 */
void
cleanup_module(void)
{
    NASTY_LOCK_DECL;
    /*
     * unregister from socket layer to eliminate future
     * requests, then wait for all pending I/O requests
     * to finish (schedq == 0), then scan socket list
     * and release the universe.
     */
    NASTY_LOCK( NASTY_LOCK_FLAGS );
    BW_ENTER_FUNC();

    bw_sock_unregister_mgmt();
    bw_timer_drain( NASTY_LOCK_FLAGS_REF );
    bw_proc_disable();

    bw_init = 0;

    BW_EXIT_FUNC();
    NASTY_UNLOCK( NASTY_LOCK_FLAGS );
}
#endif /* MODULE */

spinlock_t bw_limitlock;
spinlock_t bw_cntlock;

extern struct bw_iface_funcs bw_iface_funcs;

#ifdef HACKTASTIC
static void bw_hacktastic_func( void )
{
    printk("bw sched horked \n" );
    BW_DUMP_TRACE_LOG();
}
#endif

/*
 * Init_all is called by bw_sock if we are not a module.
 */
void bw_init_all(void)
{
    extern void bw_proc_init(void);
    extern void bw_timers_init(void);

    if (bw_init)
	return;

#ifdef BIGNASTYLOCK
    spin_lock_init(&bw_bignastylock);
#endif
    
#if defined( DEBUG ) && BW_TRACE_LEVEL>0
    memset( &bw_trace_log, 0x0, sizeof( bw_trace_log ) );
#endif

#ifdef HACKTASTIC
    hacktastic_func = bw_hacktastic_func;
#endif

    atomic_set( &bw_numconnections, 0 );
    spin_lock_init(&bw_limitlock);
    spin_lock_init(&bw_cntlock);
    bw_proc_init();
    bw_timers_init();
    bw_sock_register_mgmt(&bw_iface_funcs);

    bw_init = 1;
    BW_INSERT_TRACE( "bw_init_all finnished",1 );
    BW_DUMP_TRACE_LOG();
}

void bw_do_delconnection(struct socket *sock);

int
bw_do_newconnection(struct socket *sock, int protocol, int family)
{
    struct bw_info *bwi;

    BW_ENTER_FUNC();

    DPRINTF("bw_newconnection: sock 0x%x, obwi 0x%x\n", (int) sock,
		(int) sock->bw_mgmt);

    /*
     * When an accept() happens, we already have a bw_info for
     * this socket.  Need to clean up the mess.
     */
    if (sock->bw_mgmt)
	bw_do_delconnection(sock);

    bwi = (struct bw_info *) kmalloc(sizeof(*bwi), BW_GFP );

    if ( ! bwi)
    {
	BW_EXIT_FUNC();
	return -ENOMEM;
    }

    atomic_inc( &bw_numconnections );

    memset(bwi, 0, sizeof(*bwi));

    atomic_set( &bwi->bw_io[0].bw_prealloc, 0 );
    atomic_set( &bwi->bw_io[1].bw_prealloc, 0 );

    bwi->bw_sock = sock;
    bwi->bw_waitqp = &bwi->bw_waitq;
#if LINUX_VERSION_CODE < 0x20400
    init_waitqueue( bwi->bw_waitqp );
#else   
    init_waitqueue_head(bwi->bw_waitqp);
#endif
    bwi->bw_proto = protocol;
    bwi->bw_family = family;

    if (sock->type == SOCK_DGRAM || sock->type == SOCK_SEQPACKET)
	bwi->bw_flags |= BW_MSGONLY;

#ifdef DEBUG
    bwi->magic = BW_INFO_MAGIC;
#endif

    sock->bw_mgmt = bwi;
    BW_EXIT_FUNC();
    return 0;
}
int
bw_newconnection(struct socket *sock, int protocol, int family)
{
    int ret;
    NASTY_LOCK_DECL;

    NASTY_LOCK(NASTY_LOCK_FLAGS);
    BW_ENTER_FUNC();
    ret = bw_do_newconnection( sock, protocol, family );
    BW_EXIT_FUNC();
    NASTY_UNLOCK(NASTY_LOCK_FLAGS);

    return ret;
}
void bw_copy_bw_info( struct bw_info *nbwi, struct bw_info *bwi )
{
    enum bw_types type;
    BW_LOCK_FLAGS_DECL

    BW_ENTER_FUNC();

    /*
     * We would like to do
     *	    *nbwi = *bwi
     * but there are these linked lists and reference counts.
     * We update the correct stuff, then let the lower layers
     * do the rest.
     */
    BW_LOG_LOCK(&bw_limitlock, BW_LOCK_FLAGS);

    memcpy(nbwi->bw_type_id, bwi->bw_type_id, sizeof(bwi->bw_type_id));
    nbwi->bw_flags = bwi->bw_flags & ~BW_NODUP_FLAGS;

    for (type = 0; type < N_BW_TYPES; type++) {
	if (bw_type_valid(nbwi, type)) {
	    bw_new_ent(nbwi, type, BW_WRIO);
#ifdef BW_TRACK_READ
	    bw_new_ent(nbwi, type, BW_RDIO);
#endif
	}
    }
    BW_LOG_UNLOCK(&bw_limitlock, BW_LOCK_FLAGS);
    BW_EXIT_FUNC();
}

#if LINUX_VERSION_CODE >= 0x20400
/* this is needed in 2.4 because there is no dup op. */
int bw_pre_accept(struct socket *sock, struct socket *newsock,   
		  int flags)
{
    struct bw_info *bwi, *nbwi;
    int ret;
    NASTY_LOCK_DECL;


    if ( ! (bwi = sock->bw_mgmt))
	return 0;

    NASTY_LOCK( NASTY_LOCK_FLAGS );
    BW_ENTER_FUNC();

#ifdef DEBUG
    if ( ! BW_INFO_MAGIC_VALID( bwi ) )
    {
	printk( "ack!!!\n" );
	BW_EXIT_FUNC();
	NASTY_UNLOCK( NASTY_LOCK_FLAGS );
	return 0;
    }
/*    else
    {
	printk (":)\n" );
	} */
#endif
    ret = bw_do_newconnection( newsock, bwi->bw_proto, bwi->bw_family );
    if( ret < 0 )
    {
	BW_EXIT_FUNC();
	NASTY_UNLOCK( NASTY_LOCK_FLAGS );
	return ret;
    }
    nbwi = newsock->bw_mgmt;

    bw_copy_bw_info(nbwi, bwi );
    
    BW_EXIT_FUNC();
    NASTY_UNLOCK( NASTY_LOCK_FLAGS );
    return 0;
}
#endif

int
bw_dupconnection(struct socket *newsock, struct socket *sock)
{
    struct bw_info *bwi, *nbwi;
    int ret;
    NASTY_LOCK_DECL;
    
    if ( ! (bwi = sock->bw_mgmt))
	return 0;
    
    NASTY_LOCK( NASTY_LOCK_FLAGS );
    BW_ENTER_FUNC();
    ret = bw_do_newconnection(newsock, bwi->bw_proto, bwi->bw_family);
    if (ret < 0)
    {
	BW_EXIT_FUNC();
	NASTY_UNLOCK( NASTY_LOCK_FLAGS );
	return ret;
    }

    nbwi = newsock->bw_mgmt;

    bw_copy_bw_info(nbwi, bwi );

    BW_EXIT_FUNC();
    NASTY_UNLOCK( NASTY_LOCK_FLAGS );
    
    return 0;
}

void
bw_do_delconnection(struct socket *sock)
{
    struct bw_info *bwi;
    enum bw_iotype rw;
    enum bw_types type;
    BW_LOCK_FLAGS_DECL
    
    BW_ENTER_FUNC();

    DPRINTF("bw_delconnection: sock 0x%x, bwi 0x%x\n", (int) sock,
		(int) sock->bw_mgmt);
    if ( ! (bwi = sock->bw_mgmt))
    {
	BW_EXIT_FUNC();
	return;
    }

    /*
     * If a poll is pending, clean it up
     */
    if (bwi->bw_flags & BW_POLLPEND) {
	struct bw_ioreq *bwreq = bwi->bw_poll_req;

	DPRINTF("bw_delbwi: canceling poll req 0x%x\n", (int) bwreq);

	bwreq->bw_sleep_qpp = NULL;
	bw_cancel_io(bwreq);
	bw_io_complete(bwreq);
    }

	/* lock moved down as bw_io_complete does locking itself */
    BW_LOG_LOCK(&bw_limitlock, BW_LOCK_FLAGS); 


#ifdef BW_TRACK_READ
    for (rw = 0; rw < 2; rw++)
#else
    rw = BW_WRIO;
#endif
    {
	if ( atomic_read( &bwi->bw_io[rw].bw_prealloc) )
	    bw_io_short_io(bwi, rw, atomic_read(&bwi->bw_io[rw].bw_prealloc) );

	for (type = 0; type < N_BW_TYPES; type++) {
	    if (bw_type_valid(bwi, type))
		bw_delete_ent(bwi, type, rw);
	}
    }

    BW_LOG_UNLOCK(&bw_limitlock, BW_LOCK_FLAGS);

    kfree(bwi);
    sock->bw_mgmt = NULL;
    atomic_dec( &bw_numconnections );
    BW_EXIT_FUNC();
}

void bw_delconnection( struct socket *sock )
{
    NASTY_LOCK_DECL;

    NASTY_LOCK( NASTY_LOCK_FLAGS );
    BW_ENTER_FUNC();
    bw_do_delconnection( sock );
    BW_EXIT_FUNC();
    NASTY_UNLOCK( NASTY_LOCK_FLAGS );
}

/*
 * bw_setaddr is called from bw_sock when we know the identity
 * of a particular connection.  This happens after an accept(),
 * or a connect.
 */
void
bw_setaddr(struct socket *sock)
{
    struct bw_info *bwi;
    NASTY_LOCK_DECL;
    
    if ( ! (bwi = sock->bw_mgmt))
	return;

    NASTY_LOCK( NASTY_LOCK_FLAGS );
    BW_ENTER_FUNC();
    switch (bwi->bw_family) {
    case AF_INET:
	bw_setip(bwi, bwi->bw_proto, htonl(sock->sk->rcv_saddr),
			htons(sock->sk->sport));
	break;
    default:
    }
    BW_EXIT_FUNC();
    NASTY_UNLOCK( NASTY_LOCK_FLAGS );
    return;
}

#if LINUX_VERSION_CODE < 0x20200 
/*
 * Poll/select - select is pre-2.2, but the ideas are the same.
 * If the protocol layer is limiting the connection, we let that
 * stand.  If we have a lower limit, we try to enforce it.
 */
int
bw_post_select(struct socket *sock, int sel_type, select_table *wait, int proto_ret)
{
    struct bw_info *bwi = BW_SOCK_MGMT(sock);
    int nio;
    NASTY_LOCK_DECL;
    if (sel_type == SEL_EX || ! bwi)
	return proto_ret;

    if (bwi->bw_flags & BW_MSGONLY) {
	unsigned long tmp;
	int bw_write;
	int bw_read;
	int ret;

	if (sel_type == SEL_OUT )
	{

	    NASTY_UNLOCK( NASTY_LOCK_FLAGS );
	    ret = sock->real_ops->ioctl(sock, TIOCOUTQ, &tmp);	
	    NASTY_LOCK( NASTY_LOCK_FLAGS );

	    if( ret >= 0 )
	    {
		nio = bw_check_io(bwi, BW_WRIO, tmp);
		if (nio < tmp)
		    ret = 0;
	    }
	}
	else if (sel_type == SEL_IN )
	{

	    NASTY_UNLOCK( NASTY_LOCK_FLAGS );
	    ret = sock->real_ops->ioctl(sock, TIOCINQ, &tmp);
	    NASTY_LOCK( NASTY_LOCK_FLAGS );
	    if( ret >= 0)
	    {
		nio = bw_check_io(bwi, BW_RDIO, tmp);
		
		if (nio < tmp)
		    ret = 0;
	    }
	}
	else if (sel_type == SEL_IN)
	    ret = bw_check_io(bwi, BW_RDIO, 1);
	else if (sel_type == SEL_OUT)
	    ret = bw_check_io(bwi, BW_WRIO, 1);
    }
    
    DPRINTF("bw_post_select returning %d\n", ret);
    return ret;
}
#else if LINUX_VERSION_CODE >= 0x20200

struct bw_poll {
    struct bw_ioreq bwio;
    struct msghdr m;
    struct iovec iov;
};

int
bw_do_poll_ok(struct bw_ioreq *bwio, int len, void *p, unsigned long *lock_flags)
{
    BW_ENTER_FUNC();
    DPRINTF("bw_do_poll_ok: adding %d bytes to 0x%x\n", len, (int) bwio);
    bwio->bw_poll_len += len;
    BW_EXIT_FUNC();
    return len;
}


#define BW_NOWAIT 0
#define BW_DOWAIT 1

void
bw_make_poll_io(struct bw_info *bwi, int len, enum bw_iotype rw, int wait)
{
    struct bw_poll *bwp;
    BW_LOCK_FLAGS_DECL

    BW_ENTER_FUNC();
    if (bwi->bw_flags & BW_POLLPEND)
    {
	BW_EXIT_FUNC();
	return;
    }
    
    bwp = (struct bw_poll *) kmalloc(sizeof(*bwp) + len, BW_GFP );

    if ( ! bwp)
    {
	BW_EXIT_FUNC();
	return;
    }

    BW_LOG_LOCK(&bw_limitlock, BW_LOCK_FLAGS);

    /*
     * GFP_KERNEL means we might have slept, so
     * we recheck just in case
     */
    if (bwi->bw_flags & BW_POLLPEND) {
	BW_LOG_UNLOCK(&bw_limitlock, BW_LOCK_FLAGS);
	kfree(bwp);
	BW_EXIT_FUNC();
	return;
    }

    bwi->bw_flags |= BW_POLLPEND;

    BW_LOG_UNLOCK(&bw_limitlock, BW_LOCK_FLAGS);

    DPRINTF("bw_make_poll_io: request 0x%xx looking for %d bytes\n",
	    (int) &bwp->bwio, len);

    memset(bwp, 0, sizeof(*bwp));
    bwp->m.msg_iov = &bwp->iov;
    bwp->m.msg_iovlen = 1;
    bwp->iov.iov_base = (char *) (bwp + 1);
    bwp->iov.iov_len = len;

    bw_init_ioreq(&bwp->bwio, bwi, rw, &bwp->m, len);

    /*
     * bw_init_ioreq clears bwio, so do this here
     */
    bwp->bwio.bw_iopoll = bwp;
    bwi->bw_poll_req = &bwp->bwio;

    if (wait == BW_DOWAIT)
	bwp->bwio.bw_sleep_qpp = bwi->bw_waitqp;
    else DPRINTF("bw_make_poll_io: async req 0x%x for %d bytes\n",
		(int) &bwp->bwio, len);


    bw_io_active(&bwp->bwio);
    bw_q_io(&bwp->bwio, bw_do_poll_ok, bwp);
    BW_EXIT_FUNC();
}
/*
 * See comment above select
 */
unsigned int
bw_post_poll(struct file *file, struct socket *sock,
		struct poll_table_struct *wait, unsigned int prot_ret)
{
    struct bw_info *bwi = BW_SOCK_MGMT(sock);
    struct bw_ioreq bwio;
    int nio;
    int nwrite = 10;
    NASTY_LOCK_DECL;
#ifdef BW_TRACK_READ
    int nread = 1;
#endif

    if ( ! bwi)
	return prot_ret;

    
    poll_wait(file, bwi->bw_waitqp, wait); 

    NASTY_LOCK( NASTY_LOCK_FLAGS );
    BW_ENTER_FUNC();
    /*
     * Check to see if we have bandwidth available.  If not, we
     * need to wake ourselves up later.
     */

    /*
     * XXX - if POLLMSG ever is implemented, and is set in the
     * return from the protocol layer, we may need to fold it
     * into the datagram case below.
     */
    if (bwi->bw_savederror) {
	DPRINTF("bw_post_poll setting error\n");
	prot_ret |= POLLERR;
    }

    if (prot_ret & (POLLOUT|POLLWRNORM|POLLWRBAND)) {
	DPRINTF("bw_post_poll: ioreq 0x%x\n", (int) &bwio);
	bw_init_ioreq(&bwio, bwi, BW_WRIO, NULL, nwrite);
	nio = bw_io_check(&bwio, 0, NULL);
	bw_delete_ioreq(&bwio, bwi);
	if ( ! nio) {
	    prot_ret &= ~(POLLOUT|POLLWRNORM|POLLWRBAND);
	    bw_make_poll_io(bwi, nwrite, BW_WRIO, BW_DOWAIT);
	}
    }

    if ((prot_ret & (POLLIN|POLLPRI|POLLHUP|POLLRDNORM|POLLRDBAND)) == 0)
	goto out;

#ifdef BW_TRACK_READ
    /*
     * For a datagram, we look to see how much there is to read
     * and only pass success when the user could read that much.
     */
    if (BW_SOCK_MGMT(sock)->bw_flags & BW_MSGONLY) {
	NASTY_UNLOCK( NASTY_LOCK_FLAGS );
	(void) sock->real_ops->ioctl(sock, TIOCINQ, (int ) &nread);
	NASTY_LOCK( NASTY_LOCK_FLAGS );
    }

    bw_init_ioreq(&bwio, bwi, BW_WRIO, NULL, nread);
    nio = bw_io_check(&bwio, 0, NULL);
    bw_delete_ioreq(&bwio, bwi);

    if (nio < nread) {
	/*
	 * XXX - We clear POLLHUP if there is data to deliver and the
	 * limit prevents us from delivering it.  This should keep an
	 * application from processing the close before the final data.
	 */
	prot_ret &= ~(POLLIN|POLLPRI|POLLHUP|POLLRDNORM|POLLRDBAND);
	if ((bwi->bw_flags & BW_POLLPEND) == 0) {
	    bw_make_poll_io(bwi, nread, BW_RDIO, BW_DOWAIT);
	    bwi->bw_flags |= BW_POLLPEND;
	}
    }

#endif

out:
    DPRINTF("bw_sock_poll returning %d, wait was 0x%x\n", prot_ret, (int) wait);
    BW_EXIT_FUNC();
    NASTY_UNLOCK( NASTY_LOCK_FLAGS );
    return prot_ret;
}
#endif

/*
 * ioctl checks for requests that we process, and dealt with them,
 * and also fixes up ioctls that ask about bandwidth.
 */
int
bw_post_ioctl(struct socket *sock, unsigned int cmd,
		unsigned long arg, int proto_ret)
{
    struct bw_info *bwi = BW_SOCK_MGMT(sock);
    struct bwlim_params bwp;
    struct bw_ioreq bwio;
    int bw_ret;
    unsigned long tmp;
    int ret;
    NASTY_LOCK_DECL;

    if ( ! bwi)
	return proto_ret;

    NASTY_LOCK( NASTY_LOCK_FLAGS );
    BW_ENTER_FUNC();
    switch (cmd) {
    case BW_IOC_SETPARAM: {

#ifndef USER_TEST
	if( !capable( CAP_SYS_ADMIN ) )
	{
	    BW_EXIT_FUNC();
	    NASTY_UNLOCK( NASTY_LOCK_FLAGS );
	    return -EPERM;
	}
#endif

	if (copy_from_user((void *) &bwp, (void *) arg, sizeof(bwp)))
	{
	    BW_EXIT_FUNC();
	    NASTY_UNLOCK( NASTY_LOCK_FLAGS );
	    return -EINVAL;
	}
	
	ret = bw_new_limit(&bwp);
	BW_EXIT_FUNC();
	NASTY_UNLOCK( NASTY_LOCK_FLAGS );
	return ret;
    }

    case BW_IOC_DELETE: {
#ifndef USER_TEST
	if( !capable( CAP_SYS_ADMIN ) )
	{
	    BW_EXIT_FUNC();
	    NASTY_UNLOCK( NASTY_LOCK_FLAGS );
	    return -EPERM;
	}
#endif

	if (copy_from_user((void *) &bwp, (void *) arg, sizeof(bwp)))
	{
	    BW_EXIT_FUNC();
	    NASTY_UNLOCK( NASTY_LOCK_FLAGS );
	    return -EINVAL;
	}

	ret = bw_delete_limit(&bwp);
	BW_EXIT_FUNC();
	NASTY_UNLOCK( NASTY_LOCK_FLAGS );
	return ret;
    }

    case TIOCOUTQ:
	if (proto_ret < 0)
	    break;
	copy_from_user((void *) &tmp, (void *) arg, sizeof(tmp));
	DPRINTF("bw_post_ioctl: ioreq 0x%x\n", (int) &bwio);
	bw_init_ioreq(&bwio, bwi, BW_WRIO, NULL, tmp);
	bw_ret = bw_io_check(&bwio, 0, NULL);
	bw_delete_ioreq(&bwio, bwi);

	if (bw_ret < tmp) {
	    /*
	     * If there is data to read, hang an async request
	     * to be bandwidth.  This is even stranger than poll
	     * because there isn't even someone to wakeup.
	     */
	    DPRINTF("bw_post_ioctl: %s did TIOCOUTQ, avail %d, limit: %d\n",
		    current->comm, (int) tmp, bw_ret);

	    if (bwi->bw_flags & BW_MSGONLY) {
		bw_make_poll_io(bwi, tmp - bw_ret, BW_WRIO, BW_NOWAIT);
		tmp = 0;
	    }
	    else {
		bw_make_poll_io(bwi, min(tmp - bw_ret, BW_MIN_IOSIZE),
			BW_WRIO, BW_NOWAIT);

		tmp = bw_ret;
	    }
	    copy_to_user((void *) arg, (void *) &tmp, sizeof(tmp));
	}
	break;

#ifdef BW_TRACK_READ
    case TIOCINQ:
	if (proto_ret < 0)
	    break;
	copy_from_user((void *) &tmp, (void *) arg, sizeof(tmp));
	bw_init_ioreq(&bwio, bwi, BW_RDIO, NULL, tmp);
	bw_ret = bw_io_check(&bwio, 0, NULL);
	bw_delete_ioreq(&bwio, bwi);

	if (bw_ret < tmp) {
	    bw_make_poll_io(bwi, tmp, BW_RDIO, BW_NOWAIT);
	    tmp = (bwi->bw_flags & BW_MSGONLY) ? 0 : bw_ret;
	    copy_to_user((void *) arg, (void *) &tmp, sizeof(tmp));
	}
	break;
#endif
    }

    DPRINTF("bw_post_ioctl returning %d\n", proto_ret);
    BW_EXIT_FUNC();
    NASTY_UNLOCK( NASTY_LOCK_FLAGS );
    return proto_ret;
}

#if LINUX_VERSION_CODE >= 0x20200

#define BW_SEND_DECL    struct scm_cookie *scm;
#define bw_sendflags	m->msg_flags

#define BW_SET_SNDPARAMS(dest)		(dest)->scm = scm
#define BW_GET_SNDPARAMS(src)		scm = (src)->scm

#define BW_SR_NONBLOCK(flags, nonblk)	((flags) & (MSG_DONTWAIT|MSG_PEEK))

#else if LINUX_VERSION_CODE > 0x20000

#define BW_SEND_DECL    int nonblock; int bw_sendflags;
#define BW_SR_NONBLOCK(flags, nonblk)		\
		((nonblk) || ((flags) & (MSG_DONTWAIT|MSG_PEEK)))

#define BW_SET_SNDPARAMS(dest)	\
	    ((dest)->bw_sendflags = bw_sendflags, (dest)->nonblock = 1)

#define BW_GET_SNDPARAMS(src)	\
	    (bw_sendflags = (src)->bw_sendflags, nonblock = (src)->nonblock)

#endif

struct bw_sock_sendparams {
    struct socket *sock;
    BW_SEND_DECL;
};

/*
 * bw_sock_dosendmsg is the routine that bw_timer.c uses to actually
 * to the I/O.  bw_do_sendmsg stuffs bw_sock_dosendmsg in the bw_ioreq, 
 * and the timer routine calls bw_sock_dosendmsg when bandwidth is 
 * available.  The bw_sock_sendparams structure is defined in bw_iface.h
 * to make 2.0 and 2.2 both work.
 */
int
bw_sock_dosendmsg(struct bw_ioreq *bwio, int len, void *p, unsigned long *lock_flags)
{
    struct bw_sock_sendparams *sparm = (struct bw_sock_sendparams *) p;
    struct socket *sock;
    int ret;
    int gfp_save;
    unsigned long save_seg;
    BW_SEND_DECL;
    
    BW_ENTER_FUNC();
    
    sock = sparm->sock;
    BW_GET_SNDPARAMS(sparm);

    DPRINTF("bw_dosendmsg: sock 0x%x, err %d, len %d msg_iovlen %d, iovlen %d\n",
		(int) sock, sock->sk->err,
		len, bwio->bw_msg->msg_iovlen,
		bwio->bw_msg->msg_iov->iov_len);

    DPRINTF("	base 0x%x, limit 0x%x\n",
		(int) bwio->bw_msg->msg_iov->iov_base,
		(int) current->addr_limit.seg);
    /*
     * This is way tacky, but protocols bounds check vs. current, so we
     * need to update it since we already copied the buffer into kspace.
     */

    save_seg = current->addr_limit.seg;
    current->addr_limit.seg = 0xffffffff;

    gfp_save = sock->sk->allocation;
    sock->sk->allocation = GFP_ATOMIC; /* make sure that allocation is done atomicaly */

    NASTY_UNLOCK(*lock_flags);
    ret = sock->real_ops->sendmsg(sock, bwio->bw_msg, len, BW_SEND_OUT);
    NASTY_LOCK(*lock_flags);
    
    sock->sk->allocation = gfp_save;

    current->addr_limit.seg = save_seg;
    DPRINTF("bw_sock_dosendmsg: ret %d\n", ret);
    
    BW_EXIT_FUNC();
    return ret;
}


#define BW_RET_TO_COUNT(ret) ((ret) > 0 ? (ret) : 0)

/*
 * bw_do_sendmsg is called from bw_sock.c to do I/O.
 */
int
bw_do_sendmsg(struct socket *sock, struct msghdr *m, int total_len,
		BW_SEND_PARAMS)
{
    int ret = 0;
    struct bw_info *bwi = BW_SOCK_MGMT(sock);
    struct bw_ioreq bwreq;
    int bw_nwrite;
    struct bw_sock_sendparams bws_params;
    int saved_msg_flags;
    int slot;
    NASTY_LOCK_DECL;
    
    if ( ! bwi)
	return sock->real_ops->sendmsg(sock, m, total_len, BW_SEND_OUT);
    
    NASTY_LOCK( NASTY_LOCK_FLAGS );
    BW_ENTER_FUNC();

    if (bwi->bw_savederror) {
	ret = bwi->bw_savederror;
	bwi->bw_savederror = 0;
	DPRINTF("bw_do_sendmsg: returning saved error %d to %16s\n",
			ret, current->comm);
	if (ret == -EPIPE && (bw_sendflags & MSG_NOSIGNAL) == 0)
	    send_sig(SIGPIPE, current, 0);
	goto ret_out;
    }
    DPRINTF("bw_do_sendmsg: start ioreq 0x%x\n", (int) &bwreq);
    bw_init_ioreq(&bwreq, bwi, BW_WRIO, m, total_len);

    bw_nwrite = bw_io_check(&bwreq, 0, &slot);

    /*
     * If it fits, do the I/O and update the
     * stats based on the protocol ret value.
     */
    if (bw_nwrite >= total_len) {

	atomic_sub( bw_nwrite, &bwreq.bw_totallen );
	atomic_sub( bw_nwrite, &bwreq.bw_prealloc );

	NASTY_UNLOCK( NASTY_LOCK_FLAGS );
	ret = sock->real_ops->sendmsg(sock, m, total_len, BW_SEND_OUT);
	NASTY_LOCK( NASTY_LOCK_FLAGS );
	DPRINTF("bw_sock_sendmsg fast returning %d\n", ret);

	bw_io_update(&bwreq, bw_nwrite, BW_RET_TO_COUNT(ret));

	goto free_out;
    }

    DPRINTF("bw_sock_sendmsg: limited - wanted %d bytes, alloc %d, slot %d\n",
	    total_len, bw_nwrite, slot);
    /*
     * OK - it doesn't fit. Refer to Table:
     * If slot != -1, we have a big enough window to send something.
     *
     *		 | Stream                  |  Message
     * ----------+-------------------------+-------------------
     * Non-block | do partial I/O & return | Return EWOULDBLOCK
     * ----------+-------------------------+-------------------
     * Blocking  | do initial I/O & q rest | q all.
     * ----------+-------------------------+-------------------
     */
    if ((bwi->bw_flags & BW_MSGONLY) == 0) {
	/*
	 * If the current alloc is over the threshhold, or if this
	 * is a non-blocking request, send the current window
	 */
	if (slot != -1 || BW_SR_NONBLOCK(bw_sendflags, nonblock)) {
	    atomic_sub( bw_nwrite, &bwreq.bw_totallen );
	    atomic_sub( bw_nwrite, &bwreq.bw_prealloc );
	    bw_iov_front(&bwreq, bw_nwrite);
	    /*
	     * Stream - do partial/initial
	     */
	    NASTY_UNLOCK( NASTY_LOCK_FLAGS );
	    ret = sock->real_ops->sendmsg(sock, m, bw_nwrite, BW_SEND_OUT);
	    NASTY_LOCK( NASTY_LOCK_FLAGS );

	    bw_io_update(&bwreq, bw_nwrite, BW_RET_TO_COUNT(ret));

	    /*
	     * if the protocol is non-blocking, or it only processed
	     * a partial transfer, let the application deal with it
	     */
	    if (BW_SR_NONBLOCK(bw_sendflags, nonblock) || ret != bw_nwrite)
		goto free_out;

	    bw_iov_pullup(&bwreq, bw_nwrite, ret);
	}
    }
    else if (BW_SR_NONBLOCK(bw_sendflags, nonblock)) { /* nonblock datagram */
	ret = -EWOULDBLOCK;
	goto free_out;
    }

    DPRINTF("bw_sendmsg: bw_iov_copyin\n");

    /*
     * If we can't copyin the message, we have an interesting
     * error return case.  We know the user memory is accessible
     * since sys_sendmsg() verified it for us.  So we died on
     * memory allocation.  The question is whether we have done
     * a partial I/O yet, which only happens with streams.
     */
    if (bw_iov_copyin(&bwreq, NASTY_LOCK_FLAGS_REF) == 0) {
	if (bwi->bw_flags & BW_MSGONLY)
	    ret = -ENOBUFS;
	goto free_out;
    }
    /*
     * Ok, we did what we could.  Queue the rest
     *
     * First, we need to clear out any pending poll operations.
     * it may not make sense to have both poll and I/O on the same
     * FD, but it is a side effect of the predictive poll/limit
     * code above.  Since we will use the same fields for the
     * I/O scheduling as the poll, we nuke the poll.
     */

    /* removed lock as bw_io_complete does locking itself */
    if (bwi->bw_flags & BW_POLLPEND) {
	DPRINTF("bw_do_sendmsg: canceling poll req 0x%x\n",
	    (int) bwi->bw_poll_req);
	bwi->bw_poll_req->bw_sleep_qpp = NULL; /* skip wakeup */
	bw_cancel_io(bwi->bw_poll_req);
	bw_io_complete(bwi->bw_poll_req);
    }

    bws_params.sock = sock;
    BW_SET_SNDPARAMS(&bws_params);

    /*
     * It is possible that a TCP connection is not yet established,
     * in which case the first output will block waiting for the
     * handshake to complete.  This is OK above, but would be a
     * big problem if it happened in the timer based I/O layer. 
     *
     * So...We set the nonblocking io flag on all requests run out
     * of the timer layer.  This fixes any similar problem.
     *
     * Similarly, MSG_NOSIGNAL inhibits random death.
     */
    saved_msg_flags = bw_sendflags;
    bw_sendflags |= MSG_DONTWAIT | MSG_NOSIGNAL;
    bwreq.bw_sleep_qpp = &bwreq.bw_sleep;

    DPRINTF("bw_sendmsg: handing request to timer layer\n");
    bw_io_active(&bwreq);
    bw_q_io(&bwreq, bw_sock_dosendmsg, &bws_params);
    ASSERT(bw_verify_active(1));
    NASTY_UNLOCK( NASTY_LOCK_FLAGS );
    interruptible_sleep_on(&bwreq.bw_sleep);
    NASTY_LOCK( NASTY_LOCK_FLAGS );
    ASSERT(bw_verify_active(1));

    if (signal_pending(current)) {
	DPRINTF("bw_sendmsg: sleep finished with signal\n");
	/*
	 * I/O was interrupted - clean up pending I/O.
	 */
	bw_cancel_io(&bwreq);
	if (bwreq.bw_error_ret == 0)
	    bwreq.bw_error_ret = -ERESTARTSYS;
    }
    ret = bwreq.bw_error_ret;

    if (ret < 0) {
	DPRINTF("bw_do_sendmsg: error return %d to %16s\n", ret, current->comm);
	if (ret == -EPIPE && (saved_msg_flags & MSG_NOSIGNAL) == 0)
	    send_sig(SIGPIPE, current, 0);
    }
    else if (bwi->bw_savederror) {
	DPRINTF("bw_do_sendmsg: partial return %d, saved error %d to %16s\n",
		ret, bwi->bw_savederror, current->comm);
    }

    /*
     * Back from sleeping
     */

    bw_io_inactive(&bwreq);

    bw_iov_dealloc(&bwreq, NASTY_LOCK_FLAGS_REF);

    DPRINTF("bw_sendmsg: timer I/O complete - ret %d\n", ret);

free_out:
    bw_delete_ioreq(&bwreq, bwi);

ret_out:

#if 0
    /*
     * try some I/O if we think we are sane
     *
     * Need to see if this is an improvement.
     */
    if (BW_SOCK_MGMT(sock))
	bw_do_some_active(BW_IO_ADJUST_TIMER, BW_IOSCHED_RATE);
#endif
    DPRINTF("bw_sock_sendmsg final ret %d\n", ret);

    BW_EXIT_FUNC();
    NASTY_UNLOCK( NASTY_LOCK_FLAGS );

    return ret;
}

/*
 * For now, only output is limited.
 * receive processing is trickier - seem to need to ask for
 * byte count in all cases.  Otherwise it is kinda hard to
 * know how many bandwidth bytes to deliver or allocate.
 * If there are bytes no bytes available, but bandwidth,
 * we need to wait for bytes to show - polling is bad,
 * so it would be good to arrange for wakeup.
 */
int
bw_do_recvmsg(struct socket *sock, struct msghdr *m, int total_len,
	    BW_RECV_PARAMS)
{
    int ret;

    ret = sock->real_ops->recvmsg(sock, m, total_len, BW_RECV_OUT);
    /* DPRINTF("bw_sock_recvmsg returning %d\n", ret); */

#if 0
    /*
     * try some I/O if we think we are sane
     */
    if (BW_SOCK_MGMT(sock))
	bw_do_some_active(BW_IO_ADJUST_TIMER, BW_IOSCHED_RATE);
#endif

    return ret;
}


void
bw_io_complete(struct bw_ioreq *bwreq)
{
    DPRINTF("bw_io_complete: request 0x%x done\n", (int) bwreq);
    /*
     * if this is a poll request, we give the caller
     * his allocation, and clean up the memory.
     */
    BW_ENTER_FUNC();
    if (bwreq->bw_sleep_qpp)
	wake_up_interruptible(bwreq->bw_sleep_qpp);

    if (bwreq->bw_iopoll) {
	DPRINTF("bw_io_complete: request 0x%x done\n", (int) bwreq);
	DPRINTF("bw_io_complete: poll finished : giving %d bytes\n",
			bwreq->bw_poll_len);

	atomic_add( bwreq->bw_poll_len,
		    &bwreq->bw_bwi->bw_io[bwreq->bw_rw].bw_prealloc );
	bw_io_inactive(bwreq);
	bwreq->bw_bwi->bw_flags &= ~BW_POLLPEND;

	/* XXX - may need SIGIO sock_wake_async(bwreq->bw_bwi->bw_sock, 0); */

	kfree(bwreq->bw_iopoll);
    }
    BW_EXIT_FUNC();
}

struct bw_iface_funcs bw_iface_funcs = {
	bw_newconnection,
	bw_delconnection,
	bw_setaddr,
	bw_dupconnection,
#if LINUX_VERSION_CODE < 0x20200
	bw_post_select,
#else
	bw_post_poll,
#endif
	bw_post_ioctl,
	bw_do_sendmsg,
	bw_do_recvmsg,
#if LINUX_VERSION_CODE >= 0x20400
	bw_pre_accept,
#endif
};

// LICENSE:
// This software is subject to the terms of the GNU GENERAL 
// PUBLIC LICENSE Version 2, June 1991
