Subversion Repositories planix.SVN

Rev

Blame | Last modification | View Log | RSS feed

/*
 *      © 2005-2010 coraid
 *      ATA-over-Ethernet (AoE) storage initiator
 */

#include "u.h"
#include "../port/lib.h"
#include "mem.h"
#include "dat.h"
#include "fns.h"
#include "io.h"
#include "ureg.h"
#include "../port/error.h"
#include "../port/netif.h"
#include "etherif.h"
#include "../ip/ip.h"
#include "../port/aoe.h"

#pragma varargck argpos eventlog        1

#define dprint(...)     if(debug) eventlog(__VA_ARGS__); else USED(debug);
#define uprint(...)     snprint(up->genbuf, sizeof up->genbuf, __VA_ARGS__);

enum {
        Maxunits        = 0xff,
        Maxframes       = 128,
        Ndevlink        = 6,
        Nea             = 6,
        Nnetlink        = 6,
};

#define TYPE(q)         ((ulong)(q).path & 0xf)
#define UNIT(q)         (((ulong)(q).path>>4) & 0xff)
#define L(q)            (((ulong)(q).path>>12) & 0xf)
#define QID(u, t)       ((u)<<4 | (t))
#define Q3(l, u, t)     ((l)<<8 | QID(u, t))
#define UP(d)           ((d)->flag & Dup)
/*
 * would like this to depend on the chan (srb).
 * not possible in the current structure.
 */
#define Nofail(d, s)    ((d)->flag & Dnofail)

#define MS2TK(t)        ((t)/MS2HZ)

enum {
        Qzero,
        Qtopdir         = 1,
        Qtopbase,
        Qtopctl         = Qtopbase,
        Qtoplog,
        Qtopend,

        Qunitdir,
        Qunitbase,
        Qctl            = Qunitbase,
        Qdata,
        Qconfig,
        Qident,

        Qdevlinkdir,
        Qdevlinkbase,
        Qdevlink        = Qdevlinkbase,
        Qdevlinkend,

        Qtopfiles       = Qtopend-Qtopbase,
        Qdevlinkfiles   = Qdevlinkend-Qdevlinkbase,

        Eventlen        = 256,
        Nevents         = 64,                   /* must be power of 2 */

        Fread           = 0,
        Fwrite,
        Tfree           = -1,
        Tmgmt,

        /*
         * round trip bounds, timeouts, in ticks.
         * timeouts should be long enough that rebooting
         * the coraid (which usually takes under two minutes)
         * doesn't trigger a timeout.
         */
        Rtmax           = MS2TK(320),
        Rtmin           = MS2TK(20),
        Maxreqticks     = 4*60*HZ,              /* was 45*HZ */

        Dbcnt           = 1024,

        Crd             = 0x20,
        Crdext          = 0x24,
        Cwr             = 0x30,
        Cwrext          = 0x34,
        Cid             = 0xec,
};

enum {
        Read,
        Write,
};

/*
 * unified set of flags
 * a Netlink + Aoedev most both be jumbo capable
 * to send jumbograms to that interface.
 */
enum {
        /* sync with ahci.h */
        Dllba   = 1<<0,
        Dsmart  = 1<<1,
        Dpower  = 1<<2,
        Dnop    = 1<<3,
        Datapi  = 1<<4,
        Datapi16= 1<<5,

        /* aoe specific */
        Dup     = 1<<6,
        Djumbo  = 1<<7,
        Dnofail = 1<<8,
};

static char *flagname[] = {
        "llba",
        "smart",
        "power",
        "nop",
        "atapi",
        "atapi16",

        "up",
        "jumbo",
        "nofail",
};

typedef struct {
        ushort  flag;
        uint    lostjumbo;
        int     datamtu;

        Chan    *cc;
        Chan    *dc;
        Chan    *mtu;           /* open early to prevent bind issues. */
        char    path[Maxpath];
        uchar   ea[Eaddrlen];
} Netlink;

typedef struct {
        Netlink *nl;
        int     nea;
        ulong   eaidx;
        uchar   eatab[Nea][Eaddrlen];
        ulong   npkt;
        ulong   resent;
        ushort  flag;

        ulong   rttavg;
        ulong   mintimer;
} Devlink;

typedef struct Srb Srb;
struct Srb {
        Rendez;
        Srb     *next;
        int     shared; /* Srb shared with kproc (don't free) */
        ulong   ticksent;
        ulong   len;
        vlong   sector;
        short   write;
        short   nout;
        char    *error;
        void    *dp;
        void    *data;
};

typedef struct {
        int     tag;
        ulong   bcnt;
        ulong   dlen;
        vlong   lba;
        ulong   ticksent;
        int     nhdr;
        uchar   hdr[ETHERMINTU];
        void    *dp;
        Devlink *dl;
        Netlink *nl;
        int     eaidx;
        Srb     *srb;
} Frame;

typedef struct Aoedev Aoedev;
struct Aoedev {
        QLock;
        Aoedev  *next;

        ulong   vers;

        int     ndl;
        ulong   dlidx;
        Devlink *dl;
        Devlink dltab[Ndevlink];

        ushort  fwver;
        ushort  flag;
        int     nopen;
        int     major;
        int     minor;
        int     unit;
        int     lasttag;
        int     nframes;
        Frame   *frames;
        vlong   bsize;
        vlong   realbsize;

        uint    maxbcnt;
        ushort  nout;
        ushort  maxout;
        ulong   lastwadj;
        Srb     *head;
        Srb     *tail;
        Srb     *inprocess;

        /* magic numbers 'R' us */
        char    serial[20+1];
        char    firmware[8+1];
        char    model[40+1];
        int     nconfig;
        uchar   config[1024];
        uchar   ident[512];
};

#pragma varargck type   "æ"    Aoedev*

static struct {
        Lock;
        QLock;
        Rendez;
        char    buf[Eventlen*Nevents];
        char    *rp;
        char    *wp;
} events;

static struct {
        RWlock;
        int     nd;
        Aoedev  *d;
} devs;

static struct {
        Lock;
        int     reader[Nnetlink];       /* reader is running. */
        Rendez  rendez[Nnetlink];       /* confirm exit. */
        Netlink nl[Nnetlink];
} netlinks;

extern Dev      aoedevtab;
static Ref      units;
static Ref      drivevers;
static int      debug;
static int      autodiscover    = 1;
static int      rediscover;

static Srb*
srballoc(ulong sz)
{
        Srb *srb;

        srb = malloc(sizeof *srb+sz);
        if(srb == nil)
                error(Enomem);
        srb->dp = srb->data = srb+1;
        srb->ticksent = MACHP(0)->ticks;
        srb->shared = 0;
        return srb;
}

static Srb*
srbkalloc(void *db, ulong)
{
        Srb *srb;

        srb = malloc(sizeof *srb);
        if(srb == nil)
                error(Enomem);
        srb->dp = srb->data = db;
        srb->ticksent = MACHP(0)->ticks;
        srb->shared = 0;
        return srb;
}

static void
srbfree(Srb *srb)
{
        while(srb->shared)
                sched();
        free(srb);
}

static void
srberror(Srb *srb, char *s)
{
        srb->error = s;
        srb->nout--;
        if(srb->nout == 0)
                wakeup(srb);
}

static void
frameerror(Aoedev *d, Frame *f, char *s)
{
        Srb *srb;

        srb = f->srb;
        if(f->tag == Tfree || !srb)
                return;
        f->srb = nil;
        f->tag = Tfree;         /* don't get fooled by way-slow responses */
        srberror(srb, s);
        d->nout--;
}

static char*
unitname(Aoedev *d)
{
        uprint("%d.%d", d->major, d->minor);
        return up->genbuf;
}

static int
eventlogready(void*)
{
        return *events.rp;
}

static long
eventlogread(void *a, long n)
{
        int len;
        char *p, *buf;

        buf = smalloc(Eventlen);
        qlock(&events);
        lock(&events);
        p = events.rp;
        len = *p;
        if(len == 0){
                n = 0;
                unlock(&events);
        } else {
                if(n > len)
                        n = len;
                /* can't move directly into pageable space with events lock held */
                memmove(buf, p+1, n);
                *p = 0;
                events.rp = p += Eventlen;
                if(p >= events.buf + sizeof events.buf)
                        events.rp = events.buf;
                unlock(&events);

                /* the concern here is page faults in memmove below */
                if(waserror()){
                        free(buf);
                        qunlock(&events);
                        nexterror();
                }
                memmove(a, buf, n);
                poperror();
        }
        free(buf);
        qunlock(&events);
        return n;
}

static int
eventlog(char *fmt, ...)
{
        int dragrp, n;
        char *p;
        va_list arg;

        lock(&events);
        p = events.wp;
        dragrp = *p++;
        va_start(arg, fmt);
        n = vsnprint(p, Eventlen-1, fmt, arg);
        *--p = n;
        p = events.wp += Eventlen;
        if(p >= events.buf + sizeof events.buf)
                p = events.wp = events.buf;
        if(dragrp)
                events.rp = p;
        unlock(&events);
        wakeup(&events);
        return n;
}

static int
eventcount(void)
{
        int n;

        lock(&events);
        if(*events.rp == 0)
                n = 0;
        else
                n = (events.wp - events.rp) & (Nevents - 1);
        unlock(&events);
        return n/Eventlen;
}

static int
tsince(int tag)
{
        int n;

        n = MACHP(0)->ticks & 0xffff;
        n -= tag & 0xffff;
        if(n < 0)
                n += 1<<16;
        return n;
}

static int
newtag(Aoedev *d)
{
        int t;

        do {
                t = ++d->lasttag << 16;
                t |= MACHP(0)->ticks & 0xffff;
        } while (t == Tfree || t == Tmgmt);
        return t;
}

static void
downdev(Aoedev *d, char *err)
{
        Frame *f, *e;

        d->flag &= ~Dup;
        f = d->frames;
        e = f + d->nframes;
        for(; f < e; f->tag = Tfree, f->srb = nil, f++)
                frameerror(d, f, Eaoedown);
        d->inprocess = nil;
        eventlog("%æ: removed; %s\n", d, err);
}

static Block*
allocfb(Frame *f)
{
        int len;
        Block *b;

        len = f->nhdr + f->dlen;
        if(len < ETHERMINTU)
                len = ETHERMINTU;
        b = allocb(len);
        memmove(b->wp, f->hdr, f->nhdr);
        if(f->dlen)
                memmove(b->wp + f->nhdr, f->dp, f->dlen);
        b->wp += len;
        return b;
}

static void
putlba(Aoeata *a, vlong lba)
{
        uchar *c;

        c = a->lba;
        c[0] = lba;
        c[1] = lba >> 8;
        c[2] = lba >> 16;
        c[3] = lba >> 24;
        c[4] = lba >> 32;
        c[5] = lba >> 40;
}

static Devlink*
pickdevlink(Aoedev *d)
{
        ulong i, n;
        Devlink *l;

        for(i = 0; i < d->ndl; i++){
                n = d->dlidx++ % d->ndl;
                l = d->dl + n;
                if(l && l->flag & Dup)
                        return l;
        }
        return 0;
}

static int
pickea(Devlink *l)
{
        if(l == 0)
                return -1;
        if(l->nea == 0)
                return -1;
        return l->eaidx++ % l->nea;
}

static int
hset(Aoedev *d, Frame *f, Aoehdr *h, int cmd)
{
        int i;
        Devlink *l;

        if(f->srb && MACHP(0)->ticks - f->srb->ticksent > Maxreqticks){
                eventlog("%æ: srb timeout\n", d);
                if(cmd == ACata && f->srb && Nofail(d, s))
                        f->srb->ticksent = MACHP(0)->ticks;
                else
                        frameerror(d, f, Etimedout);
                return -1;
        }
        l = pickdevlink(d);
        i = pickea(l);
        if(i == -1){
                if(cmd != ACata || f->srb == nil || !Nofail(d, s))
                        downdev(d, "resend fails; no netlink/ea");
                return -1;
        }
        memmove(h->dst, l->eatab[i], Eaddrlen);
        memmove(h->src, l->nl->ea, sizeof h->src);
        hnputs(h->type, Aoetype);
        h->verflag = Aoever << 4;
        h->error = 0;
        hnputs(h->major, d->major);
        h->minor = d->minor;
        h->cmd = cmd;

        hnputl(h->tag, f->tag = newtag(d));
        f->dl = l;
        f->nl = l->nl;
        f->eaidx = i;
        f->ticksent = MACHP(0)->ticks;

        return f->tag;
}

static int
resend(Aoedev *d, Frame *f)
{
        ulong n;
        Aoeata *a;

        a = (Aoeata*)f->hdr;
        if(hset(d, f, a, a->cmd) == -1)
                return -1;
        n = f->bcnt;
        if(n > d->maxbcnt){
                n = d->maxbcnt;         /* mtu mismatch (jumbo fail?) */
                if(f->dlen > n)
                        f->dlen = n;
        }
        a->scnt = n / Aoesectsz;
        f->dl->resent++;
        f->dl->npkt++;
        if(waserror())
                return -1;
        devtab[f->nl->dc->type]->bwrite(f->nl->dc, allocfb(f), 0);
        poperror();
        return 0;
}

static void
discover(int major, int minor)
{
        Aoehdr *h;
        Block *b;
        Netlink *nl, *e;

        nl = netlinks.nl;
        e = nl + nelem(netlinks.nl);
        for(; nl < e; nl++){
                if(nl->cc == nil)
                        continue;
                b = allocb(ETHERMINTU);
                if(waserror()){
                        freeb(b);
                        nexterror();
                }
                b->wp = b->rp + ETHERMINTU;
                memset(b->rp, 0, ETHERMINTU);
                h = (Aoehdr*)b->rp;
                memset(h->dst, 0xff, sizeof h->dst);
                memmove(h->src, nl->ea, sizeof h->src);
                hnputs(h->type, Aoetype);
                h->verflag = Aoever << 4;
                hnputs(h->major, major);
                h->minor = minor;
                h->cmd = ACconfig;
                poperror();
                /* send b down the queue */
                devtab[nl->dc->type]->bwrite(nl->dc, b, 0);
        }
}

/*
 * Check all frames on device and resend any frames that have been
 * outstanding for 200% of the device round trip time average.
 */
static void
aoesweepproc(void*)
{
        ulong i, tx, timeout, nbc;
        vlong starttick;
        enum { Nms = 100, Nbcms = 30*1000, };           /* magic */
        uchar *ea;
        Aoeata *a;
        Aoedev *d;
        Devlink *l;
        Frame *f, *e;

        nbc = Nbcms/Nms;
loop:
        if(nbc-- == 0){
                if(rediscover && !waserror()){
                        discover(0xffff, 0xff);
                        poperror();
                }
                nbc = Nbcms/Nms;
        }
        starttick = MACHP(0)->ticks;
        rlock(&devs);
        for(d = devs.d; d; d = d->next){
                if(!canqlock(d))
                        continue;
                if(!UP(d)){
                        qunlock(d);
                        continue;
                }
                tx = 0;
                f = d->frames;
                e = f + d->nframes;
                for (; f < e; f++){
                        if(f->tag == Tfree)
                                continue;
                        l = f->dl;
                        timeout = l->rttavg << 1;
                        i = tsince(f->tag);
                        if(i < timeout)
                                continue;
                        if(d->nout == d->maxout){
                                if(d->maxout > 1)
                                        d->maxout--;
                                d->lastwadj = MACHP(0)->ticks;
                        }
                        a = (Aoeata*)f->hdr;
                        if(a->scnt > Dbcnt / Aoesectsz &&
                           ++f->nl->lostjumbo > (d->nframes << 1)){
                                ea = f->dl->eatab[f->eaidx];
                                eventlog("%æ: jumbo failure on %s:%E; lba%lld\n",
                                        d, f->nl->path, ea, f->lba);
                                d->maxbcnt = Dbcnt;
                                d->flag &= ~Djumbo;
                        }
                        resend(d, f);
                        if(tx++ == 0){
                                if((l->rttavg <<= 1) > Rtmax)
                                        l->rttavg = Rtmax;
                                eventlog("%æ: rtt %ldms\n", d, TK2MS(l->rttavg));
                        }
                }
                if(d->nout == d->maxout && d->maxout < d->nframes &&
                   TK2MS(MACHP(0)->ticks - d->lastwadj) > 10*1000){ /* more magic */
                        d->maxout++;
                        d->lastwadj = MACHP(0)->ticks;
                }
                qunlock(d);
        }
        runlock(&devs);
        i = Nms - TK2MS(MACHP(0)->ticks - starttick);
        if(i > 0)
                tsleep(&up->sleep, return0, 0, i);
        goto loop;
}

static int
fmtæ(Fmt *f)
{
        char buf[16];
        Aoedev *d;

        d = va_arg(f->args, Aoedev*);
        snprint(buf, sizeof buf, "aoe%d.%d", d->major, d->minor);
        return fmtstrcpy(f, buf);
}

static void netbind(char *path);

static void
aoecfg(void)
{
        int n, i;
        char *p, *f[32], buf[24];

        if((p = getconf("aoeif")) == nil || (n = tokenize(p, f, nelem(f))) < 1)
                return;
        /* goo! */
        for(i = 0; i < n; i++){
                p = f[i];
                if(strncmp(p, "ether", 5) == 0)
                        snprint(buf, sizeof buf, "#l%c/ether%c", p[5], p[5]);
                else if(strncmp(p, "#l", 2) == 0)
                        snprint(buf, sizeof buf, "#l%c/ether%c", p[2], p[2]);
                else
                        continue;
                if(!waserror()){
                        netbind(buf);
                        poperror();
                }
        }
}

static void
aoeinit(void)
{
        static int init;
        static QLock l;

        if(!canqlock(&l))
                return;
        if(init == 0){
                fmtinstall(L'æ', fmtæ);
                events.rp = events.wp = events.buf;
                kproc("aoesweep", aoesweepproc, nil);
                aoecfg();
                init = 1;
        }
        qunlock(&l);
}

static Chan*
aoeattach(char *spec)
{
        Chan *c;

        if(*spec)
                error(Enonexist);
        aoeinit();
        c = devattach(L'æ', spec);
        mkqid(&c->qid, Qzero, 0, QTDIR);
        return c;
}

static Aoedev*
unit2dev(ulong unit)
{
        int i;
        Aoedev *d;

        rlock(&devs);
        i = 0;
        for(d = devs.d; d; d = d->next)
                if(i++ == unit){
                        runlock(&devs);
                        return d;
                }
        runlock(&devs);
        uprint("unit lookup failure: %lux pc %#p", unit, getcallerpc(&unit));
        error(up->genbuf);
        return nil;
}

static int
unitgen(Chan *c, ulong type, Dir *dp)
{
        int perm, t;
        ulong vers;
        vlong size;
        char *p;
        Aoedev *d;
        Qid q;

        d = unit2dev(UNIT(c->qid));
        perm = 0644;
        size = 0;
        vers = d->vers;
        t = QTFILE;

        switch(type){
        default:
                return -1;
        case Qctl:
                p = "ctl";
                break;
        case Qdata:
                p = "data";
                perm = 0640;
                if(UP(d))
                        size = d->bsize;
                break;
        case Qconfig:
                p = "config";
                if(UP(d))
                        size = d->nconfig;
                break;
        case Qident:
                p = "ident";
                if(UP(d))
                        size = sizeof d->ident;
                break;
        case Qdevlinkdir:
                p = "devlink";
                t = QTDIR;
                perm = 0555;
                break;
        }
        mkqid(&q, QID(UNIT(c->qid), type), vers, t);
        devdir(c, q, p, size, eve, perm, dp);
        return 1;
}

static int
topgen(Chan *c, ulong type, Dir *d)
{
        int perm;
        vlong size;
        char *p;
        Qid q;

        perm = 0444;
        size = 0;
        switch(type){
        default:
                return -1;
        case Qtopctl:
                p = "ctl";
                perm = 0644;
                break;
        case Qtoplog:
                p = "log";
                size = eventcount();
                break;
        }
        mkqid(&q, type, 0, QTFILE);
        devdir(c, q, p, size, eve, perm, d);
        return 1;
}

static int
aoegen(Chan *c, char *, Dirtab *, int, int s, Dir *dp)
{
        int i;
        Aoedev *d;
        Qid q;

        if(c->qid.path == 0){
                switch(s){
                case DEVDOTDOT:
                        q.path = 0;
                        q.type = QTDIR;
                        devdir(c, q, "#æ", 0, eve, 0555, dp);
                        break;
                case 0:
                        q.path = Qtopdir;
                        q.type = QTDIR;
                        devdir(c, q, "aoe", 0, eve, 0555, dp);
                        break;
                default:
                        return -1;
                }
                return 1;
        }

        switch(TYPE(c->qid)){
        default:
                return -1;
        case Qtopdir:
                if(s == DEVDOTDOT){
                        mkqid(&q, Qzero, 0, QTDIR);
                        devdir(c, q, "aoe", 0, eve, 0555, dp);
                        return 1;
                }
                if(s < Qtopfiles)
                        return topgen(c, Qtopbase + s, dp);
                s -= Qtopfiles;
                if(s >= units.ref)
                        return -1;
                mkqid(&q, QID(s, Qunitdir), 0, QTDIR);
                d = unit2dev(s);
                assert(d != nil);
                devdir(c, q, unitname(d), 0, eve, 0555, dp);
                return 1;
        case Qtopctl:
        case Qtoplog:
                return topgen(c, TYPE(c->qid), dp);
        case Qunitdir:
                if(s == DEVDOTDOT){
                        mkqid(&q, QID(0, Qtopdir), 0, QTDIR);
                        uprint("%uld", UNIT(c->qid));
                        devdir(c, q, up->genbuf, 0, eve, 0555, dp);
                        return 1;
                }
                return unitgen(c, Qunitbase+s, dp);
        case Qctl:
        case Qdata:
        case Qconfig:
        case Qident:
                return unitgen(c, TYPE(c->qid), dp);
        case Qdevlinkdir:
                i = UNIT(c->qid);
                if(s == DEVDOTDOT){
                        mkqid(&q, QID(i, Qunitdir), 0, QTDIR);
                        devdir(c, q, "devlink", 0, eve, 0555, dp);
                        return 1;
                }
                if(i >= units.ref)
                        return -1;
                d = unit2dev(i);
                if(s >= d->ndl)
                        return -1;
                uprint("%d", s);
                mkqid(&q, Q3(s, i, Qdevlink), 0, QTFILE);
                devdir(c, q, up->genbuf, 0, eve, 0755, dp);
                return 1;
        case Qdevlink:
                uprint("%d", s);
                mkqid(&q, Q3(s, UNIT(c->qid), Qdevlink), 0, QTFILE);
                devdir(c, q, up->genbuf, 0, eve, 0755, dp);
                return 1;
        }
}

static Walkqid*
aoewalk(Chan *c, Chan *nc, char **name, int nname)
{
        return devwalk(c, nc, name, nname, nil, 0, aoegen);
}

static int
aoestat(Chan *c, uchar *db, int n)
{
        return devstat(c, db, n, nil, 0, aoegen);
}

static Chan*
aoeopen(Chan *c, int omode)
{
        Aoedev *d;

        if(TYPE(c->qid) != Qdata)
                return devopen(c, omode, 0, 0, aoegen);

        d = unit2dev(UNIT(c->qid));
        qlock(d);
        if(waserror()){
                qunlock(d);
                nexterror();
        }
        if(!UP(d))
                error(Eaoedown);
        c = devopen(c, omode, 0, 0, aoegen);
        d->nopen++;
        poperror();
        qunlock(d);
        return c;
}

static void
aoeclose(Chan *c)
{
        Aoedev *d;

        if(TYPE(c->qid) != Qdata || (c->flag&COPEN) == 0)
                return;

        d = unit2dev(UNIT(c->qid));
        qlock(d);
        if(--d->nopen == 0 && !waserror()){
                discover(d->major, d->minor);
                poperror();
        }
        qunlock(d);
}

static void
atarw(Aoedev *d, Frame *f)
{
        ulong bcnt;
        char extbit, writebit;
        Aoeata *ah;
        Srb *srb;

        extbit = 0x4;
        writebit = 0x10;

        srb = d->inprocess;
        bcnt = d->maxbcnt;
        if(bcnt > srb->len)
                bcnt = srb->len;
        f->nhdr = AOEATASZ;
        memset(f->hdr, 0, f->nhdr);
        ah = (Aoeata*)f->hdr;
        if(hset(d, f, ah, ACata) == -1) {
                d->inprocess = nil;
                return;
        }
        f->dp = srb->dp;
        f->bcnt = bcnt;
        f->lba = srb->sector;
        f->srb = srb;

        ah->scnt = bcnt / Aoesectsz;
        putlba(ah, f->lba);
        if(d->flag & Dllba)
                ah->aflag |= AAFext;
        else {
                extbit = 0;
                ah->lba[3] &= 0x0f;
                ah->lba[3] |= 0xe0;     /* LBA bit+obsolete 0xa0 */
        }
        if(srb->write){
                ah->aflag |= AAFwrite;
                f->dlen = bcnt;
        }else{
                writebit = 0;
                f->dlen = 0;
        }
        ah->cmdstat = 0x20 | writebit | extbit;

        /* mark tracking fields and load out */
        srb->nout++;
        srb->dp = (uchar*)srb->dp + bcnt;
        srb->len -= bcnt;
        srb->sector += bcnt / Aoesectsz;
        if(srb->len == 0)
                d->inprocess = nil;
        d->nout++;
        f->dl->npkt++;
        if(waserror()){
                f->tag = Tfree;
                d->inprocess = nil;
                nexterror();
        }
        devtab[f->nl->dc->type]->bwrite(f->nl->dc, allocfb(f), 0);
        poperror();
}

static char*
aoeerror(Aoehdr *h)
{
        int n;
        static char *errs[] = {
                "aoe protocol error: unknown",
                "aoe protocol error: bad command code",
                "aoe protocol error: bad argument param",
                "aoe protocol error: device unavailable",
                "aoe protocol error: config string present",
                "aoe protocol error: unsupported version",
                "aoe protocol error: target is reserved",
        };

        if((h->verflag & AFerr) == 0)
                return 0;
        n = h->error;
        if(n > nelem(errs))
                n = 0;
        return errs[n];
}

static void
rtupdate(Devlink *l, int rtt)
{
        int n;

        n = rtt;
        if(rtt < 0){
                n = -rtt;
                if(n < Rtmin)
                        n = Rtmin;
                else if(n > Rtmax)
                        n = Rtmax;
                l->mintimer += (n - l->mintimer) >> 1;
        } else if(n < l->mintimer)
                n = l->mintimer;
        else if(n > Rtmax)
                n = Rtmax;

        /* g == .25; cf. Congestion Avoidance and Control, Jacobson&Karels; 1988 */
        n -= l->rttavg;
        l->rttavg += n >> 2;
}

static int
srbready(void *v)
{
        Srb *s;

        s = v;
        return s->error || (s->nout == 0 && s->len == 0);
}

static Frame*
getframe(Aoedev *d, int tag)
{
        Frame *f, *e;

        f = d->frames;
        e = f + d->nframes;
        for(; f < e; f++)
                if(f->tag == tag)
                        return f;
        return nil;
}

static Frame*
freeframe(Aoedev *d)
{
        if(d->nout < d->maxout)
                return getframe(d, Tfree);
        return nil;
}

static void
work(Aoedev *d)
{
        Frame *f;

        while ((f = freeframe(d)) != nil) {
                if(d->inprocess == nil){
                        if(d->head == nil)
                                return;
                        d->inprocess = d->head;
                        d->head = d->head->next;
                        if(d->head == nil)
                                d->tail = nil;
                }
                atarw(d, f);
        }
}

static void
strategy(Aoedev *d, Srb *srb)
{
        qlock(d);
        if(waserror()){
                qunlock(d);
                nexterror();
        }
        srb->next = nil;
        if(d->tail)
                d->tail->next = srb;
        d->tail = srb;
        if(d->head == nil)
                d->head = srb;
        srb->shared = 1;
        work(d);
        poperror();
        qunlock(d);

        while(waserror())
                ;
        sleep(srb, srbready, srb);
        poperror();
}

#define iskaddr(a)      ((uintptr)(a) > KZERO)

static long
rw(Aoedev *d, int write, uchar *db, long len, uvlong off)
{
        long n, nlen, copy;
        enum { Srbsz = 1<<19, };        /* magic allocation */
        Srb *srb;

        if((off|len) & (Aoesectsz-1))
                error("offset and length must be sector multiple.\n");
        if(off > d->bsize || len == 0)
                return 0;
        if(off + len > d->bsize)
                len = d->bsize - off;
        copy = 0;
        if(iskaddr(db)){
                srb = srbkalloc(db, len);
                copy = 1;
        }else
                srb = srballoc(Srbsz <= len? Srbsz: len);
        if(waserror()){
                srbfree(srb);
                nexterror();
        }
        nlen = len;
        srb->write = write;
        do {
                if(!UP(d))
                        error(Eio);
                srb->sector = off / Aoesectsz;
                srb->dp = srb->data;
                n = nlen;
                if(n > Srbsz)
                        n = Srbsz;
                srb->len = n;
                if(write && !copy)
                        memmove(srb->data, db, n);
                strategy(d, srb);
                if(srb->error)
                        error(srb->error);
                if(!write && !copy)
                        memmove(db, srb->data, n);
                nlen -= n;
                db += n;
                off += n;
        } while (nlen > 0);
        poperror();
        srbfree(srb);
        return len;
}

static long
readmem(ulong off, void *dst, long n, void *src, long size)
{
        if(off >= size)
                return 0;
        if(off + n > size)
                n = size - off;
        memmove(dst, (uchar*)src + off, n);
        return n;
}

static char *
pflag(char *s, char *e, uchar f)
{
        uchar i;

        for(i = 0; i < 8; i++)
                if(f & (1 << i))
                        s = seprint(s, e, "%s ", flagname[i]? flagname[i]: "oops");
        return seprint(s, e, "\n");
}

static int
pstat(Aoedev *d, char *db, int len, int off)
{
        int i;
        char *state, *s, *p, *e;

        s = p = malloc(READSTR);
        if(s == nil)
                error(Enomem);
        e = p + READSTR;

        state = "down";
        if(UP(d))
                state = "up";

        p = seprint(p, e,
                "state: %s\n"   "nopen: %d\n"   "nout: %d\n"
                "nmaxout: %d\n" "nframes: %d\n" "maxbcnt: %d\n"
                "fw: %.4ux\n"
                "model: %s\n"   "serial: %s\n"  "firmware: %s\n",
                state,          d->nopen,       d->nout,
                d->maxout,      d->nframes,     d->maxbcnt,
                d->fwver,
                d->model,       d->serial,      d->firmware);
        p = seprint(p, e, "flag: ");
        p = pflag(p, e, d->flag);

        if(p - s < len)
                len = p - s;
        i = readstr(off, db, len, s);
        free(s);
        return i;
}

static long
unitread(Chan *c, void *db, long len, vlong off)
{
        Aoedev *d;

        d = unit2dev(UNIT(c->qid));
        if(d->vers != c->qid.vers)
                error(Echange);
        switch(TYPE(c->qid)){
        default:
                error(Ebadarg);
        case Qctl:
                return pstat(d, db, len, off);
        case Qdata:
                return rw(d, Read, db, len, off);
        case Qconfig:
                if (!UP(d))
                        error(Eaoedown);
                return readmem(off, db, len, d->config, d->nconfig);
        case Qident:
                if (!UP(d))
                        error(Eaoedown);
                return readmem(off, db, len, d->ident, sizeof d->ident);
        }
}

static int
devlinkread(Chan *c, void *db, int len, int off)
{
        int i;
        char *s, *p, *e;
        Aoedev *d;
        Devlink *l;

        d = unit2dev(UNIT(c->qid));
        i = L(c->qid);
        if(i >= d->ndl)
                return 0;
        l = d->dl + i;

        s = p = malloc(READSTR);
        if(s == nil)
                error(Enomem);
        e = s + READSTR;

        p = seprint(p, e, "addr: ");
        for(i = 0; i < l->nea; i++)
                p = seprint(p, e, "%E ", l->eatab[i]);
        p = seprint(p, e, "\n");
        p = seprint(p, e, "npkt: %uld\n", l->npkt);
        p = seprint(p, e, "resent: %uld\n", l->resent);
        p = seprint(p, e, "flag: "); p = pflag(p, e, l->flag);
        p = seprint(p, e, "rttavg: %uld\n", TK2MS(l->rttavg));
        p = seprint(p, e, "mintimer: %uld\n", TK2MS(l->mintimer));

        p = seprint(p, e, "nl path: %s\n", l->nl->path);
        p = seprint(p, e, "nl ea: %E\n", l->nl->ea);
        p = seprint(p, e, "nl flag: "); p = pflag(p, e, l->flag);
        p = seprint(p, e, "nl lostjumbo: %d\n", l->nl->lostjumbo);
        p = seprint(p, e, "nl datamtu: %d\n", l->nl->datamtu);

        if(p - s < len)
                len = p - s;
        i = readstr(off, db, len, s);
        free(s);
        return i;
}

static long
topctlread(Chan *, void *db, int len, int off)
{
        int i;
        char *s, *p, *e;
        Netlink *n;

        s = p = malloc(READSTR);
        if(s == nil)
                error(Enomem);
        e = s + READSTR;

        p = seprint(p, e, "debug: %d\n", debug);
        p = seprint(p, e, "autodiscover: %d\n", autodiscover);
        p = seprint(p, e, "rediscover: %d\n", rediscover);

        for(i = 0; i < Nnetlink; i++){
                n = netlinks.nl+i;
                if(n->cc == 0)
                        continue;
                p = seprint(p, e, "if%d path: %s\n", i, n->path);
                p = seprint(p, e, "if%d ea: %E\n", i, n->ea);
                p = seprint(p, e, "if%d flag: ", i); p = pflag(p, e, n->flag);
                p = seprint(p, e, "if%d lostjumbo: %d\n", i, n->lostjumbo);
                p = seprint(p, e, "if%d datamtu: %d\n", i, n->datamtu);
        }

        if(p - s < len)
                len = p - s;
        i = readstr(off, db, len, s);
        free(s);
        return i;
}

static long
aoeread(Chan *c, void *db, long n, vlong off)
{
        switch(TYPE(c->qid)){
        default:
                error(Eperm);
        case Qzero:
        case Qtopdir:
        case Qunitdir:
        case Qdevlinkdir:
                return devdirread(c, db, n, 0, 0, aoegen);
        case Qtopctl:
                return topctlread(c, db, n, off);
        case Qtoplog:
                return eventlogread(db, n);
        case Qctl:
        case Qdata:
        case Qconfig:
        case Qident:
                return unitread(c, db, n, off);
        case Qdevlink:
                return devlinkread(c, db, n, off);
        }
}

static long
configwrite(Aoedev *d, void *db, long len)
{
        char *s;
        Aoeqc *ch;
        Frame *f;
        Srb *srb;

        if(!UP(d))
                error(Eaoedown);
        if(len > ETHERMAXTU - AOEQCSZ)
                error(Etoobig);
        srb = srballoc(len);
        s = malloc(len);
        if(s == nil)
                error(Enomem);
        memmove(s, db, len);

        if(waserror()){
                srbfree(srb);
                free(s);
                nexterror();
        }
        for (;;) {
                qlock(d);
                if(waserror()){
                        qunlock(d);
                        nexterror();
                }
                f = freeframe(d);
                if(f != nil)
                        break;
                poperror();
                qunlock(d);

                if(waserror())
                        nexterror();
                tsleep(&up->sleep, return0, 0, 100);
                poperror();
        }
        f->nhdr = AOEQCSZ;
        memset(f->hdr, 0, f->nhdr);
        ch = (Aoeqc*)f->hdr;
        if(hset(d, f, ch, ACconfig) == -1) {
                /*
                 * these refer to qlock & waserror in the above for loop.
                 * there's still the first waserror outstanding.
                 */
                poperror();
                qunlock(d);
                return 0;
        }
        srb->shared = 1;
        f->srb = srb;
        f->dp = s;
        ch->verccmd = AQCfset;
        hnputs(ch->cslen, len);
        d->nout++;
        srb->nout++;
        f->dl->npkt++;
        f->dlen = len;
        /* these too */
        poperror();
        qunlock(d);

        devtab[f->nl->dc->type]->bwrite(f->nl->dc, allocfb(f), 0);
        sleep(srb, srbready, srb);
        if(srb->error)
                error(srb->error);

        qlock(d);
        if(waserror()){
                qunlock(d);
                nexterror();
        }
        memmove(d->config, s, len);
        d->nconfig = len;
        poperror();
        qunlock(d);

        poperror();                     /* pop first waserror */

        srbfree(srb);
        memmove(db, s, len);
        free(s);
        return len;
}

static int getmtu(Chan*);

static int
devmaxdata(Aoedev *d)           /* return aoe mtu (excluding headers) */
{
        int i, nmtu, mtu;
        Devlink *l;
        Netlink *n;

        mtu = 100000;
        for(i = 0; i < d->ndl; i++){
                l = d->dl + i;
                n = l->nl;
                if((l->flag & Dup) == 0 || (n->flag & Dup) == 0)
                        continue;
                nmtu = getmtu(n->mtu);
                if(mtu > nmtu)
                        mtu = nmtu;
        }
        if(mtu == 100000)
                mtu = ETHERMAXTU;               /* normal ethernet mtu */
        mtu -= AOEATASZ;
        mtu -= (uint)mtu % Aoesectsz;
        if(mtu < 2*Aoesectsz)                   /* sanity */
                mtu = 2*Aoesectsz;
        return mtu;
}

static int
toggle(char *s, int f, int bit)
{
        if(s == nil)
                f ^= bit;
        else if(strcmp(s, "on") == 0)
                f |= bit;
        else
                f &= ~bit;
        return f;
}

static void ataident(Aoedev*);

static long
unitctlwrite(Aoedev *d, void *db, long n)
{
        uint maxbcnt, mtu;
        uvlong bsize;
        enum {
                Failio,
                Ident,
                Jumbo,
                Maxbno,
                Mtu,
                Nofailf,
                Setsize,
        };
        Cmdbuf *cb;
        Cmdtab *ct;
        static Cmdtab cmds[] = {
                {Failio,        "failio",       1 },
                {Ident,         "identify",     1 },
                {Jumbo,         "jumbo",        0 },
                {Maxbno,        "maxbno",       0 },
                {Mtu,           "mtu",          0 },
                {Nofailf,       "nofail",       0 },
                {Setsize,       "setsize",      0 },
        };

        cb = parsecmd(db, n);
        qlock(d);
        if(waserror()){
                qunlock(d);
                free(cb);
                nexterror();
        }
        ct = lookupcmd(cb, cmds, nelem(cmds));
        switch(ct->index){
        case Failio:
                downdev(d, "i/o failure");
                break;
        case Ident:
                ataident(d);
                break;
        case Jumbo:
                d->flag = toggle(cb->f[1], d->flag, Djumbo);
                break;
        case Maxbno:
        case Mtu:
                maxbcnt = devmaxdata(d);
                if(cb->nf > 2)
                        error(Ecmdargs);
                if(cb->nf == 2){
                        mtu = strtoul(cb->f[1], 0, 0);
                        if(ct->index == Maxbno)
                                mtu *= Aoesectsz;
                        else{
                                mtu -= AOEATASZ;
                                mtu &= ~(Aoesectsz-1);
                        }
                        if(mtu == 0 || mtu > maxbcnt)
                                cmderror(cb, "mtu out of legal range");
                        maxbcnt = mtu;
                }
                d->maxbcnt = maxbcnt;
                break;
        case Nofailf:
                d->flag = toggle(cb->f[1], d->flag, Dnofail);
                break;
        case Setsize:
                bsize = d->realbsize;
                if(cb->nf > 2)
                        error(Ecmdargs);
                if(cb->nf == 2){
                        bsize = strtoull(cb->f[1], 0, 0);
                        if(bsize % Aoesectsz)
                                cmderror(cb, "disk size must be sector aligned");
                }
                d->bsize = bsize;
                break;
        default:
                cmderror(cb, "unknown aoe control message");
        }
        poperror();
        qunlock(d);
        free(cb);
        return n;
}

static long
unitwrite(Chan *c, void *db, long n, vlong off)
{
        long rv;
        char *buf;
        Aoedev *d;

        d = unit2dev(UNIT(c->qid));
        switch(TYPE(c->qid)){
        default:
                error(Ebadarg);
        case Qctl:
                return unitctlwrite(d, db, n);
        case Qident:
                error(Eperm);
        case Qdata:
                return rw(d, Write, db, n, off);
        case Qconfig:
                if(off + n > sizeof d->config)
                        error(Etoobig);
                buf = malloc(sizeof d->config);
                if(buf == nil)
                        error(Enomem);
                if(waserror()){
                        free(buf);
                        nexterror();
                }
                memmove(buf, d->config, d->nconfig);
                memmove(buf + off, db, n);
                rv = configwrite(d, buf, n + off);
                poperror();
                free(buf);
                return rv;
        }
}

static Netlink*
addnet(char *path, Chan *cc, Chan *dc, Chan *mtu, uchar *ea)
{
        Netlink *nl, *e;

        lock(&netlinks);
        if(waserror()){
                unlock(&netlinks);
                nexterror();
        }
        nl = netlinks.nl;
        e = nl + nelem(netlinks.nl);
        for(; nl < e && nl->cc; nl++)
                continue;
        if (nl >= e)
                error("out of netlink structures");
        nl->cc = cc;
        nl->dc = dc;
        nl->mtu = mtu;
        strncpy(nl->path, path, sizeof nl->path);
        memmove(nl->ea, ea, sizeof nl->ea);
        poperror();
        nl->flag |= Dup;
        unlock(&netlinks);
        return nl;
}

static int
newunit(void)
{
        int x;

        lock(&units);
        if(units.ref == Maxunits)
                x = -1;
        else
                x = units.ref++;
        unlock(&units);
        return x;
}

static int
dropunit(void)
{
        int x;

        lock(&units);
        x = --units.ref;
        unlock(&units);
        return x;
}

/*
 * always allocate max frames.  maxout may change.
 */
static Aoedev*
newdev(long major, long minor, int n)
{
        Aoedev *d;
        Frame *f, *e;

        d = mallocz(sizeof *d, 1);
        f = mallocz(sizeof *f * Maxframes, 1);
        if (!d || !f) {
                free(d);
                free(f);
                error("aoe device allocation failure");
        }
        d->nframes = n;
        d->frames = f;
        for (e = f + n; f < e; f++)
                f->tag = Tfree;
        d->maxout = n;
        d->major = major;
        d->minor = minor;
        d->maxbcnt = Dbcnt;
        d->flag = Djumbo;
        d->unit = newunit();            /* bzzt.  inaccurate if units removed */
        if(d->unit == -1){
                free(d->frames);
                free(d);
                error("too many units");
        }
        d->dl = d->dltab;
        return d;
}

static Aoedev*
mm2dev(int major, int minor)
{
        Aoedev *d;

        rlock(&devs);
        for(d = devs.d; d; d = d->next)
                if(d->major == major && d->minor == minor){
                        runlock(&devs);
                        return d;
                }
        runlock(&devs);
        eventlog("mm2dev: %d.%d not found\n", major, minor);
        return nil;
}

/* Find the device in our list.  If not known, add it */
static Aoedev*
getdev(long major, long minor, int n)
{
        Aoedev *d;

        if(major == 0xffff || minor == 0xff)
                return 0;
        wlock(&devs);
        if(waserror()){
                wunlock(&devs);
                nexterror();
        }
        for(d = devs.d; d; d = d->next)
                if(d->major == major && d->minor == minor)
                        break;
        if (d == nil) {
                d = newdev(major, minor, n);
                d->next = devs.d;
                devs.d = d;
        }
        poperror();
        wunlock(&devs);
        return d;
}

static ushort
gbit16(void *a)
{
        uchar *i;

        i = a;
        return i[1] << 8 | i[0];
}

static ulong
gbit32(void *a)
{
        ulong j;
        uchar *i;

        i = a;
        j  = i[3] << 24;
        j |= i[2] << 16;
        j |= i[1] << 8;
        j |= i[0];
        return j;
}

static uvlong
gbit64(void *a)
{
        uchar *i;

        i = a;
        return (uvlong)gbit32(i+4) << 32 | gbit32(a);
}

static void
ataident(Aoedev *d)
{
        Aoeata *a;
        Block *b;
        Frame *f;

        f = freeframe(d);
        if(f == nil)
                return;
        f->nhdr = AOEATASZ;
        memset(f->hdr, 0, f->nhdr);
        a = (Aoeata*)f->hdr;
        if(hset(d, f, a, ACata) == -1)
                return;
        a->cmdstat = Cid;       /* ata 6, page 110 */
        a->scnt = 1;
        a->lba[3] = 0xa0;
        d->nout++;
        f->dl->npkt++;
        f->bcnt = 512;
        f->dlen = 0;
        b = allocfb(f);
        devtab[f->nl->dc->type]->bwrite(f->nl->dc, b, 0);
}

static int
getmtu(Chan *mtuch)
{
        int n, mtu;
        char buf[36];

        mtu = ETHERMAXTU;
        if(mtuch == nil || waserror())
                return mtu;
        n = devtab[mtuch->type]->read(mtuch, buf, sizeof buf - 1, 0);
        if(n > 12){
                buf[n] = 0;
                mtu = strtoul(buf + 12, 0, 0);
        }
        poperror();
        return mtu;
}

static int
newdlea(Devlink *l, uchar *ea)
{
        int i;
        uchar *t;

        for(i = 0; i < Nea; i++){
                t = l->eatab[i];
                if(i == l->nea){
                        memmove(t, ea, Eaddrlen);
                        return l->nea++;
                }
                if(memcmp(t, ea, Eaddrlen) == 0)
                        return i;
        }
        return -1;
}

static Devlink*
newdevlink(Aoedev *d, Netlink *n, Aoeqc *c)
{
        int i;
        Devlink *l;

        for(i = 0; i < Ndevlink; i++){
                l = d->dl + i;
                if(i == d->ndl){
                        d->ndl++;
                        newdlea(l, c->src);
                        l->nl = n;
                        l->flag |= Dup;
                        l->mintimer = Rtmin;
                        l->rttavg = Rtmax;
                        return l;
                }
                if(l->nl == n) {
                        newdlea(l, c->src);
                        l->flag |= Dup;
                        return l;
                }
        }
        eventlog("%æ: out of links: %s:%E to %E\n", d, n->path, n->ea, c->src);
        return 0;
}

static void
errrsp(Block *b, char *s)
{
        int n;
        Aoedev *d;
        Aoehdr *h;
        Frame *f;

        h = (Aoehdr*)b->rp;
        n = nhgetl(h->tag);
        if(n == Tmgmt || n == Tfree)
                return;
        d = mm2dev(nhgets(h->major), h->minor);
        if(d == 0)
                return;
        if(f = getframe(d, n))
                frameerror(d, f, s);
}

static void
qcfgrsp(Block *b, Netlink *nl)
{
        int major, cmd, cslen, blen;
        unsigned n;
        Aoedev *d;
        Aoeqc *ch;
        Devlink *l;
        Frame *f;

        ch = (Aoeqc*)b->rp;
        major = nhgets(ch->major);
        n = nhgetl(ch->tag);
        if(n != Tmgmt){
                d = mm2dev(major, ch->minor);
                if(d == nil)
                        return;
                qlock(d);
                f = getframe(d, n);
                if(f == nil){
                        qunlock(d);
                        eventlog("%æ: unknown response tag %ux\n", d, n);
                        return;
                }
                cslen = nhgets(ch->cslen);
                blen = BLEN(b) - AOEQCSZ;
                if(cslen < blen && BLEN(b) > 60)
                        eventlog("%æ: cfgrsp: tag %.8ux oversized %d %d\n",
                                d, n, cslen, blen);
                if(cslen > blen){
                        eventlog("%æ: cfgrsp: tag %.8ux runt %d %d\n",
                                d, n, cslen, blen);
                        cslen = blen;
                }
                memmove(f->dp, ch + 1, cslen);
                f->srb->nout--;
                wakeup(f->srb);
                f->srb->shared = 0;
                d->nout--;
                f->srb = nil;
                f->tag = Tfree;
                qunlock(d);
                return;
        }

        cmd = ch->verccmd & 0xf;
        if(cmd != 0){
                eventlog("aoe%d.%d: cfgrsp: bad command %d\n", major, ch->minor, cmd);
                return;
        }
        n = nhgets(ch->bufcnt);
        if(n > Maxframes)
                n = Maxframes;

        if(waserror()){
                eventlog("getdev: %d.%d ignored: %s\n", major, ch->minor, up->errstr);
                return;
        }
        d = getdev(major, ch->minor, n);
        poperror();
        if(d == 0)
                return;

        qlock(d);
        *up->errstr = 0;
        if(waserror()){
                qunlock(d);
                eventlog("%æ: %s\n", d, up->errstr);
                nexterror();
        }

        l = newdevlink(d, nl, ch);              /* add this interface. */

        d->fwver = nhgets(ch->fwver);
        n = nhgets(ch->cslen);
        if(n > sizeof d->config)
                n = sizeof d->config;
        d->nconfig = n;
        memmove(d->config, ch + 1, n);
        if(l != 0 && d->flag & Djumbo){
                n = getmtu(nl->mtu) - AOEATASZ;
                n /= Aoesectsz;
                if(n > ch->scnt)
                        n = ch->scnt;
                n = n? n * Aoesectsz: Dbcnt;
                if(n != d->maxbcnt){
                        eventlog("%æ: setting %d byte data frames on %s:%E\n",
                                d, n, nl->path, nl->ea);
                        d->maxbcnt = n;
                }
        }
        if(d->nopen == 0)
                ataident(d);
        poperror();
        qunlock(d);
}

void
aoeidmove(char *p, ushort *u, unsigned n)
{
        int i;
        char *op, *e, *s;

        op = p;
        /*
         * the ushort `*u' is sometimes not aligned on a short boundary,
         * so dereferencing u[i] causes an alignment exception on
         * some machines.
         */
        s = (char *)u;
        for(i = 0; i < n; i += 2){
                *p++ = s[i + 1];
                *p++ = s[i];
        }
        *p = 0;
        while(p > op && *--p == ' ')
                *p = 0;
        e = p;
        p = op;
        while(*p == ' ')
                p++;
        memmove(op, p, n - (e - p));
}

static vlong
aoeidentify(Aoedev *d, ushort *id)
{
        int i;
        vlong s;

        d->flag &= ~(Dllba|Dpower|Dsmart|Dnop|Dup);

        i = gbit16(id+83) | gbit16(id+86);
        if(i & (1<<10)){
                d->flag |= Dllba;
                s = gbit64(id+100);
        }else
                s = gbit32(id+60);

        i = gbit16(id+83);
        if((i>>14) == 1) {
                if(i & (1<<3))
                        d->flag  |= Dpower;
                i = gbit16(id+82);
                if(i & 1)
                        d->flag  |= Dsmart;
                if(i & (1<<14))
                        d->flag  |= Dnop;
        }
//      eventlog("%æ up\n", d);
        d->flag |= Dup;
        memmove(d->ident, id, sizeof d->ident);
        return s;
}

static void
newvers(Aoedev *d)
{
        lock(&drivevers);
        d->vers = drivevers.ref++;
        unlock(&drivevers);
}

static int
identify(Aoedev *d, ushort *id)
{
        vlong osectors, s;
        uchar oserial[21];

        s = aoeidentify(d, id);
        if(s == -1)
                return -1;
        osectors = d->realbsize;
        memmove(oserial, d->serial, sizeof d->serial);

        aoeidmove(d->serial, id+10, 20);
        aoeidmove(d->firmware, id+23, 8);
        aoeidmove(d->model, id+27, 40);

        s *= Aoesectsz;
        if((osectors == 0 || osectors != s) &&
            memcmp(oserial, d->serial, sizeof oserial) != 0){
                d->bsize = s;
                d->realbsize = s;
//              d->mediachange = 1;
                newvers(d);
        }
        return 0;
}

static void
atarsp(Block *b)
{
        unsigned n;
        short major;
        Aoeata *ahin, *ahout;
        Aoedev *d;
        Frame *f;
        Srb *srb;

        ahin = (Aoeata*)b->rp;
        major = nhgets(ahin->major);
        d = mm2dev(major, ahin->minor);
        if(d == nil)
                return;
        qlock(d);
        if(waserror()){
                qunlock(d);
                nexterror();
        }
        n = nhgetl(ahin->tag);
        f = getframe(d, n);
        if(f == nil){
                dprint("%æ: unexpected response; tag %ux\n", d, n);
                goto bail;
        }
        rtupdate(f->dl, tsince(f->tag));
        ahout = (Aoeata*)f->hdr;
        srb = f->srb;

        if(ahin->cmdstat & 0xa9){
                eventlog("%æ: ata error cmd %.2ux stat %.2ux\n",
                        d, ahout->cmdstat, ahin->cmdstat);
                if(srb)
                        srb->error = Eio;
        } else {
                n = ahout->scnt * Aoesectsz;
                switch(ahout->cmdstat){
                case Crd:
                case Crdext:
                        if(BLEN(b) - AOEATASZ < n){
                                eventlog("%æ: runt read blen %ld expect %d\n",
                                        d, BLEN(b), n);
                                goto bail;
                        }
                        memmove(f->dp, (uchar *)ahin + AOEATASZ, n);
                case Cwr:
                case Cwrext:
                        if(n > Dbcnt)
                                f->nl->lostjumbo = 0;
                        if(f->bcnt -= n){
                                f->lba += n / Aoesectsz;
                                f->dp = (uchar*)f->dp + n;
                                resend(d, f);
                                goto bail;
                        }
                        break;
                case Cid:
                        if(BLEN(b) - AOEATASZ < 512){
                                eventlog("%æ: runt identify blen %ld expect %d\n",
                                        d, BLEN(b), n);
                                goto bail;
                        }
                        identify(d, (ushort*)((uchar *)ahin + AOEATASZ));
                        break;
                default:
                        eventlog("%æ: unknown ata command %.2ux \n",
                                d, ahout->cmdstat);
                }
        }

        if(srb && --srb->nout == 0 && srb->len == 0){
                wakeup(srb);
                srb->shared = 0;
        }
        f->srb = nil;
        f->tag = Tfree;
        d->nout--;

        work(d);
bail:
        poperror();
        qunlock(d);
}

static void
netrdaoeproc(void *v)
{
        int idx;
        char name[Maxpath+1], *s;
        Aoehdr *h;
        Block *b;
        Netlink *nl;

        nl = (Netlink*)v;
        idx = nl - netlinks.nl;
        netlinks.reader[idx] = 1;
        kstrcpy(name, nl->path, Maxpath);

        if(waserror()){
                eventlog("netrdaoe exiting: %s\n", up->errstr);
                netlinks.reader[idx] = 0;
                wakeup(netlinks.rendez + idx);
                pexit(up->errstr, 1);
        }
        if(autodiscover)
                discover(0xffff, 0xff);
        for (;;) {
                if(!(nl->flag & Dup)) {
                        uprint("%s: netlink is down", name);
                        error(up->genbuf);
                }
                if (nl->dc == nil)
                        panic("netrdaoe: nl->dc == nil");
                b = devtab[nl->dc->type]->bread(nl->dc, 1<<16, 0);
                if(b == nil) {
                        uprint("%s: nil read from network", name);
                        error(up->genbuf);
                }
                h = (Aoehdr*)b->rp;
                if(h->verflag & AFrsp)
                        if(s = aoeerror(h)){
                                eventlog("%s: %s\n", nl->path, up->errstr);
                                errrsp(b, s);
                        }else
                                switch(h->cmd){
                                case ACata:
                                        atarsp(b);
                                        break;
                                case ACconfig:
                                        qcfgrsp(b, nl);
                                        break;
                                default:
                                        if((h->cmd & 0xf0) == 0){
                                                eventlog("%s: unknown cmd %d\n",
                                                        nl->path, h->cmd);
                                                errrsp(b, "unknown command");
                                        }
                                        break;
                                }
                freeb(b);
        }
}

static void
getaddr(char *path, uchar *ea)
{
        int n;
        char buf[2*Eaddrlen+1];
        Chan *c;

        uprint("%s/addr", path);
        c = namec(up->genbuf, Aopen, OREAD, 0);
        if(waserror()) {
                cclose(c);
                nexterror();
        }
        if (c == nil)
                panic("æ: getaddr: c == nil");
        n = devtab[c->type]->read(c, buf, sizeof buf-1, 0);
        poperror();
        cclose(c);
        buf[n] = 0;
        if(parseether(ea, buf) < 0)
                error("parseether failure");
}

static void
netbind(char *path)
{
        char addr[Maxpath];
        uchar ea[2*Eaddrlen+1];
        Chan *dc, *cc, *mtu;
        Netlink *nl;

        snprint(addr, sizeof addr, "%s!%#x", path, Aoetype);
        dc = chandial(addr, nil, nil, &cc);
        snprint(addr, sizeof addr, "%s/mtu", path);
        if(waserror())
                mtu = nil;
        else {
                mtu = namec(addr, Aopen, OREAD, 0);
                poperror();
        }

        if(waserror()){
                cclose(dc);
                cclose(cc);
                if(mtu)
                        cclose(mtu);
                nexterror();
        }
        if(dc == nil  || cc == nil)
                error(Enonexist);
        getaddr(path, ea);
        nl = addnet(path, cc, dc, mtu, ea);
        snprint(addr, sizeof addr, "netrdaoe@%s", path);
        kproc(addr, netrdaoeproc, nl);
        poperror();
}

static int
unbound(void *v)
{
        return *(int*)v != 0;
}

static void
netunbind(char *path)
{
        int i, idx;
        Aoedev *d, *p, *next;
        Chan *dc, *cc;
        Devlink *l;
        Frame *f;
        Netlink *n, *e;

        n = netlinks.nl;
        e = n + nelem(netlinks.nl);

        lock(&netlinks);
        for(; n < e; n++)
                if(n->dc && strcmp(n->path, path) == 0)
                        break;
        unlock(&netlinks);
        if (n >= e)
                error("device not bound");

        /*
         * hunt down devices using this interface; disable
         * this also terminates the reader.
         */
        idx = n - netlinks.nl;
        wlock(&devs);
        for(d = devs.d; d; d = d->next){
                qlock(d);
                for(i = 0; i < d->ndl; i++){
                        l = d->dl + i;
                        if(l->nl == n)
                                l->flag &= ~Dup;
                }
                qunlock(d);
        }
        n->flag &= ~Dup;
        wunlock(&devs);

        /* confirm reader is down. */
        while(waserror())
                ;
        sleep(netlinks.rendez + idx, unbound, netlinks.reader + idx);
        poperror();

        /* reschedule packets. */
        wlock(&devs);
        for(d = devs.d; d; d = d->next){
                qlock(d);
                for(i = 0; i < d->nframes; i++){
                        f = d->frames + i;
                        if(f->tag != Tfree && f->nl == n)
                                resend(d, f);
                }
                qunlock(d);
        }
        wunlock(&devs);

        /* squeeze devlink pool.  (we assert nobody is using them now) */
        wlock(&devs);
        for(d = devs.d; d; d = d->next){
                qlock(d);
                for(i = 0; i < d->ndl; i++){
                        l = d->dl + i;
                        if(l->nl == n)
                                memmove(l, l + 1, sizeof *l * (--d->ndl - i));
                }
                qunlock(d);
        }
        wunlock(&devs);

        /* close device link. */
        lock(&netlinks);
        dc = n->dc;
        cc = n->cc;
        if(n->mtu)
                cclose(n->mtu);
        memset(n, 0, sizeof *n);
        unlock(&netlinks);

        cclose(dc);
        cclose(cc);

        /* squeeze orphan devices */
        wlock(&devs);
        for(p = d = devs.d; d; d = next){
                next = d->next;
                if(d->ndl > 0) {
                        p = d;
                        continue;
                }
                qlock(d);
                downdev(d, "orphan");
                qunlock(d);
                if(p != devs.d)
                        p->next = next;
                else{
                        devs.d = next;
                        p = devs.d;
                }
                free(d->frames);
                free(d);
                dropunit();
        }
        wunlock(&devs);
}

static void
removeaoedev(Aoedev *d)
{
        int i;
        Aoedev *p;

        wlock(&devs);
        p = 0;
        if(d != devs.d)
                for(p = devs.d; p; p = p->next)
                        if(p->next == d)
                                break;
        qlock(d);
        d->flag &= ~Dup;

        /*
         * Changing the version number is, strictly speaking, correct,
         * but doing so means that deleting a LUN that is not in use
         * invalidates all other LUNs too.  If your file server has
         * venti arenas or fossil file systems on 1.0, and you delete 1.1,
         * since you no longer need it, 1.0 will become inaccessible to your
         * file server, which will eventually panic.  Note that newdev()
         * does not change the version number.
         */
        // newvers(d);

        d->ndl = 0;
        qunlock(d);
        for(i = 0; i < d->nframes; i++)
                frameerror(d, d->frames+i, Eaoedown);

        if(p)
                p->next = d->next;
        else
                devs.d = d->next;
        free(d->frames);
        free(d);
        dropunit();
        wunlock(&devs);
}

static void
removedev(char *name)
{
        Aoedev *d, *p;

        wlock(&devs);
        for(p = d = devs.d; d; p = d, d = d->next)
                if(strcmp(name, unitname(d)) == 0) {
                        wunlock(&devs);
                        removeaoedev(p);
                        return;
                }
        wunlock(&devs);
        error("device not bound");
}

static void
discoverstr(char *f)
{
        ushort shelf, slot;
        ulong sh;
        char *s;

        if(f == 0){
                discover(0xffff, 0xff);
                return;
        }

        shelf = sh = strtol(f, &s, 0);
        if(s == f || sh > 0xffff)
                error("bad shelf");
        f = s;
        if(*f++ == '.'){
                slot = strtol(f, &s, 0);
                if(s == f || slot > 0xff)
                        error("bad shelf");
        }else
                slot = 0xff;
        discover(shelf, slot);
}


static void
aoeremove(Chan *c)
{
        switch(TYPE(c->qid)){
        default:
                error(Eperm);
        case Qunitdir:
                removeaoedev(unit2dev(UNIT(c->qid)));
                break;
        }
}

static long
topctlwrite(void *db, long n)
{
        enum {
                Autodiscover,
                Bind,
                Debug,
                Discover,
                Rediscover,
                Remove,
                Unbind,
        };
        char *f;
        Cmdbuf *cb;
        Cmdtab *ct;
        static Cmdtab cmds[] = {
                { Autodiscover, "autodiscover", 0        },
                { Bind,         "bind",         2       },
                { Debug,        "debug",        0        },
                { Discover,     "discover",     0        },
                { Rediscover,   "rediscover",   0        },
                { Remove,       "remove",       2       },
                { Unbind,       "unbind",       2       },
        };

        cb = parsecmd(db, n);
        if(waserror()){
                free(cb);
                nexterror();
        }
        ct = lookupcmd(cb, cmds, nelem(cmds));
        f = cb->f[1];
        switch(ct->index){
        case Autodiscover:
                autodiscover = toggle(f, autodiscover, 1);
                break;
        case Bind:
                netbind(f);
                break;
        case Debug:
                debug = toggle(f, debug, 1);
                break;
        case Discover:
                discoverstr(f);
                break;
        case Rediscover:
                rediscover = toggle(f, rediscover, 1);
                break;
        case Remove:
                removedev(f);
                break;
        case Unbind:
                netunbind(f);
                break;
        default:
                cmderror(cb, "unknown aoe control message");
        }
        poperror();
        free(cb);
        return n;
}

static long
aoewrite(Chan *c, void *db, long n, vlong off)
{
        switch(TYPE(c->qid)){
        default:
        case Qzero:
        case Qtopdir:
        case Qunitdir:
        case Qtoplog:
                error(Eperm);
        case Qtopctl:
                return topctlwrite(db, n);
        case Qctl:
        case Qdata:
        case Qconfig:
        case Qident:
                return unitwrite(c, db, n, off);
        }
}

Dev aoedevtab = {
        L'æ',
        "aoe",

        devreset,
        devinit,
        devshutdown,
        aoeattach,
        aoewalk,
        aoestat,
        aoeopen,
        devcreate,
        aoeclose,
        aoeread,
        devbread,
        aoewrite,
        devbwrite,
        aoeremove,
        devwstat,
        devpower,
        devconfig,
};