Subversion Repositories planix.SVN

Rev

Blame | Last modification | View Log | RSS feed

typedef struct Config           Config;
typedef struct AMap             AMap;
typedef struct AMapN            AMapN;
typedef struct Arena            Arena;
typedef struct AState   AState;
typedef struct ArenaCIG ArenaCIG;
typedef struct ArenaHead        ArenaHead;
typedef struct ArenaPart        ArenaPart;
typedef struct ArenaTail        ArenaTail;
typedef struct ATailStats       ATailStats;
typedef struct CIBlock          CIBlock;
typedef struct Clump            Clump;
typedef struct ClumpInfo        ClumpInfo;
typedef struct Graph Graph;
typedef struct IAddr            IAddr;
typedef struct IBucket          IBucket;
typedef struct IEStream         IEStream;
typedef struct IEntry           IEntry;
typedef struct IFile            IFile;
typedef struct ISect            ISect;
typedef struct Index            Index;
typedef struct Lump             Lump;
typedef struct DBlock           DBlock;
typedef struct Part             Part;
typedef struct Statbin Statbin;
typedef struct Statdesc Statdesc;
typedef struct Stats            Stats;
typedef struct ZBlock           ZBlock;
typedef struct Round    Round;
typedef struct Bloom    Bloom;

#pragma incomplete IEStream

#define TWID32  ((u32int)~(u32int)0)
#define TWID64  ((u64int)~(u64int)0)
#define TWID8   ((u8int)~(u8int)0)

enum
{
        ABlockLog               = 9,            /* log2(512), the quantum for reading arenas */
        ANameSize               = 64,
        MaxDiskBlock            = 64*1024,      /* max. allowed size for a disk block */
        MaxIoSize               = 64*1024,      /* max. allowed size for a disk io operation */
        PartBlank               = 256*1024,     /* untouched section at beginning of partition */
        HeadSize                = 512,          /* size of a header after PartBlank */
        MinArenaSize            = 1*1024*1024,  /* smallest reasonable arena size */
        IndexBase               = 1024*1024,    /* initial address to use in an index */
        MaxIo                   = 64*1024,      /* max size of a single read or write operation */
        ICacheBits              = 16,           /* default bits for indexing icache */
        MaxAMap                 = 31*1024,      /* max. allowed arenas in an address mapping; must be < 32*1024 */
        Unspecified             = TWID32,

        /*
         * return codes from syncarena
         */
        SyncDataErr     = 1 << 0,               /* problem reading the clump data */
        SyncCIErr       = 1 << 1,               /* found erroneous clump directory entries */
        SyncCIZero      = 1 << 2,               /* found unwritten clump directory entries */
        SyncFixErr      = 1 << 3,               /* error writing fixed data */
        SyncHeader      = 1 << 4,               /* altered header fields */

        /*
         * error severity
         */
        EOk                     = 0,            /* error expected in normal operation */
        EStrange,                               /* strange error that should be logged */
        ECorrupt,                               /* corrupted data found in arenas */
        EICorrupt,                              /* corrupted data found in index */
        EAdmin,                                 /* should be brought to administrators' attention */
        ECrash,                                 /* really bad internal error */
        EBug,                                   /* a limitation which should be fixed */
        EInconsist,                             /* inconsistencies between index and arena */
        EMax,

        /*
         * internal disk formats for the venti archival storage system
         */
        /*
         * magic numbers on disk
         */
        _ClumpMagic             = 0xd15cb10cU,  /* clump header, deprecated */
        ClumpFreeMagic          = 0,            /* free clump; terminates active clump log */

        ArenaPartMagic          = 0xa9e4a5e7U,  /* arena partition header */
        ArenaMagic              = 0xf2a14eadU,  /* arena trailer */
        ArenaHeadMagic          = 0xd15c4eadU,  /* arena header */
        
        BloomMagic              = 0xb1004eadU,  /* bloom filter header */
        BloomMaxHash    = 32,

        ISectMagic              = 0xd15c5ec7U,  /* index header */

        ArenaPartVersion        = 3,
        ArenaVersion4           = 4,
        ArenaVersion5           = 5,
        BloomVersion            = 1,
        IndexVersion            = 1,
        ISectVersion1           = 1,
        ISectVersion2           = 2,

        /*
         * encodings of clumps on disk
         */
        ClumpEErr               = 0,            /* can't happen */
        ClumpENone,                             /* plain */
        ClumpECompress,                         /* compressed */
        ClumpEMax,

        /*
         * sizes in bytes on disk
         */
        U8Size                  = 1,
        U16Size                 = 2,
        U32Size                 = 4,
        U64Size                 = 8,

        ArenaPartSize           = 4 * U32Size,
        ArenaSize4              = 2 * U64Size + 6 * U32Size + ANameSize + U8Size,
        ArenaSize5                      = ArenaSize4 + U32Size,
        ArenaSize5a             = ArenaSize5 + 2 * U8Size + 2 * U32Size + 2 * U64Size,
        ArenaHeadSize4          = U64Size + 3 * U32Size + ANameSize,
        ArenaHeadSize5          = ArenaHeadSize4 + U32Size,
        BloomHeadSize   = 4 * U32Size,
        ISectSize1              = 7 * U32Size + 2 * ANameSize,
        ISectSize2              = ISectSize1 + U32Size,
        ClumpInfoSize           = U8Size + 2 * U16Size + VtScoreSize,
        ClumpSize               = ClumpInfoSize + U8Size + 3 * U32Size,
        MaxBloomSize            = 1<<(32-3),    /* 2^32 bits */
        MaxBloomHash    = 32,           /* bits per score */
        /*
         * BUG - The various block copies that manipulate entry buckets
         * would be faster if we bumped IBucketSize up to 8 and IEntrySize up to 40,
         * so that everything is word-aligned.  Buildindex is actually cpu-bound
         * by the (byte at a time) copying in qsort.
         */
        IBucketSize             = U32Size + U16Size,
        IEntrySize              = U64Size + U32Size + 2*U16Size + 2*U8Size + VtScoreSize,
        IEntryTypeOff           = VtScoreSize + U32Size + U16Size + U64Size + U16Size,
        IEntryAddrOff           = VtScoreSize + U32Size + U16Size,

        MaxClumpBlocks          =  (VtMaxLumpSize + ClumpSize + (1 << ABlockLog) - 1) >> ABlockLog,
        
        IcacheFrac              = 1000000,      /* denominator */

        SleepForever            = 1000000000,   /* magic value for sleep time */
        /*
         * dirty flags - order controls disk write order
         */
        DirtyArena              = 1,
        DirtyArenaCib,
        DirtyArenaTrailer,
        DirtyMax,
        
        ArenaCIGSize = 10*1024, // about 0.5 MB worth of IEntry.

        VentiZZZZZZZZ
};

extern char TraceDisk[];
extern char TraceLump[];
extern char TraceBlock[];
extern char TraceProc[];
extern char TraceWork[];
extern char TraceQuiet[];
extern char TraceRpc[];

/*
 * results of parsing and initializing a config file
 */
struct Config
{
        char            *index;                 /* name of the index to initialize */
        int             naparts;                /* arena partitions initialized */
        ArenaPart       **aparts;
        int             nsects;                 /* index sections initialized */
        ISect           **sects;
        Bloom   *bloom;         /* bloom filter */
        u32int  bcmem;
        u32int  mem;
        u32int  icmem;
        int             queuewrites;
        char*   haddr;
        char*   vaddr;
        char*   webroot;
};

/*
 * a Part is the low level interface to files or disks.
 * there are two main types of partitions
 *      arena paritions, which some number of arenas, each in a sub-partition.
 *      index partition, which only have one subpartition.
 */
struct Part
{
        int             fd;                     /* rock for accessing the disk */
        int             mode;
        u64int          offset;
        u64int          size;                   /* size of the partiton */
        u32int          blocksize;              /* block size for reads and writes */
        u32int          fsblocksize;    /* minimum file system block size */
        char            *name;
        char            *filename;
        Channel         *writechan;             /* chan[dcache.nblock](DBlock*) */
};

/*
 * a cached block from the partition
 * yuck -- most of this is internal structure for the cache
 * all other routines should only use data
 */
struct DBlock
{
        u8int   *data;

        Part    *part;                  /* partition in which cached */
        u64int  addr;                   /* base address on the partition */
        u32int  size;                   /* amount of data available, not amount allocated; should go away */
        u32int  mode;
        u32int  dirty;
        u32int  dirtying;
        DBlock  *next;                  /* doubly linked hash chains */
        DBlock  *prev;
        u32int  heap;                   /* index in heap table */
        u32int  used;                   /* last reference times */
        u32int  used2;
        u32int  ref;                    /* reference count */
        RWLock  lock;                   /* for access to data only */
        Channel *writedonechan; 
        void*   chanbuf[1];             /* buffer for the chan! */
};

/*
 * a cached block from the partition
 * yuck -- most of this is internal structure for the cache
 * all other routines should only use data
 * double yuck -- this is mostly the same as a DBlock
 */
struct Lump
{
        Packet  *data;

        Part    *part;                  /* partition in which cached */
        u8int   score[VtScoreSize];     /* score of packet */
        u8int   type;                   /* type of packet */
        u32int  size;                   /* amount of data allocated to hold packet */
        Lump    *next;                  /* doubly linked hash chains */
        Lump    *prev;
        u32int  heap;                   /* index in heap table */
        u32int  used;                   /* last reference times */
        u32int  used2;
        u32int  ref;                    /* reference count */
        QLock   lock;                   /* for access to data only */
};

/*
 * mapping between names and address ranges
 */
struct AMap
{
        u64int          start;
        u64int          stop;
        char            name[ANameSize];
};

/*
 * an AMap along with a length
 */
struct AMapN
{
        int             n;
        AMap            *map;
};

/*
 * an ArenaPart is a partition made up of Arenas
 * it exists because most os's don't support many partitions,
 * and we want to have many different Arenas
 */
struct ArenaPart
{
        Part            *part;
        u64int          size;                   /* size of underlying partition, rounded down to blocks */
        Arena           **arenas;
        u32int          tabbase;                /* base address of arena table on disk */
        u32int          tabsize;                /* max. bytes in arena table */

        /*
         * fields stored on disk
         */
        u32int          version;
        u32int          blocksize;              /* "optimal" block size for reads and writes */
        u32int          arenabase;              /* base address of first arena */

        /*
         * stored in the arena mapping table on disk
         */
        AMap            *map;
        int             narenas;
};

/*
 * info about one block in the clump info cache
 */
struct CIBlock
{
        u32int          block;                  /* blocks in the directory */
        int             offset;                 /* offsets of one clump in the data */
        DBlock          *data;
};

/*
 * Statistics kept in the tail. 
 */
struct ATailStats
{
        u32int          clumps;         /* number of clumps */
        u32int          cclumps;                /* number of compressed clumps */
        u64int          used;
        u64int          uncsize;
        u8int           sealed;
};

/*
 * Arena state - represents a point in the data log
 */
struct AState
{
        Arena           *arena;
        u64int          aa;                     /* index address */
        ATailStats              stats;
};

/*
 * an Arena is a log of Clumps, preceeded by an ArenaHeader,
 * and followed by a Arena, each in one disk block.
 * struct on disk is not always up to date, but should be self-consistent.
 * to sync after reboot, follow clumps starting at used until ClumpFreeMagic if found.
 * <struct name="Arena" type="Arena *">
 *      <field name="name" val="s->name" type="AName"/>
 *      <field name="version" val="s->version" type="U32int"/>
 *      <field name="partition" val="s->part->name" type="AName"/>
 *      <field name="blocksize" val="s->blocksize" type="U32int"/>
 *      <field name="start" val="s->base" type="U64int"/>
 *      <field name="stop" val="s->base+2*s->blocksize" type="U64int"/>
 *      <field name="created" val="s->ctime" type="U32int"/>
 *      <field name="modified" val="s->wtime" type="U32int"/>
 *      <field name="sealed" val="s->sealed" type="Sealed"/>
 *      <field name="score" val="s->score" type="Score"/>
 *      <field name="clumps" val="s->clumps" type="U32int"/>
 *      <field name="compressedclumps" val="s->cclumps" type="U32int"/>
 *      <field name="data" val="s->uncsize" type="U64int"/>
 *      <field name="compresseddata" val="s->used - s->clumps * ClumpSize" type="U64int"/>
 *      <field name="storage" val="s->used + s->clumps * ClumpInfoSize" type="U64int"/>
 * </struct>
 */
struct Arena
{
        QLock           lock;                   /* lock for arena fields, writing to disk */
        Part            *part;                  /* partition in which arena lives */
        int             blocksize;              /* size of block to read or write */
        u64int          base;                   /* base address on disk */
        u64int          size;                   /* total space in the arena */
        u8int           score[VtScoreSize];     /* score of the entire sealed & summed arena */

        int             clumpmax;               /* ClumpInfos per block */
        AState          mem;
        int             inqueue;

        /*
         * fields stored on disk
         */
        u32int          version;
        char            name[ANameSize];        /* text label */
        ATailStats              memstats;
        ATailStats              diskstats;
        u32int          ctime;                  /* first time a block was written */
        u32int          wtime;                  /* last time a block was written */
        u32int          clumpmagic;
        
        ArenaCIG        *cig;
        int     ncig;
};

struct ArenaCIG
{
        u64int  offset;  // from arena base
};

/*
 * redundant storage of some fields at the beginning of each arena
 */
struct ArenaHead
{
        u32int          version;
        char            name[ANameSize];
        u32int          blocksize;
        u64int          size;
        u32int          clumpmagic;
};

/*
 * most interesting meta information for a clump.
 * stored in each clump's header and in the Arena's directory,
 * stored in reverse order just prior to the arena trailer
 */
struct ClumpInfo
{
        u8int           type;
        u16int          size;                   /* size of disk data, not including header */
        u16int          uncsize;                /* size of uncompressed data */
        u8int           score[VtScoreSize];     /* score of the uncompressed data only */
};

/*
 * header for an immutable clump of data
 */
struct Clump
{
        ClumpInfo       info;
        u8int           encoding;
        u32int          creator;                /* initial client which wrote the block */
        u32int          time;                   /* creation at gmt seconds since 1/1/1970 */
};

/*
 * index of all clumps according to their score
 * this is just a wrapper to tie together the index sections
 * <struct name="Index" type="Index *">
 *      <field name="name" val="s->name" type="AName"/>
 *      <field name="version" val="s->version" type="U32int"/>
 *      <field name="blocksize" val="s->blocksize" type="U32int"/>
 *      <field name="tabsize" val="s->tabsize" type="U32int"/>
 *      <field name="buckets" val="s->buckets" type="U32int"/>
 *      <field name="buckdiv" val="s->div" type="U32int"/>
 *      <field name="bitblocks" val="s->div" type="U32int"/>
 *      <field name="maxdepth" val="s->div" type="U32int"/>
 *      <field name="bitkeylog" val="s->div" type="U32int"/>
 *      <field name="bitkeymask" val="s->div" type="U32int"/>
 *      <array name="sect" val="&s->smap[i]" elems="s->nsects" type="Amap"/>
 *      <array name="amap" val="&s->amap[i]" elems="s->narenas" type="Amap"/>
 *      <array name="arena" val="s->arenas[i]" elems="s->narenas" type="Arena"/>
 * </struct>
 * <struct name="Amap" type="AMap *">
 *      <field name="name" val="s->name" type="AName"/>
 *      <field name="start" val="s->start" type="U64int"/>
 *      <field name="stop" val="s->stop" type="U64int"/>
 * </struct>
 */
struct Index
{
        u32int          div;                    /* divisor for mapping score to bucket */
        u32int          buckets;                /* last bucket used in disk hash table */
        u32int          blocksize;
        u32int          tabsize;                /* max. bytes in index config */

        int             mapalloc;               /* first arena to check when adding a lump */
        Arena           **arenas;               /* arenas in the mapping */
        ISect           **sects;                /* sections which hold the buckets */
        Bloom           *bloom; /* bloom filter */

        /*
         * fields stored in config file 
         */
        u32int          version;
        char            name[ANameSize];        /* text label */
        int             nsects;
        AMap            *smap;                  /* mapping of buckets to index sections */
        int             narenas;
        AMap            *amap;                  /* mapping from index addesses to arenas */
        
        QLock   writing;
};

/*
 * one part of the bucket storage for an index.
 * the index blocks are sequentially allocated
 * across all of the sections.
 */
struct ISect
{
        Part            *part;
        int             blocklog;               /* log2(blocksize) */
        int             buckmax;                /* max. entries in a index bucket */
        u32int          tabbase;                /* base address of index config table on disk */
        u32int          tabsize;                /* max. bytes in index config */
        Channel *writechan;
        Channel *writedonechan;
        void            *ig;            /* used by buildindex only */
        int             ng;

        /*
         * fields stored on disk
         */
        u32int          version;
        u32int          bucketmagic;
        char            name[ANameSize];        /* text label */
        char            index[ANameSize];       /* index owning the section */
        u32int          blocksize;              /* size of hash buckets in index */
        u32int          blockbase;              /* address of start of on disk index table */
        u32int          blocks;                 /* total blocks on disk; some may be unused */
        u32int          start;                  /* first bucket in this section */
        u32int          stop;                   /* limit of buckets in this section */
};

/*
 * externally interesting part of an IEntry
 */
struct IAddr
{
        u64int          addr;
        u16int          size;                   /* uncompressed size */
        u8int           type;                   /* type of block */
        u8int           blocks;                 /* arena io quanta for Clump + data */
};

/*
 * entries in the index
 * kept in IBuckets in the disk index table,
 * cached in the memory ICache.
 */
struct IEntry
{
        /* on disk data - 32 bytes*/
        u8int   score[VtScoreSize];
        IAddr   ia;
        
        IEntry  *nexthash;
        IEntry  *nextdirty;
        IEntry  *next;
        IEntry  *prev;
        u8int   state;
};
enum {
        IEClean = 0,
        IEDirty = 1,
        IESummary = 2,
};

/*
 * buckets in the on disk index table
 */
struct IBucket
{
        u16int          n;                      /* number of active indices */
        u32int          buck;           /* used by buildindex/checkindex only */
        u8int           *data;
};

/*
 * temporary buffers used by individual threads
 */
struct ZBlock
{
        u32int          len;
        u32int          _size;
        u8int           *data;
        u8int           *free;
};

/*
 * simple input buffer for a '\0' terminated text file
 */
struct IFile
{
        char            *name;                          /* name of the file */
        ZBlock          *b;                             /* entire contents of file */
        u32int          pos;                            /* current position in the file */
};

struct Statdesc
{
        char *name;
        ulong max;
};

/* keep in sync with stats.c:/statdesc and httpd.c:/graphname*/
enum
{
        StatRpcTotal,
        StatRpcRead,
        StatRpcReadOk,
        StatRpcReadFail,
        StatRpcReadBytes,
        StatRpcReadTime,
        StatRpcReadCached,
        StatRpcReadCachedTime,
        StatRpcReadUncached,
        StatRpcReadUncachedTime,
        StatRpcWrite,
        StatRpcWriteNew,
        StatRpcWriteOld,
        StatRpcWriteFail,
        StatRpcWriteBytes,
        StatRpcWriteTime,
        StatRpcWriteNewTime,
        StatRpcWriteOldTime,

        StatLcacheHit,
        StatLcacheMiss,
        StatLcacheRead,
        StatLcacheWrite,
        StatLcacheSize,
        StatLcacheStall,
        StatLcacheReadTime,

        StatDcacheHit,
        StatDcacheMiss,
        StatDcacheLookup,
        StatDcacheRead,
        StatDcacheWrite,
        StatDcacheDirty,
        StatDcacheSize,
        StatDcacheFlush,
        StatDcacheStall,
        StatDcacheLookupTime,

        StatDblockStall,
        StatLumpStall,

        StatIcacheHit,
        StatIcacheMiss,
        StatIcacheRead,
        StatIcacheWrite,
        StatIcacheFill,
        StatIcachePrefetch,
        StatIcacheDirty,
        StatIcacheSize,
        StatIcacheFlush,
        StatIcacheStall,
        StatIcacheReadTime,
        StatIcacheLookup,
        StatScacheHit,
        StatScachePrefetch,

        StatBloomHit,
        StatBloomMiss,
        StatBloomFalseMiss,
        StatBloomLookup,
        StatBloomOnes,
        StatBloomBits,

        StatApartRead,
        StatApartReadBytes,
        StatApartWrite,
        StatApartWriteBytes,

        StatIsectRead,
        StatIsectReadBytes,
        StatIsectWrite,
        StatIsectWriteBytes,

        StatSumRead,
        StatSumReadBytes,
        
        StatCigLoad,
        StatCigLoadTime,

        NStat
};

extern Statdesc statdesc[NStat];

/*
 * statistics about the operation of the server
 * mainly for performance monitoring and profiling.
 */
struct Stats
{
        ulong           now;
        ulong           n[NStat];
};

struct Statbin
{
        uint nsamp;
        uint min;
        uint max;
        uint avg;
};

struct Graph
{
        long (*fn)(Stats*, Stats*, void*);
        void *arg;
        long t0;
        long t1;
        long min;
        long max;
        long wid;
        long ht;
        int fill;
};

/*
 * for kicking background processes that run one round after another after another
 */
struct Round
{
        QLock   lock;
        Rendez  start;
        Rendez  finish;
        Rendez  delaywait;
        int             delaytime;
        int             delaykick;
        char*   name;
        int             last;
        int             current;
        int             next;
        int             doanother;
};

/*
 * Bloom filter of stored block hashes
 */
struct Bloom
{
        RWLock lk;              /* protects nhash, nbits, tab, mb */
        QLock mod;              /* one marker at a time, protects nb */
        int nhash;
        ulong size;             /* bytes in tab */
        ulong bitmask;          /* to produce bit index */
        u8int *data;
        Part *part;
        Channel *writechan;
        Channel *writedonechan;
};

extern  Index           *mainindex;
extern  u32int          maxblocksize;           /* max. block size used by any partition */
extern  int             paranoid;               /* should verify hashes on disk read */
extern  int             queuewrites;            /* put all lump writes on a queue and finish later */
extern  int             readonly;               /* only allowed to read the disk data */
extern  Stats           stats;
extern  u8int           zeroscore[VtScoreSize];
extern  int             compressblocks;
extern  int             writestodevnull;        /* dangerous - for performance debugging */
extern  int             collectstats;
extern  QLock   memdrawlock;
extern  int             icachesleeptime;
extern  int             minicachesleeptime;
extern  int             arenasumsleeptime;
extern  int             manualscheduling;
extern  int             l0quantum;
extern  int             l1quantum;
extern  int             ignorebloom;
extern  int             icacheprefetch;
extern  int             syncwrites;
extern  int             debugarena; /* print in arena error msgs; -1==unknown */

extern  Stats   *stathist;
extern  int     nstathist;
extern  ulong   stattime;

#ifndef PLAN9PORT
#pragma varargck type "V" uchar*
#define ODIRECT 0
#endif