Subversion Repositories planix.SVN

Rev

Rev 2 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
#include "stdinc.h"
2
#include "dat.h"
3
#include "fns.h"
4
#include "error.h"
5
 
6
#include "9.h"	/* for cacheFlush */
7
 
8
typedef struct FreeList FreeList;
9
typedef struct BAddr BAddr;
10
 
11
enum {
12
	BadHeap = ~0,
13
};
14
 
15
/*
16
 * Store data to the memory cache in c->size blocks
17
 * with the block zero extended to fill it out.  When writing to
18
 * Venti, the block will be zero truncated.  The walker will also check
19
 * that the block fits within psize or dsize as the case may be.
20
 */
21
 
22
struct Cache
23
{
24
	VtLock	*lk;
25
	int 	ref;
26
	int	mode;
27
 
28
	Disk 	*disk;
29
	int	size;			/* block size */
30
	int	ndmap;		/* size of per-block dirty pointer map used in blockWrite */
31
	VtSession *z;
32
	u32int	now;			/* ticks for usage timestamps */
33
	Block	**heads;		/* hash table for finding address */
34
	int	nheap;			/* number of available victims */
35
	Block	**heap;			/* heap for locating victims */
36
	long	nblocks;		/* number of blocks allocated */
37
	Block	*blocks;		/* array of block descriptors */
38
	u8int	*mem;			/* memory for all block data & blists */
39
 
40
	BList	*blfree;
41
	VtRendez *blrend;
42
 
43
	int 	ndirty;			/* number of dirty blocks in the cache */
44
	int 	maxdirty;		/* max number of dirty blocks */
45
	u32int	vers;
46
 
47
	long hashSize;
48
 
49
	FreeList *fl;
50
 
51
	VtRendez *die;			/* daemon threads should die when != nil */
52
 
53
	VtRendez *flush;
54
	VtRendez *flushwait;
55
	VtRendez *heapwait;
56
	BAddr *baddr;
57
	int bw, br, be;
58
	int nflush;
59
 
60
	Periodic *sync;
61
 
62
	/* unlink daemon */
63
	BList *uhead;
64
	BList *utail;
65
	VtRendez *unlink;
66
 
67
	/* block counts */
68
	int nused;
69
	int ndisk;
70
};
71
 
72
struct BList {
73
	int part;
74
	u32int addr;
75
	uchar type;
76
	u32int tag;
77
	u32int epoch;
78
	u32int vers;
79
 
80
	int recurse;	/* for block unlink */
81
 
82
	/* for roll back */
83
	int index;			/* -1 indicates not valid */
84
	union {
85
		uchar score[VtScoreSize];
86
		uchar entry[VtEntrySize];
87
	} old;
88
	BList *next;
89
};
90
 
91
struct BAddr {
92
	int part;
93
	u32int addr;
94
	u32int vers;
95
};
96
 
97
struct FreeList {
98
	VtLock *lk;
99
	u32int last;		/* last block allocated */
100
	u32int end;		/* end of data partition */
101
	u32int nused;		/* number of used blocks */
102
	u32int epochLow;	/* low epoch when last updated nused */
103
};
104
 
105
static FreeList *flAlloc(u32int end);
106
static void flFree(FreeList *fl);
107
 
108
static Block *cacheBumpBlock(Cache *c);
109
static void heapDel(Block*);
110
static void heapIns(Block*);
111
static void cacheCheck(Cache*);
112
static void unlinkThread(void *a);
113
static void flushThread(void *a);
114
static void unlinkBody(Cache *c);
115
static int cacheFlushBlock(Cache *c);
116
static void cacheSync(void*);
117
static BList *blistAlloc(Block*);
118
static void blistFree(Cache*, BList*);
119
static void doRemoveLink(Cache*, BList*);
120
 
121
/*
122
 * Mapping from local block type to Venti type
123
 */
124
int vtType[BtMax] = {
125
	VtDataType,		/* BtData | 0  */
126
	VtPointerType0,		/* BtData | 1  */
127
	VtPointerType1,		/* BtData | 2  */
128
	VtPointerType2,		/* BtData | 3  */
129
	VtPointerType3,		/* BtData | 4  */
130
	VtPointerType4,		/* BtData | 5  */
131
	VtPointerType5,		/* BtData | 6  */
132
	VtPointerType6,		/* BtData | 7  */
133
	VtDirType,		/* BtDir | 0  */
134
	VtPointerType0,		/* BtDir | 1  */
135
	VtPointerType1,		/* BtDir | 2  */
136
	VtPointerType2,		/* BtDir | 3  */
137
	VtPointerType3,		/* BtDir | 4  */
138
	VtPointerType4,		/* BtDir | 5  */
139
	VtPointerType5,		/* BtDir | 6  */
140
	VtPointerType6,		/* BtDir | 7  */
141
};
142
 
143
/*
144
 * Allocate the memory cache.
145
 */
146
Cache *
147
cacheAlloc(Disk *disk, VtSession *z, ulong nblocks, int mode)
148
{
149
	int i;
150
	Cache *c;
151
	Block *b;
152
	BList *bl;
153
	u8int *p;
154
	int nbl;
155
 
156
	c = vtMemAllocZ(sizeof(Cache));
157
 
158
	/* reasonable number of BList elements */
159
	nbl = nblocks * 4;
160
 
161
	c->lk = vtLockAlloc();
162
	c->ref = 1;
163
	c->disk = disk;
164
	c->z = z;
165
	c->size = diskBlockSize(disk);
166
bwatchSetBlockSize(c->size);
167
	/* round c->size up to be a nice multiple */
168
	c->size = (c->size + 127) & ~127;
169
	c->ndmap = (c->size/20 + 7) / 8;
170
	c->nblocks = nblocks;
171
	c->hashSize = nblocks;
172
	c->heads = vtMemAllocZ(c->hashSize*sizeof(Block*));
173
	c->heap = vtMemAllocZ(nblocks*sizeof(Block*));
174
	c->blocks = vtMemAllocZ(nblocks*sizeof(Block));
175
	c->mem = vtMemAllocZ(nblocks * (c->size + c->ndmap) + nbl * sizeof(BList));
176
	c->baddr = vtMemAllocZ(nblocks * sizeof(BAddr));
177
	c->mode = mode;
178
	c->vers++;
179
	p = c->mem;
180
	for(i = 0; i < nblocks; i++){
181
		b = &c->blocks[i];
182
		b->lk = vtLockAlloc();
183
		b->c = c;
184
		b->data = p;
185
		b->heap = i;
186
		b->ioready = vtRendezAlloc(b->lk);
187
		c->heap[i] = b;
188
		p += c->size;
189
	}
190
	c->nheap = nblocks;
191
	for(i = 0; i < nbl; i++){
192
		bl = (BList*)p;
193
		bl->next = c->blfree;
194
		c->blfree = bl;
195
		p += sizeof(BList);
196
	}
197
	/* separate loop to keep blocks and blists reasonably aligned */
198
	for(i = 0; i < nblocks; i++){
199
		b = &c->blocks[i];
200
		b->dmap = p;
201
		p += c->ndmap;
202
	}
203
 
204
	c->blrend = vtRendezAlloc(c->lk);
205
 
206
	c->maxdirty = nblocks*(DirtyPercentage*0.01);
207
 
208
	c->fl = flAlloc(diskSize(disk, PartData));
209
 
210
	c->unlink = vtRendezAlloc(c->lk);
211
	c->flush = vtRendezAlloc(c->lk);
212
	c->flushwait = vtRendezAlloc(c->lk);
213
	c->heapwait = vtRendezAlloc(c->lk);
214
	c->sync = periodicAlloc(cacheSync, c, 30*1000);
215
 
216
	if(mode == OReadWrite){
217
		c->ref += 2;
218
		vtThread(unlinkThread, c);
219
		vtThread(flushThread, c);
220
	}
221
	cacheCheck(c);
222
 
223
	return c;
224
}
225
 
226
/*
227
 * Free the whole memory cache, flushing all dirty blocks to the disk.
228
 */
229
void
230
cacheFree(Cache *c)
231
{
232
	int i;
233
 
234
	/* kill off daemon threads */
235
	vtLock(c->lk);
236
	c->die = vtRendezAlloc(c->lk);
237
	periodicKill(c->sync);
238
	vtWakeup(c->flush);
239
	vtWakeup(c->unlink);
240
	while(c->ref > 1)
241
		vtSleep(c->die);
242
 
243
	/* flush everything out */
244
	do {
245
		unlinkBody(c);
246
		vtUnlock(c->lk);
247
		while(cacheFlushBlock(c))
248
			;
249
		diskFlush(c->disk);
250
		vtLock(c->lk);
251
	} while(c->uhead || c->ndirty);
252
	vtUnlock(c->lk);
253
 
254
	cacheCheck(c);
255
 
256
	for(i = 0; i < c->nblocks; i++){
257
		assert(c->blocks[i].ref == 0);
258
		vtRendezFree(c->blocks[i].ioready);
259
		vtLockFree(c->blocks[i].lk);
260
	}
261
	flFree(c->fl);
262
	vtMemFree(c->baddr);
263
	vtMemFree(c->heads);
264
	vtMemFree(c->blocks);
265
	vtMemFree(c->mem);
266
	vtLockFree(c->lk);
267
	diskFree(c->disk);
268
	vtRendezFree(c->blrend);
269
	/* don't close vtSession */
270
	vtMemFree(c);
271
}
272
 
273
static void
274
cacheDump(Cache *c)
275
{
276
	int i;
277
	Block *b;
278
 
279
	for(i = 0; i < c->nblocks; i++){
280
		b = &c->blocks[i];
281
		fprint(2, "%d. p=%d a=%ud %V t=%d ref=%d state=%s io=%s pc=%#p\n",
282
			i, b->part, b->addr, b->score, b->l.type, b->ref,
283
			bsStr(b->l.state), bioStr(b->iostate), b->pc);
284
	}
285
}
286
 
287
static void
288
cacheCheck(Cache *c)
289
{
290
	u32int size, now;
291
	int i, k, refed;
292
	static uchar zero[VtScoreSize];
293
	Block *b;
294
 
295
	size = c->size;
296
	now = c->now;
297
 
298
	for(i = 0; i < c->nheap; i++){
299
		if(c->heap[i]->heap != i)
300
			vtFatal("mis-heaped at %d: %d", i, c->heap[i]->heap);
301
		if(i > 0 && c->heap[(i - 1) >> 1]->used - now > c->heap[i]->used - now)
302
			vtFatal("bad heap ordering");
303
		k = (i << 1) + 1;
304
		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
305
			vtFatal("bad heap ordering");
306
		k++;
307
		if(k < c->nheap && c->heap[i]->used - now > c->heap[k]->used - now)
308
			vtFatal("bad heap ordering");
309
	}
310
 
311
	refed = 0;
312
	for(i = 0; i < c->nblocks; i++){
313
		b = &c->blocks[i];
314
		if(b->data != &c->mem[i * size])
315
			vtFatal("mis-blocked at %d", i);
316
		if(b->ref && b->heap == BadHeap){
317
			refed++;
318
		}
319
	}
320
if(c->nheap + refed != c->nblocks){
321
fprint(2, "%s: cacheCheck: nheap %d refed %d nblocks %ld\n", argv0, c->nheap, refed, c->nblocks);
322
cacheDump(c);
323
}
324
	assert(c->nheap + refed == c->nblocks);
325
	refed = 0;
326
	for(i = 0; i < c->nblocks; i++){
327
		b = &c->blocks[i];
328
		if(b->ref){
329
if(1)fprint(2, "%s: p=%d a=%ud %V ref=%d %L\n", argv0, b->part, b->addr, b->score, b->ref, &b->l);
330
			refed++;
331
		}
332
	}
333
if(refed > 0)fprint(2, "%s: cacheCheck: in used %d\n", argv0, refed);
334
}
335
 
336
 
337
/*
338
 * locate the block with the oldest second to last use.
339
 * remove it from the heap, and fix up the heap.
340
 */
341
/* called with c->lk held */
342
static Block *
343
cacheBumpBlock(Cache *c)
344
{
345
	int printed;
346
	Block *b;
347
 
348
	/*
349
	 * locate the block with the oldest second to last use.
350
	 * remove it from the heap, and fix up the heap.
351
	 */
352
	printed = 0;
353
	if(c->nheap == 0){
354
		while(c->nheap == 0){
355
			vtWakeup(c->flush);
356
			vtSleep(c->heapwait);
357
			if(c->nheap == 0){
358
				printed = 1;
359
				fprint(2, "%s: entire cache is busy, %d dirty "
360
					"-- waking flush thread\n",
361
					argv0, c->ndirty);
362
			}
363
		}
364
		if(printed)
365
			fprint(2, "%s: cache is okay again, %d dirty\n",
366
				argv0, c->ndirty);
367
	}
368
 
369
	b = c->heap[0];
370
	heapDel(b);
371
 
372
	assert(b->heap == BadHeap);
373
	assert(b->ref == 0);
374
	assert(b->iostate != BioDirty && b->iostate != BioReading && b->iostate != BioWriting);
375
	assert(b->prior == nil);
376
	assert(b->uhead == nil);
377
 
378
	/*
379
	 * unchain the block from hash chain
380
	 */
381
	if(b->prev){
382
		*(b->prev) = b->next;
383
		if(b->next)
384
			b->next->prev = b->prev;
385
		b->prev = nil;
386
	}
387
 
388
 
389
if(0)fprint(2, "%s: dropping %d:%x:%V\n", argv0, b->part, b->addr, b->score);
390
	/* set block to a reasonable state */
391
	b->ref = 1;
392
	b->part = PartError;
393
	memset(&b->l, 0, sizeof(b->l));
394
	b->iostate = BioEmpty;
395
 
396
	return b;
397
}
398
 
399
/*
400
 * look for a particular version of the block in the memory cache.
401
 */
402
static Block *
403
_cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers,
404
	int waitlock, int *lockfailure)
405
{
406
	Block *b;
407
	ulong h;
408
 
409
	h = addr % c->hashSize;
410
 
411
	if(lockfailure)
412
		*lockfailure = 0;
413
 
414
	/*
415
	 * look for the block in the cache
416
	 */
417
	vtLock(c->lk);
418
	for(b = c->heads[h]; b != nil; b = b->next){
419
		if(b->part == part && b->addr == addr)
420
			break;
421
	}
422
	if(b == nil || b->vers != vers){
423
		vtUnlock(c->lk);
424
		return nil;
425
	}
426
	if(!waitlock && !vtCanLock(b->lk)){
427
		*lockfailure = 1;
428
		vtUnlock(c->lk);
429
		return nil;
430
	}
431
	heapDel(b);
432
	b->ref++;
433
	vtUnlock(c->lk);
434
 
435
	bwatchLock(b);
436
	if(waitlock)
437
		vtLock(b->lk);
438
	b->nlock = 1;
439
 
440
	for(;;){
441
		switch(b->iostate){
442
		default:
443
			abort();
444
		case BioEmpty:
445
		case BioLabel:
446
		case BioClean:
447
		case BioDirty:
448
			if(b->vers != vers){
449
				blockPut(b);
450
				return nil;
451
			}
452
			return b;
453
		case BioReading:
454
		case BioWriting:
455
			vtSleep(b->ioready);
456
			break;
457
		case BioVentiError:
458
			blockPut(b);
459
			vtSetError("venti i/o error block 0x%.8ux", addr);
460
			return nil;
461
		case BioReadError:
462
			blockPut(b);
463
			vtSetError("error reading block 0x%.8ux", addr);
464
			return nil;
465
		}
466
	}
467
	/* NOT REACHED */
468
}
469
static Block*
470
cacheLocalLookup(Cache *c, int part, u32int addr, u32int vers)
471
{
472
	return _cacheLocalLookup(c, part, addr, vers, Waitlock, 0);
473
}
474
 
475
 
476
/*
477
 * fetch a local (on-disk) block from the memory cache.
478
 * if it's not there, load it, bumping some other block.
479
 */
480
Block *
481
_cacheLocal(Cache *c, int part, u32int addr, int mode, u32int epoch)
482
{
483
	Block *b;
484
	ulong h;
485
 
486
	assert(part != PartVenti);
487
 
488
	h = addr % c->hashSize;
489
 
490
	/*
491
	 * look for the block in the cache
492
	 */
493
	vtLock(c->lk);
494
	for(b = c->heads[h]; b != nil; b = b->next){
495
		if(b->part != part || b->addr != addr)
496
			continue;
497
		if(epoch && b->l.epoch != epoch){
498
fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
499
			vtUnlock(c->lk);
500
			vtSetError(ELabelMismatch);
501
			return nil;
502
		}
503
		heapDel(b);
504
		b->ref++;
505
		break;
506
	}
507
 
508
	if(b == nil){
509
		b = cacheBumpBlock(c);
510
 
511
		b->part = part;
512
		b->addr = addr;
513
		localToGlobal(addr, b->score);
514
 
515
		/* chain onto correct hash */
516
		b->next = c->heads[h];
517
		c->heads[h] = b;
518
		if(b->next != nil)
519
			b->next->prev = &b->next;
520
		b->prev = &c->heads[h];
521
	}
522
 
523
	vtUnlock(c->lk);
524
 
525
	/*
526
	 * BUG: what if the epoch changes right here?
527
	 * In the worst case, we could end up in some weird
528
	 * lock loop, because the block we want no longer exists,
529
	 * and instead we're trying to lock a block we have no
530
	 * business grabbing.
531
	 *
532
	 * For now, I'm not going to worry about it.
533
	 */
534
 
535
if(0)fprint(2, "%s: cacheLocal: %d: %d %x\n", argv0, getpid(), b->part, b->addr);
536
	bwatchLock(b);
537
	vtLock(b->lk);
538
	b->nlock = 1;
539
 
540
	if(part == PartData && b->iostate == BioEmpty){
541
		if(!readLabel(c, &b->l, addr)){
542
			blockPut(b);
543
			return nil;
544
		}
545
		blockSetIOState(b, BioLabel);
546
	}
547
	if(epoch && b->l.epoch != epoch){
548
		blockPut(b);
549
fprint(2, "%s: _cacheLocal want epoch %ud got %ud\n", argv0, epoch, b->l.epoch);
550
		vtSetError(ELabelMismatch);
551
		return nil;
552
	}
553
 
554
	b->pc = getcallerpc(&c);
555
	for(;;){
556
		switch(b->iostate){
557
		default:
558
			abort();
559
		case BioLabel:
560
			if(mode == OOverWrite)
561
				/*
562
				 * leave iostate as BioLabel because data
563
				 * hasn't been read.
564
				 */
565
				return b;
566
			/* fall through */
567
		case BioEmpty:
568
			diskRead(c->disk, b);
569
			vtSleep(b->ioready);
570
			break;
571
		case BioClean:
572
		case BioDirty:
573
			return b;
574
		case BioReading:
575
		case BioWriting:
576
			vtSleep(b->ioready);
577
			break;
578
		case BioReadError:
579
			blockSetIOState(b, BioEmpty);
580
			blockPut(b);
581
			vtSetError("error reading block 0x%.8ux", addr);
582
			return nil;
583
		}
584
	}
585
	/* NOT REACHED */
586
}
587
 
588
Block *
589
cacheLocal(Cache *c, int part, u32int addr, int mode)
590
{
591
	return _cacheLocal(c, part, addr, mode, 0);
592
}
593
 
594
/*
595
 * fetch a local (on-disk) block from the memory cache.
596
 * if it's not there, load it, bumping some other block.
597
 * check tag and type.
598
 */
599
Block *
600
cacheLocalData(Cache *c, u32int addr, int type, u32int tag, int mode, u32int epoch)
601
{
602
	Block *b;
603
 
604
	b = _cacheLocal(c, PartData, addr, mode, epoch);
605
	if(b == nil)
606
		return nil;
607
	if(b->l.type != type || b->l.tag != tag){
608
		fprint(2, "%s: cacheLocalData: addr=%d type got %d exp %d: tag got %ux exp %ux\n",
609
			argv0, addr, b->l.type, type, b->l.tag, tag);
610
		vtSetError(ELabelMismatch);
611
		blockPut(b);
612
		return nil;
613
	}
614
	b->pc = getcallerpc(&c);
615
	return b;
616
}
617
 
618
/*
619
 * fetch a global (Venti) block from the memory cache.
620
 * if it's not there, load it, bumping some other block.
621
 * check tag and type if it's really a local block in disguise.
622
 */
623
Block *
624
cacheGlobal(Cache *c, uchar score[VtScoreSize], int type, u32int tag, int mode)
625
{
626
	int n;
627
	Block *b;
628
	ulong h;
629
	u32int addr;
630
 
631
	addr = globalToLocal(score);
632
	if(addr != NilBlock){
633
		b = cacheLocalData(c, addr, type, tag, mode, 0);
634
		if(b)
635
			b->pc = getcallerpc(&c);
636
		return b;
637
	}
638
 
639
	h = (u32int)(score[0]|(score[1]<<8)|(score[2]<<16)|(score[3]<<24)) % c->hashSize;
640
 
641
	/*
642
	 * look for the block in the cache
643
	 */
644
	vtLock(c->lk);
645
	for(b = c->heads[h]; b != nil; b = b->next){
646
		if(b->part != PartVenti || memcmp(b->score, score, VtScoreSize) != 0 || b->l.type != type)
647
			continue;
648
		heapDel(b);
649
		b->ref++;
650
		break;
651
	}
652
 
653
	if(b == nil){
654
if(0)fprint(2, "%s: cacheGlobal %V %d\n", argv0, score, type);
655
 
656
		b = cacheBumpBlock(c);
657
 
658
		b->part = PartVenti;
659
		b->addr = NilBlock;
660
		b->l.type = type;
661
		memmove(b->score, score, VtScoreSize);
662
 
663
		/* chain onto correct hash */
664
		b->next = c->heads[h];
665
		c->heads[h] = b;
666
		if(b->next != nil)
667
			b->next->prev = &b->next;
668
		b->prev = &c->heads[h];
669
	}
670
	vtUnlock(c->lk);
671
 
672
	bwatchLock(b);
673
	vtLock(b->lk);
674
	b->nlock = 1;
675
	b->pc = getcallerpc(&c);
676
 
677
	switch(b->iostate){
678
	default:
679
		abort();
680
	case BioEmpty:
681
		n = vtRead(c->z, score, vtType[type], b->data, c->size);
682
		if(n < 0 || !vtSha1Check(score, b->data, n)){
683
			blockSetIOState(b, BioVentiError);
684
			blockPut(b);
685
			vtSetError(
686
			"venti error reading block %V or wrong score: %r",
687
				score);
688
			return nil;
689
		}
690
		vtZeroExtend(vtType[type], b->data, n, c->size);
691
		blockSetIOState(b, BioClean);
692
		return b;
693
	case BioClean:
694
		return b;
695
	case BioVentiError:
696
		blockPut(b);
697
		vtSetError("venti i/o error or wrong score, block %V", score);
698
		return nil;
699
	case BioReadError:
700
		blockPut(b);
701
		vtSetError("error reading block %V", b->score);
702
		return nil;
703
	}
704
	/* NOT REACHED */
705
}
706
 
707
/*
708
 * allocate a new on-disk block and load it into the memory cache.
709
 * BUG: if the disk is full, should we flush some of it to Venti?
710
 */
711
static u32int lastAlloc;
712
 
713
Block *
714
cacheAllocBlock(Cache *c, int type, u32int tag, u32int epoch, u32int epochLow)
715
{
716
	FreeList *fl;
717
	u32int addr;
718
	Block *b;
719
	int n, nwrap;
720
	Label lab;
721
 
722
	n = c->size / LabelSize;
723
	fl = c->fl;
724
 
725
	vtLock(fl->lk);
726
	addr = fl->last;
727
	b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
728
	if(b == nil){
729
		fprint(2, "%s: cacheAllocBlock: xxx %R\n", argv0);
730
		vtUnlock(fl->lk);
731
		return nil;
732
	}
733
	nwrap = 0;
734
	for(;;){
735
		if(++addr >= fl->end){
736
			addr = 0;
737
			if(++nwrap >= 2){
738
				blockPut(b);
739
				vtSetError("disk is full");
740
				/*
741
				 * try to avoid a continuous spew of console
742
				 * messages.
743
				 */
744
				if (fl->last != 0)
745
					fprint(2, "%s: cacheAllocBlock: xxx1 %R\n",
746
						argv0);
747
				fl->last = 0;
748
				vtUnlock(fl->lk);
749
				return nil;
750
			}
751
		}
752
		if(addr%n == 0){
753
			blockPut(b);
754
			b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
755
			if(b == nil){
756
				fl->last = addr;
757
				fprint(2, "%s: cacheAllocBlock: xxx2 %R\n", argv0);
758
				vtUnlock(fl->lk);
759
				return nil;
760
			}
761
		}
762
		if(!labelUnpack(&lab, b->data, addr%n))
763
			continue;
764
		if(lab.state == BsFree)
765
			goto Found;
766
		if(lab.state&BsClosed)
767
		if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
768
			goto Found;
769
	}
770
Found:
771
	blockPut(b);
772
	b = cacheLocal(c, PartData, addr, OOverWrite);
773
	if(b == nil){
774
		fprint(2, "%s: cacheAllocBlock: xxx3 %R\n", argv0);
775
		return nil;
776
	}
777
assert(b->iostate == BioLabel || b->iostate == BioClean);
778
	fl->last = addr;
779
	lab.type = type;
780
	lab.tag = tag;
781
	lab.state = BsAlloc;
782
	lab.epoch = epoch;
783
	lab.epochClose = ~(u32int)0;
784
	if(!blockSetLabel(b, &lab, 1)){
785
		fprint(2, "%s: cacheAllocBlock: xxx4 %R\n", argv0);
786
		blockPut(b);
787
		return nil;
788
	}
789
	vtZeroExtend(vtType[type], b->data, 0, c->size);
790
if(0)diskWrite(c->disk, b);
791
 
792
if(0)fprint(2, "%s: fsAlloc %ud type=%d tag = %ux\n", argv0, addr, type, tag);
793
	lastAlloc = addr;
794
	fl->nused++;
795
	vtUnlock(fl->lk);
796
	b->pc = getcallerpc(&c);
797
	return b;
798
}
799
 
800
int
801
cacheDirty(Cache *c)
802
{
803
	return c->ndirty;
804
}
805
 
806
void
807
cacheCountUsed(Cache *c, u32int epochLow, u32int *used, u32int *total, u32int *bsize)
808
{
809
	int n;
810
	u32int addr, nused;
811
	Block *b;
812
	Label lab;
813
	FreeList *fl;
814
 
815
	fl = c->fl;
816
	n = c->size / LabelSize;
817
	*bsize = c->size;
818
	vtLock(fl->lk);
819
	if(fl->epochLow == epochLow){
820
		*used = fl->nused;
821
		*total = fl->end;
822
		vtUnlock(fl->lk);
823
		return;
824
	}
825
	b = nil;
826
	nused = 0;
827
	for(addr=0; addr<fl->end; addr++){
828
		if(addr%n == 0){
829
			blockPut(b);
830
			b = cacheLocal(c, PartLabel, addr/n, OReadOnly);
831
			if(b == nil){
832
				fprint(2, "%s: flCountUsed: loading %ux: %R\n",
833
					argv0, addr/n);
834
				break;
835
			}
836
		}
837
		if(!labelUnpack(&lab, b->data, addr%n))
838
			continue;
839
		if(lab.state == BsFree)
840
			continue;
841
		if(lab.state&BsClosed)
842
		if(lab.epochClose <= epochLow || lab.epoch==lab.epochClose)
843
			continue;
844
		nused++;
845
	}
846
	blockPut(b);
847
	if(addr == fl->end){
848
		fl->nused = nused;
849
		fl->epochLow = epochLow;
850
	}
851
	*used = nused;
852
	*total = fl->end;
853
	vtUnlock(fl->lk);
854
	return;
855
}
856
 
857
static FreeList *
858
flAlloc(u32int end)
859
{
860
	FreeList *fl;
861
 
862
	fl = vtMemAllocZ(sizeof(*fl));
863
	fl->lk = vtLockAlloc();
864
	fl->last = 0;
865
	fl->end = end;
866
	return fl;
867
}
868
 
869
static void
870
flFree(FreeList *fl)
871
{
872
	vtLockFree(fl->lk);
873
	vtMemFree(fl);
874
}
875
 
876
u32int
877
cacheLocalSize(Cache *c, int part)
878
{
879
	return diskSize(c->disk, part);
880
}
881
 
882
/*
883
 * The thread that has locked b may refer to it by
884
 * multiple names.  Nlock counts the number of
885
 * references the locking thread holds.  It will call
886
 * blockPut once per reference.
887
 */
888
void
889
blockDupLock(Block *b)
890
{
891
	assert(b->nlock > 0);
892
	b->nlock++;
893
}
894
 
895
/*
896
 * we're done with the block.
897
 * unlock it.  can't use it after calling this.
898
 */
899
void
900
blockPut(Block* b)
901
{
902
	Cache *c;
903
 
904
	if(b == nil)
905
		return;
906
 
907
if(0)fprint(2, "%s: blockPut: %d: %d %x %d %s\n", argv0, getpid(), b->part, b->addr, c->nheap, bioStr(b->iostate));
908
 
909
	if(b->iostate == BioDirty)
910
		bwatchDependency(b);
911
 
912
	if(--b->nlock > 0)
913
		return;
914
 
915
	/*
916
	 * b->nlock should probably stay at zero while
917
	 * the block is unlocked, but diskThread and vtSleep
918
	 * conspire to assume that they can just vtLock(b->lk); blockPut(b),
919
	 * so we have to keep b->nlock set to 1 even
920
	 * when the block is unlocked.
921
	 */
922
	assert(b->nlock == 0);
923
	b->nlock = 1;
924
//	b->pc = 0;
925
 
926
	bwatchUnlock(b);
927
	vtUnlock(b->lk);
928
	c = b->c;
929
	vtLock(c->lk);
930
 
931
	if(--b->ref > 0){
932
		vtUnlock(c->lk);
933
		return;
934
	}
935
 
936
	assert(b->ref == 0);
937
	switch(b->iostate){
938
	default:
939
		b->used = c->now++;
940
		heapIns(b);
941
		break;
942
	case BioEmpty:
943
	case BioLabel:
944
		if(c->nheap == 0)
945
			b->used = c->now++;
946
		else
947
			b->used = c->heap[0]->used;
948
		heapIns(b);
949
		break;
950
	case BioDirty:
951
		break;
952
	}
953
	vtUnlock(c->lk);
954
}
955
 
956
/*
957
 * set the label associated with a block.
958
 */
959
Block*
960
_blockSetLabel(Block *b, Label *l)
961
{
962
	int lpb;
963
	Block *bb;
964
	u32int a;
965
	Cache *c;
966
 
967
	c = b->c;
968
 
969
	assert(b->part == PartData);
970
	assert(b->iostate == BioLabel || b->iostate == BioClean || b->iostate == BioDirty);
971
	lpb = c->size / LabelSize;
972
	a = b->addr / lpb;
973
	bb = cacheLocal(c, PartLabel, a, OReadWrite);
974
	if(bb == nil){
975
		blockPut(b);
976
		return nil;
977
	}
978
	b->l = *l;
979
	labelPack(l, bb->data, b->addr%lpb);
980
	blockDirty(bb);
981
	return bb;
982
}
983
 
984
int
985
blockSetLabel(Block *b, Label *l, int allocating)
986
{
987
	Block *lb;
988
	Label oldl;
989
 
990
	oldl = b->l;
991
	lb = _blockSetLabel(b, l);
992
	if(lb == nil)
993
		return 0;
994
 
995
	/*
996
	 * If we're allocating the block, make sure the label (bl)
997
	 * goes to disk before the data block (b) itself.  This is to help
998
	 * the blocks that in turn depend on b.
999
	 *
1000
	 * Suppose bx depends on (must be written out after) b.
1001
	 * Once we write b we'll think it's safe to write bx.
1002
	 * Bx can't get at b unless it has a valid label, though.
1003
	 *
1004
	 * Allocation is the only case in which having a current label
1005
	 * is vital because:
1006
	 *
1007
	 *	- l.type is set at allocation and never changes.
1008
	 *	- l.tag is set at allocation and never changes.
1009
	 *	- l.state is not checked when we load blocks.
1010
	 *	- the archiver cares deeply about l.state being
1011
	 *		BaActive vs. BaCopied, but that's handled
1012
	 *		by direct calls to _blockSetLabel.
1013
	 */
1014
 
1015
	if(allocating)
1016
		blockDependency(b, lb, -1, nil, nil);
1017
	blockPut(lb);
1018
	return 1;
1019
}
1020
 
1021
/*
1022
 * Record that bb must be written out before b.
1023
 * If index is given, we're about to overwrite the score/e
1024
 * at that index in the block.  Save the old value so we
1025
 * can write a safer ``old'' version of the block if pressed.
1026
 */
1027
void
1028
blockDependency(Block *b, Block *bb, int index, uchar *score, Entry *e)
1029
{
1030
	BList *p;
1031
 
1032
	if(bb->iostate == BioClean)
1033
		return;
1034
 
1035
	/*
1036
	 * Dependencies for blocks containing Entry structures
1037
	 * or scores must always be explained.  The problem with
1038
	 * only explaining some of them is this.  Suppose we have two
1039
	 * dependencies for the same field, the first explained
1040
	 * and the second not.  We try to write the block when the first
1041
	 * dependency is not written but the second is.  We will roll back
1042
	 * the first change even though the second trumps it.
1043
	 */
1044
	if(index == -1 && bb->part == PartData)
1045
		assert(b->l.type == BtData);
1046
 
1047
	if(bb->iostate != BioDirty){
1048
		fprint(2, "%s: %d:%x:%d iostate is %d in blockDependency\n",
1049
			argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
1050
		abort();
1051
	}
1052
 
1053
	p = blistAlloc(bb);
1054
	if(p == nil)
1055
		return;
1056
 
1057
	assert(bb->iostate == BioDirty);
1058
if(0)fprint(2, "%s: %d:%x:%d depends on %d:%x:%d\n", argv0, b->part, b->addr, b->l.type, bb->part, bb->addr, bb->l.type);
1059
 
1060
	p->part = bb->part;
1061
	p->addr = bb->addr;
1062
	p->type = bb->l.type;
1063
	p->vers = bb->vers;
1064
	p->index = index;
1065
	if(p->index >= 0){
1066
		/*
1067
		 * This test would just be b->l.type==BtDir except
1068
		 * we need to exclude the super block.
1069
		 */
1070
		if(b->l.type == BtDir && b->part == PartData)
1071
			entryPack(e, p->old.entry, 0);
1072
		else
1073
			memmove(p->old.score, score, VtScoreSize);
1074
	}
1075
	p->next = b->prior;
1076
	b->prior = p;
1077
}
1078
 
1079
/*
1080
 * Mark an in-memory block as dirty.  If there are too many
1081
 * dirty blocks, start writing some out to disk. 
1082
 * 
1083
 * If there were way too many dirty blocks, we used to
1084
 * try to do some flushing ourselves, but it's just too dangerous -- 
1085
 * it implies that the callers cannot have any of our priors locked,
1086
 * but this is hard to avoid in some cases.
1087
 */
1088
int
1089
blockDirty(Block *b)
1090
{
1091
	Cache *c;
1092
 
1093
	c = b->c;
1094
 
1095
	assert(b->part != PartVenti);
1096
 
1097
	if(b->iostate == BioDirty)
1098
		return 1;
1099
	assert(b->iostate == BioClean || b->iostate == BioLabel);
1100
 
1101
	vtLock(c->lk);
1102
	b->iostate = BioDirty;
1103
	c->ndirty++;
1104
	if(c->ndirty > (c->maxdirty>>1))
1105
		vtWakeup(c->flush);
1106
	vtUnlock(c->lk);
1107
 
1108
	return 1;
1109
}
1110
 
1111
/*
1112
 * We've decided to write out b.  Maybe b has some pointers to blocks
1113
 * that haven't yet been written to disk.  If so, construct a slightly out-of-date
1114
 * copy of b that is safe to write out.  (diskThread will make sure the block
1115
 * remains marked as dirty.)
1116
 */
1117
uchar *
1118
blockRollback(Block *b, uchar *buf)
1119
{
1120
	u32int addr;
1121
	BList *p;
1122
	Super super;
1123
 
1124
	/* easy case */
1125
	if(b->prior == nil)
1126
		return b->data;
1127
 
1128
	memmove(buf, b->data, b->c->size);
1129
	for(p=b->prior; p; p=p->next){
1130
		/*
1131
		 * we know p->index >= 0 because blockWrite has vetted this block for us.
1132
		 */
1133
		assert(p->index >= 0);
1134
		assert(b->part == PartSuper || (b->part == PartData && b->l.type != BtData));
1135
		if(b->part == PartSuper){
1136
			assert(p->index == 0);
1137
			superUnpack(&super, buf);
1138
			addr = globalToLocal(p->old.score);
1139
			if(addr == NilBlock){
1140
				fprint(2, "%s: rolling back super block: "
1141
					"bad replacement addr %V\n",
1142
					argv0, p->old.score);
1143
				abort();
1144
			}
1145
			super.active = addr;
1146
			superPack(&super, buf);
1147
			continue;
1148
		}
1149
		if(b->l.type == BtDir)
1150
			memmove(buf+p->index*VtEntrySize, p->old.entry, VtEntrySize);
1151
		else
1152
			memmove(buf+p->index*VtScoreSize, p->old.score, VtScoreSize);
1153
	}
1154
	return buf;
1155
}
1156
 
1157
/*
1158
 * Try to write block b.
1159
 * If b depends on other blocks:
1160
 *
1161
 *	If the block has been written out, remove the dependency.
1162
 *	If the dependency is replaced by a more recent dependency,
1163
 *		throw it out.
1164
 *	If we know how to write out an old version of b that doesn't
1165
 *		depend on it, do that.
1166
 *
1167
 *	Otherwise, bail.
1168
 */
1169
int
1170
blockWrite(Block *b, int waitlock)
1171
{
1172
	uchar *dmap;
1173
	Cache *c;
1174
	BList *p, **pp;
1175
	Block *bb;
1176
	int lockfail;
1177
 
1178
	c = b->c;
1179
 
1180
	if(b->iostate != BioDirty)
1181
		return 1;
1182
 
1183
	dmap = b->dmap;
1184
	memset(dmap, 0, c->ndmap);
1185
	pp = &b->prior;
1186
	for(p=*pp; p; p=*pp){
1187
		if(p->index >= 0){
1188
			/* more recent dependency has succeeded; this one can go */
1189
			if(dmap[p->index/8] & (1<<(p->index%8)))
1190
				goto ignblock;
1191
		}
1192
 
1193
		lockfail = 0;
1194
		bb = _cacheLocalLookup(c, p->part, p->addr, p->vers, waitlock,
1195
			&lockfail);
1196
		if(bb == nil){
1197
			if(lockfail)
1198
				return 0;
1199
			/* block not in cache => was written already */
1200
			dmap[p->index/8] |= 1<<(p->index%8);
1201
			goto ignblock;
1202
		}
1203
 
1204
		/*
1205
		 * same version of block is still in cache.
1206
		 *
1207
		 * the assertion is true because the block still has version p->vers,
1208
		 * which means it hasn't been written out since we last saw it.
1209
		 */
1210
		if(bb->iostate != BioDirty){
1211
			fprint(2, "%s: %d:%x:%d iostate is %d in blockWrite\n",
1212
				argv0, bb->part, bb->addr, bb->l.type, bb->iostate);
1213
			/* probably BioWriting if it happens? */
1214
			if(bb->iostate == BioClean)
1215
				goto ignblock;
1216
		}
1217
 
1218
		blockPut(bb);
1219
 
1220
		if(p->index < 0){
1221
			/*
1222
			 * We don't know how to temporarily undo
1223
			 * b's dependency on bb, so just don't write b yet.
1224
			 */
1225
			if(0) fprint(2, "%s: blockWrite skipping %d %x %d %d; need to write %d %x %d\n",
1226
				argv0, b->part, b->addr, b->vers, b->l.type, p->part, p->addr, bb->vers);
1227
			return 0;
1228
		}
1229
		/* keep walking down the list */
1230
		pp = &p->next;
1231
		continue;
1232
 
1233
ignblock:
1234
		*pp = p->next;
1235
		blistFree(c, p);
1236
		continue;
1237
	}
1238
 
1239
	/*
1240
	 * DiskWrite must never be called with a double-locked block.
1241
	 * This call to diskWrite is okay because blockWrite is only called
1242
	 * from the cache flush thread, which never double-locks a block.
1243
	 */
1244
	diskWrite(c->disk, b);
1245
	return 1;
1246
}
1247
 
1248
/*
1249
 * Change the I/O state of block b.
1250
 * Just an assignment except for magic in
1251
 * switch statement (read comments there).
1252
 */
1253
void
1254
blockSetIOState(Block *b, int iostate)
1255
{
1256
	int dowakeup;
1257
	Cache *c;
1258
	BList *p, *q;
1259
 
1260
if(0) fprint(2, "%s: iostate part=%d addr=%x %s->%s\n", argv0, b->part, b->addr, bioStr(b->iostate), bioStr(iostate));
1261
 
1262
	c = b->c;
1263
 
1264
	dowakeup = 0;
1265
	switch(iostate){
1266
	default:
1267
		abort();
1268
	case BioEmpty:
1269
		assert(!b->uhead);
1270
		break;
1271
	case BioLabel:
1272
		assert(!b->uhead);
1273
		break;
1274
	case BioClean:
1275
		bwatchDependency(b);
1276
		/*
1277
		 * If b->prior is set, it means a write just finished.
1278
		 * The prior list isn't needed anymore.
1279
		 */
1280
		for(p=b->prior; p; p=q){
1281
			q = p->next;
1282
			blistFree(c, p);
1283
		}
1284
		b->prior = nil;
1285
		/*
1286
		 * Freeing a block or just finished a write.
1287
		 * Move the blocks from the per-block unlink
1288
		 * queue to the cache unlink queue.
1289
		 */
1290
		if(b->iostate == BioDirty || b->iostate == BioWriting){
1291
			vtLock(c->lk);
1292
			c->ndirty--;
1293
			b->iostate = iostate;	/* change here to keep in sync with ndirty */
1294
			b->vers = c->vers++;
1295
			if(b->uhead){
1296
				/* add unlink blocks to unlink queue */
1297
				if(c->uhead == nil){
1298
					c->uhead = b->uhead;
1299
					vtWakeup(c->unlink);
1300
				}else
1301
					c->utail->next = b->uhead;
1302
				c->utail = b->utail;
1303
				b->uhead = nil;
1304
			}
1305
			vtUnlock(c->lk);
1306
		}
1307
		assert(!b->uhead);
1308
		dowakeup = 1;
1309
		break;
1310
	case BioDirty:
1311
		/*
1312
		 * Wrote out an old version of the block (see blockRollback).
1313
		 * Bump a version count, leave it dirty.
1314
		 */
1315
		if(b->iostate == BioWriting){
1316
			vtLock(c->lk);
1317
			b->vers = c->vers++;
1318
			vtUnlock(c->lk);
1319
			dowakeup = 1;
1320
		}
1321
		break;
1322
	case BioReading:
1323
	case BioWriting:
1324
		/*
1325
		 * Adding block to disk queue.  Bump reference count.
1326
		 * diskThread decs the count later by calling blockPut.
1327
		 * This is here because we need to lock c->lk to
1328
		 * manipulate the ref count.
1329
		 */
1330
		vtLock(c->lk);
1331
		b->ref++;
1332
		vtUnlock(c->lk);
1333
		break;
1334
	case BioReadError:
1335
	case BioVentiError:
1336
		/*
1337
		 * Oops.
1338
		 */
1339
		dowakeup = 1;
1340
		break;
1341
	}
1342
	b->iostate = iostate;
1343
	/*
1344
	 * Now that the state has changed, we can wake the waiters.
1345
	 */
1346
	if(dowakeup)
1347
		vtWakeupAll(b->ioready);
1348
}
1349
 
1350
/*
1351
 * The active file system is a tree of blocks. 
1352
 * When we add snapshots to the mix, the entire file system
1353
 * becomes a dag and thus requires a bit more care.
1354
 * 
1355
 * The life of the file system is divided into epochs.  A snapshot
1356
 * ends one epoch and begins the next.  Each file system block
1357
 * is marked with the epoch in which it was created (b.epoch).
1358
 * When the block is unlinked from the file system (closed), it is marked
1359
 * with the epoch in which it was removed (b.epochClose).  
1360
 * Once we have discarded or archived all snapshots up to 
1361
 * b.epochClose, we can reclaim the block.
1362
 *
1363
 * If a block was created in a past epoch but is not yet closed,
1364
 * it is treated as copy-on-write.  Of course, in order to insert the
1365
 * new pointer into the tree, the parent must be made writable,
1366
 * and so on up the tree.  The recursion stops because the root
1367
 * block is always writable.
1368
 *
1369
 * If blocks are never closed, they will never be reused, and
1370
 * we will run out of disk space.  But marking a block as closed
1371
 * requires some care about dependencies and write orderings.
1372
 *
1373
 * (1) If a block p points at a copy-on-write block b and we
1374
 * copy b to create bb, then p must be written out after bb and
1375
 * lbb (bb's label block).
1376
 *
1377
 * (2) We have to mark b as closed, but only after we switch
1378
 * the pointer, so lb must be written out after p.  In fact, we 
1379
 * can't even update the in-memory copy, or the cache might
1380
 * mistakenly give out b for reuse before p gets written.
1381
 *
1382
 * CacheAllocBlock's call to blockSetLabel records a "bb after lbb" dependency.
1383
 * The caller is expected to record a "p after bb" dependency
1384
 * to finish (1), and also expected to call blockRemoveLink
1385
 * to arrange for (2) to happen once p is written.
1386
 *
1387
 * Until (2) happens, some pieces of the code (e.g., the archiver)
1388
 * still need to know whether a block has been copied, so we 
1389
 * set the BsCopied bit in the label and force that to disk *before*
1390
 * the copy gets written out.
1391
 */
1392
Block*
1393
blockCopy(Block *b, u32int tag, u32int ehi, u32int elo)
1394
{
1395
	Block *bb, *lb;
1396
	Label l;
1397
 
1398
	if((b->l.state&BsClosed) || b->l.epoch >= ehi)
1399
		fprint(2, "%s: blockCopy %#ux %L but fs is [%ud,%ud]\n",
1400
			argv0, b->addr, &b->l, elo, ehi);
1401
 
1402
	bb = cacheAllocBlock(b->c, b->l.type, tag, ehi, elo);
1403
	if(bb == nil){
1404
		blockPut(b);
1405
		return nil;
1406
	}
1407
 
1408
	/*
1409
	 * Update label so we know the block has been copied.
1410
	 * (It will be marked closed once it has been unlinked from
1411
	 * the tree.)  This must follow cacheAllocBlock since we
1412
	 * can't be holding onto lb when we call cacheAllocBlock.
1413
	 */
1414
	if((b->l.state&BsCopied)==0)
1415
	if(b->part == PartData){	/* not the superblock */
1416
		l = b->l;
1417
		l.state |= BsCopied;
1418
		lb = _blockSetLabel(b, &l);
1419
		if(lb == nil){
1420
			/* can't set label => can't copy block */
1421
			blockPut(b);
1422
			l.type = BtMax;
1423
			l.state = BsFree;
1424
			l.epoch = 0;
1425
			l.epochClose = 0;
1426
			l.tag = 0;
1427
			blockSetLabel(bb, &l, 0);
1428
			blockPut(bb);
1429
			return nil;
1430
		}
1431
		blockDependency(bb, lb, -1, nil, nil);
1432
		blockPut(lb);
1433
	}
1434
 
1435
	memmove(bb->data, b->data, b->c->size);
1436
	blockDirty(bb);
1437
	blockPut(b);
1438
	return bb;
1439
}
1440
 
1441
/*
1442
 * Block b once pointed at the block bb at addr/type/tag, but no longer does.
1443
 * If recurse is set, we are unlinking all of bb's children as well.
1444
 *
1445
 * We can't reclaim bb (or its kids) until the block b gets written to disk.  We add
1446
 * the relevant information to b's list of unlinked blocks.  Once b is written,
1447
 * the list will be queued for processing.
1448
 *
1449
 * If b depends on bb, it doesn't anymore, so we remove bb from the prior list.
1450
 */
1451
void
1452
blockRemoveLink(Block *b, u32int addr, int type, u32int tag, int recurse)
1453
{
1454
	BList *p, **pp, bl;
1455
 
1456
	/* remove bb from prior list */
1457
	for(pp=&b->prior; (p=*pp)!=nil; ){
1458
		if(p->part == PartData && p->addr == addr){
1459
			*pp = p->next;
1460
			blistFree(b->c, p);
1461
		}else
1462
			pp = &p->next;
1463
	}
1464
 
1465
	bl.part = PartData;
1466
	bl.addr = addr;
1467
	bl.type = type;
1468
	bl.tag = tag;
1469
	if(b->l.epoch == 0)
1470
		assert(b->part == PartSuper);
1471
	bl.epoch = b->l.epoch;
1472
	bl.next = nil;
1473
	bl.recurse = recurse;
1474
 
1475
	if(b->part == PartSuper && b->iostate == BioClean)
1476
		p = nil;
1477
	else
1478
		p = blistAlloc(b);
1479
	if(p == nil){
1480
		/*
1481
		 * b has already been written to disk.
1482
		 */
1483
		doRemoveLink(b->c, &bl);
1484
		return;
1485
	}
1486
 
1487
	/* Uhead is only processed when the block goes from Dirty -> Clean */
1488
	assert(b->iostate == BioDirty);
1489
 
1490
	*p = bl;
1491
	if(b->uhead == nil)
1492
		b->uhead = p;
1493
	else
1494
		b->utail->next = p;
1495
	b->utail = p;
1496
}
1497
 
1498
/* 
1499
 * Process removal of a single block and perhaps its children.
1500
 */
1501
static void
1502
doRemoveLink(Cache *c, BList *p)
1503
{
1504
	int i, n, recurse;
1505
	u32int a;
1506
	Block *b;
1507
	Label l;
1508
	BList bl;
1509
 
1510
	recurse = (p->recurse && p->type != BtData && p->type != BtDir);
1511
 
1512
	/*
1513
	 * We're not really going to overwrite b, but if we're not
1514
	 * going to look at its contents, there is no point in reading
1515
	 * them from the disk.
1516
	 */
1517
	b = cacheLocalData(c, p->addr, p->type, p->tag, recurse ? OReadOnly : OOverWrite, 0);
1518
	if(b == nil)
1519
		return;
1520
 
1521
	/*
1522
	 * When we're unlinking from the superblock, close with the next epoch.
1523
	 */
1524
	if(p->epoch == 0)
1525
		p->epoch = b->l.epoch+1;
1526
 
1527
	/* sanity check */
1528
	if(b->l.epoch > p->epoch){
1529
		fprint(2, "%s: doRemoveLink: strange epoch %ud > %ud\n",
1530
			argv0, b->l.epoch, p->epoch);
1531
		blockPut(b);
1532
		return;
1533
	}
1534
 
1535
	if(recurse){
1536
		n = c->size / VtScoreSize;
1537
		for(i=0; i<n; i++){
1538
			a = globalToLocal(b->data + i*VtScoreSize);
1539
			if(a == NilBlock || !readLabel(c, &l, a))
1540
				continue;
1541
			if(l.state&BsClosed)
1542
				continue;
1543
			/*
1544
			 * If stack space becomes an issue...
1545
			p->addr = a;
1546
			p->type = l.type;
1547
			p->tag = l.tag;
1548
			doRemoveLink(c, p);
1549
			 */
1550
 
1551
			bl.part = PartData;
1552
			bl.addr = a;
1553
			bl.type = l.type;
1554
			bl.tag = l.tag;
1555
			bl.epoch = p->epoch;
1556
			bl.next = nil;
1557
			bl.recurse = 1;
1558
			/* give up the block lock - share with others */
1559
			blockPut(b);
1560
			doRemoveLink(c, &bl);
1561
			b = cacheLocalData(c, p->addr, p->type, p->tag, OReadOnly, 0);
1562
			if(b == nil){
1563
				fprint(2, "%s: warning: lost block in doRemoveLink\n",
1564
					argv0);
1565
				return;
1566
			}
1567
		}
1568
	}
1569
 
1570
	l = b->l;
1571
	l.state |= BsClosed;
1572
	l.epochClose = p->epoch;
1573
	if(l.epochClose == l.epoch){
1574
		vtLock(c->fl->lk);
1575
		if(l.epoch == c->fl->epochLow)
1576
			c->fl->nused--;
1577
		blockSetLabel(b, &l, 0);
1578
		vtUnlock(c->fl->lk);
1579
	}else
1580
		blockSetLabel(b, &l, 0);
1581
	blockPut(b);
1582
}
1583
 
1584
/*
1585
 * Allocate a BList so that we can record a dependency
1586
 * or queue a removal related to block b.
1587
 * If we can't find a BList, we write out b and return nil.
1588
 */
1589
static BList *
1590
blistAlloc(Block *b)
1591
{
1592
	Cache *c;
1593
	BList *p;
1594
 
1595
	if(b->iostate != BioDirty){
1596
		/*
1597
		 * should not happen anymore -
1598
	 	 * blockDirty used to flush but no longer does.
1599
		 */
1600
		assert(b->iostate == BioClean);
1601
		fprint(2, "%s: blistAlloc: called on clean block\n", argv0);
1602
		return nil;
1603
	}
1604
 
1605
	c = b->c;
1606
	vtLock(c->lk);
1607
	if(c->blfree == nil){
1608
		/*
1609
		 * No free BLists.  What are our options?
1610
		 */
1611
 
1612
		/* Block has no priors? Just write it. */
1613
		if(b->prior == nil){
1614
			vtUnlock(c->lk);
1615
			diskWriteAndWait(c->disk, b);
1616
			return nil;
1617
		}
1618
 
1619
		/*
1620
		 * Wake the flush thread, which will hopefully free up
1621
		 * some BLists for us.  We used to flush a block from
1622
		 * our own prior list and reclaim that BList, but this is
1623
		 * a no-no: some of the blocks on our prior list may
1624
		 * be locked by our caller.  Or maybe their label blocks
1625
		 * are locked by our caller.  In any event, it's too hard
1626
		 * to make sure we can do I/O for ourselves.  Instead,
1627
		 * we assume the flush thread will find something.
1628
		 * (The flush thread never blocks waiting for a block,
1629
		 * so it can't deadlock like we can.)
1630
		 */
1631
		while(c->blfree == nil){
1632
			vtWakeup(c->flush);
1633
			vtSleep(c->blrend);
1634
			if(c->blfree == nil)
1635
				fprint(2, "%s: flushing for blists\n", argv0);
1636
		}
1637
	}
1638
 
1639
	p = c->blfree;
1640
	c->blfree = p->next;
1641
	vtUnlock(c->lk);
1642
	return p;
1643
}
1644
 
1645
static void
1646
blistFree(Cache *c, BList *bl)
1647
{
1648
	vtLock(c->lk);
1649
	bl->next = c->blfree;
1650
	c->blfree = bl;
1651
	vtWakeup(c->blrend);
1652
	vtUnlock(c->lk);
1653
}
1654
 
1655
char*
1656
bsStr(int state)
1657
{
1658
	static char s[100];
1659
 
1660
	if(state == BsFree)
1661
		return "Free";
1662
	if(state == BsBad)
1663
		return "Bad";
1664
 
1665
	sprint(s, "%x", state);
1666
	if(!(state&BsAlloc))
1667
		strcat(s, ",Free");	/* should not happen */
1668
	if(state&BsCopied)
1669
		strcat(s, ",Copied");
1670
	if(state&BsVenti)
1671
		strcat(s, ",Venti");
1672
	if(state&BsClosed)
1673
		strcat(s, ",Closed");
1674
	return s;
1675
}
1676
 
1677
char *
1678
bioStr(int iostate)
1679
{
1680
	switch(iostate){
1681
	default:
1682
		return "Unknown!!";
1683
	case BioEmpty:
1684
		return "Empty";
1685
	case BioLabel:
1686
		return "Label";
1687
	case BioClean:
1688
		return "Clean";
1689
	case BioDirty:
1690
		return "Dirty";
1691
	case BioReading:
1692
		return "Reading";
1693
	case BioWriting:
1694
		return "Writing";
1695
	case BioReadError:
1696
		return "ReadError";
1697
	case BioVentiError:
1698
		return "VentiError";
1699
	case BioMax:
1700
		return "Max";
1701
	}
1702
}
1703
 
1704
static char *bttab[] = {
1705
	"BtData",
1706
	"BtData+1",
1707
	"BtData+2",
1708
	"BtData+3",
1709
	"BtData+4",
1710
	"BtData+5",
1711
	"BtData+6",
1712
	"BtData+7",
1713
	"BtDir",
1714
	"BtDir+1",
1715
	"BtDir+2",
1716
	"BtDir+3",
1717
	"BtDir+4",
1718
	"BtDir+5",
1719
	"BtDir+6",
1720
	"BtDir+7",
1721
};
1722
 
1723
char*
1724
btStr(int type)
1725
{
1726
	if(type < nelem(bttab))
1727
		return bttab[type];
1728
	return "unknown";
1729
}
1730
 
1731
int
1732
labelFmt(Fmt *f)
1733
{
1734
	Label *l;
1735
 
1736
	l = va_arg(f->args, Label*);
1737
	return fmtprint(f, "%s,%s,e=%ud,%d,tag=%#ux",
1738
		btStr(l->type), bsStr(l->state), l->epoch, (int)l->epochClose, l->tag);
1739
}
1740
 
1741
int
1742
scoreFmt(Fmt *f)
1743
{
1744
	uchar *v;
1745
	int i;
1746
	u32int addr;
1747
 
1748
	v = va_arg(f->args, uchar*);
1749
	if(v == nil){
1750
		fmtprint(f, "*");
1751
	}else if((addr = globalToLocal(v)) != NilBlock)
1752
		fmtprint(f, "0x%.8ux", addr);
1753
	else{
1754
		for(i = 0; i < VtScoreSize; i++)
1755
			fmtprint(f, "%2.2ux", v[i]);
1756
	}
1757
 
1758
	return 0;
1759
}
1760
 
1761
static int
1762
upHeap(int i, Block *b)
1763
{
1764
	Block *bb;
1765
	u32int now;
1766
	int p;
1767
	Cache *c;
1768
 
1769
	c = b->c;
1770
	now = c->now;
1771
	for(; i != 0; i = p){
1772
		p = (i - 1) >> 1;
1773
		bb = c->heap[p];
1774
		if(b->used - now >= bb->used - now)
1775
			break;
1776
		c->heap[i] = bb;
1777
		bb->heap = i;
1778
	}
1779
	c->heap[i] = b;
1780
	b->heap = i;
1781
 
1782
	return i;
1783
}
1784
 
1785
static int
1786
downHeap(int i, Block *b)
1787
{
1788
	Block *bb;
1789
	u32int now;
1790
	int k;
1791
	Cache *c;
1792
 
1793
	c = b->c;
1794
	now = c->now;
1795
	for(; ; i = k){
1796
		k = (i << 1) + 1;
1797
		if(k >= c->nheap)
1798
			break;
1799
		if(k + 1 < c->nheap && c->heap[k]->used - now > c->heap[k + 1]->used - now)
1800
			k++;
1801
		bb = c->heap[k];
1802
		if(b->used - now <= bb->used - now)
1803
			break;
1804
		c->heap[i] = bb;
1805
		bb->heap = i;
1806
	}
1807
	c->heap[i] = b;
1808
	b->heap = i;
1809
	return i;
1810
}
1811
 
1812
/*
1813
 * Delete a block from the heap.
1814
 * Called with c->lk held.
1815
 */
1816
static void
1817
heapDel(Block *b)
1818
{
1819
	int i, si;
1820
	Cache *c;
1821
 
1822
	c = b->c;
1823
 
1824
	si = b->heap;
1825
	if(si == BadHeap)
1826
		return;
1827
	b->heap = BadHeap;
1828
	c->nheap--;
1829
	if(si == c->nheap)
1830
		return;
1831
	b = c->heap[c->nheap];
1832
	i = upHeap(si, b);
1833
	if(i == si)
1834
		downHeap(i, b);
1835
}
1836
 
1837
/*
1838
 * Insert a block into the heap.
1839
 * Called with c->lk held.
1840
 */
1841
static void
1842
heapIns(Block *b)
1843
{
1844
	assert(b->heap == BadHeap);
1845
	upHeap(b->c->nheap++, b);
1846
	vtWakeup(b->c->heapwait);
1847
}
1848
 
1849
/*
1850
 * Get just the label for a block.
1851
 */
1852
int
1853
readLabel(Cache *c, Label *l, u32int addr)
1854
{
1855
	int lpb;
1856
	Block *b;
1857
	u32int a;
1858
 
1859
	lpb = c->size / LabelSize;
1860
	a = addr / lpb;
1861
	b = cacheLocal(c, PartLabel, a, OReadOnly);
1862
	if(b == nil){
1863
		blockPut(b);
1864
		return 0;
1865
	}
1866
 
1867
	if(!labelUnpack(l, b->data, addr%lpb)){
1868
		blockPut(b);
1869
		return 0;
1870
	}
1871
	blockPut(b);
1872
	return 1;
1873
}
1874
 
1875
/*
1876
 * Process unlink queue.
1877
 * Called with c->lk held.
1878
 */
1879
static void
1880
unlinkBody(Cache *c)
1881
{
1882
	BList *p;
1883
 
1884
	while(c->uhead != nil){
1885
		p = c->uhead;
1886
		c->uhead = p->next;
1887
		vtUnlock(c->lk);
1888
		doRemoveLink(c, p);
1889
		vtLock(c->lk);
1890
		p->next = c->blfree;
1891
		c->blfree = p;
1892
	}
1893
}
1894
 
1895
/*
1896
 * Occasionally unlink the blocks on the cache unlink queue.
1897
 */
1898
static void
1899
unlinkThread(void *a)
1900
{
1901
	Cache *c = a;
1902
 
1903
	vtThreadSetName("unlink");
1904
 
1905
	vtLock(c->lk);
1906
	for(;;){
1907
		while(c->uhead == nil && c->die == nil)
1908
			vtSleep(c->unlink);
1909
		if(c->die != nil)
1910
			break;
1911
		unlinkBody(c);
1912
	}
1913
	c->ref--;
1914
	vtWakeup(c->die);
1915
	vtUnlock(c->lk);
1916
}
1917
 
1918
static int
1919
baddrCmp(void *a0, void *a1)
1920
{
1921
	BAddr *b0, *b1;
1922
	b0 = a0;
1923
	b1 = a1;
1924
 
1925
	if(b0->part < b1->part)
1926
		return -1;
1927
	if(b0->part > b1->part)
1928
		return 1;
1929
	if(b0->addr < b1->addr)
1930
		return -1;
1931
	if(b0->addr > b1->addr)
1932
		return 1;
1933
	return 0;
1934
}
1935
 
1936
/*
1937
 * Scan the block list for dirty blocks; add them to the list c->baddr.
1938
 */
1939
static void
1940
flushFill(Cache *c)
1941
{
1942
	int i, ndirty;
1943
	BAddr *p;
1944
	Block *b;
1945
 
1946
	vtLock(c->lk);
1947
	if(c->ndirty == 0){
1948
		vtUnlock(c->lk);
1949
		return;
1950
	}
1951
 
1952
	p = c->baddr;
1953
	ndirty = 0;
1954
	for(i=0; i<c->nblocks; i++){
1955
		b = c->blocks + i;
1956
		if(b->part == PartError)
1957
			continue;
1958
		if(b->iostate == BioDirty || b->iostate == BioWriting)
1959
			ndirty++;
1960
		if(b->iostate != BioDirty)
1961
			continue;
1962
		p->part = b->part;
1963
		p->addr = b->addr;
1964
		p->vers = b->vers;
1965
		p++;
1966
	}
1967
	if(ndirty != c->ndirty){
1968
		fprint(2, "%s: ndirty mismatch expected %d found %d\n",
1969
			argv0, c->ndirty, ndirty);
1970
		c->ndirty = ndirty;
1971
	}
1972
	vtUnlock(c->lk);
1973
 
1974
	c->bw = p - c->baddr;
1975
	qsort(c->baddr, c->bw, sizeof(BAddr), baddrCmp);
1976
}
1977
 
1978
/*
1979
 * This is not thread safe, i.e. it can't be called from multiple threads.
1980
 *
1981
 * It's okay how we use it, because it only gets called in
1982
 * the flushThread.  And cacheFree, but only after
1983
 * cacheFree has killed off the flushThread.
1984
 */
1985
static int
1986
cacheFlushBlock(Cache *c)
1987
{
1988
	Block *b;
1989
	BAddr *p;
1990
	int lockfail, nfail;
1991
 
1992
	nfail = 0;
1993
	for(;;){
1994
		if(c->br == c->be){
1995
			if(c->bw == 0 || c->bw == c->be)
1996
				flushFill(c);
1997
			c->br = 0;
1998
			c->be = c->bw;
1999
			c->bw = 0;
2000
			c->nflush = 0;
2001
		}
2002
 
2003
		if(c->br == c->be)
2004
			return 0;
2005
		p = c->baddr + c->br;
2006
		c->br++;
2007
		b = _cacheLocalLookup(c, p->part, p->addr, p->vers, Nowaitlock,
2008
			&lockfail);
2009
 
2010
		if(b && blockWrite(b, Nowaitlock)){
2011
			c->nflush++;
2012
			blockPut(b);
2013
			return 1;
2014
		}
2015
		if(b)
2016
			blockPut(b);
2017
 
2018
		/*
2019
		 * Why didn't we write the block?
2020
		 */
2021
 
2022
		/* Block already written out */
2023
		if(b == nil && !lockfail)
2024
			continue;
2025
 
2026
		/* Failed to acquire lock; sleep if happens a lot. */
2027
		if(lockfail && ++nfail > 100){
2028
			sleep(500);
2029
			nfail = 0;
2030
		}
2031
		/* Requeue block. */
2032
		if(c->bw < c->be)
2033
			c->baddr[c->bw++] = *p;
2034
	}
2035
}
2036
 
2037
/*
2038
 * Occasionally flush dirty blocks from memory to the disk.
2039
 */
2040
static void
2041
flushThread(void *a)
2042
{
2043
	Cache *c = a;
2044
	int i;
2045
 
2046
	vtThreadSetName("flush");
2047
	vtLock(c->lk);
2048
	while(c->die == nil){
2049
		vtSleep(c->flush);
2050
		vtUnlock(c->lk);
2051
		for(i=0; i<FlushSize; i++)
2052
			if(!cacheFlushBlock(c)){
2053
				/*
2054
				 * If i==0, could be someone is waking us repeatedly
2055
				 * to flush the cache but there's no work to do.
2056
				 * Pause a little.
2057
				 */
2058
				if(i==0){
2059
					// fprint(2, "%s: flushthread found "
2060
					//	"nothing to flush - %d dirty\n",
2061
					//	argv0, c->ndirty);
2062
					sleep(250);
2063
				}
2064
				break;
2065
			}
2066
		if(i==0 && c->ndirty){
2067
			/*
2068
			 * All the blocks are being written right now -- there's nothing to do.
2069
			 * We might be spinning with cacheFlush though -- he'll just keep
2070
			 * kicking us until c->ndirty goes down.  Probably we should sleep
2071
			 * on something that the diskThread can kick, but for now we'll
2072
			 * just pause for a little while waiting for disks to finish.
2073
			 */
2074
			sleep(100);
2075
		}
2076
		vtLock(c->lk);
2077
		vtWakeupAll(c->flushwait);
2078
	}
2079
	c->ref--;
2080
	vtWakeup(c->die);
2081
	vtUnlock(c->lk);
2082
}
2083
 
2084
/*
2085
 * Flush the cache.
2086
 */
2087
void
2088
cacheFlush(Cache *c, int wait)
2089
{
2090
	vtLock(c->lk);
2091
	if(wait){
2092
		while(c->ndirty){
2093
		//	consPrint("cacheFlush: %d dirty blocks, uhead %p\n",
2094
		//		c->ndirty, c->uhead);
2095
			vtWakeup(c->flush);
2096
			vtSleep(c->flushwait);
2097
		}
2098
	//	consPrint("cacheFlush: done (uhead %p)\n", c->ndirty, c->uhead);
2099
	}else if(c->ndirty)
2100
		vtWakeup(c->flush);
2101
	vtUnlock(c->lk);
2102
}
2103
 
2104
/*
2105
 * Kick the flushThread every 30 seconds.
2106
 */
2107
static void
2108
cacheSync(void *v)
2109
{
2110
	Cache *c;
2111
 
2112
	c = v;
2113
	cacheFlush(c, 0);
2114
}